Merge pull request #10553 from Icinga/jsonrpc-utilization-metrics

Add JSON-RPC utilization metrics and troubleshooting documentation
This commit is contained in:
Julian Brost 2025-09-23 16:05:44 +02:00 committed by GitHub
commit 3674af75de
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 132 additions and 0 deletions

View File

@ -2055,3 +2055,40 @@ Value data: 65534
More details in [this blogpost](https://www.netways.de/blog/2019/01/24/windows-blocking-icinga-2-with-ephemeral-port-range/) More details in [this blogpost](https://www.netways.de/blog/2019/01/24/windows-blocking-icinga-2-with-ephemeral-port-range/)
and this [MS help entry](https://support.microsoft.com/en-us/help/196271/when-you-try-to-connect-from-tcp-ports-greater-than-5000-you-receive-t). and this [MS help entry](https://support.microsoft.com/en-us/help/196271/when-you-try-to-connect-from-tcp-ports-greater-than-5000-you-receive-t).
### Cluster Troubleshooting: Overloaded JSON-RPC Connections <a id="troubleshooting-jsonrpc-overload"></a>
If JSON-RPC connections are overloaded, messages are processed with a delay. This can show in symptoms like the master
lagging behind, for example showing check results only minutes after they were available on a satellite.
There are two ways this situation can be identified:
First, if a connection is overloaded, Icinga 2 will read data from it slower than it arrives, so pending messages are
accumulating in the TCP receive queue on the overloaded endpoint and the TCP send queue of other endpoints sending to
it. This can be checked by querying information about open TCP connections using the command
`ss --tcp --processes --numeric`. High values for Recv-Q on a socket used by the `icinga2` process can be a hint that
the local endpoint is not able to keep up with the messages from this connection. Note that small values (a few
kilobytes) are perfectly fine as messages can be in flight. Also, while the replay log is received, messages are
processed as fast as possible and thus the connection is operating at capacity, thus the size of the TCP receive queue
is only meaningful after processing the replay log has finished.
Second, Icinga 2.15.1 introduced a metric that can be used to estimate how much load there is on a particular
connection: the `seconds_processing_messages` attribute of `Endpoint` objects which can be
[queried using the API](12-icinga2-api.md#icinga2-api-config-objects-query). This value accumulates the total time spent
processing JSON-RPC messages from connections to that endpoint. In order to interpret this number, you have to query it
at least twice and calculate the rate at which the number increased. For example, a rate of 0.4 (increases by 0.4s every
second) means that the connection is at around 40% of its maximum capacity. In practice, the rate will never reach the
theoretical maximum of 1 as there's also some time spent reading the messages, so if it's close to 1, the connection
might be overloaded or is close to its capacity limit.
This limit in capacity exists because all there can be implicit dependencies between different JSON-RPC messages,
requiring them to be processed in the same order that they were sent. This is currently ensured by processing all
messages from the same connection sequentially.
To work around this limit, the following approaches are possible:
1. Try to redistribute load between connections, for example if the overloaded connection is between the master and
a satellite zone, try splitting this zone into two, distributing the load across two connections.
2. Reduce the load on that connection. Typically, the most frequent message type will be check results, so reducing
the check interval can be a first step.
3. As the messages are processed sequentially, the throughput is limited by the single core CPU performance of the
machine Icinga 2 is running on, switching to a more powerful one can increase the capacity of individual connections.

View File

@ -4,6 +4,7 @@
#define ATOMIC_H #define ATOMIC_H
#include <atomic> #include <atomic>
#include <chrono>
#include <mutex> #include <mutex>
#include <type_traits> #include <type_traits>
#include <utility> #include <utility>
@ -34,6 +35,43 @@ public:
} }
}; };
/**
* Accumulates time durations atomically.
*
* @ingroup base
*/
class AtomicDuration
{
public:
using Clock = std::chrono::steady_clock;
/**
* Adds the elapsedTime to this instance.
*
* May be called multiple times to accumulate time.
*
* @param elapsedTime The distance between two time points
*
* @return This instance for method chaining
*/
AtomicDuration& operator+=(const Clock::duration& elapsedTime) noexcept
{
m_Sum.fetch_add(elapsedTime.count(), std::memory_order_relaxed);
return *this;
}
/**
* @return The total accumulated time in seconds
*/
operator double() const noexcept
{
return std::chrono::duration<double>(Clock::duration(m_Sum.load(std::memory_order_relaxed))).count();
}
private:
Atomic<Clock::duration::rep> m_Sum {0};
};
/** /**
* Wraps any T into a std::atomic<T>-like interface that locks using a mutex. * Wraps any T into a std::atomic<T>-like interface that locks using a mutex.
* *

View File

@ -130,6 +130,11 @@ void Endpoint::AddMessageReceived(const intrusive_ptr<ApiFunction>& method)
m_MessageCounters.at(method).fetch_add(1, std::memory_order_relaxed); m_MessageCounters.at(method).fetch_add(1, std::memory_order_relaxed);
} }
void Endpoint::AddMessageProcessed(const AtomicDuration::Clock::duration& duration)
{
m_InputProcessingTime += duration;
}
double Endpoint::GetMessagesSentPerSecond() const double Endpoint::GetMessagesSentPerSecond() const
{ {
return m_MessagesSent.CalculateRate(Utility::GetTime(), 60); return m_MessagesSent.CalculateRate(Utility::GetTime(), 60);
@ -162,3 +167,8 @@ Dictionary::Ptr Endpoint::GetMessagesReceivedPerType() const
return new Dictionary(std::move(result)); return new Dictionary(std::move(result));
} }
double Endpoint::GetSecondsProcessingMessages() const
{
return m_InputProcessingTime;
}

View File

@ -49,6 +49,7 @@ public:
void AddMessageSent(int bytes); void AddMessageSent(int bytes);
void AddMessageReceived(int bytes); void AddMessageReceived(int bytes);
void AddMessageReceived(const intrusive_ptr<ApiFunction>& method); void AddMessageReceived(const intrusive_ptr<ApiFunction>& method);
void AddMessageProcessed(const AtomicDuration::Clock::duration& duration);
double GetMessagesSentPerSecond() const override; double GetMessagesSentPerSecond() const override;
double GetMessagesReceivedPerSecond() const override; double GetMessagesReceivedPerSecond() const override;
@ -58,6 +59,8 @@ public:
Dictionary::Ptr GetMessagesReceivedPerType() const override; Dictionary::Ptr GetMessagesReceivedPerType() const override;
double GetSecondsProcessingMessages() const override;
protected: protected:
void OnAllConfigLoaded() override; void OnAllConfigLoaded() override;
@ -71,6 +74,8 @@ private:
mutable RingBuffer m_MessagesReceived{60}; mutable RingBuffer m_MessagesReceived{60};
mutable RingBuffer m_BytesSent{60}; mutable RingBuffer m_BytesSent{60};
mutable RingBuffer m_BytesReceived{60}; mutable RingBuffer m_BytesReceived{60};
AtomicDuration m_InputProcessingTime;
}; };
} }

View File

@ -58,6 +58,10 @@ class Endpoint : ConfigObject
[no_user_modify, no_storage] Dictionary::Ptr messages_received_per_type { [no_user_modify, no_storage] Dictionary::Ptr messages_received_per_type {
get; get;
}; };
[no_user_modify, no_storage] double seconds_processing_messages {
get;
};
}; };
} }

View File

@ -106,6 +106,9 @@ void JsonRpcConnection::HandleIncomingMessages(boost::asio::yield_context yc)
l_TaskStats.InsertValue(Utility::GetTime(), 1); l_TaskStats.InsertValue(Utility::GetTime(), 1);
auto total = ch::steady_clock::now() - start; auto total = ch::steady_clock::now() - start;
if (m_Endpoint) {
m_Endpoint->AddMessageProcessed(total);
}
Log msg(total >= ch::seconds(5) ? LogWarning : LogDebug, "JsonRpcConnection"); Log msg(total >= ch::seconds(5) ? LogWarning : LogDebug, "JsonRpcConnection");
msg << "Processed JSON-RPC '" << rpcMethod << "' message for identity '" << m_Identity msg << "Processed JSON-RPC '" << rpcMethod << "' message for identity '" << m_Identity

View File

@ -89,6 +89,7 @@ set(base_test_SOURCES
icingaapplication-fixture.cpp icingaapplication-fixture.cpp
utils.cpp utils.cpp
base-array.cpp base-array.cpp
base-atomic.cpp
base-base64.cpp base-base64.cpp
base-convert.cpp base-convert.cpp
base-dictionary.cpp base-dictionary.cpp

34
test/base-atomic.cpp Normal file
View File

@ -0,0 +1,34 @@
/* Icinga 2 | (c) 2025 Icinga GmbH | GPLv2+ */
#include "base/atomic.hpp"
#include <BoostTestTargetConfig.h>
using namespace icinga;
BOOST_AUTO_TEST_SUITE(base_atomic)
BOOST_AUTO_TEST_CASE(duration_none)
{
BOOST_CHECK_EQUAL(static_cast<double>(AtomicDuration()), 0);
}
BOOST_AUTO_TEST_CASE(duration_one)
{
AtomicDuration sum;
sum += std::chrono::seconds(1);
BOOST_CHECK_EQUAL(static_cast<double>(sum), 1);
}
BOOST_AUTO_TEST_CASE(duration_two)
{
AtomicDuration sum;
sum += std::chrono::seconds(1);
sum += std::chrono::seconds(2);
BOOST_CHECK_EQUAL(static_cast<double>(sum), 3);
}
BOOST_AUTO_TEST_SUITE_END()