diff --git a/doc/15-troubleshooting.md b/doc/15-troubleshooting.md index 67ba70cb2..28acb8060 100644 --- a/doc/15-troubleshooting.md +++ b/doc/15-troubleshooting.md @@ -2055,3 +2055,40 @@ Value data: 65534 More details in [this blogpost](https://www.netways.de/blog/2019/01/24/windows-blocking-icinga-2-with-ephemeral-port-range/) and this [MS help entry](https://support.microsoft.com/en-us/help/196271/when-you-try-to-connect-from-tcp-ports-greater-than-5000-you-receive-t). + +### Cluster Troubleshooting: Overloaded JSON-RPC Connections + +If JSON-RPC connections are overloaded, messages are processed with a delay. This can show in symptoms like the master +lagging behind, for example showing check results only minutes after they were available on a satellite. + +There are two ways this situation can be identified: + +First, if a connection is overloaded, Icinga 2 will read data from it slower than it arrives, so pending messages are +accumulating in the TCP receive queue on the overloaded endpoint and the TCP send queue of other endpoints sending to +it. This can be checked by querying information about open TCP connections using the command +`ss --tcp --processes --numeric`. High values for Recv-Q on a socket used by the `icinga2` process can be a hint that +the local endpoint is not able to keep up with the messages from this connection. Note that small values (a few +kilobytes) are perfectly fine as messages can be in flight. Also, while the replay log is received, messages are +processed as fast as possible and thus the connection is operating at capacity, thus the size of the TCP receive queue +is only meaningful after processing the replay log has finished. + +Second, Icinga 2.15.1 introduced a metric that can be used to estimate how much load there is on a particular +connection: the `seconds_processing_messages` attribute of `Endpoint` objects which can be +[queried using the API](12-icinga2-api.md#icinga2-api-config-objects-query). This value accumulates the total time spent +processing JSON-RPC messages from connections to that endpoint. In order to interpret this number, you have to query it +at least twice and calculate the rate at which the number increased. For example, a rate of 0.4 (increases by 0.4s every +second) means that the connection is at around 40% of its maximum capacity. In practice, the rate will never reach the +theoretical maximum of 1 as there's also some time spent reading the messages, so if it's close to 1, the connection +might be overloaded or is close to its capacity limit. + +This limit in capacity exists because all there can be implicit dependencies between different JSON-RPC messages, +requiring them to be processed in the same order that they were sent. This is currently ensured by processing all +messages from the same connection sequentially. + +To work around this limit, the following approaches are possible: +1. Try to redistribute load between connections, for example if the overloaded connection is between the master and + a satellite zone, try splitting this zone into two, distributing the load across two connections. +2. Reduce the load on that connection. Typically, the most frequent message type will be check results, so reducing + the check interval can be a first step. +3. As the messages are processed sequentially, the throughput is limited by the single core CPU performance of the + machine Icinga 2 is running on, switching to a more powerful one can increase the capacity of individual connections. diff --git a/lib/base/atomic.hpp b/lib/base/atomic.hpp index 855850336..841151612 100644 --- a/lib/base/atomic.hpp +++ b/lib/base/atomic.hpp @@ -4,6 +4,7 @@ #define ATOMIC_H #include +#include #include #include #include @@ -34,6 +35,43 @@ public: } }; +/** + * Accumulates time durations atomically. + * + * @ingroup base + */ +class AtomicDuration +{ +public: + using Clock = std::chrono::steady_clock; + + /** + * Adds the elapsedTime to this instance. + * + * May be called multiple times to accumulate time. + * + * @param elapsedTime The distance between two time points + * + * @return This instance for method chaining + */ + AtomicDuration& operator+=(const Clock::duration& elapsedTime) noexcept + { + m_Sum.fetch_add(elapsedTime.count(), std::memory_order_relaxed); + return *this; + } + + /** + * @return The total accumulated time in seconds + */ + operator double() const noexcept + { + return std::chrono::duration(Clock::duration(m_Sum.load(std::memory_order_relaxed))).count(); + } + +private: + Atomic m_Sum {0}; +}; + /** * Wraps any T into a std::atomic-like interface that locks using a mutex. * diff --git a/lib/remote/endpoint.cpp b/lib/remote/endpoint.cpp index 42a5a5af1..751f89539 100644 --- a/lib/remote/endpoint.cpp +++ b/lib/remote/endpoint.cpp @@ -130,6 +130,11 @@ void Endpoint::AddMessageReceived(const intrusive_ptr& method) m_MessageCounters.at(method).fetch_add(1, std::memory_order_relaxed); } +void Endpoint::AddMessageProcessed(const AtomicDuration::Clock::duration& duration) +{ + m_InputProcessingTime += duration; +} + double Endpoint::GetMessagesSentPerSecond() const { return m_MessagesSent.CalculateRate(Utility::GetTime(), 60); @@ -162,3 +167,8 @@ Dictionary::Ptr Endpoint::GetMessagesReceivedPerType() const return new Dictionary(std::move(result)); } + +double Endpoint::GetSecondsProcessingMessages() const +{ + return m_InputProcessingTime; +} diff --git a/lib/remote/endpoint.hpp b/lib/remote/endpoint.hpp index 09ad8a4fd..0cd015bf9 100644 --- a/lib/remote/endpoint.hpp +++ b/lib/remote/endpoint.hpp @@ -49,6 +49,7 @@ public: void AddMessageSent(int bytes); void AddMessageReceived(int bytes); void AddMessageReceived(const intrusive_ptr& method); + void AddMessageProcessed(const AtomicDuration::Clock::duration& duration); double GetMessagesSentPerSecond() const override; double GetMessagesReceivedPerSecond() const override; @@ -58,6 +59,8 @@ public: Dictionary::Ptr GetMessagesReceivedPerType() const override; + double GetSecondsProcessingMessages() const override; + protected: void OnAllConfigLoaded() override; @@ -71,6 +74,8 @@ private: mutable RingBuffer m_MessagesReceived{60}; mutable RingBuffer m_BytesSent{60}; mutable RingBuffer m_BytesReceived{60}; + + AtomicDuration m_InputProcessingTime; }; } diff --git a/lib/remote/endpoint.ti b/lib/remote/endpoint.ti index 2fa874b5e..c246af4d6 100644 --- a/lib/remote/endpoint.ti +++ b/lib/remote/endpoint.ti @@ -58,6 +58,10 @@ class Endpoint : ConfigObject [no_user_modify, no_storage] Dictionary::Ptr messages_received_per_type { get; }; + + [no_user_modify, no_storage] double seconds_processing_messages { + get; + }; }; } diff --git a/lib/remote/jsonrpcconnection.cpp b/lib/remote/jsonrpcconnection.cpp index b85944867..0dab1ed5f 100644 --- a/lib/remote/jsonrpcconnection.cpp +++ b/lib/remote/jsonrpcconnection.cpp @@ -106,6 +106,9 @@ void JsonRpcConnection::HandleIncomingMessages(boost::asio::yield_context yc) l_TaskStats.InsertValue(Utility::GetTime(), 1); auto total = ch::steady_clock::now() - start; + if (m_Endpoint) { + m_Endpoint->AddMessageProcessed(total); + } Log msg(total >= ch::seconds(5) ? LogWarning : LogDebug, "JsonRpcConnection"); msg << "Processed JSON-RPC '" << rpcMethod << "' message for identity '" << m_Identity diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index cb815da73..c317d4ebe 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -89,6 +89,7 @@ set(base_test_SOURCES icingaapplication-fixture.cpp utils.cpp base-array.cpp + base-atomic.cpp base-base64.cpp base-convert.cpp base-dictionary.cpp diff --git a/test/base-atomic.cpp b/test/base-atomic.cpp new file mode 100644 index 000000000..1bdead38d --- /dev/null +++ b/test/base-atomic.cpp @@ -0,0 +1,34 @@ +/* Icinga 2 | (c) 2025 Icinga GmbH | GPLv2+ */ + +#include "base/atomic.hpp" +#include + +using namespace icinga; + +BOOST_AUTO_TEST_SUITE(base_atomic) + +BOOST_AUTO_TEST_CASE(duration_none) +{ + BOOST_CHECK_EQUAL(static_cast(AtomicDuration()), 0); +} + +BOOST_AUTO_TEST_CASE(duration_one) +{ + AtomicDuration sum; + + sum += std::chrono::seconds(1); + + BOOST_CHECK_EQUAL(static_cast(sum), 1); +} + +BOOST_AUTO_TEST_CASE(duration_two) +{ + AtomicDuration sum; + + sum += std::chrono::seconds(1); + sum += std::chrono::seconds(2); + + BOOST_CHECK_EQUAL(static_cast(sum), 3); +} + +BOOST_AUTO_TEST_SUITE_END()