mirror of
https://github.com/Icinga/icinga2.git
synced 2025-09-25 18:48:50 +02:00
Merge pull request #10553 from Icinga/jsonrpc-utilization-metrics
Add JSON-RPC utilization metrics and troubleshooting documentation
This commit is contained in:
commit
3674af75de
@ -2055,3 +2055,40 @@ Value data: 65534
|
|||||||
|
|
||||||
More details in [this blogpost](https://www.netways.de/blog/2019/01/24/windows-blocking-icinga-2-with-ephemeral-port-range/)
|
More details in [this blogpost](https://www.netways.de/blog/2019/01/24/windows-blocking-icinga-2-with-ephemeral-port-range/)
|
||||||
and this [MS help entry](https://support.microsoft.com/en-us/help/196271/when-you-try-to-connect-from-tcp-ports-greater-than-5000-you-receive-t).
|
and this [MS help entry](https://support.microsoft.com/en-us/help/196271/when-you-try-to-connect-from-tcp-ports-greater-than-5000-you-receive-t).
|
||||||
|
|
||||||
|
### Cluster Troubleshooting: Overloaded JSON-RPC Connections <a id="troubleshooting-jsonrpc-overload"></a>
|
||||||
|
|
||||||
|
If JSON-RPC connections are overloaded, messages are processed with a delay. This can show in symptoms like the master
|
||||||
|
lagging behind, for example showing check results only minutes after they were available on a satellite.
|
||||||
|
|
||||||
|
There are two ways this situation can be identified:
|
||||||
|
|
||||||
|
First, if a connection is overloaded, Icinga 2 will read data from it slower than it arrives, so pending messages are
|
||||||
|
accumulating in the TCP receive queue on the overloaded endpoint and the TCP send queue of other endpoints sending to
|
||||||
|
it. This can be checked by querying information about open TCP connections using the command
|
||||||
|
`ss --tcp --processes --numeric`. High values for Recv-Q on a socket used by the `icinga2` process can be a hint that
|
||||||
|
the local endpoint is not able to keep up with the messages from this connection. Note that small values (a few
|
||||||
|
kilobytes) are perfectly fine as messages can be in flight. Also, while the replay log is received, messages are
|
||||||
|
processed as fast as possible and thus the connection is operating at capacity, thus the size of the TCP receive queue
|
||||||
|
is only meaningful after processing the replay log has finished.
|
||||||
|
|
||||||
|
Second, Icinga 2.15.1 introduced a metric that can be used to estimate how much load there is on a particular
|
||||||
|
connection: the `seconds_processing_messages` attribute of `Endpoint` objects which can be
|
||||||
|
[queried using the API](12-icinga2-api.md#icinga2-api-config-objects-query). This value accumulates the total time spent
|
||||||
|
processing JSON-RPC messages from connections to that endpoint. In order to interpret this number, you have to query it
|
||||||
|
at least twice and calculate the rate at which the number increased. For example, a rate of 0.4 (increases by 0.4s every
|
||||||
|
second) means that the connection is at around 40% of its maximum capacity. In practice, the rate will never reach the
|
||||||
|
theoretical maximum of 1 as there's also some time spent reading the messages, so if it's close to 1, the connection
|
||||||
|
might be overloaded or is close to its capacity limit.
|
||||||
|
|
||||||
|
This limit in capacity exists because all there can be implicit dependencies between different JSON-RPC messages,
|
||||||
|
requiring them to be processed in the same order that they were sent. This is currently ensured by processing all
|
||||||
|
messages from the same connection sequentially.
|
||||||
|
|
||||||
|
To work around this limit, the following approaches are possible:
|
||||||
|
1. Try to redistribute load between connections, for example if the overloaded connection is between the master and
|
||||||
|
a satellite zone, try splitting this zone into two, distributing the load across two connections.
|
||||||
|
2. Reduce the load on that connection. Typically, the most frequent message type will be check results, so reducing
|
||||||
|
the check interval can be a first step.
|
||||||
|
3. As the messages are processed sequentially, the throughput is limited by the single core CPU performance of the
|
||||||
|
machine Icinga 2 is running on, switching to a more powerful one can increase the capacity of individual connections.
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
#define ATOMIC_H
|
#define ATOMIC_H
|
||||||
|
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
|
#include <chrono>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
@ -34,6 +35,43 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Accumulates time durations atomically.
|
||||||
|
*
|
||||||
|
* @ingroup base
|
||||||
|
*/
|
||||||
|
class AtomicDuration
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
using Clock = std::chrono::steady_clock;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds the elapsedTime to this instance.
|
||||||
|
*
|
||||||
|
* May be called multiple times to accumulate time.
|
||||||
|
*
|
||||||
|
* @param elapsedTime The distance between two time points
|
||||||
|
*
|
||||||
|
* @return This instance for method chaining
|
||||||
|
*/
|
||||||
|
AtomicDuration& operator+=(const Clock::duration& elapsedTime) noexcept
|
||||||
|
{
|
||||||
|
m_Sum.fetch_add(elapsedTime.count(), std::memory_order_relaxed);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return The total accumulated time in seconds
|
||||||
|
*/
|
||||||
|
operator double() const noexcept
|
||||||
|
{
|
||||||
|
return std::chrono::duration<double>(Clock::duration(m_Sum.load(std::memory_order_relaxed))).count();
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
Atomic<Clock::duration::rep> m_Sum {0};
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wraps any T into a std::atomic<T>-like interface that locks using a mutex.
|
* Wraps any T into a std::atomic<T>-like interface that locks using a mutex.
|
||||||
*
|
*
|
||||||
|
@ -130,6 +130,11 @@ void Endpoint::AddMessageReceived(const intrusive_ptr<ApiFunction>& method)
|
|||||||
m_MessageCounters.at(method).fetch_add(1, std::memory_order_relaxed);
|
m_MessageCounters.at(method).fetch_add(1, std::memory_order_relaxed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Endpoint::AddMessageProcessed(const AtomicDuration::Clock::duration& duration)
|
||||||
|
{
|
||||||
|
m_InputProcessingTime += duration;
|
||||||
|
}
|
||||||
|
|
||||||
double Endpoint::GetMessagesSentPerSecond() const
|
double Endpoint::GetMessagesSentPerSecond() const
|
||||||
{
|
{
|
||||||
return m_MessagesSent.CalculateRate(Utility::GetTime(), 60);
|
return m_MessagesSent.CalculateRate(Utility::GetTime(), 60);
|
||||||
@ -162,3 +167,8 @@ Dictionary::Ptr Endpoint::GetMessagesReceivedPerType() const
|
|||||||
|
|
||||||
return new Dictionary(std::move(result));
|
return new Dictionary(std::move(result));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double Endpoint::GetSecondsProcessingMessages() const
|
||||||
|
{
|
||||||
|
return m_InputProcessingTime;
|
||||||
|
}
|
||||||
|
@ -49,6 +49,7 @@ public:
|
|||||||
void AddMessageSent(int bytes);
|
void AddMessageSent(int bytes);
|
||||||
void AddMessageReceived(int bytes);
|
void AddMessageReceived(int bytes);
|
||||||
void AddMessageReceived(const intrusive_ptr<ApiFunction>& method);
|
void AddMessageReceived(const intrusive_ptr<ApiFunction>& method);
|
||||||
|
void AddMessageProcessed(const AtomicDuration::Clock::duration& duration);
|
||||||
|
|
||||||
double GetMessagesSentPerSecond() const override;
|
double GetMessagesSentPerSecond() const override;
|
||||||
double GetMessagesReceivedPerSecond() const override;
|
double GetMessagesReceivedPerSecond() const override;
|
||||||
@ -58,6 +59,8 @@ public:
|
|||||||
|
|
||||||
Dictionary::Ptr GetMessagesReceivedPerType() const override;
|
Dictionary::Ptr GetMessagesReceivedPerType() const override;
|
||||||
|
|
||||||
|
double GetSecondsProcessingMessages() const override;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void OnAllConfigLoaded() override;
|
void OnAllConfigLoaded() override;
|
||||||
|
|
||||||
@ -71,6 +74,8 @@ private:
|
|||||||
mutable RingBuffer m_MessagesReceived{60};
|
mutable RingBuffer m_MessagesReceived{60};
|
||||||
mutable RingBuffer m_BytesSent{60};
|
mutable RingBuffer m_BytesSent{60};
|
||||||
mutable RingBuffer m_BytesReceived{60};
|
mutable RingBuffer m_BytesReceived{60};
|
||||||
|
|
||||||
|
AtomicDuration m_InputProcessingTime;
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -58,6 +58,10 @@ class Endpoint : ConfigObject
|
|||||||
[no_user_modify, no_storage] Dictionary::Ptr messages_received_per_type {
|
[no_user_modify, no_storage] Dictionary::Ptr messages_received_per_type {
|
||||||
get;
|
get;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
[no_user_modify, no_storage] double seconds_processing_messages {
|
||||||
|
get;
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -106,6 +106,9 @@ void JsonRpcConnection::HandleIncomingMessages(boost::asio::yield_context yc)
|
|||||||
l_TaskStats.InsertValue(Utility::GetTime(), 1);
|
l_TaskStats.InsertValue(Utility::GetTime(), 1);
|
||||||
|
|
||||||
auto total = ch::steady_clock::now() - start;
|
auto total = ch::steady_clock::now() - start;
|
||||||
|
if (m_Endpoint) {
|
||||||
|
m_Endpoint->AddMessageProcessed(total);
|
||||||
|
}
|
||||||
|
|
||||||
Log msg(total >= ch::seconds(5) ? LogWarning : LogDebug, "JsonRpcConnection");
|
Log msg(total >= ch::seconds(5) ? LogWarning : LogDebug, "JsonRpcConnection");
|
||||||
msg << "Processed JSON-RPC '" << rpcMethod << "' message for identity '" << m_Identity
|
msg << "Processed JSON-RPC '" << rpcMethod << "' message for identity '" << m_Identity
|
||||||
|
@ -89,6 +89,7 @@ set(base_test_SOURCES
|
|||||||
icingaapplication-fixture.cpp
|
icingaapplication-fixture.cpp
|
||||||
utils.cpp
|
utils.cpp
|
||||||
base-array.cpp
|
base-array.cpp
|
||||||
|
base-atomic.cpp
|
||||||
base-base64.cpp
|
base-base64.cpp
|
||||||
base-convert.cpp
|
base-convert.cpp
|
||||||
base-dictionary.cpp
|
base-dictionary.cpp
|
||||||
|
34
test/base-atomic.cpp
Normal file
34
test/base-atomic.cpp
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
/* Icinga 2 | (c) 2025 Icinga GmbH | GPLv2+ */
|
||||||
|
|
||||||
|
#include "base/atomic.hpp"
|
||||||
|
#include <BoostTestTargetConfig.h>
|
||||||
|
|
||||||
|
using namespace icinga;
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_SUITE(base_atomic)
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE(duration_none)
|
||||||
|
{
|
||||||
|
BOOST_CHECK_EQUAL(static_cast<double>(AtomicDuration()), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE(duration_one)
|
||||||
|
{
|
||||||
|
AtomicDuration sum;
|
||||||
|
|
||||||
|
sum += std::chrono::seconds(1);
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(static_cast<double>(sum), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE(duration_two)
|
||||||
|
{
|
||||||
|
AtomicDuration sum;
|
||||||
|
|
||||||
|
sum += std::chrono::seconds(1);
|
||||||
|
sum += std::chrono::seconds(2);
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(static_cast<double>(sum), 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_SUITE_END()
|
Loading…
x
Reference in New Issue
Block a user