Add metrics about RemoteCheckQueue to Icinga check, API and logs

refs #4841
This commit is contained in:
Noah Hilverling 2018-01-18 15:22:16 +01:00
parent d8c31353e4
commit 06e381ceea
5 changed files with 53 additions and 2 deletions

View File

@ -256,7 +256,7 @@ Configuration Attributes:
Name | Type | Description
--------------------------|-----------------------|----------------------------------
concurrent\_checks | Number | **Optional and Deprecated.** The maximum number of concurrent checks. Was replaced by global constant `MaxConcurrentChecks` which will be set if you still use `concurrent_checks`.
concurrent\_checks | Number | **Optional and deprecated.** The maximum number of concurrent checks. Was replaced by global constant `MaxConcurrentChecks` which will be set if you still use `concurrent_checks`.
## CheckResultReader <a id="objecttype-checkresultreader"></a>

View File

@ -20,6 +20,7 @@
#include "icinga/cib.hpp"
#include "icinga/host.hpp"
#include "icinga/service.hpp"
#include "icinga/clusterevents.hpp"
#include "base/objectlock.hpp"
#include "base/utility.hpp"
#include "base/perfdatavalue.hpp"
@ -305,6 +306,8 @@ void CIB::StatsFunc(const Dictionary::Ptr& status, const Array::Ptr& perfdata) {
status->Set("active_service_checks_15min", GetActiveServiceChecksStatistics(60 * 15));
status->Set("passive_service_checks_15min", GetPassiveServiceChecksStatistics(60 * 15));
status->Set("remote_check_queue", ClusterEvents::GetCheckRequestQueueSize());
CheckableCheckStatistics scs = CalculateServiceCheckStats();
status->Set("min_latency", scs.min_latency);

View File

@ -21,6 +21,7 @@
#include "remote/apilistener.hpp"
#include "base/serializer.hpp"
#include "base/exception.hpp"
#include <boost/thread/once.hpp>
#include <thread>
using namespace icinga;
@ -28,6 +29,9 @@ using namespace icinga;
boost::mutex ClusterEvents::m_Mutex;
std::deque<std::function<void ()>> ClusterEvents::m_CheckRequestQueue;
bool ClusterEvents::m_CheckSchedulerRunning;
int ClusterEvents::m_ChecksExecutedDuringInterval;
int ClusterEvents::m_ChecksDroppedDuringInterval;
Timer::Ptr ClusterEvents::m_LogTimer;
void ClusterEvents::RemoteCheckThreadProc()
{
@ -45,6 +49,7 @@ void ClusterEvents::RemoteCheckThreadProc()
auto callback = m_CheckRequestQueue.front();
m_CheckRequestQueue.pop_front();
m_ChecksExecutedDuringInterval++;
lock.unlock();
callback();
@ -58,10 +63,19 @@ void ClusterEvents::RemoteCheckThreadProc()
void ClusterEvents::EnqueueCheck(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params)
{
static boost::once_flag once = BOOST_ONCE_INIT;
boost::call_once(once, []() {
m_LogTimer = new Timer();
m_LogTimer->SetInterval(10);
m_LogTimer->OnTimerExpired.connect(std::bind(ClusterEvents::LogRemoteCheckQueueInformation));
m_LogTimer->Start();
});
boost::mutex::scoped_lock lock(m_Mutex);
if (m_CheckRequestQueue.size() >= 25000) {
Log(LogCritical, "ClusterEvents", "Remote check queue ran out of slots. Discarding remote check request.");
m_ChecksDroppedDuringInterval++;
return;
}
@ -184,3 +198,28 @@ void ClusterEvents::ExecuteCheckFromQueue(const MessageOrigin::Ptr& origin, cons
}
}
int ClusterEvents::GetCheckRequestQueueSize()
{
return m_CheckRequestQueue.size();
}
void ClusterEvents::LogRemoteCheckQueueInformation() {
if (m_ChecksDroppedDuringInterval > 0) {
Log(LogCritical, "ClusterEvents")
<< "Remote check queue ran out of slots. "
<< m_ChecksDroppedDuringInterval << " checks dropped.";
m_ChecksDroppedDuringInterval = 0;
}
if (m_ChecksExecutedDuringInterval == 0)
return;
Log(LogInformation, "RemoteCheckQueue")
<< "items: " << m_CheckRequestQueue.size()
<< ", rate: " << m_ChecksExecutedDuringInterval / 10 << "/s "
<< "(" << m_ChecksExecutedDuringInterval * 6 << "/min "
<< m_ChecksExecutedDuringInterval * 6 * 5 << "/5min "
<< m_ChecksExecutedDuringInterval * 6 * 15 << "/15min" << ");";
m_ChecksExecutedDuringInterval = 0;
}

View File

@ -75,10 +75,16 @@ public:
NotificationType notificationType, const CheckResult::Ptr& cr, const String& author, const String& commentText, const MessageOrigin::Ptr& origin);
static Value NotificationSentToAllUsersAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params);
static int GetCheckRequestQueueSize();
static void LogRemoteCheckQueueInformation();
private:
static boost::mutex m_Mutex;
static std::deque<std::function<void ()>> m_CheckRequestQueue;
static bool m_CheckSchedulerRunning;
static int m_ChecksExecutedDuringInterval;
static int m_ChecksDroppedDuringInterval;
static Timer::Ptr m_LogTimer;
static void RemoteCheckThreadProc();
static void EnqueueCheck(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params);

View File

@ -23,6 +23,7 @@
#include "icinga/checkcommand.hpp"
#include "icinga/macroprocessor.hpp"
#include "icinga/icingaapplication.hpp"
#include "icinga/clusterevents.hpp"
#include "base/application.hpp"
#include "base/objectlock.hpp"
#include "base/utility.hpp"
@ -84,6 +85,8 @@ void IcingaCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckRes
perfdata->Add(new PerfdataValue("active_service_checks_15min", CIB::GetActiveServiceChecksStatistics(60 * 15)));
perfdata->Add(new PerfdataValue("passive_service_checks_15min", CIB::GetPassiveServiceChecksStatistics(60 * 15)));
perfdata->Add(new PerfdataValue("remote_check_queue", ClusterEvents::GetCheckRequestQueueSize()));
CheckableCheckStatistics scs = CIB::CalculateServiceCheckStats();
perfdata->Add(new PerfdataValue("min_latency", scs.min_latency));