Introduce Icinga DB check (like the IDO one)

This commit is contained in:
Alexander A. Klimov 2022-06-01 11:38:17 +02:00
parent 51a9c61859
commit 92c886a153
9 changed files with 502 additions and 2 deletions

View File

@ -2,8 +2,11 @@
mkclass_target(icingadb.ti icingadb-ti.cpp icingadb-ti.hpp) mkclass_target(icingadb.ti icingadb-ti.cpp icingadb-ti.hpp)
mkembedconfig_target(icingadb-itl.conf icingadb-itl.cpp)
set(icingadb_SOURCES set(icingadb_SOURCES
icingadb.cpp icingadb-objects.cpp icingadb-stats.cpp icingadb-utility.cpp redisconnection.cpp icingadb-ti.hpp icingadb.cpp icingadb-objects.cpp icingadb-stats.cpp icingadb-utility.cpp redisconnection.cpp icingadb-ti.hpp
icingadbchecktask.cpp icingadb-itl.cpp
) )
if(ICINGA2_UNITY_BUILD) if(ICINGA2_UNITY_BUILD)

View File

@ -0,0 +1,35 @@
/* Icinga 2 | (c) 2022 Icinga GmbH | GPLv2+ */
System.assert(Internal.run_with_activation_context(function() {
template CheckCommand "icingadb-check-command" use (checkFunc = Internal.IcingadbCheck) {
execute = checkFunc
}
object CheckCommand "icingadb" {
import "icingadb-check-command"
var criticalPendingSize = 1000000000
var objectSize = 1000
var maxObjectsPerQuery = 100
var additionalSmallerQueriesPerObject = 1
var criticalPendingQueries = (criticalPendingSize / (maxObjectsPerQuery * objectSize)) * (1 + additionalSmallerQueriesPerObject)
vars.icingadb_name = "icingadb"
vars.icingadb_downfor_warning = 10s
vars.icingadb_downfor_critical = 60s
vars.icingadb_heartbeat_warning = 10s
vars.icingadb_heartbeat_critical = 60s
vars.icingadb_idlefor_warning = 1.5m
vars.icingadb_idlefor_critical = 3m
vars.icingadb_history_backlog_warning = 1m
vars.icingadb_history_backlog_critical = 15m
vars.icingadb_pending_queries_warning = criticalPendingQueries / 2
vars.icingadb_pending_queries_critical = criticalPendingQueries
vars.icingadb_syncago_warning = 5 * 366d
vars.icingadb_syncago_critical = 10 * 366d
vars.icingadb_dumpago_warning = 5 * 366d
vars.icingadb_dumpago_critical = 10 * 366d
}
}))
Internal.remove("IcingadbCheck")

View File

@ -524,8 +524,14 @@ void IcingaDB::UpdateAllConfigObjects()
m_Rcon->EnqueueCallback([&p](boost::asio::yield_context& yc) { p.set_value(); }, Prio::Config); m_Rcon->EnqueueCallback([&p](boost::asio::yield_context& yc) { p.set_value(); }, Prio::Config);
p.get_future().wait(); p.get_future().wait();
auto endTime (Utility::GetTime());
auto took (endTime - startTime);
SetLastdumpTook(took);
SetLastdumpEnd(endTime);
Log(LogInformation, "IcingaDB") Log(LogInformation, "IcingaDB")
<< "Initial config/status dump finished in " << Utility::GetTime() - startTime << " seconds."; << "Initial config/status dump finished in " << took << " seconds.";
} }
std::vector<std::vector<intrusive_ptr<ConfigObject>>> IcingaDB::ChunkObjects(std::vector<intrusive_ptr<ConfigObject>> objects, size_t chunkSize) { std::vector<std::vector<intrusive_ptr<ConfigObject>>> IcingaDB::ChunkObjects(std::vector<intrusive_ptr<ConfigObject>> objects, size_t chunkSize) {

View File

@ -45,6 +45,11 @@ public:
String GetEnvironmentId() const override; String GetEnvironmentId() const override;
inline RedisConnection::Ptr GetConnection()
{
return m_Rcon;
}
template<class T> template<class T>
static void AddKvsToMap(const Array::Ptr& kvs, T& map) static void AddKvsToMap(const Array::Ptr& kvs, T& map)
{ {

View File

@ -48,6 +48,13 @@ class IcingaDB : ConfigObject
[no_storage] String environment_id { [no_storage] String environment_id {
get; get;
}; };
[set_protected] double lastdump_end {
default {{{ return 0; }}}
};
[set_protected] double lastdump_took {
default {{{ return 0; }}}
};
}; };
} }

View File

@ -0,0 +1,398 @@
/* Icinga 2 | (c) 2022 Icinga GmbH | GPLv2+ */
#include "icingadb/icingadbchecktask.hpp"
#include "icinga/host.hpp"
#include "icinga/checkcommand.hpp"
#include "icinga/macroprocessor.hpp"
#include "remote/apilistener.hpp"
#include "remote/endpoint.hpp"
#include "remote/zone.hpp"
#include "base/function.hpp"
#include "base/json.hpp"
#include "base/utility.hpp"
#include "base/perfdatavalue.hpp"
#include "base/configtype.hpp"
#include "base/convert.hpp"
#include <utility>
using namespace icinga;
REGISTER_FUNCTION_NONCONST(Internal, IcingadbCheck, &IcingadbCheckTask::ScriptFunc, "checkable:cr:resolvedMacros:useResolvedMacros");
static void ReportIcingadbCheck(
const Checkable::Ptr& checkable, const CheckCommand::Ptr& commandObj,
const CheckResult::Ptr& cr, String output, ServiceState state)
{
if (Checkable::ExecuteCommandProcessFinishedHandler) {
double now = Utility::GetTime();
ProcessResult pr;
pr.PID = -1;
pr.Output = std::move(output);
pr.ExecutionStart = now;
pr.ExecutionEnd = now;
pr.ExitStatus = state;
Checkable::ExecuteCommandProcessFinishedHandler(commandObj->GetName(), pr);
} else {
cr->SetState(state);
cr->SetOutput(output);
checkable->ProcessCheckResult(cr);
}
}
static inline
double GetXMessageTs(const Array::Ptr& xMessage)
{
return Convert::ToLong(String(xMessage->Get(0)).Split("-")[0]) / 1000.0;
}
void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr,
const Dictionary::Ptr& resolvedMacros, bool useResolvedMacros)
{
CheckCommand::Ptr commandObj = CheckCommand::ExecuteOverride ? CheckCommand::ExecuteOverride : checkable->GetCheckCommand();
Host::Ptr host;
Service::Ptr service;
tie(host, service) = GetHostService(checkable);
MacroProcessor::ResolverList resolvers;
String silenceMissingMacroWarning;
if (MacroResolver::OverrideMacros)
resolvers.emplace_back("override", MacroResolver::OverrideMacros);
if (service)
resolvers.emplace_back("service", service);
resolvers.emplace_back("host", host);
resolvers.emplace_back("command", commandObj);
resolvers.emplace_back("icinga", IcingaApplication::GetInstance());
auto resolve ([&resolvers, &checkable, &silenceMissingMacroWarning, &resolvedMacros, useResolvedMacros](const String& macro) {
return MacroProcessor::ResolveMacros(macro, resolvers, checkable->GetLastCheckResult(),
&silenceMissingMacroWarning, MacroProcessor::EscapeCallback(), resolvedMacros, useResolvedMacros);
});
struct Thresholds
{
Value Warning, Critical;
};
auto resolveThresholds ([&resolve](const String& wmacro, const String& cmacro) {
return Thresholds{resolve(wmacro), resolve(cmacro)};
});
String icingadbName = resolve("$icingadb_name$");
auto downForThresholds (resolveThresholds("$icingadb_downfor_warning$", "$icingadb_downfor_critical$"));
auto heartbeatThresholds (resolveThresholds("$icingadb_heartbeat_warning$", "$icingadb_heartbeat_critical$"));
auto idleForThresholds (resolveThresholds("$icingadb_idlefor_warning$", "$icingadb_idlefor_critical$"));
auto historyBacklogThresholds (resolveThresholds("$icingadb_history_backlog_warning$", "$icingadb_history_backlog_critical$"));
auto queriesThresholds (resolveThresholds("$icingadb_queries_warning$", "$icingadb_queries_critical$"));
auto pendingQueriesThresholds (resolveThresholds("$icingadb_pending_queries_warning$", "$icingadb_pending_queries_critical$"));
auto syncAgoThresholds (resolveThresholds("$icingadb_syncago_warning$", "$icingadb_syncago_critical$"));
auto syncTookThresholds (resolveThresholds("$icingadb_synctook_warning$", "$icingadb_synctook_critical$"));
auto dumpAgoThresholds (resolveThresholds("$icingadb_dumpago_warning$", "$icingadb_dumpago_critical$"));
auto dumpTookThresholds (resolveThresholds("$icingadb_dumptook_warning$", "$icingadb_dumptook_critical$"));
std::map<String, Thresholds> thresholdsByOp;
const char * const icingadbKnownStats[] = {
"sync_config", "sync_state", "sync_history", "sync_overdue", "cleanup_history"
};
for (auto metric : icingadbKnownStats) {
thresholdsByOp.emplace(metric, resolveThresholds(
String("$icingadb_") + metric + "_warning$",
String("$icingadb_") + metric + "_critical$"
));
}
if (resolvedMacros && !useResolvedMacros)
return;
if (icingadbName.IsEmpty()) {
ReportIcingadbCheck(checkable, commandObj, cr, "Attribute 'icingadb_name' must be set.", ServiceUnknown);
return;
}
auto conn (IcingaDB::GetByName(icingadbName));
if (!conn) {
ReportIcingadbCheck(checkable, commandObj, cr, "Icinga DB connection '" + icingadbName + "' does not exist.", ServiceUnknown);
return;
}
auto redis (conn->GetConnection());
if (!redis->GetConnected()) {
ReportIcingadbCheck(checkable, commandObj, cr, "Could not connect to Redis.", ServiceCritical);
return;
}
Array::Ptr xReadHeartbeat, xReadStats, xReadHistory;
try {
auto replies (redis->GetResultsOfQueries(
{
{"XREAD", "STREAMS", "icingadb:telemetry:heartbeat", "0-0"},
{"XREAD", "STREAMS", "icingadb:telemetry:stats", "0-0"},
{
"XREAD", "COUNT", "1", "STREAMS",
"icinga:history:stream:acknowledgement", "icinga:history:stream:comment",
"icinga:history:stream:downtime", "icinga:history:stream:flapping",
"icinga:history:stream:notification", "icinga:history:stream:state",
"0-0", "0-0", "0-0", "0-0", "0-0", "0-0"
}
},
RedisConnection::QueryPriority::Heartbeat
));
xReadHeartbeat = std::move(replies.at(0));
xReadStats = std::move(replies.at(1));
xReadHistory = std::move(replies.at(2));
} catch (const std::exception& ex) {
ReportIcingadbCheck(checkable, commandObj, cr, String("Could not read XREAD responses from Redis: ") + ex.what(), ServiceCritical);
return;
}
if (!xReadHeartbeat) {
ReportIcingadbCheck(
checkable, commandObj, cr,
"The Icinga DB daemon seems to have never run. (Missing heartbeat)",
ServiceCritical
);
return;
}
auto dumpWhen (conn->GetLastdumpEnd());
auto dumpTook (conn->GetLastdumpTook());
Array::Ptr heartbeatMessage = Array::Ptr(Array::Ptr(xReadHeartbeat->Get(0))->Get(1))->Get(0);
auto heartbeatTime (GetXMessageTs(heartbeatMessage));
std::map<String, String> heartbeatData;
IcingaDB::AddKvsToMap(heartbeatMessage->Get(1), heartbeatData);
String version = heartbeatData.at("meta:version");
Dictionary::Ptr goMetricsByCumulativity (JsonDecode(heartbeatData.at("go:metrics")));
String dbErr (heartbeatData.at("db:err"));
auto ourHeartbeatTs (Convert::ToLong(heartbeatData.at("heartbeat:last-ts")) / 1000.0);
bool weResponsible = Convert::ToLong(heartbeatData.at("ha:we-responsible"));
auto weResponsibleTs (Convert::ToLong(heartbeatData.at("ha:we-responsible-ts")) / 1000.0);
bool otherResponsible = Convert::ToLong(heartbeatData.at("ha:other-responsible"));
auto syncWhen (Convert::ToLong(heartbeatData.at("sync:when")) / 1000.0);
auto syncTook (Convert::ToDouble(heartbeatData.at("sync:took")) / 1000);
auto now (Utility::GetTime());
auto downFor (now - heartbeatTime);
auto responsibleFor (now - weResponsibleTs);
auto idleFor ((weResponsible ? -1 : 1) * responsibleFor);
auto heartbeatLag (now - ourHeartbeatTs);
auto syncAgo (now - syncWhen);
auto dumpAgo (now - dumpWhen);
double historyBacklog = 0;
if (xReadHistory) {
double minTs = 0;
ObjectLock lock (xReadHistory);
for (Array::Ptr stream : xReadHistory) {
auto ts (GetXMessageTs(Array::Ptr(stream->Get(1))->Get(0)));
if (minTs == 0 || ts < minTs) {
minTs = ts;
}
}
if (minTs > 0) {
historyBacklog = now - minTs;
}
}
Array::Ptr perfdata = new Array();
std::map<String, RingBuffer> statsPerOp;
for (auto metric : icingadbKnownStats) {
statsPerOp.emplace(std::piecewise_construct, std::forward_as_tuple(metric), std::forward_as_tuple(15 * 60));
}
if (xReadStats) {
Array::Ptr messages = Array::Ptr(xReadStats->Get(0))->Get(1);
ObjectLock lock (messages);
for (Array::Ptr message : messages) {
auto ts (GetXMessageTs(message));
std::map<String, String> opsPerSec;
IcingaDB::AddKvsToMap(message->Get(1), opsPerSec);
for (auto& kv : opsPerSec) {
auto buf (statsPerOp.find(kv.first));
if (buf == statsPerOp.end()) {
buf = statsPerOp.emplace(
std::piecewise_construct,
std::forward_as_tuple(kv.first), std::forward_as_tuple(15 * 60)
).first;
}
buf->second.InsertValue(ts, Convert::ToLong(kv.second));
}
}
}
ServiceState state = ServiceOK;
std::ostringstream msgbuf;
double qps = redis->GetQueryCount(60) / 60.0;
double pendingQueries = redis->GetPendingQueryCount();
auto checkLower ([&state, &msgbuf](double value, const Thresholds& thresholds) {
if (!thresholds.Critical.IsEmpty() && value < (double)thresholds.Critical) {
msgbuf << ", lower than CRITICAL threshold (" << thresholds.Critical << ")";
state = ServiceCritical;
} else if (!thresholds.Warning.IsEmpty() && value < (double)thresholds.Warning) {
msgbuf << ", lower than WARNING threshold (" << thresholds.Warning << ")";
if (state == ServiceOK) {
state = ServiceWarning;
}
}
});
auto checkGreater ([&state, &msgbuf](double value, const Thresholds& thresholds) {
if (!thresholds.Critical.IsEmpty() && value > (double)thresholds.Critical) {
msgbuf << ", greater than CRITICAL threshold (" << thresholds.Critical << ")";
state = ServiceCritical;
} else if (!thresholds.Warning.IsEmpty() && value > (double)thresholds.Warning) {
msgbuf << ", greater than WARNING threshold (" << thresholds.Warning << ")";
if (state == ServiceOK) {
state = ServiceWarning;
}
}
});
msgbuf << std::fixed << std::setprecision(3)
<< "Icinga 2\n--------\n"
<< "\n* Connected to Redis"
<< "\n* Queries per second: " << qps;
checkLower(qps, queriesThresholds);
msgbuf << "\n* Pending queries: " << pendingQueries;
checkGreater(pendingQueries, pendingQueriesThresholds);
msgbuf << "\n* Last dump: ";
if (dumpWhen) {
msgbuf << dumpAgo << " seconds ago";
perfdata->Add(new PerfdataValue("dump_ago", dumpAgo, false, "seconds", dumpAgoThresholds.Warning, dumpAgoThresholds.Critical, 0));
} else {
msgbuf << "never";
}
checkGreater(dumpAgo, dumpAgoThresholds);
if (dumpWhen) {
msgbuf << "\n* Last dump took: " << dumpTook << " seconds";
checkGreater(dumpTook, dumpTookThresholds);
perfdata->Add(new PerfdataValue("dump_took", dumpTook, false, "seconds", dumpTookThresholds.Warning, dumpTookThresholds.Critical, 0));
}
msgbuf << "\n\nIcinga DB daemon\n----------------\n"
<< "\n* Version: " << version;
if (!dbErr.IsEmpty()) {
msgbuf << "\n* Database ERROR: " << dbErr;
state = ServiceCritical;
}
msgbuf << "\n* Last seen: " << downFor << " seconds ago";
checkGreater(downFor, downForThresholds);
msgbuf << "\n* Icinga 2 last seen: " << heartbeatLag << " seconds ago";
checkGreater(heartbeatLag, heartbeatThresholds);
msgbuf << "\n* " << (weResponsible ? "Responsible" : "Not responsible") << " for: " << responsibleFor << " seconds";
if (otherResponsible) {
msgbuf << " (but other instance is responsible)";
} else {
checkGreater(idleFor, idleForThresholds);
}
msgbuf << "\n* History backlog: " << historyBacklog << " seconds";
checkGreater(historyBacklog, historyBacklogThresholds);
msgbuf << "\n* Last sync: ";
if (syncWhen) {
msgbuf << syncAgo << " seconds ago";
perfdata->Add(new PerfdataValue("sync_ago", syncAgo, false, "seconds", syncAgoThresholds.Warning, syncAgoThresholds.Critical, 0));
} else {
msgbuf << "never";
}
checkGreater(syncAgo, syncAgoThresholds);
if (syncWhen) {
msgbuf << "\n* Last sync took: " << syncTook << " seconds";
checkGreater(syncTook, syncTookThresholds);
perfdata->Add(new PerfdataValue("sync_took", syncTook, false, "seconds", syncTookThresholds.Warning, syncTookThresholds.Critical, 0));
}
perfdata->Add(new PerfdataValue("queries", qps, false, "", Empty, Empty, 0));
perfdata->Add(new PerfdataValue("queries_1min", redis->GetQueryCount(60), Empty, Empty, 0));
perfdata->Add(new PerfdataValue("queries_5mins", redis->GetQueryCount(5 * 60), Empty, Empty, 0));
perfdata->Add(new PerfdataValue("queries_15mins", redis->GetQueryCount(15 * 60), Empty, Empty, 0));
perfdata->Add(new PerfdataValue("pending_queries", pendingQueries, false, "", pendingQueriesThresholds.Warning, pendingQueriesThresholds.Critical, 0));
perfdata->Add(new PerfdataValue("down_for", downFor, false, "seconds", downForThresholds.Warning, downForThresholds.Critical, 0));
perfdata->Add(new PerfdataValue("heartbeat_lag", heartbeatLag, false, "seconds", heartbeatThresholds.Warning, heartbeatThresholds.Critical));
perfdata->Add(new PerfdataValue("idle_for", idleFor, false, "seconds", idleForThresholds.Warning, idleForThresholds.Critical));
perfdata->Add(new PerfdataValue("history_backlog", historyBacklog, false, "seconds", historyBacklogThresholds.Warning, historyBacklogThresholds.Critical, 0));
for (auto& kv : statsPerOp) {
auto perMin (kv.second.UpdateAndGetValues(now, 60));
auto perSec (perMin / 60.0);
auto thresholds (thresholdsByOp.find(kv.first));
msgbuf << "\n* " << perSec << " " << kv.first << "/s";
if (thresholds != thresholdsByOp.end()) {
checkLower(perSec, thresholds->second);
}
perfdata->Add(new PerfdataValue(kv.first, perSec, false, "", Empty, Empty, 0));
perfdata->Add(new PerfdataValue(kv.first + "_1min", perMin, false, "", Empty, Empty, 0));
perfdata->Add(new PerfdataValue(kv.first + "_5mins", kv.second.UpdateAndGetValues(now, 5 * 60), false, "", Empty, Empty, 0));
perfdata->Add(new PerfdataValue(kv.first + "_15mins", kv.second.UpdateAndGetValues(now, 15 * 60), false, "", Empty, Empty, 0));
}
{
static boost::regex wellNamedUnits (":(bytes|seconds)$");
ObjectLock lock (goMetricsByCumulativity);
for (auto& kv : goMetricsByCumulativity) {
bool cumulative = kv.first == "cumulative";
Dictionary::Ptr goMetricsPerCumulativity = kv.second;
ObjectLock lock (goMetricsPerCumulativity);
for (auto& kv : goMetricsPerCumulativity) {
std::string unit;
boost::smatch what;
if (boost::regex_search(kv.first.GetData(), what, wellNamedUnits)) {
unit = what[1];
}
bool counter = cumulative && unit.empty();
perfdata->Add(new PerfdataValue(kv.first, kv.second, counter, std::move(unit)));
}
}
}
cr->SetPerformanceData(perfdata);
ReportIcingadbCheck(checkable, commandObj, cr, msgbuf.str(), state);
}

View File

@ -0,0 +1,29 @@
/* Icinga 2 | (c) 2022 Icinga GmbH | GPLv2+ */
#ifndef ICINGADBCHECKTASK_H
#define ICINGADBCHECKTASK_H
#include "icingadb/icingadb.hpp"
#include "icinga/checkable.hpp"
namespace icinga
{
/**
* Icinga DB check.
*
* @ingroup icingadb
*/
class IcingadbCheckTask
{
public:
static void ScriptFunc(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr,
const Dictionary::Ptr& resolvedMacros, bool useResolvedMacros);
private:
IcingadbCheckTask();
};
}
#endif /* ICINGADBCHECKTASK_H */

View File

@ -674,6 +674,11 @@ void RedisConnection::SetConnectedCallback(std::function<void(asio::yield_contex
m_ConnectedCallback = std::move(callback); m_ConnectedCallback = std::move(callback);
} }
int RedisConnection::GetQueryCount(RingBuffer::SizeType span)
{
return m_OutputQueries.UpdateAndGetValues(Utility::GetTime(), span);
}
void RedisConnection::IncreasePendingQueries(int count) void RedisConnection::IncreasePendingQueries(int count)
{ {
if (m_Parent) { if (m_Parent) {

View File

@ -98,6 +98,18 @@ namespace icinga
void SetConnectedCallback(std::function<void(boost::asio::yield_context& yc)> callback); void SetConnectedCallback(std::function<void(boost::asio::yield_context& yc)> callback);
inline bool GetConnected()
{
return m_Connected.load();
}
int GetQueryCount(RingBuffer::SizeType span);
inline int GetPendingQueryCount()
{
return m_PendingQueries;
}
private: private:
/** /**
* What to do with the responses to Redis queries. * What to do with the responses to Redis queries.
@ -225,7 +237,7 @@ namespace icinga
// Stats // Stats
RingBuffer m_InputQueries{10}; RingBuffer m_InputQueries{10};
RingBuffer m_OutputQueries{10}; RingBuffer m_OutputQueries{15 * 60};
int m_PendingQueries{0}; int m_PendingQueries{0};
boost::asio::deadline_timer m_LogStatsTimer; boost::asio::deadline_timer m_LogStatsTimer;
Ptr m_Parent; Ptr m_Parent;