From 6eebc7868f2bac3d084fade2d90581cb550cd8dc Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Wed, 1 Jun 2022 11:38:17 +0200 Subject: [PATCH] Introduce Icinga DB check (like the IDO one) --- doc/10-icinga-template-library.md | 18 + lib/icingadb/CMakeLists.txt | 3 + lib/icingadb/icingadb-itl.conf | 24 ++ lib/icingadb/icingadb-objects.cpp | 28 +- lib/icingadb/icingadb.hpp | 5 + lib/icingadb/icingadb.ti | 10 + lib/icingadb/icingadbchecktask.cpp | 519 +++++++++++++++++++++++++++++ lib/icingadb/icingadbchecktask.hpp | 29 ++ lib/icingadb/redisconnection.cpp | 94 +++++- lib/icingadb/redisconnection.hpp | 55 ++- 10 files changed, 760 insertions(+), 25 deletions(-) create mode 100644 lib/icingadb/icingadb-itl.conf create mode 100644 lib/icingadb/icingadbchecktask.cpp create mode 100644 lib/icingadb/icingadbchecktask.hpp diff --git a/doc/10-icinga-template-library.md b/doc/10-icinga-template-library.md index 9bb1ae3c7..179428b51 100644 --- a/doc/10-icinga-template-library.md +++ b/doc/10-icinga-template-library.md @@ -103,6 +103,24 @@ cluster\_zone | **Required.** The zone name. Defaults to `$host.name$`. cluster\_lag\_warning | **Optional.** Warning threshold for log lag in seconds. Applies if the log lag is greater than the threshold. cluster\_lag\_critical | **Optional.** Critical threshold for log lag in seconds. Applies if the log lag is greater than the threshold. +### icingadb + +Check command for the built-in `icingadb` check. + +Custom variables passed as [command parameters](03-monitoring-basics.md#command-passing-parameters): + +Name | Description +------------------------------------------|----------------------------- +icingadb\_name | **Required.** The name of the Icinga DB connection object. Defaults to `icingadb`. +icingadb\_redis\_dump\_takes\_warning | **Optional.** Warning threshold for ongoing Redis dump duration. Applies if the value is higher than the threshold. Defaults to 5 minutes. +icingadb\_redis\_dump\_takes\_critical | **Optional.** Critical threshold for ongoing Redis dump duration. Applies if the value is higher than the threshold. Defaults to 10 minutes. +icingadb\_database\_sync\_takes\_warning | **Optional.** Warning threshold for ongoing database sync duration. Applies if the value is higher than the threshold. Defaults to 5 minutes. +icingadb\_database\_sync\_takes\_critical | **Optional.** Critical threshold for ongoing database sync duration. Applies if the value is higher than the threshold. Defaults to 10 minutes. +icingadb\_redis\_backlog\_warning | **Optional.** Warning threshold for Redis write backlog. Applies if the value is higher than the threshold. Defaults to 5 minutes. +icingadb\_redis\_backlog\_critical | **Optional.** Critical threshold for Redis write backlog. Applies if the value is higher than the threshold. Defaults to 15 minutes. +icingadb\_database\_backlog\_warning | **Optional.** Warning threshold for database sync backlog. Applies if the value is higher than the threshold. Defaults to 5 minutes. +icingadb\_database\_backlog\_critical | **Optional.** Critical threshold for database sync backlog. Applies if the value is higher than the threshold. Defaults to 15 minutes. + ### ido Check command for the built-in `ido` check. diff --git a/lib/icingadb/CMakeLists.txt b/lib/icingadb/CMakeLists.txt index 71a7c67f2..de8e4adae 100644 --- a/lib/icingadb/CMakeLists.txt +++ b/lib/icingadb/CMakeLists.txt @@ -2,8 +2,11 @@ mkclass_target(icingadb.ti icingadb-ti.cpp icingadb-ti.hpp) +mkembedconfig_target(icingadb-itl.conf icingadb-itl.cpp) + set(icingadb_SOURCES icingadb.cpp icingadb-objects.cpp icingadb-stats.cpp icingadb-utility.cpp redisconnection.cpp icingadb-ti.hpp + icingadbchecktask.cpp icingadb-itl.cpp ) if(ICINGA2_UNITY_BUILD) diff --git a/lib/icingadb/icingadb-itl.conf b/lib/icingadb/icingadb-itl.conf new file mode 100644 index 000000000..af3b9dc00 --- /dev/null +++ b/lib/icingadb/icingadb-itl.conf @@ -0,0 +1,24 @@ +/* Icinga 2 | (c) 2022 Icinga GmbH | GPLv2+ */ + +System.assert(Internal.run_with_activation_context(function() { + template CheckCommand "icingadb-check-command" use (checkFunc = Internal.IcingadbCheck) { + execute = checkFunc + } + + object CheckCommand "icingadb" { + import "icingadb-check-command" + + vars.icingadb_name = "icingadb" + + vars.icingadb_redis_dump_takes_warning = 5m + vars.icingadb_redis_dump_takes_critical = 10m + vars.icingadb_database_sync_takes_warning = 5m + vars.icingadb_database_sync_takes_critical = 10m + vars.icingadb_redis_backlog_warning = 5m + vars.icingadb_redis_backlog_critical = 15m + vars.icingadb_database_backlog_warning = 5m + vars.icingadb_database_backlog_critical = 15m + } +})) + +Internal.remove("IcingadbCheck") diff --git a/lib/icingadb/icingadb-objects.cpp b/lib/icingadb/icingadb-objects.cpp index 7a50fe4c3..e4c3b0b50 100644 --- a/lib/icingadb/icingadb-objects.cpp +++ b/lib/icingadb/icingadb-objects.cpp @@ -171,6 +171,12 @@ void IcingaDB::UpdateAllConfigObjects() Log(LogInformation, "IcingaDB") << "Starting initial config/status dump"; double startTime = Utility::GetTime(); + SetOngoingDumpStart(startTime); + + Defer resetOngoingDumpStart ([this]() { + SetOngoingDumpStart(0); + }); + // Use a Workqueue to pack objects in parallel WorkQueue upq(25000, Configuration::Concurrency, LogNotice); upq.SetName("IcingaDB:ConfigDump"); @@ -402,6 +408,8 @@ void IcingaDB::UpdateAllConfigObjects() auto ourEnd (ourCheckSums.end()); auto flushSets ([&]() { + auto affectedConfig (setObject.size() / 2u); + setChecksum.insert(setChecksum.begin(), {"HMSET", configCheckSum}); setObject.insert(setObject.begin(), {"HMSET", configObject}); @@ -415,10 +423,12 @@ void IcingaDB::UpdateAllConfigObjects() setChecksum.clear(); setObject.clear(); - rcon->FireAndForgetQueries(std::move(transaction), Prio::Config); + rcon->FireAndForgetQueries(std::move(transaction), Prio::Config, {affectedConfig}); }); auto flushDels ([&]() { + auto affectedConfig (delObject.size()); + delChecksum.insert(delChecksum.begin(), {"HDEL", configCheckSum}); delObject.insert(delObject.begin(), {"HDEL", configObject}); @@ -432,7 +442,7 @@ void IcingaDB::UpdateAllConfigObjects() delChecksum.clear(); delObject.clear(); - rcon->FireAndForgetQueries(std::move(transaction), Prio::Config); + rcon->FireAndForgetQueries(std::move(transaction), Prio::Config, {affectedConfig}); }); auto setOne ([&]() { @@ -524,8 +534,14 @@ void IcingaDB::UpdateAllConfigObjects() m_Rcon->EnqueueCallback([&p](boost::asio::yield_context& yc) { p.set_value(); }, Prio::Config); p.get_future().wait(); + auto endTime (Utility::GetTime()); + auto took (endTime - startTime); + + SetLastdumpTook(took); + SetLastdumpEnd(endTime); + Log(LogInformation, "IcingaDB") - << "Initial config/status dump finished in " << Utility::GetTime() - startTime << " seconds."; + << "Initial config/status dump finished in " << took << " seconds."; } std::vector>> IcingaDB::ChunkObjects(std::vector> objects, size_t chunkSize) { @@ -1131,7 +1147,7 @@ void IcingaDB::UpdateState(const Checkable::Ptr& checkable, StateUpdate mode) streamadd.emplace_back(IcingaToStreamValue(kv.second)); } - m_Rcon->FireAndForgetQuery(std::move(streamadd), Prio::RuntimeStateStream); + m_Rcon->FireAndForgetQuery(std::move(streamadd), Prio::RuntimeStateStream, {0, 1}); } } @@ -1178,7 +1194,7 @@ void IcingaDB::SendConfigUpdate(const ConfigObject::Ptr& object, bool runtimeUpd if (transaction.size() > 1) { transaction.push_back({"EXEC"}); - m_Rcon->FireAndForgetQueries(std::move(transaction), Prio::Config); + m_Rcon->FireAndForgetQueries(std::move(transaction), Prio::Config, {1}); } if (checkable) { @@ -2343,7 +2359,7 @@ void IcingaDB::ForwardHistoryEntries() if (m_Rcon && m_Rcon->IsConnected()) { try { - m_Rcon->GetResultsOfQueries(haystack, Prio::History); + m_Rcon->GetResultsOfQueries(haystack, Prio::History, {0, 0, haystack.size()}); break; } catch (const std::exception& ex) { logFailure(ex.what()); diff --git a/lib/icingadb/icingadb.hpp b/lib/icingadb/icingadb.hpp index e4c1be7e5..075eb36ac 100644 --- a/lib/icingadb/icingadb.hpp +++ b/lib/icingadb/icingadb.hpp @@ -44,6 +44,11 @@ public: String GetEnvironmentId() const override; + inline RedisConnection::Ptr GetConnection() + { + return m_Rcon; + } + template static void AddKvsToMap(const Array::Ptr& kvs, T& map) { diff --git a/lib/icingadb/icingadb.ti b/lib/icingadb/icingadb.ti index 00ca95691..1c649c8e4 100644 --- a/lib/icingadb/icingadb.ti +++ b/lib/icingadb/icingadb.ti @@ -48,6 +48,16 @@ class IcingaDB : ConfigObject [no_storage] String environment_id { get; }; + + [set_protected] double ongoing_dump_start { + default {{{ return 0; }}} + }; + [state, set_protected] double lastdump_end { + default {{{ return 0; }}} + }; + [state, set_protected] double lastdump_took { + default {{{ return 0; }}} + }; }; } diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp new file mode 100644 index 000000000..2327a2fba --- /dev/null +++ b/lib/icingadb/icingadbchecktask.cpp @@ -0,0 +1,519 @@ +/* Icinga 2 | (c) 2022 Icinga GmbH | GPLv2+ */ + +#include "icingadb/icingadbchecktask.hpp" +#include "icinga/host.hpp" +#include "icinga/checkcommand.hpp" +#include "icinga/macroprocessor.hpp" +#include "remote/apilistener.hpp" +#include "remote/endpoint.hpp" +#include "remote/zone.hpp" +#include "base/function.hpp" +#include "base/json.hpp" +#include "base/utility.hpp" +#include "base/perfdatavalue.hpp" +#include "base/configtype.hpp" +#include "base/convert.hpp" +#include + +using namespace icinga; + +REGISTER_FUNCTION_NONCONST(Internal, IcingadbCheck, &IcingadbCheckTask::ScriptFunc, "checkable:cr:resolvedMacros:useResolvedMacros"); + +static void ReportIcingadbCheck( + const Checkable::Ptr& checkable, const CheckCommand::Ptr& commandObj, + const CheckResult::Ptr& cr, String output, ServiceState state) +{ + if (Checkable::ExecuteCommandProcessFinishedHandler) { + double now = Utility::GetTime(); + ProcessResult pr; + pr.PID = -1; + pr.Output = std::move(output); + pr.ExecutionStart = now; + pr.ExecutionEnd = now; + pr.ExitStatus = state; + + Checkable::ExecuteCommandProcessFinishedHandler(commandObj->GetName(), pr); + } else { + cr->SetState(state); + cr->SetOutput(output); + checkable->ProcessCheckResult(cr); + } +} + +static inline +double GetXMessageTs(const Array::Ptr& xMessage) +{ + return Convert::ToLong(String(xMessage->Get(0)).Split("-")[0]) / 1000.0; +} + +void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr, + const Dictionary::Ptr& resolvedMacros, bool useResolvedMacros) +{ + CheckCommand::Ptr commandObj = CheckCommand::ExecuteOverride ? CheckCommand::ExecuteOverride : checkable->GetCheckCommand(); + + Host::Ptr host; + Service::Ptr service; + tie(host, service) = GetHostService(checkable); + + MacroProcessor::ResolverList resolvers; + String silenceMissingMacroWarning; + + if (MacroResolver::OverrideMacros) + resolvers.emplace_back("override", MacroResolver::OverrideMacros); + + if (service) + resolvers.emplace_back("service", service); + resolvers.emplace_back("host", host); + resolvers.emplace_back("command", commandObj); + resolvers.emplace_back("icinga", IcingaApplication::GetInstance()); + + auto resolve ([&](const String& macro) { + return MacroProcessor::ResolveMacros(macro, resolvers, checkable->GetLastCheckResult(), + &silenceMissingMacroWarning, MacroProcessor::EscapeCallback(), resolvedMacros, useResolvedMacros); + }); + + struct Thresholds + { + Value Warning, Critical; + }; + + auto resolveThresholds ([&resolve](const String& wmacro, const String& cmacro) { + return Thresholds{resolve(wmacro), resolve(cmacro)}; + }); + + String icingadbName = resolve("$icingadb_name$"); + + auto dumpTakesThresholds (resolveThresholds("$icingadb_redis_dump_takes_warning$", "$icingadb_redis_dump_takes_critical$")); + auto syncTakesThresholds (resolveThresholds("$icingadb_database_sync_takes_warning$", "$icingadb_database_sync_takes_critical$")); + auto icingaBacklogThresholds (resolveThresholds("$icingadb_redis_backlog_warning$", "$icingadb_redis_backlog_critical$")); + auto icingadbBacklogThresholds (resolveThresholds("$icingadb_database_backlog_warning$", "$icingadb_database_backlog_critical$")); + + if (resolvedMacros && !useResolvedMacros) + return; + + if (icingadbName.IsEmpty()) { + ReportIcingadbCheck(checkable, commandObj, cr, "Icinga DB UNKNOWN: Attribute 'icingadb_name' must be set.", ServiceUnknown); + return; + } + + auto conn (IcingaDB::GetByName(icingadbName)); + + if (!conn) { + ReportIcingadbCheck(checkable, commandObj, cr, "Icinga DB UNKNOWN: Icinga DB connection '" + icingadbName + "' does not exist.", ServiceUnknown); + return; + } + + auto redis (conn->GetConnection()); + + if (!redis->GetConnected()) { + ReportIcingadbCheck(checkable, commandObj, cr, "Icinga DB CRITICAL: Could not connect to Redis.", ServiceCritical); + return; + } + + auto now (Utility::GetTime()); + Array::Ptr redisTime, xReadHeartbeat, xReadStats, xReadRtuHistory; + + try { + auto replies (redis->GetResultsOfQueries( + { + {"TIME"}, + {"XREAD", "STREAMS", "icingadb:telemetry:heartbeat", "0-0"}, + {"XREAD", "STREAMS", "icingadb:telemetry:stats", "0-0"}, + { + "XREAD", "COUNT", "1", "STREAMS", + "icinga:runtime", "icinga:runtime:state", + "icinga:history:stream:acknowledgement", "icinga:history:stream:comment", + "icinga:history:stream:downtime", "icinga:history:stream:flapping", + "icinga:history:stream:notification", "icinga:history:stream:state", + "0-0", "0-0", "0-0", "0-0", "0-0", "0-0", "0-0", "0-0" + } + }, + RedisConnection::QueryPriority::Heartbeat + )); + + redisTime = std::move(replies.at(0)); + xReadHeartbeat = std::move(replies.at(1)); + xReadStats = std::move(replies.at(2)); + xReadRtuHistory = std::move(replies.at(3)); + } catch (const std::exception& ex) { + ReportIcingadbCheck( + checkable, commandObj, cr, + String("Icinga DB CRITICAL: Could not read XREAD responses from Redis: ") + ex.what(), ServiceCritical + ); + return; + } + + if (!xReadHeartbeat) { + ReportIcingadbCheck( + checkable, commandObj, cr, + "Icinga DB CRITICAL: The Icinga DB daemon seems to have never run. (Missing heartbeat)", + ServiceCritical + ); + + return; + } + + auto redisOldestPending (redis->GetOldestPendingQueryTs()); + auto ongoingDumpStart (conn->GetOngoingDumpStart()); + auto dumpWhen (conn->GetLastdumpEnd()); + auto dumpTook (conn->GetLastdumpTook()); + + auto redisNow (Convert::ToLong(redisTime->Get(0)) + Convert::ToLong(redisTime->Get(1)) / 1000000.0); + Array::Ptr heartbeatMessage = Array::Ptr(Array::Ptr(xReadHeartbeat->Get(0))->Get(1))->Get(0); + auto heartbeatTime (GetXMessageTs(heartbeatMessage)); + std::map heartbeatData; + + IcingaDB::AddKvsToMap(heartbeatMessage->Get(1), heartbeatData); + + String version = heartbeatData.at("general:version"); + auto icingadbNow (Convert::ToLong(heartbeatData.at("general:time")) / 1000.0 + (redisNow - heartbeatTime)); + auto icingadbStartTime (Convert::ToLong(heartbeatData.at("general:start-time")) / 1000.0); + String errMsg (heartbeatData.at("general:err")); + auto errSince (Convert::ToLong(heartbeatData.at("general:err-since")) / 1000.0); + Dictionary::Ptr goMetricsByCumulativity (JsonDecode(heartbeatData.at("go:metrics"))); + auto heartbeatLastReceived (Convert::ToLong(heartbeatData.at("heartbeat:last-received")) / 1000.0); + bool weResponsible = Convert::ToLong(heartbeatData.at("ha:responsible")); + auto weResponsibleTs (Convert::ToLong(heartbeatData.at("ha:responsible-ts")) / 1000.0); + bool otherResponsible = Convert::ToLong(heartbeatData.at("ha:other-responsible")); + auto syncOngoingSince (Convert::ToLong(heartbeatData.at("sync:ongoing-since")) / 1000.0); + auto syncSuccessWhen (Convert::ToLong(heartbeatData.at("sync:success-finish")) / 1000.0); + auto syncSuccessTook (Convert::ToLong(heartbeatData.at("sync:success-duration")) / 1000.0); + + std::ostringstream i2okmsgs, idbokmsgs, warnmsgs, critmsgs; + Array::Ptr perfdata = new Array(); + + i2okmsgs << std::fixed << std::setprecision(3); + idbokmsgs << std::fixed << std::setprecision(3); + warnmsgs << std::fixed << std::setprecision(3); + critmsgs << std::fixed << std::setprecision(3); + + const auto downForCritical (10); + auto downFor (redisNow - heartbeatTime); + bool down = false; + + if (downFor > downForCritical) { + down = true; + + critmsgs << " Last seen " << Utility::FormatDuration(downFor) + << " ago, greater than CRITICAL threshold (" << Utility::FormatDuration(downForCritical) << ")!"; + } else { + idbokmsgs << "\n* Last seen: " << Utility::FormatDuration(downFor) << " ago"; + } + + perfdata->Add(new PerfdataValue("icingadb_heartbeat_age", downFor, false, "seconds", Empty, downForCritical, 0)); + + const auto errForCritical (10); + auto err (!errMsg.IsEmpty()); + auto errFor (icingadbNow - errSince); + + if (err) { + if (errFor > errForCritical) { + critmsgs << " ERROR: " << errMsg << "!"; + } + + perfdata->Add(new PerfdataValue("err_for", errFor * (err ? 1 : -1), false, "seconds", Empty, errForCritical, 0)); + } + + if (!down) { + const auto heartbeatLagWarning (3/* Icinga DB read freq. */ + 1/* Icinga DB write freq. */ + 2/* threshold */); + auto heartbeatLag (fmin(icingadbNow - heartbeatLastReceived, 10 * 60)); + + if (!heartbeatLastReceived) { + critmsgs << " Lost Icinga 2 heartbeat!"; + } else if (heartbeatLag > heartbeatLagWarning) { + warnmsgs << " Icinga 2 heartbeat lag: " << Utility::FormatDuration(heartbeatLag) + << ", greater than WARNING threshold (" << Utility::FormatDuration(heartbeatLagWarning) << ")."; + } + + perfdata->Add(new PerfdataValue("icinga_heartbeat_lag", heartbeatLag, false, "seconds", heartbeatLagWarning, Empty, 0)); + } + + if (weResponsible) { + idbokmsgs << "\n* Responsible"; + } else if (otherResponsible) { + idbokmsgs << "\n* Not responsible, but another instance is"; + } else { + critmsgs << " No instance is responsible!"; + } + + perfdata->Add(new PerfdataValue("icingadb_responsible_instances", int(weResponsible || otherResponsible), false, "", Empty, Empty, 0, 1)); + + const auto clockDriftWarning (5); + const auto clockDriftCritical (30); + auto clockDrift (fmax(fabs(now - redisNow), fmax(fabs(redisNow - icingadbNow), fabs(icingadbNow - now)))); + + if (clockDrift > clockDriftCritical) { + critmsgs << " Icinga 2/Redis/Icinga DB clock drift: " << Utility::FormatDuration(clockDrift) + << ", greater than CRITICAL threshold (" << Utility::FormatDuration(clockDriftCritical) << ")!"; + } else if (clockDrift > clockDriftWarning) { + warnmsgs << " Icinga 2/Redis/Icinga DB clock drift: " << Utility::FormatDuration(clockDrift) + << ", greater than WARNING threshold (" << Utility::FormatDuration(clockDriftWarning) << ")."; + } + + perfdata->Add(new PerfdataValue("clock_drift", clockDrift, false, "seconds", clockDriftWarning, clockDriftCritical, 0)); + + if (ongoingDumpStart) { + auto ongoingDumpTakes (now - ongoingDumpStart); + + if (!dumpTakesThresholds.Critical.IsEmpty() && ongoingDumpTakes > dumpTakesThresholds.Critical) { + critmsgs << " Ongoing Icinga 2 dump already takes " << Utility::FormatDuration(ongoingDumpTakes) + << ", greater than CRITICAL threshold (" << Utility::FormatDuration(dumpTakesThresholds.Critical) << ")!"; + } else if (!dumpTakesThresholds.Warning.IsEmpty() && ongoingDumpTakes > dumpTakesThresholds.Warning) { + warnmsgs << " Ongoing Icinga 2 dump already takes " << Utility::FormatDuration(ongoingDumpTakes) + << ", greater than WARNING threshold (" << Utility::FormatDuration(dumpTakesThresholds.Warning) << ")."; + } + + perfdata->Add(new PerfdataValue("redis_dump_takes", ongoingDumpTakes, false, "seconds", + dumpTakesThresholds.Warning, dumpTakesThresholds.Critical, 0)); + } + + if (!down && syncOngoingSince) { + auto ongoingSyncTakes (icingadbNow - syncOngoingSince); + + if (!syncTakesThresholds.Critical.IsEmpty() && ongoingSyncTakes > syncTakesThresholds.Critical) { + critmsgs << " Ongoing sync already takes " << Utility::FormatDuration(ongoingSyncTakes) + << ", greater than CRITICAL threshold (" << Utility::FormatDuration(syncTakesThresholds.Critical) << ")!"; + } else if (!syncTakesThresholds.Warning.IsEmpty() && ongoingSyncTakes > syncTakesThresholds.Warning) { + warnmsgs << " Ongoing sync already takes " << Utility::FormatDuration(ongoingSyncTakes) + << ", greater than WARNING threshold (" << Utility::FormatDuration(syncTakesThresholds.Warning) << ")."; + } + + perfdata->Add(new PerfdataValue("database_sync_takes", ongoingSyncTakes, false, "seconds", + syncTakesThresholds.Warning, syncTakesThresholds.Critical, 0)); + } + + auto redisBacklog (now - redisOldestPending); + + if (!redisOldestPending) { + redisBacklog = 0; + } + + if (!icingaBacklogThresholds.Critical.IsEmpty() && redisBacklog > icingaBacklogThresholds.Critical) { + critmsgs << " Icinga 2 query backlog: " << Utility::FormatDuration(redisBacklog) + << ", greater than CRITICAL threshold (" << Utility::FormatDuration(icingaBacklogThresholds.Critical) << ")!"; + } else if (!icingaBacklogThresholds.Warning.IsEmpty() && redisBacklog > icingaBacklogThresholds.Warning) { + warnmsgs << " Icinga 2 query backlog: " << Utility::FormatDuration(redisBacklog) + << ", greater than WARNING threshold (" << Utility::FormatDuration(icingaBacklogThresholds.Warning) << ")."; + } + + perfdata->Add(new PerfdataValue("redis_backlog", redisBacklog, false, "seconds", + icingaBacklogThresholds.Warning, icingaBacklogThresholds.Critical, 0)); + + if (!down) { + double icingadbBacklog = 0; + + if (xReadRtuHistory) { + double minTs = 0; + ObjectLock lock (xReadRtuHistory); + + for (Array::Ptr stream : xReadRtuHistory) { + if (!weResponsible) { + String name = stream->Get(0); + + if (name == "icinga:runtime" || name == "icinga:runtime:state") { + continue; + } + } + + auto ts (GetXMessageTs(Array::Ptr(stream->Get(1))->Get(0))); + + if (minTs == 0 || ts < minTs) { + minTs = ts; + } + } + + if (minTs > 0) { + icingadbBacklog = redisNow - minTs; + } + } + + if (!icingadbBacklogThresholds.Critical.IsEmpty() && icingadbBacklog > icingadbBacklogThresholds.Critical) { + critmsgs << " Query backlog: " << Utility::FormatDuration(icingadbBacklog) + << ", greater than CRITICAL threshold (" << Utility::FormatDuration(icingadbBacklogThresholds.Critical) << ")!"; + } else if (!icingadbBacklogThresholds.Warning.IsEmpty() && icingadbBacklog > icingadbBacklogThresholds.Warning) { + warnmsgs << " Query backlog: " << Utility::FormatDuration(icingadbBacklog) + << ", greater than WARNING threshold (" << Utility::FormatDuration(icingadbBacklogThresholds.Warning) << ")."; + } + + perfdata->Add(new PerfdataValue("database_backlog", icingadbBacklog, false, "seconds", + icingadbBacklogThresholds.Warning, icingadbBacklogThresholds.Critical, 0)); + } + + auto dumpAgo (now - dumpWhen); + + if (dumpWhen) { + perfdata->Add(new PerfdataValue("redis_dump_ago", dumpAgo, false, "seconds", Empty, Empty, 0)); + } + + if (dumpTook) { + perfdata->Add(new PerfdataValue("redis_dump_took", dumpTook, false, "seconds", Empty, Empty, 0)); + } + + if (dumpWhen && dumpTook) { + i2okmsgs << "\n* Last dump: " << Utility::FormatDuration(dumpAgo) + << " ago, took " << Utility::FormatDuration(dumpTook); + } + + auto icingadbUptime (icingadbNow - icingadbStartTime); + + if (!down) { + perfdata->Add(new PerfdataValue("icingadb_uptime", icingadbUptime, false, "seconds", Empty, Empty, 0)); + } + + { + static boost::regex wellNamedUnits (":(bytes|seconds)$"); + ObjectLock lock (goMetricsByCumulativity); + + for (auto& kv : goMetricsByCumulativity) { + bool cumulative = kv.first == "cumulative"; + Dictionary::Ptr goMetricsPerCumulativity = kv.second; + ObjectLock lock (goMetricsPerCumulativity); + + for (auto& kv : goMetricsPerCumulativity) { + std::string unit; + boost::smatch what; + + if (boost::regex_search(kv.first.GetData(), what, wellNamedUnits)) { + unit = what[1]; + } + + bool counter = cumulative && unit.empty(); + auto label ("go" + kv.first); + + for (auto& c : label) { + if (!('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9')) { + c = '_'; + } + } + + perfdata->Add(new PerfdataValue(std::move(label), kv.second, counter, std::move(unit))); + } + } + } + + if (weResponsibleTs) { + perfdata->Add(new PerfdataValue("icingadb_responsible_for", + (weResponsible ? 1 : -1) * (icingadbNow - weResponsibleTs), false, "seconds")); + } + + auto syncAgo (icingadbNow - syncSuccessWhen); + + if (syncSuccessWhen) { + perfdata->Add(new PerfdataValue("database_sync_ago", syncAgo, false, "seconds", Empty, Empty, 0)); + } + + if (syncSuccessTook) { + perfdata->Add(new PerfdataValue("database_sync_took", syncSuccessTook, false, "seconds", Empty, Empty, 0)); + } + + if (syncSuccessWhen && syncSuccessTook) { + idbokmsgs << "\n* Last sync: " << Utility::FormatDuration(syncAgo) + << " ago, took " << Utility::FormatDuration(syncSuccessTook); + } + + std::map statsPerOp; + + const char * const icingadbKnownStats[] = { + "sync_config", "sync_state", "sync_history", "sync_overdue", "cleanup_history" + }; + + for (auto metric : icingadbKnownStats) { + statsPerOp.emplace(std::piecewise_construct, std::forward_as_tuple(metric), std::forward_as_tuple(15 * 60)); + } + + if (xReadStats) { + Array::Ptr messages = Array::Ptr(xReadStats->Get(0))->Get(1); + ObjectLock lock (messages); + + for (Array::Ptr message : messages) { + auto ts (GetXMessageTs(message)); + std::map opsPerSec; + + IcingaDB::AddKvsToMap(message->Get(1), opsPerSec); + + for (auto& kv : opsPerSec) { + auto buf (statsPerOp.find(kv.first)); + + if (buf == statsPerOp.end()) { + buf = statsPerOp.emplace( + std::piecewise_construct, + std::forward_as_tuple(kv.first), std::forward_as_tuple(15 * 60) + ).first; + } + + buf->second.InsertValue(ts, Convert::ToLong(kv.second)); + } + } + } + + for (auto& kv : statsPerOp) { + auto perMin (kv.second.UpdateAndGetValues(now, 60)); + + perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_1sec", perMin / 60.0, false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_1min", perMin, false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_5mins", kv.second.UpdateAndGetValues(now, 5 * 60), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_15mins", kv.second.UpdateAndGetValues(now, 15 * 60), false, "", Empty, Empty, 0)); + } + + auto queriesPerMin (redis->GetQueryCount(60)); + + perfdata->Add(new PerfdataValue("redis_queries_1sec", queriesPerMin / 60.0, false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("redis_queries_1min", queriesPerMin, false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("redis_queries_5mins", redis->GetQueryCount(5 * 60), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("redis_queries_15mins", redis->GetQueryCount(15 * 60), false, "", Empty, Empty, 0)); + + perfdata->Add(new PerfdataValue("redis_pending_queries", redis->GetPendingQueryCount(), false, "", Empty, Empty, 0)); + + struct { + const char * Name; + int (RedisConnection::* Getter)(RingBuffer::SizeType span, RingBuffer::SizeType tv); + } const icingaWriteSubjects[] = { + {"icinga_dump_config", &RedisConnection::GetWrittenConfigFor}, + {"icinga_dump_state", &RedisConnection::GetWrittenStateFor}, + {"icinga_dump_history", &RedisConnection::GetWrittenHistoryFor} + }; + + for (auto subject : icingaWriteSubjects) { + auto perMin ((redis.get()->*subject.Getter)(60, now)); + + perfdata->Add(new PerfdataValue(String(subject.Name) + "_1sec", perMin / 60.0, false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue(String(subject.Name) + "_1min", perMin, false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue(String(subject.Name) + "_5mins", (redis.get()->*subject.Getter)(5 * 60, now), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue(String(subject.Name) + "_15mins", (redis.get()->*subject.Getter)(15 * 60, now), false, "", Empty, Empty, 0)); + } + + ServiceState state; + std::ostringstream msgbuf; + auto i2okmsg (i2okmsgs.str()); + auto idbokmsg (idbokmsgs.str()); + auto warnmsg (warnmsgs.str()); + auto critmsg (critmsgs.str()); + + msgbuf << "Icinga DB "; + + if (!critmsg.empty()) { + state = ServiceCritical; + msgbuf << "CRITICAL:" << critmsg; + + if (!warnmsg.empty()) { + msgbuf << "\n\nWARNING:" << warnmsg; + } + } else if (!warnmsg.empty()) { + state = ServiceWarning; + msgbuf << "WARNING:" << warnmsg; + } else { + state = ServiceOK; + msgbuf << "OK: Uptime: " << Utility::FormatDuration(icingadbUptime) << ". Version: " << version << "."; + } + + if (!i2okmsg.empty()) { + msgbuf << "\n\nIcinga 2\n--------\n" << i2okmsg; + } + + if (!idbokmsg.empty()) { + msgbuf << "\n\nIcinga DB\n---------\n" << idbokmsg; + } + + cr->SetPerformanceData(perfdata); + ReportIcingadbCheck(checkable, commandObj, cr, msgbuf.str(), state); +} diff --git a/lib/icingadb/icingadbchecktask.hpp b/lib/icingadb/icingadbchecktask.hpp new file mode 100644 index 000000000..ba7d61b1e --- /dev/null +++ b/lib/icingadb/icingadbchecktask.hpp @@ -0,0 +1,29 @@ +/* Icinga 2 | (c) 2022 Icinga GmbH | GPLv2+ */ + +#ifndef ICINGADBCHECKTASK_H +#define ICINGADBCHECKTASK_H + +#include "icingadb/icingadb.hpp" +#include "icinga/checkable.hpp" + +namespace icinga +{ + +/** + * Icinga DB check. + * + * @ingroup icingadb + */ +class IcingadbCheckTask +{ +public: + static void ScriptFunc(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr, + const Dictionary::Ptr& resolvedMacros, bool useResolvedMacros); + +private: + IcingadbCheckTask(); +}; + +} + +#endif /* ICINGADBCHECKTASK_H */ diff --git a/lib/icingadb/redisconnection.cpp b/lib/icingadb/redisconnection.cpp index f2ae3fa7b..f4c21a60f 100644 --- a/lib/icingadb/redisconnection.cpp +++ b/lib/icingadb/redisconnection.cpp @@ -113,7 +113,7 @@ void LogQuery(RedisConnection::Query& query, Log& msg) * @param query Redis query * @param priority The query's priority */ -void RedisConnection::FireAndForgetQuery(RedisConnection::Query query, RedisConnection::QueryPriority priority) +void RedisConnection::FireAndForgetQuery(RedisConnection::Query query, RedisConnection::QueryPriority priority, QueryAffects affects) { { Log msg (LogDebug, "IcingaDB", "Firing and forgetting query:"); @@ -121,9 +121,10 @@ void RedisConnection::FireAndForgetQuery(RedisConnection::Query query, RedisConn } auto item (Shared::Make(std::move(query))); + auto ctime (Utility::GetTime()); - asio::post(m_Strand, [this, item, priority]() { - m_Queues.Writes[priority].emplace(WriteQueueItem{item, nullptr, nullptr, nullptr}); + asio::post(m_Strand, [this, item, priority, ctime, affects]() { + m_Queues.Writes[priority].emplace(WriteQueueItem{item, nullptr, nullptr, nullptr, nullptr, ctime, affects}); m_QueuedWrites.Set(); IncreasePendingQueries(1); }); @@ -135,7 +136,7 @@ void RedisConnection::FireAndForgetQuery(RedisConnection::Query query, RedisConn * @param queries Redis queries * @param priority The queries' priority */ -void RedisConnection::FireAndForgetQueries(RedisConnection::Queries queries, RedisConnection::QueryPriority priority) +void RedisConnection::FireAndForgetQueries(RedisConnection::Queries queries, RedisConnection::QueryPriority priority, QueryAffects affects) { for (auto& query : queries) { Log msg (LogDebug, "IcingaDB", "Firing and forgetting query:"); @@ -143,9 +144,10 @@ void RedisConnection::FireAndForgetQueries(RedisConnection::Queries queries, Red } auto item (Shared::Make(std::move(queries))); + auto ctime (Utility::GetTime()); - asio::post(m_Strand, [this, item, priority]() { - m_Queues.Writes[priority].emplace(WriteQueueItem{nullptr, item, nullptr, nullptr}); + asio::post(m_Strand, [this, item, priority, ctime, affects]() { + m_Queues.Writes[priority].emplace(WriteQueueItem{nullptr, item, nullptr, nullptr, nullptr, ctime, affects}); m_QueuedWrites.Set(); IncreasePendingQueries(item->size()); }); @@ -159,7 +161,7 @@ void RedisConnection::FireAndForgetQueries(RedisConnection::Queries queries, Red * * @return The response */ -RedisConnection::Reply RedisConnection::GetResultOfQuery(RedisConnection::Query query, RedisConnection::QueryPriority priority) +RedisConnection::Reply RedisConnection::GetResultOfQuery(RedisConnection::Query query, RedisConnection::QueryPriority priority, QueryAffects affects) { { Log msg (LogDebug, "IcingaDB", "Executing query:"); @@ -169,9 +171,10 @@ RedisConnection::Reply RedisConnection::GetResultOfQuery(RedisConnection::Query std::promise promise; auto future (promise.get_future()); auto item (Shared>>::Make(std::move(query), std::move(promise))); + auto ctime (Utility::GetTime()); - asio::post(m_Strand, [this, item, priority]() { - m_Queues.Writes[priority].emplace(WriteQueueItem{nullptr, nullptr, item, nullptr}); + asio::post(m_Strand, [this, item, priority, ctime, affects]() { + m_Queues.Writes[priority].emplace(WriteQueueItem{nullptr, nullptr, item, nullptr, nullptr, ctime, affects}); m_QueuedWrites.Set(); IncreasePendingQueries(1); }); @@ -189,7 +192,7 @@ RedisConnection::Reply RedisConnection::GetResultOfQuery(RedisConnection::Query * * @return The responses */ -RedisConnection::Replies RedisConnection::GetResultsOfQueries(RedisConnection::Queries queries, RedisConnection::QueryPriority priority) +RedisConnection::Replies RedisConnection::GetResultsOfQueries(RedisConnection::Queries queries, RedisConnection::QueryPriority priority, QueryAffects affects) { for (auto& query : queries) { Log msg (LogDebug, "IcingaDB", "Executing query:"); @@ -199,9 +202,10 @@ RedisConnection::Replies RedisConnection::GetResultsOfQueries(RedisConnection::Q std::promise promise; auto future (promise.get_future()); auto item (Shared>>::Make(std::move(queries), std::move(promise))); + auto ctime (Utility::GetTime()); - asio::post(m_Strand, [this, item, priority]() { - m_Queues.Writes[priority].emplace(WriteQueueItem{nullptr, nullptr, nullptr, item}); + asio::post(m_Strand, [this, item, priority, ctime, affects]() { + m_Queues.Writes[priority].emplace(WriteQueueItem{nullptr, nullptr, nullptr, item, nullptr, ctime, affects}); m_QueuedWrites.Set(); IncreasePendingQueries(item->first.size()); }); @@ -213,8 +217,10 @@ RedisConnection::Replies RedisConnection::GetResultsOfQueries(RedisConnection::Q void RedisConnection::EnqueueCallback(const std::function& callback, RedisConnection::QueryPriority priority) { - asio::post(m_Strand, [this, callback, priority]() { - m_Queues.Writes[priority].emplace(WriteQueueItem{nullptr, nullptr, nullptr, nullptr, callback}); + auto ctime (Utility::GetTime()); + + asio::post(m_Strand, [this, callback, priority, ctime]() { + m_Queues.Writes[priority].emplace(WriteQueueItem{nullptr, nullptr, nullptr, nullptr, callback, ctime}); m_QueuedWrites.Set(); }); } @@ -230,6 +236,36 @@ void RedisConnection::Sync() GetResultOfQuery({"PING"}, RedisConnection::QueryPriority::SyncConnection); } +/** + * Get the enqueue time of the oldest still queued Redis query + * + * @return *nix timestamp or 0 + */ +double RedisConnection::GetOldestPendingQueryTs() +{ + auto promise (Shared>::Make()); + auto future (promise->get_future()); + + asio::post(m_Strand, [this, promise]() { + double oldest = 0; + + for (auto& queue : m_Queues.Writes) { + if (!queue.second.empty()) { + auto ctime (queue.second.front().CTime); + + if (ctime < oldest || oldest == 0) { + oldest = ctime; + } + } + } + + promise->set_value(oldest); + }); + + future.wait(); + return future.get(); +} + /** * Mark kind as kind of queries not to actually send yet * @@ -625,6 +661,8 @@ void RedisConnection::WriteItem(boost::asio::yield_context& yc, RedisConnection: if (next.Callback) { next.Callback(yc); } + + RecordAffected(next.Affects, Utility::GetTime()); } /** @@ -674,6 +712,11 @@ void RedisConnection::SetConnectedCallback(std::functionm_Strand, [parent, affected, when]() { + parent->RecordAffected(affected, when); + }); + } else { + if (affected.Config) { + m_WrittenConfig.InsertValue(when, affected.Config); + } + + if (affected.State) { + m_WrittenState.InsertValue(when, affected.State); + } + + if (affected.History) { + m_WrittenHistory.InsertValue(when, affected.History); + } + } +} diff --git a/lib/icingadb/redisconnection.hpp b/lib/icingadb/redisconnection.hpp index 608b11e0d..f3e05f9cb 100644 --- a/lib/icingadb/redisconnection.hpp +++ b/lib/icingadb/redisconnection.hpp @@ -74,6 +74,16 @@ namespace icinga SyncConnection = 255 }; + struct QueryAffects + { + size_t Config; + size_t State; + size_t History; + + QueryAffects(size_t config = 0, size_t state = 0, size_t history = 0) + : Config(config), State(state), History(history) { } + }; + RedisConnection(const String& host, int port, const String& path, const String& password, int db, bool useTls, bool insecure, const String& certPath, const String& keyPath, const String& caPath, const String& crlPath, const String& tlsProtocolmin, const String& cipherList, double connectTimeout, DebugInfo di, const Ptr& parent = nullptr); @@ -84,20 +94,48 @@ namespace icinga bool IsConnected(); - void FireAndForgetQuery(Query query, QueryPriority priority); - void FireAndForgetQueries(Queries queries, QueryPriority priority); + void FireAndForgetQuery(Query query, QueryPriority priority, QueryAffects affects = {}); + void FireAndForgetQueries(Queries queries, QueryPriority priority, QueryAffects affects = {}); - Reply GetResultOfQuery(Query query, QueryPriority priority); - Replies GetResultsOfQueries(Queries queries, QueryPriority priority); + Reply GetResultOfQuery(Query query, QueryPriority priority, QueryAffects affects = {}); + Replies GetResultsOfQueries(Queries queries, QueryPriority priority, QueryAffects affects = {}); void EnqueueCallback(const std::function& callback, QueryPriority priority); void Sync(); + double GetOldestPendingQueryTs(); void SuppressQueryKind(QueryPriority kind); void UnsuppressQueryKind(QueryPriority kind); void SetConnectedCallback(std::function callback); + inline bool GetConnected() + { + return m_Connected.load(); + } + + int GetQueryCount(RingBuffer::SizeType span); + + inline int GetPendingQueryCount() + { + return m_PendingQueries; + } + + inline int GetWrittenConfigFor(RingBuffer::SizeType span, RingBuffer::SizeType tv = Utility::GetTime()) + { + return m_WrittenConfig.UpdateAndGetValues(tv, span); + } + + inline int GetWrittenStateFor(RingBuffer::SizeType span, RingBuffer::SizeType tv = Utility::GetTime()) + { + return m_WrittenState.UpdateAndGetValues(tv, span); + } + + inline int GetWrittenHistoryFor(RingBuffer::SizeType span, RingBuffer::SizeType tv = Utility::GetTime()) + { + return m_WrittenHistory.UpdateAndGetValues(tv, span); + } + private: /** * What to do with the responses to Redis queries. @@ -134,6 +172,9 @@ namespace icinga Shared>>::Ptr GetResultOfQuery; Shared>>::Ptr GetResultsOfQueries; std::function Callback; + + double CTime; + QueryAffects Affects; }; typedef boost::asio::ip::tcp Tcp; @@ -175,6 +216,7 @@ namespace icinga void IncreasePendingQueries(int count); void DecreasePendingQueries(int count); + void RecordAffected(QueryAffects affected, double when); template void Handshake(StreamPtr& stream, boost::asio::yield_context& yc); @@ -225,7 +267,10 @@ namespace icinga // Stats RingBuffer m_InputQueries{10}; - RingBuffer m_OutputQueries{10}; + RingBuffer m_OutputQueries{15 * 60}; + RingBuffer m_WrittenConfig{15 * 60}; + RingBuffer m_WrittenState{15 * 60}; + RingBuffer m_WrittenHistory{15 * 60}; int m_PendingQueries{0}; boost::asio::deadline_timer m_LogStatsTimer; Ptr m_Parent;