From 2c3d2f8b8708968ac25eb95d30c6d0bea7c69aa3 Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Thu, 14 Apr 2022 17:04:30 +0200 Subject: [PATCH 01/24] RedisConnection::ReadRESP(): *-1\r\n is null, not [ ] --- lib/icingadb/redisconnection.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/icingadb/redisconnection.hpp b/lib/icingadb/redisconnection.hpp index fb9d1e584..3023fa3e9 100644 --- a/lib/icingadb/redisconnection.hpp +++ b/lib/icingadb/redisconnection.hpp @@ -555,12 +555,12 @@ Value RedisConnection::ReadRESP(AsyncReadStream& stream, boost::asio::yield_cont throw BadRedisInt(std::move(buf)); } - Array::Ptr arr = new Array(); - if (i < 0) { - i = 0; + return Empty; } + Array::Ptr arr = new Array(); + arr->Reserve(i); for (; i; --i) { From 8eef51afeb3abb22aa78e6f3f057971475dabf5e Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Tue, 19 Apr 2022 13:19:54 +0200 Subject: [PATCH 02/24] Introduce IcingaDB::AddKvsToMap() --- lib/icingadb/icingadb-objects.cpp | 13 +------------ lib/icingadb/icingadb.hpp | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/lib/icingadb/icingadb-objects.cpp b/lib/icingadb/icingadb-objects.cpp index ac876b61b..b1e759f99 100644 --- a/lib/icingadb/icingadb-objects.cpp +++ b/lib/icingadb/icingadb-objects.cpp @@ -230,18 +230,7 @@ void IcingaDB::UpdateAllConfigObjects() "HSCAN", configCheckSum, cursor, "COUNT", "1000" }, Prio::Config); - Array::Ptr kvs = res->Get(1); - Value* key = nullptr; - ObjectLock oLock (kvs); - - for (auto& kv : kvs) { - if (key) { - redisCheckSums.emplace(std::move(*key), std::move(kv)); - key = nullptr; - } else { - key = &kv; - } - } + AddKvsToMap(res->Get(1), redisCheckSums); cursor = res->Get(0); } while (cursor != "0"); diff --git a/lib/icingadb/icingadb.hpp b/lib/icingadb/icingadb.hpp index ef4483a7a..9e5525193 100644 --- a/lib/icingadb/icingadb.hpp +++ b/lib/icingadb/icingadb.hpp @@ -45,6 +45,22 @@ public: String GetEnvironmentId() const override; + template + static void AddKvsToMap(const Array::Ptr& kvs, T& map) + { + Value* key = nullptr; + ObjectLock oLock (kvs); + + for (auto& kv : kvs) { + if (key) { + map.emplace(std::move(*key), std::move(kv)); + key = nullptr; + } else { + key = &kv; + } + } + } + protected: void ValidateTlsProtocolmin(const Lazy& lvalue, const ValidationUtils& utils) override; void ValidateConnectTimeout(const Lazy& lvalue, const ValidationUtils& utils) override; From 88c8d29ee62ad630d6e1c35f03d2f7a3ca9ea729 Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Tue, 21 Jun 2022 15:34:23 +0200 Subject: [PATCH 03/24] Remove Icinga DB perfdata from Icinga check as the Icinga DB check already yields it. --- lib/icingadb/icingadb.cpp | 24 ------------------------ lib/icingadb/icingadb.hpp | 1 - 2 files changed, 25 deletions(-) diff --git a/lib/icingadb/icingadb.cpp b/lib/icingadb/icingadb.cpp index 21702f3a8..0d80d006e 100644 --- a/lib/icingadb/icingadb.cpp +++ b/lib/icingadb/icingadb.cpp @@ -29,8 +29,6 @@ std::mutex IcingaDB::m_EnvironmentIdInitMutex; REGISTER_TYPE(IcingaDB); -REGISTER_STATSFUNCTION(IcingaDB, &IcingaDB::StatsFunc); - IcingaDB::IcingaDB() : m_Rcon(nullptr) { @@ -42,28 +40,6 @@ IcingaDB::IcingaDB() m_PrefixConfigCheckSum = "icinga:checksum:"; } -/** - * Feature stats interface - * - * @param status Key value pairs for feature stats - */ -void IcingaDB::StatsFunc(const Dictionary::Ptr& status, const Array::Ptr& perfdata) -{ - DictionaryData nodes; - - for (auto& icingadb : ConfigType::GetObjectsByType()) { - auto historyBufferItems (icingadb->m_HistoryBulker.Size()); - - nodes.emplace_back(icingadb->GetName(), new Dictionary({ - { "history_buffer_items", historyBufferItems } - })); - - perfdata->Add(new PerfdataValue("icingadb_" + icingadb->GetName() + "_history_buffer_items", historyBufferItems)); - } - - status->Set("icingadb", new Dictionary(std::move(nodes))); -} - void IcingaDB::Validate(int types, const ValidationUtils& utils) { ObjectImpl::Validate(types, utils); diff --git a/lib/icingadb/icingadb.hpp b/lib/icingadb/icingadb.hpp index 9e5525193..e4c1be7e5 100644 --- a/lib/icingadb/icingadb.hpp +++ b/lib/icingadb/icingadb.hpp @@ -37,7 +37,6 @@ public: IcingaDB(); static void ConfigStaticInitialize(); - static void StatsFunc(const Dictionary::Ptr& status, const Array::Ptr& perfdata); void Validate(int types, const ValidationUtils& utils) override; virtual void Start(bool runtimeCreated) override; From e4a36bc217fcb5f2375ff51114af0e09b5643242 Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Wed, 1 Jun 2022 11:38:17 +0200 Subject: [PATCH 04/24] Introduce Icinga DB check (like the IDO one) --- doc/10-icinga-template-library.md | 18 + lib/icingadb/CMakeLists.txt | 3 + lib/icingadb/icingadb-itl.conf | 24 ++ lib/icingadb/icingadb-objects.cpp | 28 +- lib/icingadb/icingadb.hpp | 5 + lib/icingadb/icingadb.ti | 10 + lib/icingadb/icingadbchecktask.cpp | 519 +++++++++++++++++++++++++++++ lib/icingadb/icingadbchecktask.hpp | 29 ++ lib/icingadb/redisconnection.cpp | 94 +++++- lib/icingadb/redisconnection.hpp | 55 ++- 10 files changed, 760 insertions(+), 25 deletions(-) create mode 100644 lib/icingadb/icingadb-itl.conf create mode 100644 lib/icingadb/icingadbchecktask.cpp create mode 100644 lib/icingadb/icingadbchecktask.hpp diff --git a/doc/10-icinga-template-library.md b/doc/10-icinga-template-library.md index f99a2c409..102d07add 100644 --- a/doc/10-icinga-template-library.md +++ b/doc/10-icinga-template-library.md @@ -103,6 +103,24 @@ cluster\_zone | **Required.** The zone name. Defaults to `$host.name$`. cluster\_lag\_warning | **Optional.** Warning threshold for log lag in seconds. Applies if the log lag is greater than the threshold. cluster\_lag\_critical | **Optional.** Critical threshold for log lag in seconds. Applies if the log lag is greater than the threshold. +### icingadb + +Check command for the built-in `icingadb` check. + +Custom variables passed as [command parameters](03-monitoring-basics.md#command-passing-parameters): + +Name | Description +------------------------------------------|----------------------------- +icingadb\_name | **Required.** The name of the Icinga DB connection object. Defaults to `icingadb`. +icingadb\_redis\_dump\_takes\_warning | **Optional.** Warning threshold for ongoing Redis dump duration. Applies if the value is higher than the threshold. Defaults to 5 minutes. +icingadb\_redis\_dump\_takes\_critical | **Optional.** Critical threshold for ongoing Redis dump duration. Applies if the value is higher than the threshold. Defaults to 10 minutes. +icingadb\_database\_sync\_takes\_warning | **Optional.** Warning threshold for ongoing database sync duration. Applies if the value is higher than the threshold. Defaults to 5 minutes. +icingadb\_database\_sync\_takes\_critical | **Optional.** Critical threshold for ongoing database sync duration. Applies if the value is higher than the threshold. Defaults to 10 minutes. +icingadb\_redis\_backlog\_warning | **Optional.** Warning threshold for Redis write backlog. Applies if the value is higher than the threshold. Defaults to 5 minutes. +icingadb\_redis\_backlog\_critical | **Optional.** Critical threshold for Redis write backlog. Applies if the value is higher than the threshold. Defaults to 15 minutes. +icingadb\_database\_backlog\_warning | **Optional.** Warning threshold for database sync backlog. Applies if the value is higher than the threshold. Defaults to 5 minutes. +icingadb\_database\_backlog\_critical | **Optional.** Critical threshold for database sync backlog. Applies if the value is higher than the threshold. Defaults to 15 minutes. + ### ido Check command for the built-in `ido` check. diff --git a/lib/icingadb/CMakeLists.txt b/lib/icingadb/CMakeLists.txt index 71a7c67f2..de8e4adae 100644 --- a/lib/icingadb/CMakeLists.txt +++ b/lib/icingadb/CMakeLists.txt @@ -2,8 +2,11 @@ mkclass_target(icingadb.ti icingadb-ti.cpp icingadb-ti.hpp) +mkembedconfig_target(icingadb-itl.conf icingadb-itl.cpp) + set(icingadb_SOURCES icingadb.cpp icingadb-objects.cpp icingadb-stats.cpp icingadb-utility.cpp redisconnection.cpp icingadb-ti.hpp + icingadbchecktask.cpp icingadb-itl.cpp ) if(ICINGA2_UNITY_BUILD) diff --git a/lib/icingadb/icingadb-itl.conf b/lib/icingadb/icingadb-itl.conf new file mode 100644 index 000000000..af3b9dc00 --- /dev/null +++ b/lib/icingadb/icingadb-itl.conf @@ -0,0 +1,24 @@ +/* Icinga 2 | (c) 2022 Icinga GmbH | GPLv2+ */ + +System.assert(Internal.run_with_activation_context(function() { + template CheckCommand "icingadb-check-command" use (checkFunc = Internal.IcingadbCheck) { + execute = checkFunc + } + + object CheckCommand "icingadb" { + import "icingadb-check-command" + + vars.icingadb_name = "icingadb" + + vars.icingadb_redis_dump_takes_warning = 5m + vars.icingadb_redis_dump_takes_critical = 10m + vars.icingadb_database_sync_takes_warning = 5m + vars.icingadb_database_sync_takes_critical = 10m + vars.icingadb_redis_backlog_warning = 5m + vars.icingadb_redis_backlog_critical = 15m + vars.icingadb_database_backlog_warning = 5m + vars.icingadb_database_backlog_critical = 15m + } +})) + +Internal.remove("IcingadbCheck") diff --git a/lib/icingadb/icingadb-objects.cpp b/lib/icingadb/icingadb-objects.cpp index b1e759f99..d433e7b8e 100644 --- a/lib/icingadb/icingadb-objects.cpp +++ b/lib/icingadb/icingadb-objects.cpp @@ -171,6 +171,12 @@ void IcingaDB::UpdateAllConfigObjects() Log(LogInformation, "IcingaDB") << "Starting initial config/status dump"; double startTime = Utility::GetTime(); + SetOngoingDumpStart(startTime); + + Defer resetOngoingDumpStart ([this]() { + SetOngoingDumpStart(0); + }); + // Use a Workqueue to pack objects in parallel WorkQueue upq(25000, Configuration::Concurrency, LogNotice); upq.SetName("IcingaDB:ConfigDump"); @@ -402,6 +408,8 @@ void IcingaDB::UpdateAllConfigObjects() auto ourEnd (ourCheckSums.end()); auto flushSets ([&]() { + auto affectedConfig (setObject.size() / 2u); + setChecksum.insert(setChecksum.begin(), {"HMSET", configCheckSum}); setObject.insert(setObject.begin(), {"HMSET", configObject}); @@ -415,10 +423,12 @@ void IcingaDB::UpdateAllConfigObjects() setChecksum.clear(); setObject.clear(); - rcon->FireAndForgetQueries(std::move(transaction), Prio::Config); + rcon->FireAndForgetQueries(std::move(transaction), Prio::Config, {affectedConfig}); }); auto flushDels ([&]() { + auto affectedConfig (delObject.size()); + delChecksum.insert(delChecksum.begin(), {"HDEL", configCheckSum}); delObject.insert(delObject.begin(), {"HDEL", configObject}); @@ -432,7 +442,7 @@ void IcingaDB::UpdateAllConfigObjects() delChecksum.clear(); delObject.clear(); - rcon->FireAndForgetQueries(std::move(transaction), Prio::Config); + rcon->FireAndForgetQueries(std::move(transaction), Prio::Config, {affectedConfig}); }); auto setOne ([&]() { @@ -524,8 +534,14 @@ void IcingaDB::UpdateAllConfigObjects() m_Rcon->EnqueueCallback([&p](boost::asio::yield_context& yc) { p.set_value(); }, Prio::Config); p.get_future().wait(); + auto endTime (Utility::GetTime()); + auto took (endTime - startTime); + + SetLastdumpTook(took); + SetLastdumpEnd(endTime); + Log(LogInformation, "IcingaDB") - << "Initial config/status dump finished in " << Utility::GetTime() - startTime << " seconds."; + << "Initial config/status dump finished in " << took << " seconds."; } std::vector>> IcingaDB::ChunkObjects(std::vector> objects, size_t chunkSize) { @@ -1131,7 +1147,7 @@ void IcingaDB::UpdateState(const Checkable::Ptr& checkable, StateUpdate mode) streamadd.emplace_back(IcingaToStreamValue(kv.second)); } - m_Rcon->FireAndForgetQuery(std::move(streamadd), Prio::RuntimeStateStream); + m_Rcon->FireAndForgetQuery(std::move(streamadd), Prio::RuntimeStateStream, {0, 1}); } } @@ -1178,7 +1194,7 @@ void IcingaDB::SendConfigUpdate(const ConfigObject::Ptr& object, bool runtimeUpd if (transaction.size() > 1) { transaction.push_back({"EXEC"}); - m_Rcon->FireAndForgetQueries(std::move(transaction), Prio::Config); + m_Rcon->FireAndForgetQueries(std::move(transaction), Prio::Config, {1}); } if (checkable) { @@ -2343,7 +2359,7 @@ void IcingaDB::ForwardHistoryEntries() if (m_Rcon && m_Rcon->IsConnected()) { try { - m_Rcon->GetResultsOfQueries(haystack, Prio::History); + m_Rcon->GetResultsOfQueries(haystack, Prio::History, {0, 0, haystack.size()}); break; } catch (const std::exception& ex) { logFailure(ex.what()); diff --git a/lib/icingadb/icingadb.hpp b/lib/icingadb/icingadb.hpp index e4c1be7e5..075eb36ac 100644 --- a/lib/icingadb/icingadb.hpp +++ b/lib/icingadb/icingadb.hpp @@ -44,6 +44,11 @@ public: String GetEnvironmentId() const override; + inline RedisConnection::Ptr GetConnection() + { + return m_Rcon; + } + template static void AddKvsToMap(const Array::Ptr& kvs, T& map) { diff --git a/lib/icingadb/icingadb.ti b/lib/icingadb/icingadb.ti index 00ca95691..1c649c8e4 100644 --- a/lib/icingadb/icingadb.ti +++ b/lib/icingadb/icingadb.ti @@ -48,6 +48,16 @@ class IcingaDB : ConfigObject [no_storage] String environment_id { get; }; + + [set_protected] double ongoing_dump_start { + default {{{ return 0; }}} + }; + [state, set_protected] double lastdump_end { + default {{{ return 0; }}} + }; + [state, set_protected] double lastdump_took { + default {{{ return 0; }}} + }; }; } diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp new file mode 100644 index 000000000..2327a2fba --- /dev/null +++ b/lib/icingadb/icingadbchecktask.cpp @@ -0,0 +1,519 @@ +/* Icinga 2 | (c) 2022 Icinga GmbH | GPLv2+ */ + +#include "icingadb/icingadbchecktask.hpp" +#include "icinga/host.hpp" +#include "icinga/checkcommand.hpp" +#include "icinga/macroprocessor.hpp" +#include "remote/apilistener.hpp" +#include "remote/endpoint.hpp" +#include "remote/zone.hpp" +#include "base/function.hpp" +#include "base/json.hpp" +#include "base/utility.hpp" +#include "base/perfdatavalue.hpp" +#include "base/configtype.hpp" +#include "base/convert.hpp" +#include + +using namespace icinga; + +REGISTER_FUNCTION_NONCONST(Internal, IcingadbCheck, &IcingadbCheckTask::ScriptFunc, "checkable:cr:resolvedMacros:useResolvedMacros"); + +static void ReportIcingadbCheck( + const Checkable::Ptr& checkable, const CheckCommand::Ptr& commandObj, + const CheckResult::Ptr& cr, String output, ServiceState state) +{ + if (Checkable::ExecuteCommandProcessFinishedHandler) { + double now = Utility::GetTime(); + ProcessResult pr; + pr.PID = -1; + pr.Output = std::move(output); + pr.ExecutionStart = now; + pr.ExecutionEnd = now; + pr.ExitStatus = state; + + Checkable::ExecuteCommandProcessFinishedHandler(commandObj->GetName(), pr); + } else { + cr->SetState(state); + cr->SetOutput(output); + checkable->ProcessCheckResult(cr); + } +} + +static inline +double GetXMessageTs(const Array::Ptr& xMessage) +{ + return Convert::ToLong(String(xMessage->Get(0)).Split("-")[0]) / 1000.0; +} + +void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr, + const Dictionary::Ptr& resolvedMacros, bool useResolvedMacros) +{ + CheckCommand::Ptr commandObj = CheckCommand::ExecuteOverride ? CheckCommand::ExecuteOverride : checkable->GetCheckCommand(); + + Host::Ptr host; + Service::Ptr service; + tie(host, service) = GetHostService(checkable); + + MacroProcessor::ResolverList resolvers; + String silenceMissingMacroWarning; + + if (MacroResolver::OverrideMacros) + resolvers.emplace_back("override", MacroResolver::OverrideMacros); + + if (service) + resolvers.emplace_back("service", service); + resolvers.emplace_back("host", host); + resolvers.emplace_back("command", commandObj); + resolvers.emplace_back("icinga", IcingaApplication::GetInstance()); + + auto resolve ([&](const String& macro) { + return MacroProcessor::ResolveMacros(macro, resolvers, checkable->GetLastCheckResult(), + &silenceMissingMacroWarning, MacroProcessor::EscapeCallback(), resolvedMacros, useResolvedMacros); + }); + + struct Thresholds + { + Value Warning, Critical; + }; + + auto resolveThresholds ([&resolve](const String& wmacro, const String& cmacro) { + return Thresholds{resolve(wmacro), resolve(cmacro)}; + }); + + String icingadbName = resolve("$icingadb_name$"); + + auto dumpTakesThresholds (resolveThresholds("$icingadb_redis_dump_takes_warning$", "$icingadb_redis_dump_takes_critical$")); + auto syncTakesThresholds (resolveThresholds("$icingadb_database_sync_takes_warning$", "$icingadb_database_sync_takes_critical$")); + auto icingaBacklogThresholds (resolveThresholds("$icingadb_redis_backlog_warning$", "$icingadb_redis_backlog_critical$")); + auto icingadbBacklogThresholds (resolveThresholds("$icingadb_database_backlog_warning$", "$icingadb_database_backlog_critical$")); + + if (resolvedMacros && !useResolvedMacros) + return; + + if (icingadbName.IsEmpty()) { + ReportIcingadbCheck(checkable, commandObj, cr, "Icinga DB UNKNOWN: Attribute 'icingadb_name' must be set.", ServiceUnknown); + return; + } + + auto conn (IcingaDB::GetByName(icingadbName)); + + if (!conn) { + ReportIcingadbCheck(checkable, commandObj, cr, "Icinga DB UNKNOWN: Icinga DB connection '" + icingadbName + "' does not exist.", ServiceUnknown); + return; + } + + auto redis (conn->GetConnection()); + + if (!redis->GetConnected()) { + ReportIcingadbCheck(checkable, commandObj, cr, "Icinga DB CRITICAL: Could not connect to Redis.", ServiceCritical); + return; + } + + auto now (Utility::GetTime()); + Array::Ptr redisTime, xReadHeartbeat, xReadStats, xReadRtuHistory; + + try { + auto replies (redis->GetResultsOfQueries( + { + {"TIME"}, + {"XREAD", "STREAMS", "icingadb:telemetry:heartbeat", "0-0"}, + {"XREAD", "STREAMS", "icingadb:telemetry:stats", "0-0"}, + { + "XREAD", "COUNT", "1", "STREAMS", + "icinga:runtime", "icinga:runtime:state", + "icinga:history:stream:acknowledgement", "icinga:history:stream:comment", + "icinga:history:stream:downtime", "icinga:history:stream:flapping", + "icinga:history:stream:notification", "icinga:history:stream:state", + "0-0", "0-0", "0-0", "0-0", "0-0", "0-0", "0-0", "0-0" + } + }, + RedisConnection::QueryPriority::Heartbeat + )); + + redisTime = std::move(replies.at(0)); + xReadHeartbeat = std::move(replies.at(1)); + xReadStats = std::move(replies.at(2)); + xReadRtuHistory = std::move(replies.at(3)); + } catch (const std::exception& ex) { + ReportIcingadbCheck( + checkable, commandObj, cr, + String("Icinga DB CRITICAL: Could not read XREAD responses from Redis: ") + ex.what(), ServiceCritical + ); + return; + } + + if (!xReadHeartbeat) { + ReportIcingadbCheck( + checkable, commandObj, cr, + "Icinga DB CRITICAL: The Icinga DB daemon seems to have never run. (Missing heartbeat)", + ServiceCritical + ); + + return; + } + + auto redisOldestPending (redis->GetOldestPendingQueryTs()); + auto ongoingDumpStart (conn->GetOngoingDumpStart()); + auto dumpWhen (conn->GetLastdumpEnd()); + auto dumpTook (conn->GetLastdumpTook()); + + auto redisNow (Convert::ToLong(redisTime->Get(0)) + Convert::ToLong(redisTime->Get(1)) / 1000000.0); + Array::Ptr heartbeatMessage = Array::Ptr(Array::Ptr(xReadHeartbeat->Get(0))->Get(1))->Get(0); + auto heartbeatTime (GetXMessageTs(heartbeatMessage)); + std::map heartbeatData; + + IcingaDB::AddKvsToMap(heartbeatMessage->Get(1), heartbeatData); + + String version = heartbeatData.at("general:version"); + auto icingadbNow (Convert::ToLong(heartbeatData.at("general:time")) / 1000.0 + (redisNow - heartbeatTime)); + auto icingadbStartTime (Convert::ToLong(heartbeatData.at("general:start-time")) / 1000.0); + String errMsg (heartbeatData.at("general:err")); + auto errSince (Convert::ToLong(heartbeatData.at("general:err-since")) / 1000.0); + Dictionary::Ptr goMetricsByCumulativity (JsonDecode(heartbeatData.at("go:metrics"))); + auto heartbeatLastReceived (Convert::ToLong(heartbeatData.at("heartbeat:last-received")) / 1000.0); + bool weResponsible = Convert::ToLong(heartbeatData.at("ha:responsible")); + auto weResponsibleTs (Convert::ToLong(heartbeatData.at("ha:responsible-ts")) / 1000.0); + bool otherResponsible = Convert::ToLong(heartbeatData.at("ha:other-responsible")); + auto syncOngoingSince (Convert::ToLong(heartbeatData.at("sync:ongoing-since")) / 1000.0); + auto syncSuccessWhen (Convert::ToLong(heartbeatData.at("sync:success-finish")) / 1000.0); + auto syncSuccessTook (Convert::ToLong(heartbeatData.at("sync:success-duration")) / 1000.0); + + std::ostringstream i2okmsgs, idbokmsgs, warnmsgs, critmsgs; + Array::Ptr perfdata = new Array(); + + i2okmsgs << std::fixed << std::setprecision(3); + idbokmsgs << std::fixed << std::setprecision(3); + warnmsgs << std::fixed << std::setprecision(3); + critmsgs << std::fixed << std::setprecision(3); + + const auto downForCritical (10); + auto downFor (redisNow - heartbeatTime); + bool down = false; + + if (downFor > downForCritical) { + down = true; + + critmsgs << " Last seen " << Utility::FormatDuration(downFor) + << " ago, greater than CRITICAL threshold (" << Utility::FormatDuration(downForCritical) << ")!"; + } else { + idbokmsgs << "\n* Last seen: " << Utility::FormatDuration(downFor) << " ago"; + } + + perfdata->Add(new PerfdataValue("icingadb_heartbeat_age", downFor, false, "seconds", Empty, downForCritical, 0)); + + const auto errForCritical (10); + auto err (!errMsg.IsEmpty()); + auto errFor (icingadbNow - errSince); + + if (err) { + if (errFor > errForCritical) { + critmsgs << " ERROR: " << errMsg << "!"; + } + + perfdata->Add(new PerfdataValue("err_for", errFor * (err ? 1 : -1), false, "seconds", Empty, errForCritical, 0)); + } + + if (!down) { + const auto heartbeatLagWarning (3/* Icinga DB read freq. */ + 1/* Icinga DB write freq. */ + 2/* threshold */); + auto heartbeatLag (fmin(icingadbNow - heartbeatLastReceived, 10 * 60)); + + if (!heartbeatLastReceived) { + critmsgs << " Lost Icinga 2 heartbeat!"; + } else if (heartbeatLag > heartbeatLagWarning) { + warnmsgs << " Icinga 2 heartbeat lag: " << Utility::FormatDuration(heartbeatLag) + << ", greater than WARNING threshold (" << Utility::FormatDuration(heartbeatLagWarning) << ")."; + } + + perfdata->Add(new PerfdataValue("icinga_heartbeat_lag", heartbeatLag, false, "seconds", heartbeatLagWarning, Empty, 0)); + } + + if (weResponsible) { + idbokmsgs << "\n* Responsible"; + } else if (otherResponsible) { + idbokmsgs << "\n* Not responsible, but another instance is"; + } else { + critmsgs << " No instance is responsible!"; + } + + perfdata->Add(new PerfdataValue("icingadb_responsible_instances", int(weResponsible || otherResponsible), false, "", Empty, Empty, 0, 1)); + + const auto clockDriftWarning (5); + const auto clockDriftCritical (30); + auto clockDrift (fmax(fabs(now - redisNow), fmax(fabs(redisNow - icingadbNow), fabs(icingadbNow - now)))); + + if (clockDrift > clockDriftCritical) { + critmsgs << " Icinga 2/Redis/Icinga DB clock drift: " << Utility::FormatDuration(clockDrift) + << ", greater than CRITICAL threshold (" << Utility::FormatDuration(clockDriftCritical) << ")!"; + } else if (clockDrift > clockDriftWarning) { + warnmsgs << " Icinga 2/Redis/Icinga DB clock drift: " << Utility::FormatDuration(clockDrift) + << ", greater than WARNING threshold (" << Utility::FormatDuration(clockDriftWarning) << ")."; + } + + perfdata->Add(new PerfdataValue("clock_drift", clockDrift, false, "seconds", clockDriftWarning, clockDriftCritical, 0)); + + if (ongoingDumpStart) { + auto ongoingDumpTakes (now - ongoingDumpStart); + + if (!dumpTakesThresholds.Critical.IsEmpty() && ongoingDumpTakes > dumpTakesThresholds.Critical) { + critmsgs << " Ongoing Icinga 2 dump already takes " << Utility::FormatDuration(ongoingDumpTakes) + << ", greater than CRITICAL threshold (" << Utility::FormatDuration(dumpTakesThresholds.Critical) << ")!"; + } else if (!dumpTakesThresholds.Warning.IsEmpty() && ongoingDumpTakes > dumpTakesThresholds.Warning) { + warnmsgs << " Ongoing Icinga 2 dump already takes " << Utility::FormatDuration(ongoingDumpTakes) + << ", greater than WARNING threshold (" << Utility::FormatDuration(dumpTakesThresholds.Warning) << ")."; + } + + perfdata->Add(new PerfdataValue("redis_dump_takes", ongoingDumpTakes, false, "seconds", + dumpTakesThresholds.Warning, dumpTakesThresholds.Critical, 0)); + } + + if (!down && syncOngoingSince) { + auto ongoingSyncTakes (icingadbNow - syncOngoingSince); + + if (!syncTakesThresholds.Critical.IsEmpty() && ongoingSyncTakes > syncTakesThresholds.Critical) { + critmsgs << " Ongoing sync already takes " << Utility::FormatDuration(ongoingSyncTakes) + << ", greater than CRITICAL threshold (" << Utility::FormatDuration(syncTakesThresholds.Critical) << ")!"; + } else if (!syncTakesThresholds.Warning.IsEmpty() && ongoingSyncTakes > syncTakesThresholds.Warning) { + warnmsgs << " Ongoing sync already takes " << Utility::FormatDuration(ongoingSyncTakes) + << ", greater than WARNING threshold (" << Utility::FormatDuration(syncTakesThresholds.Warning) << ")."; + } + + perfdata->Add(new PerfdataValue("database_sync_takes", ongoingSyncTakes, false, "seconds", + syncTakesThresholds.Warning, syncTakesThresholds.Critical, 0)); + } + + auto redisBacklog (now - redisOldestPending); + + if (!redisOldestPending) { + redisBacklog = 0; + } + + if (!icingaBacklogThresholds.Critical.IsEmpty() && redisBacklog > icingaBacklogThresholds.Critical) { + critmsgs << " Icinga 2 query backlog: " << Utility::FormatDuration(redisBacklog) + << ", greater than CRITICAL threshold (" << Utility::FormatDuration(icingaBacklogThresholds.Critical) << ")!"; + } else if (!icingaBacklogThresholds.Warning.IsEmpty() && redisBacklog > icingaBacklogThresholds.Warning) { + warnmsgs << " Icinga 2 query backlog: " << Utility::FormatDuration(redisBacklog) + << ", greater than WARNING threshold (" << Utility::FormatDuration(icingaBacklogThresholds.Warning) << ")."; + } + + perfdata->Add(new PerfdataValue("redis_backlog", redisBacklog, false, "seconds", + icingaBacklogThresholds.Warning, icingaBacklogThresholds.Critical, 0)); + + if (!down) { + double icingadbBacklog = 0; + + if (xReadRtuHistory) { + double minTs = 0; + ObjectLock lock (xReadRtuHistory); + + for (Array::Ptr stream : xReadRtuHistory) { + if (!weResponsible) { + String name = stream->Get(0); + + if (name == "icinga:runtime" || name == "icinga:runtime:state") { + continue; + } + } + + auto ts (GetXMessageTs(Array::Ptr(stream->Get(1))->Get(0))); + + if (minTs == 0 || ts < minTs) { + minTs = ts; + } + } + + if (minTs > 0) { + icingadbBacklog = redisNow - minTs; + } + } + + if (!icingadbBacklogThresholds.Critical.IsEmpty() && icingadbBacklog > icingadbBacklogThresholds.Critical) { + critmsgs << " Query backlog: " << Utility::FormatDuration(icingadbBacklog) + << ", greater than CRITICAL threshold (" << Utility::FormatDuration(icingadbBacklogThresholds.Critical) << ")!"; + } else if (!icingadbBacklogThresholds.Warning.IsEmpty() && icingadbBacklog > icingadbBacklogThresholds.Warning) { + warnmsgs << " Query backlog: " << Utility::FormatDuration(icingadbBacklog) + << ", greater than WARNING threshold (" << Utility::FormatDuration(icingadbBacklogThresholds.Warning) << ")."; + } + + perfdata->Add(new PerfdataValue("database_backlog", icingadbBacklog, false, "seconds", + icingadbBacklogThresholds.Warning, icingadbBacklogThresholds.Critical, 0)); + } + + auto dumpAgo (now - dumpWhen); + + if (dumpWhen) { + perfdata->Add(new PerfdataValue("redis_dump_ago", dumpAgo, false, "seconds", Empty, Empty, 0)); + } + + if (dumpTook) { + perfdata->Add(new PerfdataValue("redis_dump_took", dumpTook, false, "seconds", Empty, Empty, 0)); + } + + if (dumpWhen && dumpTook) { + i2okmsgs << "\n* Last dump: " << Utility::FormatDuration(dumpAgo) + << " ago, took " << Utility::FormatDuration(dumpTook); + } + + auto icingadbUptime (icingadbNow - icingadbStartTime); + + if (!down) { + perfdata->Add(new PerfdataValue("icingadb_uptime", icingadbUptime, false, "seconds", Empty, Empty, 0)); + } + + { + static boost::regex wellNamedUnits (":(bytes|seconds)$"); + ObjectLock lock (goMetricsByCumulativity); + + for (auto& kv : goMetricsByCumulativity) { + bool cumulative = kv.first == "cumulative"; + Dictionary::Ptr goMetricsPerCumulativity = kv.second; + ObjectLock lock (goMetricsPerCumulativity); + + for (auto& kv : goMetricsPerCumulativity) { + std::string unit; + boost::smatch what; + + if (boost::regex_search(kv.first.GetData(), what, wellNamedUnits)) { + unit = what[1]; + } + + bool counter = cumulative && unit.empty(); + auto label ("go" + kv.first); + + for (auto& c : label) { + if (!('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9')) { + c = '_'; + } + } + + perfdata->Add(new PerfdataValue(std::move(label), kv.second, counter, std::move(unit))); + } + } + } + + if (weResponsibleTs) { + perfdata->Add(new PerfdataValue("icingadb_responsible_for", + (weResponsible ? 1 : -1) * (icingadbNow - weResponsibleTs), false, "seconds")); + } + + auto syncAgo (icingadbNow - syncSuccessWhen); + + if (syncSuccessWhen) { + perfdata->Add(new PerfdataValue("database_sync_ago", syncAgo, false, "seconds", Empty, Empty, 0)); + } + + if (syncSuccessTook) { + perfdata->Add(new PerfdataValue("database_sync_took", syncSuccessTook, false, "seconds", Empty, Empty, 0)); + } + + if (syncSuccessWhen && syncSuccessTook) { + idbokmsgs << "\n* Last sync: " << Utility::FormatDuration(syncAgo) + << " ago, took " << Utility::FormatDuration(syncSuccessTook); + } + + std::map statsPerOp; + + const char * const icingadbKnownStats[] = { + "sync_config", "sync_state", "sync_history", "sync_overdue", "cleanup_history" + }; + + for (auto metric : icingadbKnownStats) { + statsPerOp.emplace(std::piecewise_construct, std::forward_as_tuple(metric), std::forward_as_tuple(15 * 60)); + } + + if (xReadStats) { + Array::Ptr messages = Array::Ptr(xReadStats->Get(0))->Get(1); + ObjectLock lock (messages); + + for (Array::Ptr message : messages) { + auto ts (GetXMessageTs(message)); + std::map opsPerSec; + + IcingaDB::AddKvsToMap(message->Get(1), opsPerSec); + + for (auto& kv : opsPerSec) { + auto buf (statsPerOp.find(kv.first)); + + if (buf == statsPerOp.end()) { + buf = statsPerOp.emplace( + std::piecewise_construct, + std::forward_as_tuple(kv.first), std::forward_as_tuple(15 * 60) + ).first; + } + + buf->second.InsertValue(ts, Convert::ToLong(kv.second)); + } + } + } + + for (auto& kv : statsPerOp) { + auto perMin (kv.second.UpdateAndGetValues(now, 60)); + + perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_1sec", perMin / 60.0, false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_1min", perMin, false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_5mins", kv.second.UpdateAndGetValues(now, 5 * 60), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_15mins", kv.second.UpdateAndGetValues(now, 15 * 60), false, "", Empty, Empty, 0)); + } + + auto queriesPerMin (redis->GetQueryCount(60)); + + perfdata->Add(new PerfdataValue("redis_queries_1sec", queriesPerMin / 60.0, false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("redis_queries_1min", queriesPerMin, false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("redis_queries_5mins", redis->GetQueryCount(5 * 60), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("redis_queries_15mins", redis->GetQueryCount(15 * 60), false, "", Empty, Empty, 0)); + + perfdata->Add(new PerfdataValue("redis_pending_queries", redis->GetPendingQueryCount(), false, "", Empty, Empty, 0)); + + struct { + const char * Name; + int (RedisConnection::* Getter)(RingBuffer::SizeType span, RingBuffer::SizeType tv); + } const icingaWriteSubjects[] = { + {"icinga_dump_config", &RedisConnection::GetWrittenConfigFor}, + {"icinga_dump_state", &RedisConnection::GetWrittenStateFor}, + {"icinga_dump_history", &RedisConnection::GetWrittenHistoryFor} + }; + + for (auto subject : icingaWriteSubjects) { + auto perMin ((redis.get()->*subject.Getter)(60, now)); + + perfdata->Add(new PerfdataValue(String(subject.Name) + "_1sec", perMin / 60.0, false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue(String(subject.Name) + "_1min", perMin, false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue(String(subject.Name) + "_5mins", (redis.get()->*subject.Getter)(5 * 60, now), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue(String(subject.Name) + "_15mins", (redis.get()->*subject.Getter)(15 * 60, now), false, "", Empty, Empty, 0)); + } + + ServiceState state; + std::ostringstream msgbuf; + auto i2okmsg (i2okmsgs.str()); + auto idbokmsg (idbokmsgs.str()); + auto warnmsg (warnmsgs.str()); + auto critmsg (critmsgs.str()); + + msgbuf << "Icinga DB "; + + if (!critmsg.empty()) { + state = ServiceCritical; + msgbuf << "CRITICAL:" << critmsg; + + if (!warnmsg.empty()) { + msgbuf << "\n\nWARNING:" << warnmsg; + } + } else if (!warnmsg.empty()) { + state = ServiceWarning; + msgbuf << "WARNING:" << warnmsg; + } else { + state = ServiceOK; + msgbuf << "OK: Uptime: " << Utility::FormatDuration(icingadbUptime) << ". Version: " << version << "."; + } + + if (!i2okmsg.empty()) { + msgbuf << "\n\nIcinga 2\n--------\n" << i2okmsg; + } + + if (!idbokmsg.empty()) { + msgbuf << "\n\nIcinga DB\n---------\n" << idbokmsg; + } + + cr->SetPerformanceData(perfdata); + ReportIcingadbCheck(checkable, commandObj, cr, msgbuf.str(), state); +} diff --git a/lib/icingadb/icingadbchecktask.hpp b/lib/icingadb/icingadbchecktask.hpp new file mode 100644 index 000000000..ba7d61b1e --- /dev/null +++ b/lib/icingadb/icingadbchecktask.hpp @@ -0,0 +1,29 @@ +/* Icinga 2 | (c) 2022 Icinga GmbH | GPLv2+ */ + +#ifndef ICINGADBCHECKTASK_H +#define ICINGADBCHECKTASK_H + +#include "icingadb/icingadb.hpp" +#include "icinga/checkable.hpp" + +namespace icinga +{ + +/** + * Icinga DB check. + * + * @ingroup icingadb + */ +class IcingadbCheckTask +{ +public: + static void ScriptFunc(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr, + const Dictionary::Ptr& resolvedMacros, bool useResolvedMacros); + +private: + IcingadbCheckTask(); +}; + +} + +#endif /* ICINGADBCHECKTASK_H */ diff --git a/lib/icingadb/redisconnection.cpp b/lib/icingadb/redisconnection.cpp index f2ae3fa7b..f4c21a60f 100644 --- a/lib/icingadb/redisconnection.cpp +++ b/lib/icingadb/redisconnection.cpp @@ -113,7 +113,7 @@ void LogQuery(RedisConnection::Query& query, Log& msg) * @param query Redis query * @param priority The query's priority */ -void RedisConnection::FireAndForgetQuery(RedisConnection::Query query, RedisConnection::QueryPriority priority) +void RedisConnection::FireAndForgetQuery(RedisConnection::Query query, RedisConnection::QueryPriority priority, QueryAffects affects) { { Log msg (LogDebug, "IcingaDB", "Firing and forgetting query:"); @@ -121,9 +121,10 @@ void RedisConnection::FireAndForgetQuery(RedisConnection::Query query, RedisConn } auto item (Shared::Make(std::move(query))); + auto ctime (Utility::GetTime()); - asio::post(m_Strand, [this, item, priority]() { - m_Queues.Writes[priority].emplace(WriteQueueItem{item, nullptr, nullptr, nullptr}); + asio::post(m_Strand, [this, item, priority, ctime, affects]() { + m_Queues.Writes[priority].emplace(WriteQueueItem{item, nullptr, nullptr, nullptr, nullptr, ctime, affects}); m_QueuedWrites.Set(); IncreasePendingQueries(1); }); @@ -135,7 +136,7 @@ void RedisConnection::FireAndForgetQuery(RedisConnection::Query query, RedisConn * @param queries Redis queries * @param priority The queries' priority */ -void RedisConnection::FireAndForgetQueries(RedisConnection::Queries queries, RedisConnection::QueryPriority priority) +void RedisConnection::FireAndForgetQueries(RedisConnection::Queries queries, RedisConnection::QueryPriority priority, QueryAffects affects) { for (auto& query : queries) { Log msg (LogDebug, "IcingaDB", "Firing and forgetting query:"); @@ -143,9 +144,10 @@ void RedisConnection::FireAndForgetQueries(RedisConnection::Queries queries, Red } auto item (Shared::Make(std::move(queries))); + auto ctime (Utility::GetTime()); - asio::post(m_Strand, [this, item, priority]() { - m_Queues.Writes[priority].emplace(WriteQueueItem{nullptr, item, nullptr, nullptr}); + asio::post(m_Strand, [this, item, priority, ctime, affects]() { + m_Queues.Writes[priority].emplace(WriteQueueItem{nullptr, item, nullptr, nullptr, nullptr, ctime, affects}); m_QueuedWrites.Set(); IncreasePendingQueries(item->size()); }); @@ -159,7 +161,7 @@ void RedisConnection::FireAndForgetQueries(RedisConnection::Queries queries, Red * * @return The response */ -RedisConnection::Reply RedisConnection::GetResultOfQuery(RedisConnection::Query query, RedisConnection::QueryPriority priority) +RedisConnection::Reply RedisConnection::GetResultOfQuery(RedisConnection::Query query, RedisConnection::QueryPriority priority, QueryAffects affects) { { Log msg (LogDebug, "IcingaDB", "Executing query:"); @@ -169,9 +171,10 @@ RedisConnection::Reply RedisConnection::GetResultOfQuery(RedisConnection::Query std::promise promise; auto future (promise.get_future()); auto item (Shared>>::Make(std::move(query), std::move(promise))); + auto ctime (Utility::GetTime()); - asio::post(m_Strand, [this, item, priority]() { - m_Queues.Writes[priority].emplace(WriteQueueItem{nullptr, nullptr, item, nullptr}); + asio::post(m_Strand, [this, item, priority, ctime, affects]() { + m_Queues.Writes[priority].emplace(WriteQueueItem{nullptr, nullptr, item, nullptr, nullptr, ctime, affects}); m_QueuedWrites.Set(); IncreasePendingQueries(1); }); @@ -189,7 +192,7 @@ RedisConnection::Reply RedisConnection::GetResultOfQuery(RedisConnection::Query * * @return The responses */ -RedisConnection::Replies RedisConnection::GetResultsOfQueries(RedisConnection::Queries queries, RedisConnection::QueryPriority priority) +RedisConnection::Replies RedisConnection::GetResultsOfQueries(RedisConnection::Queries queries, RedisConnection::QueryPriority priority, QueryAffects affects) { for (auto& query : queries) { Log msg (LogDebug, "IcingaDB", "Executing query:"); @@ -199,9 +202,10 @@ RedisConnection::Replies RedisConnection::GetResultsOfQueries(RedisConnection::Q std::promise promise; auto future (promise.get_future()); auto item (Shared>>::Make(std::move(queries), std::move(promise))); + auto ctime (Utility::GetTime()); - asio::post(m_Strand, [this, item, priority]() { - m_Queues.Writes[priority].emplace(WriteQueueItem{nullptr, nullptr, nullptr, item}); + asio::post(m_Strand, [this, item, priority, ctime, affects]() { + m_Queues.Writes[priority].emplace(WriteQueueItem{nullptr, nullptr, nullptr, item, nullptr, ctime, affects}); m_QueuedWrites.Set(); IncreasePendingQueries(item->first.size()); }); @@ -213,8 +217,10 @@ RedisConnection::Replies RedisConnection::GetResultsOfQueries(RedisConnection::Q void RedisConnection::EnqueueCallback(const std::function& callback, RedisConnection::QueryPriority priority) { - asio::post(m_Strand, [this, callback, priority]() { - m_Queues.Writes[priority].emplace(WriteQueueItem{nullptr, nullptr, nullptr, nullptr, callback}); + auto ctime (Utility::GetTime()); + + asio::post(m_Strand, [this, callback, priority, ctime]() { + m_Queues.Writes[priority].emplace(WriteQueueItem{nullptr, nullptr, nullptr, nullptr, callback, ctime}); m_QueuedWrites.Set(); }); } @@ -230,6 +236,36 @@ void RedisConnection::Sync() GetResultOfQuery({"PING"}, RedisConnection::QueryPriority::SyncConnection); } +/** + * Get the enqueue time of the oldest still queued Redis query + * + * @return *nix timestamp or 0 + */ +double RedisConnection::GetOldestPendingQueryTs() +{ + auto promise (Shared>::Make()); + auto future (promise->get_future()); + + asio::post(m_Strand, [this, promise]() { + double oldest = 0; + + for (auto& queue : m_Queues.Writes) { + if (!queue.second.empty()) { + auto ctime (queue.second.front().CTime); + + if (ctime < oldest || oldest == 0) { + oldest = ctime; + } + } + } + + promise->set_value(oldest); + }); + + future.wait(); + return future.get(); +} + /** * Mark kind as kind of queries not to actually send yet * @@ -625,6 +661,8 @@ void RedisConnection::WriteItem(boost::asio::yield_context& yc, RedisConnection: if (next.Callback) { next.Callback(yc); } + + RecordAffected(next.Affects, Utility::GetTime()); } /** @@ -674,6 +712,11 @@ void RedisConnection::SetConnectedCallback(std::functionm_Strand, [parent, affected, when]() { + parent->RecordAffected(affected, when); + }); + } else { + if (affected.Config) { + m_WrittenConfig.InsertValue(when, affected.Config); + } + + if (affected.State) { + m_WrittenState.InsertValue(when, affected.State); + } + + if (affected.History) { + m_WrittenHistory.InsertValue(when, affected.History); + } + } +} diff --git a/lib/icingadb/redisconnection.hpp b/lib/icingadb/redisconnection.hpp index 3023fa3e9..f346ba285 100644 --- a/lib/icingadb/redisconnection.hpp +++ b/lib/icingadb/redisconnection.hpp @@ -74,6 +74,16 @@ namespace icinga SyncConnection = 255 }; + struct QueryAffects + { + size_t Config; + size_t State; + size_t History; + + QueryAffects(size_t config = 0, size_t state = 0, size_t history = 0) + : Config(config), State(state), History(history) { } + }; + RedisConnection(const String& host, int port, const String& path, const String& password, int db, bool useTls, bool insecure, const String& certPath, const String& keyPath, const String& caPath, const String& crlPath, const String& tlsProtocolmin, const String& cipherList, double connectTimeout, DebugInfo di, const Ptr& parent = nullptr); @@ -84,20 +94,48 @@ namespace icinga bool IsConnected(); - void FireAndForgetQuery(Query query, QueryPriority priority); - void FireAndForgetQueries(Queries queries, QueryPriority priority); + void FireAndForgetQuery(Query query, QueryPriority priority, QueryAffects affects = {}); + void FireAndForgetQueries(Queries queries, QueryPriority priority, QueryAffects affects = {}); - Reply GetResultOfQuery(Query query, QueryPriority priority); - Replies GetResultsOfQueries(Queries queries, QueryPriority priority); + Reply GetResultOfQuery(Query query, QueryPriority priority, QueryAffects affects = {}); + Replies GetResultsOfQueries(Queries queries, QueryPriority priority, QueryAffects affects = {}); void EnqueueCallback(const std::function& callback, QueryPriority priority); void Sync(); + double GetOldestPendingQueryTs(); void SuppressQueryKind(QueryPriority kind); void UnsuppressQueryKind(QueryPriority kind); void SetConnectedCallback(std::function callback); + inline bool GetConnected() + { + return m_Connected.load(); + } + + int GetQueryCount(RingBuffer::SizeType span); + + inline int GetPendingQueryCount() + { + return m_PendingQueries; + } + + inline int GetWrittenConfigFor(RingBuffer::SizeType span, RingBuffer::SizeType tv = Utility::GetTime()) + { + return m_WrittenConfig.UpdateAndGetValues(tv, span); + } + + inline int GetWrittenStateFor(RingBuffer::SizeType span, RingBuffer::SizeType tv = Utility::GetTime()) + { + return m_WrittenState.UpdateAndGetValues(tv, span); + } + + inline int GetWrittenHistoryFor(RingBuffer::SizeType span, RingBuffer::SizeType tv = Utility::GetTime()) + { + return m_WrittenHistory.UpdateAndGetValues(tv, span); + } + private: /** * What to do with the responses to Redis queries. @@ -134,6 +172,9 @@ namespace icinga Shared>>::Ptr GetResultOfQuery; Shared>>::Ptr GetResultsOfQueries; std::function Callback; + + double CTime; + QueryAffects Affects; }; typedef boost::asio::ip::tcp Tcp; @@ -175,6 +216,7 @@ namespace icinga void IncreasePendingQueries(int count); void DecreasePendingQueries(int count); + void RecordAffected(QueryAffects affected, double when); template void Handshake(StreamPtr& stream, boost::asio::yield_context& yc); @@ -225,7 +267,10 @@ namespace icinga // Stats RingBuffer m_InputQueries{10}; - RingBuffer m_OutputQueries{10}; + RingBuffer m_OutputQueries{15 * 60}; + RingBuffer m_WrittenConfig{15 * 60}; + RingBuffer m_WrittenState{15 * 60}; + RingBuffer m_WrittenHistory{15 * 60}; int m_PendingQueries{0}; boost::asio::deadline_timer m_LogStatsTimer; Ptr m_Parent; From 44cbd040885123a88a6775179bf72c72c618533e Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Fri, 24 Jun 2022 12:27:07 +0200 Subject: [PATCH 05/24] Icinga DB Check: read performance data string from Redis Use the already existing format to pass performance data to Icinga 2 rather than some new JSON structure. Has the additional benefit of doing more things in Go than in C++. --- lib/icingadb/icingadbchecktask.cpp | 34 ++++++------------------------ 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp index 2327a2fba..73989d442 100644 --- a/lib/icingadb/icingadbchecktask.cpp +++ b/lib/icingadb/icingadbchecktask.cpp @@ -4,11 +4,11 @@ #include "icinga/host.hpp" #include "icinga/checkcommand.hpp" #include "icinga/macroprocessor.hpp" +#include "icinga/pluginutility.hpp" #include "remote/apilistener.hpp" #include "remote/endpoint.hpp" #include "remote/zone.hpp" #include "base/function.hpp" -#include "base/json.hpp" #include "base/utility.hpp" #include "base/perfdatavalue.hpp" #include "base/configtype.hpp" @@ -170,7 +170,7 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR auto icingadbStartTime (Convert::ToLong(heartbeatData.at("general:start-time")) / 1000.0); String errMsg (heartbeatData.at("general:err")); auto errSince (Convert::ToLong(heartbeatData.at("general:err-since")) / 1000.0); - Dictionary::Ptr goMetricsByCumulativity (JsonDecode(heartbeatData.at("go:metrics"))); + String perfdataFromRedis = heartbeatData.at("general:performance-data"); auto heartbeatLastReceived (Convert::ToLong(heartbeatData.at("heartbeat:last-received")) / 1000.0); bool weResponsible = Convert::ToLong(heartbeatData.at("ha:responsible")); auto weResponsibleTs (Convert::ToLong(heartbeatData.at("ha:responsible-ts")) / 1000.0); @@ -361,33 +361,11 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR } { - static boost::regex wellNamedUnits (":(bytes|seconds)$"); - ObjectLock lock (goMetricsByCumulativity); + Array::Ptr values = PluginUtility::SplitPerfdata(perfdataFromRedis); + ObjectLock lock (values); - for (auto& kv : goMetricsByCumulativity) { - bool cumulative = kv.first == "cumulative"; - Dictionary::Ptr goMetricsPerCumulativity = kv.second; - ObjectLock lock (goMetricsPerCumulativity); - - for (auto& kv : goMetricsPerCumulativity) { - std::string unit; - boost::smatch what; - - if (boost::regex_search(kv.first.GetData(), what, wellNamedUnits)) { - unit = what[1]; - } - - bool counter = cumulative && unit.empty(); - auto label ("go" + kv.first); - - for (auto& c : label) { - if (!('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9')) { - c = '_'; - } - } - - perfdata->Add(new PerfdataValue(std::move(label), kv.second, counter, std::move(unit))); - } + for (auto& v : values) { + perfdata->Add(PerfdataValue::Parse(v)); } } From d74fbbbb82d14ceff9158939b1f827ca7b43841f Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Fri, 24 Jun 2022 12:41:01 +0200 Subject: [PATCH 06/24] Icinga DB Check: remove *_1sec metrics They add no additional information compared to the *_1min values as it's always the same value divided by 60 anyways. Adding the actual value from the last second makes little sense for realistic values of check_interval. --- lib/icingadb/icingadbchecktask.cpp | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp index 73989d442..f535d06b6 100644 --- a/lib/icingadb/icingadbchecktask.cpp +++ b/lib/icingadb/icingadbchecktask.cpp @@ -425,18 +425,12 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR } for (auto& kv : statsPerOp) { - auto perMin (kv.second.UpdateAndGetValues(now, 60)); - - perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_1sec", perMin / 60.0, false, "", Empty, Empty, 0)); - perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_1min", perMin, false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_1min", kv.second.UpdateAndGetValues(now, 60), false, "", Empty, Empty, 0)); perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_5mins", kv.second.UpdateAndGetValues(now, 5 * 60), false, "", Empty, Empty, 0)); perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_15mins", kv.second.UpdateAndGetValues(now, 15 * 60), false, "", Empty, Empty, 0)); } - auto queriesPerMin (redis->GetQueryCount(60)); - - perfdata->Add(new PerfdataValue("redis_queries_1sec", queriesPerMin / 60.0, false, "", Empty, Empty, 0)); - perfdata->Add(new PerfdataValue("redis_queries_1min", queriesPerMin, false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("redis_queries_1min", redis->GetQueryCount(60), false, "", Empty, Empty, 0)); perfdata->Add(new PerfdataValue("redis_queries_5mins", redis->GetQueryCount(5 * 60), false, "", Empty, Empty, 0)); perfdata->Add(new PerfdataValue("redis_queries_15mins", redis->GetQueryCount(15 * 60), false, "", Empty, Empty, 0)); @@ -452,10 +446,7 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR }; for (auto subject : icingaWriteSubjects) { - auto perMin ((redis.get()->*subject.Getter)(60, now)); - - perfdata->Add(new PerfdataValue(String(subject.Name) + "_1sec", perMin / 60.0, false, "", Empty, Empty, 0)); - perfdata->Add(new PerfdataValue(String(subject.Name) + "_1min", perMin, false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue(String(subject.Name) + "_1min", (redis.get()->*subject.Getter)(60, now), false, "", Empty, Empty, 0)); perfdata->Add(new PerfdataValue(String(subject.Name) + "_5mins", (redis.get()->*subject.Getter)(5 * 60, now), false, "", Empty, Empty, 0)); perfdata->Add(new PerfdataValue(String(subject.Name) + "_15mins", (redis.get()->*subject.Getter)(15 * 60, now), false, "", Empty, Empty, 0)); } From 2b310718e3d0828a5905e8acb4a8d79217bfe0a4 Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Fri, 24 Jun 2022 14:27:21 +0200 Subject: [PATCH 07/24] Icinga DB Check: rename keys in heartbeat stream In both C++ and Go, the keys are only used as constant strings, so namespacing them just adds clutter for the `general:*` keys, therefore remove it. --- lib/icingadb/icingadbchecktask.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp index f535d06b6..e5afe2556 100644 --- a/lib/icingadb/icingadbchecktask.cpp +++ b/lib/icingadb/icingadbchecktask.cpp @@ -165,19 +165,19 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR IcingaDB::AddKvsToMap(heartbeatMessage->Get(1), heartbeatData); - String version = heartbeatData.at("general:version"); - auto icingadbNow (Convert::ToLong(heartbeatData.at("general:time")) / 1000.0 + (redisNow - heartbeatTime)); - auto icingadbStartTime (Convert::ToLong(heartbeatData.at("general:start-time")) / 1000.0); - String errMsg (heartbeatData.at("general:err")); - auto errSince (Convert::ToLong(heartbeatData.at("general:err-since")) / 1000.0); - String perfdataFromRedis = heartbeatData.at("general:performance-data"); - auto heartbeatLastReceived (Convert::ToLong(heartbeatData.at("heartbeat:last-received")) / 1000.0); - bool weResponsible = Convert::ToLong(heartbeatData.at("ha:responsible")); - auto weResponsibleTs (Convert::ToLong(heartbeatData.at("ha:responsible-ts")) / 1000.0); - bool otherResponsible = Convert::ToLong(heartbeatData.at("ha:other-responsible")); - auto syncOngoingSince (Convert::ToLong(heartbeatData.at("sync:ongoing-since")) / 1000.0); - auto syncSuccessWhen (Convert::ToLong(heartbeatData.at("sync:success-finish")) / 1000.0); - auto syncSuccessTook (Convert::ToLong(heartbeatData.at("sync:success-duration")) / 1000.0); + String version = heartbeatData.at("version"); + auto icingadbNow (Convert::ToLong(heartbeatData.at("time")) / 1000.0 + (redisNow - heartbeatTime)); + auto icingadbStartTime (Convert::ToLong(heartbeatData.at("start-time")) / 1000.0); + String errMsg (heartbeatData.at("error")); + auto errSince (Convert::ToLong(heartbeatData.at("error-since")) / 1000.0); + String perfdataFromRedis = heartbeatData.at("performance-data"); + auto heartbeatLastReceived (Convert::ToLong(heartbeatData.at("last-heartbeat-received")) / 1000.0); + bool weResponsible = Convert::ToLong(heartbeatData.at("ha-responsible")); + auto weResponsibleTs (Convert::ToLong(heartbeatData.at("ha-responsible-ts")) / 1000.0); + bool otherResponsible = Convert::ToLong(heartbeatData.at("ha-other-responsible")); + auto syncOngoingSince (Convert::ToLong(heartbeatData.at("sync-ongoing-since")) / 1000.0); + auto syncSuccessWhen (Convert::ToLong(heartbeatData.at("sync-success-finish")) / 1000.0); + auto syncSuccessTook (Convert::ToLong(heartbeatData.at("sync-success-duration")) / 1000.0); std::ostringstream i2okmsgs, idbokmsgs, warnmsgs, critmsgs; Array::Ptr perfdata = new Array(); From 4f1f70f8430141fe12b3376709dd204596e35e06 Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Fri, 24 Jun 2022 14:31:29 +0200 Subject: [PATCH 08/24] Icinga DB Check: remove unused includes --- lib/icingadb/icingadbchecktask.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp index e5afe2556..5c3a43e32 100644 --- a/lib/icingadb/icingadbchecktask.cpp +++ b/lib/icingadb/icingadbchecktask.cpp @@ -5,13 +5,9 @@ #include "icinga/checkcommand.hpp" #include "icinga/macroprocessor.hpp" #include "icinga/pluginutility.hpp" -#include "remote/apilistener.hpp" -#include "remote/endpoint.hpp" -#include "remote/zone.hpp" #include "base/function.hpp" #include "base/utility.hpp" #include "base/perfdatavalue.hpp" -#include "base/configtype.hpp" #include "base/convert.hpp" #include From 31c7dfee53cdd5c72572f379f6d90fc373ace62e Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Fri, 24 Jun 2022 14:48:31 +0200 Subject: [PATCH 09/24] Icinga DB Check: fix error message on Redis query error Not only XREAD queries are performed, so the previous error message was incorrect. --- lib/icingadb/icingadbchecktask.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp index 5c3a43e32..9ac3831ac 100644 --- a/lib/icingadb/icingadbchecktask.cpp +++ b/lib/icingadb/icingadbchecktask.cpp @@ -134,7 +134,7 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR } catch (const std::exception& ex) { ReportIcingadbCheck( checkable, commandObj, cr, - String("Icinga DB CRITICAL: Could not read XREAD responses from Redis: ") + ex.what(), ServiceCritical + String("Icinga DB CRITICAL: Could not query Redis: ") + ex.what(), ServiceCritical ); return; } From f3f1373f83914762e8f90da2bb9654fa25f0ea70 Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Fri, 24 Jun 2022 15:05:59 +0200 Subject: [PATCH 10/24] Icinga DB Check: spell out "error" in perfdata --- lib/icingadb/icingadbchecktask.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp index 9ac3831ac..16c611e8b 100644 --- a/lib/icingadb/icingadbchecktask.cpp +++ b/lib/icingadb/icingadbchecktask.cpp @@ -207,7 +207,7 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR critmsgs << " ERROR: " << errMsg << "!"; } - perfdata->Add(new PerfdataValue("err_for", errFor * (err ? 1 : -1), false, "seconds", Empty, errForCritical, 0)); + perfdata->Add(new PerfdataValue("error_for", errFor * (err ? 1 : -1), false, "seconds", Empty, errForCritical, 0)); } if (!down) { From 5613412b8191c68a20da0593c799140871655c3a Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Fri, 24 Jun 2022 15:06:14 +0200 Subject: [PATCH 11/24] Icinga DB Check: replace nested calls to fmax() with std::max() Improves readability, even more so after splitting it into separate lines. --- lib/icingadb/icingadbchecktask.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp index 16c611e8b..e55258546 100644 --- a/lib/icingadb/icingadbchecktask.cpp +++ b/lib/icingadb/icingadbchecktask.cpp @@ -236,7 +236,11 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR const auto clockDriftWarning (5); const auto clockDriftCritical (30); - auto clockDrift (fmax(fabs(now - redisNow), fmax(fabs(redisNow - icingadbNow), fabs(icingadbNow - now)))); + auto clockDrift (std::max({ + fabs(now - redisNow), + fabs(redisNow - icingadbNow), + fabs(icingadbNow - now), + })); if (clockDrift > clockDriftCritical) { critmsgs << " Icinga 2/Redis/Icinga DB clock drift: " << Utility::FormatDuration(clockDrift) From 2a4605f4b79e53b71433a2a10f0e41eec7368e51 Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Fri, 24 Jun 2022 15:08:04 +0200 Subject: [PATCH 12/24] Icinga DB Check: clearly state Icinga 2 Redis backlog Should make it easier to understand that this refers to Redis queries issued by Icinga 2. --- lib/icingadb/icingadbchecktask.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp index e55258546..bc56363c9 100644 --- a/lib/icingadb/icingadbchecktask.cpp +++ b/lib/icingadb/icingadbchecktask.cpp @@ -289,14 +289,14 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR } if (!icingaBacklogThresholds.Critical.IsEmpty() && redisBacklog > icingaBacklogThresholds.Critical) { - critmsgs << " Icinga 2 query backlog: " << Utility::FormatDuration(redisBacklog) + critmsgs << " Icinga 2 Redis backlog: " << Utility::FormatDuration(redisBacklog) << ", greater than CRITICAL threshold (" << Utility::FormatDuration(icingaBacklogThresholds.Critical) << ")!"; } else if (!icingaBacklogThresholds.Warning.IsEmpty() && redisBacklog > icingaBacklogThresholds.Warning) { - warnmsgs << " Icinga 2 query backlog: " << Utility::FormatDuration(redisBacklog) + warnmsgs << " Icinga 2 Redis backlog: " << Utility::FormatDuration(redisBacklog) << ", greater than WARNING threshold (" << Utility::FormatDuration(icingaBacklogThresholds.Warning) << ")."; } - perfdata->Add(new PerfdataValue("redis_backlog", redisBacklog, false, "seconds", + perfdata->Add(new PerfdataValue("icinga2_redis_backlog", redisBacklog, false, "seconds", icingaBacklogThresholds.Warning, icingaBacklogThresholds.Critical, 0)); if (!down) { From d70a27b9821cdb5cca5477366e4cfb4c6a9417e5 Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Fri, 24 Jun 2022 15:30:08 +0200 Subject: [PATCH 13/24] Icinga DB Check: report history and runtime update backlog separately Probably makes little difference for an end-user, but for support and development it's great to know which of the two is causing problems. --- lib/icingadb/icingadbchecktask.cpp | 73 +++++++++++++++++++----------- 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp index bc56363c9..7638cebc9 100644 --- a/lib/icingadb/icingadbchecktask.cpp +++ b/lib/icingadb/icingadbchecktask.cpp @@ -107,7 +107,7 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR } auto now (Utility::GetTime()); - Array::Ptr redisTime, xReadHeartbeat, xReadStats, xReadRtuHistory; + Array::Ptr redisTime, xReadHeartbeat, xReadStats, xReadRuntimeBacklog, xReadHistoryBacklog; try { auto replies (redis->GetResultsOfQueries( @@ -115,13 +115,16 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR {"TIME"}, {"XREAD", "STREAMS", "icingadb:telemetry:heartbeat", "0-0"}, {"XREAD", "STREAMS", "icingadb:telemetry:stats", "0-0"}, + {"XREAD", "COUNT", "1", "STREAMS", "icinga:runtime", "icinga:runtime:state", "0-0", "0-0"}, { "XREAD", "COUNT", "1", "STREAMS", - "icinga:runtime", "icinga:runtime:state", - "icinga:history:stream:acknowledgement", "icinga:history:stream:comment", - "icinga:history:stream:downtime", "icinga:history:stream:flapping", - "icinga:history:stream:notification", "icinga:history:stream:state", - "0-0", "0-0", "0-0", "0-0", "0-0", "0-0", "0-0", "0-0" + "icinga:history:stream:acknowledgement", + "icinga:history:stream:comment", + "icinga:history:stream:downtime", + "icinga:history:stream:flapping", + "icinga:history:stream:notification", + "icinga:history:stream:state", + "0-0", "0-0", "0-0", "0-0", "0-0", "0-0", } }, RedisConnection::QueryPriority::Heartbeat @@ -130,7 +133,8 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR redisTime = std::move(replies.at(0)); xReadHeartbeat = std::move(replies.at(1)); xReadStats = std::move(replies.at(2)); - xReadRtuHistory = std::move(replies.at(3)); + xReadRuntimeBacklog = std::move(replies.at(3)); + xReadHistoryBacklog = std::move(replies.at(4)); } catch (const std::exception& ex) { ReportIcingadbCheck( checkable, commandObj, cr, @@ -300,21 +304,15 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR icingaBacklogThresholds.Warning, icingaBacklogThresholds.Critical, 0)); if (!down) { - double icingadbBacklog = 0; + auto getBacklog = [redisNow](const Array::Ptr& streams) -> double { + if (!streams) { + return 0; + } - if (xReadRtuHistory) { double minTs = 0; - ObjectLock lock (xReadRtuHistory); - - for (Array::Ptr stream : xReadRtuHistory) { - if (!weResponsible) { - String name = stream->Get(0); - - if (name == "icinga:runtime" || name == "icinga:runtime:state") { - continue; - } - } + ObjectLock lock (streams); + for (Array::Ptr stream : streams) { auto ts (GetXMessageTs(Array::Ptr(stream->Get(1))->Get(0))); if (minTs == 0 || ts < minTs) { @@ -323,19 +321,42 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR } if (minTs > 0) { - icingadbBacklog = redisNow - minTs; + return redisNow - minTs; + } else { + return 0; } - } + }; - if (!icingadbBacklogThresholds.Critical.IsEmpty() && icingadbBacklog > icingadbBacklogThresholds.Critical) { - critmsgs << " Query backlog: " << Utility::FormatDuration(icingadbBacklog) + double historyBacklog = getBacklog(xReadHistoryBacklog); + + if (!icingadbBacklogThresholds.Critical.IsEmpty() && historyBacklog > icingadbBacklogThresholds.Critical) { + critmsgs << " History backlog: " << Utility::FormatDuration(historyBacklog) << ", greater than CRITICAL threshold (" << Utility::FormatDuration(icingadbBacklogThresholds.Critical) << ")!"; - } else if (!icingadbBacklogThresholds.Warning.IsEmpty() && icingadbBacklog > icingadbBacklogThresholds.Warning) { - warnmsgs << " Query backlog: " << Utility::FormatDuration(icingadbBacklog) + } else if (!icingadbBacklogThresholds.Warning.IsEmpty() && historyBacklog > icingadbBacklogThresholds.Warning) { + warnmsgs << " History backlog: " << Utility::FormatDuration(historyBacklog) << ", greater than WARNING threshold (" << Utility::FormatDuration(icingadbBacklogThresholds.Warning) << ")."; } - perfdata->Add(new PerfdataValue("database_backlog", icingadbBacklog, false, "seconds", + perfdata->Add(new PerfdataValue("history_backlog", historyBacklog, false, "seconds", + icingadbBacklogThresholds.Warning, icingadbBacklogThresholds.Critical, 0)); + + double runtimeBacklog = 0; + + if (weResponsible) { + // These streams are only processed by one instance, it's fine for the other instance to have some backlog. + runtimeBacklog = getBacklog(xReadRuntimeBacklog); + + if (!icingadbBacklogThresholds.Critical.IsEmpty() && runtimeBacklog > icingadbBacklogThresholds.Critical) { + critmsgs << " Runtime update backlog: " << Utility::FormatDuration(runtimeBacklog) + << ", greater than CRITICAL threshold (" << Utility::FormatDuration(icingadbBacklogThresholds.Critical) << ")!"; + } else if (!icingadbBacklogThresholds.Warning.IsEmpty() && runtimeBacklog > icingadbBacklogThresholds.Warning) { + warnmsgs << " Runtime update backlog: " << Utility::FormatDuration(runtimeBacklog) + << ", greater than WARNING threshold (" << Utility::FormatDuration(icingadbBacklogThresholds.Warning) << ")."; + } + } + + // Also report the perfdata value on the other instance (as 0 in this case). + perfdata->Add(new PerfdataValue("runtime_backlog", runtimeBacklog, false, "seconds", icingadbBacklogThresholds.Warning, icingadbBacklogThresholds.Critical, 0)); } From 3c29b152148e6c894b0f5e52dacc94f0fc945deb Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Fri, 24 Jun 2022 16:35:14 +0200 Subject: [PATCH 14/24] Icinga DB Check: use more natural names for sync/cleanup metrics --- lib/icingadb/icingadbchecktask.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp index 7638cebc9..809dd00c3 100644 --- a/lib/icingadb/icingadbchecktask.cpp +++ b/lib/icingadb/icingadbchecktask.cpp @@ -413,7 +413,7 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR std::map statsPerOp; const char * const icingadbKnownStats[] = { - "sync_config", "sync_state", "sync_history", "sync_overdue", "cleanup_history" + "config_sync", "state_sync", "history_sync", "overdue_sync", "history_cleanup" }; for (auto metric : icingadbKnownStats) { From d0382f71ab902c1b02d43de34405eaf7c098073f Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Mon, 27 Jun 2022 09:12:04 +0200 Subject: [PATCH 15/24] Icinga DB Check: rename variables from takes to duration Sounds more natural in my opinion and I doubt that many users would get that due to the difference between takes/took, this refers to ongoing dumps. --- doc/10-icinga-template-library.md | 22 +++++++++++----------- lib/icingadb/icingadb-itl.conf | 8 ++++---- lib/icingadb/icingadbchecktask.cpp | 4 ++-- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/doc/10-icinga-template-library.md b/doc/10-icinga-template-library.md index 102d07add..4e6ae6435 100644 --- a/doc/10-icinga-template-library.md +++ b/doc/10-icinga-template-library.md @@ -109,17 +109,17 @@ Check command for the built-in `icingadb` check. Custom variables passed as [command parameters](03-monitoring-basics.md#command-passing-parameters): -Name | Description -------------------------------------------|----------------------------- -icingadb\_name | **Required.** The name of the Icinga DB connection object. Defaults to `icingadb`. -icingadb\_redis\_dump\_takes\_warning | **Optional.** Warning threshold for ongoing Redis dump duration. Applies if the value is higher than the threshold. Defaults to 5 minutes. -icingadb\_redis\_dump\_takes\_critical | **Optional.** Critical threshold for ongoing Redis dump duration. Applies if the value is higher than the threshold. Defaults to 10 minutes. -icingadb\_database\_sync\_takes\_warning | **Optional.** Warning threshold for ongoing database sync duration. Applies if the value is higher than the threshold. Defaults to 5 minutes. -icingadb\_database\_sync\_takes\_critical | **Optional.** Critical threshold for ongoing database sync duration. Applies if the value is higher than the threshold. Defaults to 10 minutes. -icingadb\_redis\_backlog\_warning | **Optional.** Warning threshold for Redis write backlog. Applies if the value is higher than the threshold. Defaults to 5 minutes. -icingadb\_redis\_backlog\_critical | **Optional.** Critical threshold for Redis write backlog. Applies if the value is higher than the threshold. Defaults to 15 minutes. -icingadb\_database\_backlog\_warning | **Optional.** Warning threshold for database sync backlog. Applies if the value is higher than the threshold. Defaults to 5 minutes. -icingadb\_database\_backlog\_critical | **Optional.** Critical threshold for database sync backlog. Applies if the value is higher than the threshold. Defaults to 15 minutes. +Name | Description +-----------------------------------------|----------------------------- +icingadb\_name | **Required.** The name of the Icinga DB connection object. Defaults to `icingadb`. +icingadb\_full\_dump\_duration\_warning | **Optional.** Warning threshold for ongoing Redis dump duration. Applies if the value is higher than the threshold. Defaults to 5 minutes. +icingadb\_full\_dump\_duration\_critical | **Optional.** Critical threshold for ongoing Redis dump duration. Applies if the value is higher than the threshold. Defaults to 10 minutes. +icingadb\_full\_sync\_duration\_warning | **Optional.** Warning threshold for ongoing database sync duration. Applies if the value is higher than the threshold. Defaults to 5 minutes. +icingadb\_full\_sync\_duration\_critical | **Optional.** Critical threshold for ongoing database sync duration. Applies if the value is higher than the threshold. Defaults to 10 minutes. +icingadb\_redis\_backlog\_warning | **Optional.** Warning threshold for Redis write backlog. Applies if the value is higher than the threshold. Defaults to 5 minutes. +icingadb\_redis\_backlog\_critical | **Optional.** Critical threshold for Redis write backlog. Applies if the value is higher than the threshold. Defaults to 15 minutes. +icingadb\_database\_backlog\_warning | **Optional.** Warning threshold for database sync backlog. Applies if the value is higher than the threshold. Defaults to 5 minutes. +icingadb\_database\_backlog\_critical | **Optional.** Critical threshold for database sync backlog. Applies if the value is higher than the threshold. Defaults to 15 minutes. ### ido diff --git a/lib/icingadb/icingadb-itl.conf b/lib/icingadb/icingadb-itl.conf index af3b9dc00..5f3950e3d 100644 --- a/lib/icingadb/icingadb-itl.conf +++ b/lib/icingadb/icingadb-itl.conf @@ -10,10 +10,10 @@ System.assert(Internal.run_with_activation_context(function() { vars.icingadb_name = "icingadb" - vars.icingadb_redis_dump_takes_warning = 5m - vars.icingadb_redis_dump_takes_critical = 10m - vars.icingadb_database_sync_takes_warning = 5m - vars.icingadb_database_sync_takes_critical = 10m + vars.icingadb_full_dump_duration_warning = 5m + vars.icingadb_full_dump_duration_critical = 10m + vars.icingadb_full_sync_duration_warning = 5m + vars.icingadb_full_sync_duration_critical = 10m vars.icingadb_redis_backlog_warning = 5m vars.icingadb_redis_backlog_critical = 15m vars.icingadb_database_backlog_warning = 5m diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp index 809dd00c3..5e21cd4ec 100644 --- a/lib/icingadb/icingadbchecktask.cpp +++ b/lib/icingadb/icingadbchecktask.cpp @@ -79,8 +79,8 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR String icingadbName = resolve("$icingadb_name$"); - auto dumpTakesThresholds (resolveThresholds("$icingadb_redis_dump_takes_warning$", "$icingadb_redis_dump_takes_critical$")); - auto syncTakesThresholds (resolveThresholds("$icingadb_database_sync_takes_warning$", "$icingadb_database_sync_takes_critical$")); + auto dumpTakesThresholds (resolveThresholds("$icingadb_full_dump_duration_warning$", "$icingadb_full_dump_duration_critical$")); + auto syncTakesThresholds (resolveThresholds("$icingadb_full_sync_duration_warning$", "$icingadb_full_sync_duration_critical$")); auto icingaBacklogThresholds (resolveThresholds("$icingadb_redis_backlog_warning$", "$icingadb_redis_backlog_critical$")); auto icingadbBacklogThresholds (resolveThresholds("$icingadb_database_backlog_warning$", "$icingadb_database_backlog_critical$")); From c59d44cd8bfc9cd377dd42f161e42d515385c703 Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Mon, 27 Jun 2022 12:43:56 +0200 Subject: [PATCH 16/24] Icinga DB Check: rename perfdata values - Add icinga2_ and icingadb_ prefixes to make clear which component is responsible for the value. - Rename heartbeat_lag to heartbeat_age, describes it better in my opinion and sound a bit less like something that should be as close to zero as possible. - Rename redis_dump/database_sync into full_dump/full_sync as this is how these operations are refered to in log messages as well. - Rename Redis backlog into Redis query backlog, makes it a bit clearer in my opinion. - Rename runtime_backlog into runtime_update_backlog, as the component in Icinga DB is called that way and this naming is also exposed in log messages. - Rename dump_config/state/history into config/state/history_dump, makes it sound more natural. --- lib/icingadb/icingadbchecktask.cpp | 38 +++++++++++++++--------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp index 5e21cd4ec..8c3cd9595 100644 --- a/lib/icingadb/icingadbchecktask.cpp +++ b/lib/icingadb/icingadbchecktask.cpp @@ -225,7 +225,7 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR << ", greater than WARNING threshold (" << Utility::FormatDuration(heartbeatLagWarning) << ")."; } - perfdata->Add(new PerfdataValue("icinga_heartbeat_lag", heartbeatLag, false, "seconds", heartbeatLagWarning, Empty, 0)); + perfdata->Add(new PerfdataValue("icinga2_heartbeat_age", heartbeatLag, false, "seconds", heartbeatLagWarning, Empty, 0)); } if (weResponsible) { @@ -267,7 +267,7 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR << ", greater than WARNING threshold (" << Utility::FormatDuration(dumpTakesThresholds.Warning) << ")."; } - perfdata->Add(new PerfdataValue("redis_dump_takes", ongoingDumpTakes, false, "seconds", + perfdata->Add(new PerfdataValue("icinga2_full_dump_takes", ongoingDumpTakes, false, "seconds", dumpTakesThresholds.Warning, dumpTakesThresholds.Critical, 0)); } @@ -282,7 +282,7 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR << ", greater than WARNING threshold (" << Utility::FormatDuration(syncTakesThresholds.Warning) << ")."; } - perfdata->Add(new PerfdataValue("database_sync_takes", ongoingSyncTakes, false, "seconds", + perfdata->Add(new PerfdataValue("icingadb_full_sync_takes", ongoingSyncTakes, false, "seconds", syncTakesThresholds.Warning, syncTakesThresholds.Critical, 0)); } @@ -293,14 +293,14 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR } if (!icingaBacklogThresholds.Critical.IsEmpty() && redisBacklog > icingaBacklogThresholds.Critical) { - critmsgs << " Icinga 2 Redis backlog: " << Utility::FormatDuration(redisBacklog) + critmsgs << " Icinga 2 Redis query backlog: " << Utility::FormatDuration(redisBacklog) << ", greater than CRITICAL threshold (" << Utility::FormatDuration(icingaBacklogThresholds.Critical) << ")!"; } else if (!icingaBacklogThresholds.Warning.IsEmpty() && redisBacklog > icingaBacklogThresholds.Warning) { - warnmsgs << " Icinga 2 Redis backlog: " << Utility::FormatDuration(redisBacklog) + warnmsgs << " Icinga 2 Redis query backlog: " << Utility::FormatDuration(redisBacklog) << ", greater than WARNING threshold (" << Utility::FormatDuration(icingaBacklogThresholds.Warning) << ")."; } - perfdata->Add(new PerfdataValue("icinga2_redis_backlog", redisBacklog, false, "seconds", + perfdata->Add(new PerfdataValue("icinga2_redis_query_backlog", redisBacklog, false, "seconds", icingaBacklogThresholds.Warning, icingaBacklogThresholds.Critical, 0)); if (!down) { @@ -337,7 +337,7 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR << ", greater than WARNING threshold (" << Utility::FormatDuration(icingadbBacklogThresholds.Warning) << ")."; } - perfdata->Add(new PerfdataValue("history_backlog", historyBacklog, false, "seconds", + perfdata->Add(new PerfdataValue("icingadb_history_backlog", historyBacklog, false, "seconds", icingadbBacklogThresholds.Warning, icingadbBacklogThresholds.Critical, 0)); double runtimeBacklog = 0; @@ -356,18 +356,18 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR } // Also report the perfdata value on the other instance (as 0 in this case). - perfdata->Add(new PerfdataValue("runtime_backlog", runtimeBacklog, false, "seconds", + perfdata->Add(new PerfdataValue("icingadb_runtime_update_backlog", runtimeBacklog, false, "seconds", icingadbBacklogThresholds.Warning, icingadbBacklogThresholds.Critical, 0)); } auto dumpAgo (now - dumpWhen); if (dumpWhen) { - perfdata->Add(new PerfdataValue("redis_dump_ago", dumpAgo, false, "seconds", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icinga2_full_dump_ago", dumpAgo, false, "seconds", Empty, Empty, 0)); } if (dumpTook) { - perfdata->Add(new PerfdataValue("redis_dump_took", dumpTook, false, "seconds", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icinga2_full_dump_took", dumpTook, false, "seconds", Empty, Empty, 0)); } if (dumpWhen && dumpTook) { @@ -398,11 +398,11 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR auto syncAgo (icingadbNow - syncSuccessWhen); if (syncSuccessWhen) { - perfdata->Add(new PerfdataValue("database_sync_ago", syncAgo, false, "seconds", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icingadb_full_sync_ago", syncAgo, false, "seconds", Empty, Empty, 0)); } if (syncSuccessTook) { - perfdata->Add(new PerfdataValue("database_sync_took", syncSuccessTook, false, "seconds", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icingadb_full_sync_took", syncSuccessTook, false, "seconds", Empty, Empty, 0)); } if (syncSuccessWhen && syncSuccessTook) { @@ -451,19 +451,19 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_15mins", kv.second.UpdateAndGetValues(now, 15 * 60), false, "", Empty, Empty, 0)); } - perfdata->Add(new PerfdataValue("redis_queries_1min", redis->GetQueryCount(60), false, "", Empty, Empty, 0)); - perfdata->Add(new PerfdataValue("redis_queries_5mins", redis->GetQueryCount(5 * 60), false, "", Empty, Empty, 0)); - perfdata->Add(new PerfdataValue("redis_queries_15mins", redis->GetQueryCount(15 * 60), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icinga2_redis_queries_1min", redis->GetQueryCount(60), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icinga2_redis_queries_5mins", redis->GetQueryCount(5 * 60), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icinga2_redis_queries_15mins", redis->GetQueryCount(15 * 60), false, "", Empty, Empty, 0)); - perfdata->Add(new PerfdataValue("redis_pending_queries", redis->GetPendingQueryCount(), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icinga2_redis_pending_queries", redis->GetPendingQueryCount(), false, "", Empty, Empty, 0)); struct { const char * Name; int (RedisConnection::* Getter)(RingBuffer::SizeType span, RingBuffer::SizeType tv); } const icingaWriteSubjects[] = { - {"icinga_dump_config", &RedisConnection::GetWrittenConfigFor}, - {"icinga_dump_state", &RedisConnection::GetWrittenStateFor}, - {"icinga_dump_history", &RedisConnection::GetWrittenHistoryFor} + {"icinga2_config_dump", &RedisConnection::GetWrittenConfigFor}, + {"icinga2_state_dump", &RedisConnection::GetWrittenStateFor}, + {"icinga2_history_dump", &RedisConnection::GetWrittenHistoryFor} }; for (auto subject : icingaWriteSubjects) { From 953e1134652eb452542cc29cb1121644e00cb8fa Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Mon, 27 Jun 2022 13:21:46 +0200 Subject: [PATCH 17/24] Icinga DB Check: remove markdown headings from output icingadb-web shows multiple lines from the check output collapsed into a single line. The lines containing just minuses make this look cluttered and making making it a heading provides little to no benefit. Even when rendering markdown in the check output at some point, having the lists labeled using normal paragraphs would look just fine. --- lib/icingadb/icingadbchecktask.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp index 8c3cd9595..25a2084b9 100644 --- a/lib/icingadb/icingadbchecktask.cpp +++ b/lib/icingadb/icingadbchecktask.cpp @@ -497,11 +497,11 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR } if (!i2okmsg.empty()) { - msgbuf << "\n\nIcinga 2\n--------\n" << i2okmsg; + msgbuf << "\n\nIcinga 2:\n" << i2okmsg; } if (!idbokmsg.empty()) { - msgbuf << "\n\nIcinga DB\n---------\n" << idbokmsg; + msgbuf << "\n\nIcinga DB:\n" << idbokmsg; } cr->SetPerformanceData(perfdata); From 2fafffb85fccc3d7adf44bc7c4295f0bd9057569 Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Mon, 27 Jun 2022 16:33:25 +0200 Subject: [PATCH 18/24] Icinga DB Check: fix race-condition with IcingaDB::Start() IcingaDB::GetConnection() uses IcingaDB::m_Rcon which is only initialized in IcingaDB::Start(), therefore add a nullptr check to the check command. Additionally, as m_Rcon is potentially accessed concurrently, add a copy of the value that is safe for concurrent use. --- lib/icingadb/icingadb.cpp | 3 ++- lib/icingadb/icingadb.hpp | 7 ++++++- lib/icingadb/icingadbchecktask.cpp | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/lib/icingadb/icingadb.cpp b/lib/icingadb/icingadb.cpp index 0d80d006e..f1e538fb0 100644 --- a/lib/icingadb/icingadb.cpp +++ b/lib/icingadb/icingadb.cpp @@ -32,7 +32,7 @@ REGISTER_TYPE(IcingaDB); IcingaDB::IcingaDB() : m_Rcon(nullptr) { - m_Rcon = nullptr; + m_RconLocked.store(nullptr); m_WorkQueue.SetName("IcingaDB"); @@ -80,6 +80,7 @@ void IcingaDB::Start(bool runtimeCreated) m_Rcon = new RedisConnection(GetHost(), GetPort(), GetPath(), GetPassword(), GetDbIndex(), GetEnableTls(), GetInsecureNoverify(), GetCertPath(), GetKeyPath(), GetCaPath(), GetCrlPath(), GetTlsProtocolmin(), GetCipherList(), GetConnectTimeout(), GetDebugInfo()); + m_RconLocked.store(m_Rcon); for (const Type::Ptr& type : GetTypes()) { auto ctype (dynamic_cast(type.get())); diff --git a/lib/icingadb/icingadb.hpp b/lib/icingadb/icingadb.hpp index 075eb36ac..c08f36465 100644 --- a/lib/icingadb/icingadb.hpp +++ b/lib/icingadb/icingadb.hpp @@ -5,6 +5,7 @@ #include "icingadb/icingadb-ti.hpp" #include "icingadb/redisconnection.hpp" +#include "base/atomic.hpp" #include "base/bulker.hpp" #include "base/timer.hpp" #include "base/workqueue.hpp" @@ -46,7 +47,7 @@ public: inline RedisConnection::Ptr GetConnection() { - return m_Rcon; + return m_RconLocked.load(); } template @@ -215,6 +216,10 @@ private: bool m_ConfigDumpDone; RedisConnection::Ptr m_Rcon; + // m_RconLocked containes a copy of the value in m_Rcon where all accesses are guarded by a mutex to allow safe + // concurrent access like from the icingadb check command. It's a copy to still allow fast access without additional + // syncronization to m_Rcon within the IcingaDB feature itself. + Locked m_RconLocked; std::unordered_map m_Rcons; std::atomic_size_t m_PendingRcons; diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp index 25a2084b9..63f5d9910 100644 --- a/lib/icingadb/icingadbchecktask.cpp +++ b/lib/icingadb/icingadbchecktask.cpp @@ -101,7 +101,7 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR auto redis (conn->GetConnection()); - if (!redis->GetConnected()) { + if (!redis || !redis->GetConnected()) { ReportIcingadbCheck(checkable, commandObj, cr, "Icinga DB CRITICAL: Could not connect to Redis.", ServiceCritical); return; } From eaae7d58632137a5df770f369f26cf1906d52112 Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Mon, 27 Jun 2022 16:40:34 +0200 Subject: [PATCH 19/24] Icinga DB Check: update not connected message The check makes no attempt to explicitly connect to Redis, it uses the connection of the IcingaDB feature, so this message better describes the state in this situation. --- lib/icingadb/icingadbchecktask.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp index 63f5d9910..5aff5d8ba 100644 --- a/lib/icingadb/icingadbchecktask.cpp +++ b/lib/icingadb/icingadbchecktask.cpp @@ -102,7 +102,7 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR auto redis (conn->GetConnection()); if (!redis || !redis->GetConnected()) { - ReportIcingadbCheck(checkable, commandObj, cr, "Icinga DB CRITICAL: Could not connect to Redis.", ServiceCritical); + ReportIcingadbCheck(checkable, commandObj, cr, "Icinga DB CRITICAL: Not connected to Redis.", ServiceCritical); return; } From e36bc92a2c751240c284156a36043a65080f9f97 Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Tue, 28 Jun 2022 10:47:24 +0200 Subject: [PATCH 20/24] Icinga DB Check: add unit hints to all rates --- lib/icingadb/icingadbchecktask.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp index 5aff5d8ba..ca185647b 100644 --- a/lib/icingadb/icingadbchecktask.cpp +++ b/lib/icingadb/icingadbchecktask.cpp @@ -446,9 +446,9 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR } for (auto& kv : statsPerOp) { - perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_1min", kv.second.UpdateAndGetValues(now, 60), false, "", Empty, Empty, 0)); - perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_5mins", kv.second.UpdateAndGetValues(now, 5 * 60), false, "", Empty, Empty, 0)); - perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_15mins", kv.second.UpdateAndGetValues(now, 15 * 60), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_items_1min", kv.second.UpdateAndGetValues(now, 60), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_items_5mins", kv.second.UpdateAndGetValues(now, 5 * 60), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_items_15mins", kv.second.UpdateAndGetValues(now, 15 * 60), false, "", Empty, Empty, 0)); } perfdata->Add(new PerfdataValue("icinga2_redis_queries_1min", redis->GetQueryCount(60), false, "", Empty, Empty, 0)); @@ -461,15 +461,15 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR const char * Name; int (RedisConnection::* Getter)(RingBuffer::SizeType span, RingBuffer::SizeType tv); } const icingaWriteSubjects[] = { - {"icinga2_config_dump", &RedisConnection::GetWrittenConfigFor}, - {"icinga2_state_dump", &RedisConnection::GetWrittenStateFor}, - {"icinga2_history_dump", &RedisConnection::GetWrittenHistoryFor} + {"config_dump", &RedisConnection::GetWrittenConfigFor}, + {"state_dump", &RedisConnection::GetWrittenStateFor}, + {"history_dump", &RedisConnection::GetWrittenHistoryFor} }; for (auto subject : icingaWriteSubjects) { - perfdata->Add(new PerfdataValue(String(subject.Name) + "_1min", (redis.get()->*subject.Getter)(60, now), false, "", Empty, Empty, 0)); - perfdata->Add(new PerfdataValue(String(subject.Name) + "_5mins", (redis.get()->*subject.Getter)(5 * 60, now), false, "", Empty, Empty, 0)); - perfdata->Add(new PerfdataValue(String(subject.Name) + "_15mins", (redis.get()->*subject.Getter)(15 * 60, now), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue(String("icinga2_") + subject.Name + "_items_1min", (redis.get()->*subject.Getter)(60, now), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue(String("icinga2_") + subject.Name + "_items_5mins", (redis.get()->*subject.Getter)(5 * 60, now), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue(String("icinga2_") + subject.Name + "_items_15mins", (redis.get()->*subject.Getter)(15 * 60, now), false, "", Empty, Empty, 0)); } ServiceState state; From 3ded7a92684c797919939e2b55a03262f5908f0d Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Tue, 28 Jun 2022 10:51:36 +0200 Subject: [PATCH 21/24] Icinga DB Check: rename dump/sync related perfdata values Scope all values using current/last instead of takes/took. --- lib/icingadb/icingadbchecktask.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp index ca185647b..853aa9dcc 100644 --- a/lib/icingadb/icingadbchecktask.cpp +++ b/lib/icingadb/icingadbchecktask.cpp @@ -267,7 +267,7 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR << ", greater than WARNING threshold (" << Utility::FormatDuration(dumpTakesThresholds.Warning) << ")."; } - perfdata->Add(new PerfdataValue("icinga2_full_dump_takes", ongoingDumpTakes, false, "seconds", + perfdata->Add(new PerfdataValue("icinga2_current_full_dump_duration", ongoingDumpTakes, false, "seconds", dumpTakesThresholds.Warning, dumpTakesThresholds.Critical, 0)); } @@ -282,7 +282,7 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR << ", greater than WARNING threshold (" << Utility::FormatDuration(syncTakesThresholds.Warning) << ")."; } - perfdata->Add(new PerfdataValue("icingadb_full_sync_takes", ongoingSyncTakes, false, "seconds", + perfdata->Add(new PerfdataValue("icingadb_current_full_sync_duration", ongoingSyncTakes, false, "seconds", syncTakesThresholds.Warning, syncTakesThresholds.Critical, 0)); } @@ -363,11 +363,11 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR auto dumpAgo (now - dumpWhen); if (dumpWhen) { - perfdata->Add(new PerfdataValue("icinga2_full_dump_ago", dumpAgo, false, "seconds", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icinga2_last_full_dump_ago", dumpAgo, false, "seconds", Empty, Empty, 0)); } if (dumpTook) { - perfdata->Add(new PerfdataValue("icinga2_full_dump_took", dumpTook, false, "seconds", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icinga2_last_full_dump_duration", dumpTook, false, "seconds", Empty, Empty, 0)); } if (dumpWhen && dumpTook) { @@ -398,11 +398,11 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR auto syncAgo (icingadbNow - syncSuccessWhen); if (syncSuccessWhen) { - perfdata->Add(new PerfdataValue("icingadb_full_sync_ago", syncAgo, false, "seconds", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icingadb_last_full_sync_ago", syncAgo, false, "seconds", Empty, Empty, 0)); } if (syncSuccessTook) { - perfdata->Add(new PerfdataValue("icingadb_full_sync_took", syncSuccessTook, false, "seconds", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icingadb_last_full_sync_duration", syncSuccessTook, false, "seconds", Empty, Empty, 0)); } if (syncSuccessWhen && syncSuccessTook) { From 5550fb713ce192bcb061b9999afc0eb5703c2a15 Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Tue, 28 Jun 2022 11:30:11 +0200 Subject: [PATCH 22/24] Icinga DB Check: include ongoing dumps in OK message Also use the "current" and "full dump/sync" terminology in the other messages. --- lib/icingadb/icingadbchecktask.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp index 853aa9dcc..225168ce5 100644 --- a/lib/icingadb/icingadbchecktask.cpp +++ b/lib/icingadb/icingadbchecktask.cpp @@ -260,11 +260,13 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR auto ongoingDumpTakes (now - ongoingDumpStart); if (!dumpTakesThresholds.Critical.IsEmpty() && ongoingDumpTakes > dumpTakesThresholds.Critical) { - critmsgs << " Ongoing Icinga 2 dump already takes " << Utility::FormatDuration(ongoingDumpTakes) + critmsgs << " Current Icinga 2 full dump already takes " << Utility::FormatDuration(ongoingDumpTakes) << ", greater than CRITICAL threshold (" << Utility::FormatDuration(dumpTakesThresholds.Critical) << ")!"; } else if (!dumpTakesThresholds.Warning.IsEmpty() && ongoingDumpTakes > dumpTakesThresholds.Warning) { - warnmsgs << " Ongoing Icinga 2 dump already takes " << Utility::FormatDuration(ongoingDumpTakes) + warnmsgs << " Current Icinga 2 full dump already takes " << Utility::FormatDuration(ongoingDumpTakes) << ", greater than WARNING threshold (" << Utility::FormatDuration(dumpTakesThresholds.Warning) << ")."; + } else { + i2okmsgs << "\n* Current full dump running for " << Utility::FormatDuration(ongoingDumpTakes); } perfdata->Add(new PerfdataValue("icinga2_current_full_dump_duration", ongoingDumpTakes, false, "seconds", @@ -275,11 +277,13 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR auto ongoingSyncTakes (icingadbNow - syncOngoingSince); if (!syncTakesThresholds.Critical.IsEmpty() && ongoingSyncTakes > syncTakesThresholds.Critical) { - critmsgs << " Ongoing sync already takes " << Utility::FormatDuration(ongoingSyncTakes) + critmsgs << " Current full sync already takes " << Utility::FormatDuration(ongoingSyncTakes) << ", greater than CRITICAL threshold (" << Utility::FormatDuration(syncTakesThresholds.Critical) << ")!"; } else if (!syncTakesThresholds.Warning.IsEmpty() && ongoingSyncTakes > syncTakesThresholds.Warning) { - warnmsgs << " Ongoing sync already takes " << Utility::FormatDuration(ongoingSyncTakes) + warnmsgs << " Current full sync already takes " << Utility::FormatDuration(ongoingSyncTakes) << ", greater than WARNING threshold (" << Utility::FormatDuration(syncTakesThresholds.Warning) << ")."; + } else { + idbokmsgs << "\n* Current full sync running for " << Utility::FormatDuration(ongoingSyncTakes); } perfdata->Add(new PerfdataValue("icingadb_current_full_sync_duration", ongoingSyncTakes, false, "seconds", @@ -371,7 +375,7 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR } if (dumpWhen && dumpTook) { - i2okmsgs << "\n* Last dump: " << Utility::FormatDuration(dumpAgo) + i2okmsgs << "\n* Last full dump: " << Utility::FormatDuration(dumpAgo) << " ago, took " << Utility::FormatDuration(dumpTook); } @@ -406,7 +410,7 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR } if (syncSuccessWhen && syncSuccessTook) { - idbokmsgs << "\n* Last sync: " << Utility::FormatDuration(syncAgo) + idbokmsgs << "\n* Last full sync: " << Utility::FormatDuration(syncAgo) << " ago, took " << Utility::FormatDuration(syncSuccessTook); } From 4f125753bff73771e0cd1e20549f175c7dc652a4 Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Tue, 28 Jun 2022 11:30:48 +0200 Subject: [PATCH 23/24] Icinga DB Check: ignore suppressed queries in Redis backlog check If some kind of query is not supposed to be processed at the moment, there is little point in checking it. During a full dump, state updates are suppressed (i.e. delayed), so when a dump takes very long, this would have resulted in a false Redis backlog warning. --- lib/icingadb/redisconnection.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/icingadb/redisconnection.cpp b/lib/icingadb/redisconnection.cpp index f4c21a60f..d9233ca77 100644 --- a/lib/icingadb/redisconnection.cpp +++ b/lib/icingadb/redisconnection.cpp @@ -250,7 +250,7 @@ double RedisConnection::GetOldestPendingQueryTs() double oldest = 0; for (auto& queue : m_Queues.Writes) { - if (!queue.second.empty()) { + if (m_SuppressedQueryKinds.find(queue.first) == m_SuppressedQueryKinds.end() && !queue.second.empty()) { auto ctime (queue.second.front().CTime); if (ctime < oldest || oldest == 0) { From 3222fab05aa883e5d56327a761ba0bd9e0359129 Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Tue, 28 Jun 2022 12:18:11 +0200 Subject: [PATCH 24/24] Icinga DB Check: don't check runtime update backlog during full sync --- lib/icingadb/icingadbchecktask.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp index 225168ce5..c2f1a3699 100644 --- a/lib/icingadb/icingadbchecktask.cpp +++ b/lib/icingadb/icingadbchecktask.cpp @@ -346,8 +346,9 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR double runtimeBacklog = 0; - if (weResponsible) { - // These streams are only processed by one instance, it's fine for the other instance to have some backlog. + if (weResponsible && !syncOngoingSince) { + // These streams are only processed by the responsible instance after the full sync finished, + // it's fine for some backlog to exist otherwise. runtimeBacklog = getBacklog(xReadRuntimeBacklog); if (!icingadbBacklogThresholds.Critical.IsEmpty() && runtimeBacklog > icingadbBacklogThresholds.Critical) { @@ -359,7 +360,7 @@ void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckR } } - // Also report the perfdata value on the other instance (as 0 in this case). + // Also report the perfdata value on the standby instance or during a full sync (as 0 in this case). perfdata->Add(new PerfdataValue("icingadb_runtime_update_backlog", runtimeBacklog, false, "seconds", icingadbBacklogThresholds.Warning, icingadbBacklogThresholds.Critical, 0)); }