From 3c15e71e195690eaec98021b81c2912fbd0ee00e Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Tue, 1 Dec 2020 18:22:02 +0100 Subject: [PATCH 1/2] Don't fire suppressed notifications if last parent recovery >= last check result --- lib/icinga/checkable-notification.cpp | 112 ++++++++++++++++---------- 1 file changed, 68 insertions(+), 44 deletions(-) diff --git a/lib/icinga/checkable-notification.cpp b/lib/icinga/checkable-notification.cpp index 43b4589fa..59d20fda4 100644 --- a/lib/icinga/checkable-notification.cpp +++ b/lib/icinga/checkable-notification.cpp @@ -10,6 +10,7 @@ #include "base/exception.hpp" #include "base/context.hpp" #include "base/convert.hpp" +#include "base/lazy-init.hpp" #include "remote/apilistener.hpp" using namespace icinga; @@ -145,73 +146,96 @@ static void FireSuppressedNotifications(Checkable* checkable) int subtract = 0; - for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) { - if (suppressed_types & type) { - bool still_applies; + { + LazyInit wasLastParentRecoveryRecent ([&checkable]() { auto cr (checkable->GetLastCheckResult()); - switch (type) { - case NotificationProblem: - still_applies = cr && !checkable->IsStateOK(cr->GetState()) && checkable->GetStateType() == StateTypeHard; - break; - case NotificationRecovery: - still_applies = cr && checkable->IsStateOK(cr->GetState()); - break; - case NotificationFlappingStart: - still_applies = checkable->IsFlapping(); - break; - case NotificationFlappingEnd: - still_applies = !checkable->IsFlapping(); - break; - default: - break; + if (!cr) { + return true; } - if (still_applies) { - bool still_suppressed; + auto threshold (cr->GetExecutionStart()); + + for (auto& dep : checkable->GetDependencies()) { + auto parent (dep->GetParent()); + ObjectLock oLock (parent); + + if (!parent->GetProblem() && parent->GetLastStateChange() >= threshold) { + return true; + } + } + + return false; + }); + + for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) { + if (suppressed_types & type) { + bool still_applies; + auto cr (checkable->GetLastCheckResult()); switch (type) { case NotificationProblem: - /* Fall through. */ + still_applies = cr && !checkable->IsStateOK(cr->GetState()) && checkable->GetStateType() == StateTypeHard; + break; case NotificationRecovery: - still_suppressed = !checkable->IsReachable(DependencyNotification) || checkable->IsInDowntime() || checkable->IsAcknowledged(); + still_applies = cr && checkable->IsStateOK(cr->GetState()); break; case NotificationFlappingStart: - /* Fall through. */ + still_applies = checkable->IsFlapping(); + break; case NotificationFlappingEnd: - still_suppressed = checkable->IsInDowntime(); + still_applies = !checkable->IsFlapping(); break; default: break; } - if (!still_suppressed && checkable->GetEnableActiveChecks()) { - /* If e.g. the downtime just ended, but the service is still not ok, we would re-send the stashed problem notification. - * But if the next check result recovers the service soon, we would send a recovery notification soon after the problem one. - * This is not desired, especially for lots of services at once. - * Because of that if there's likely to be a check result soon, - * we delay the re-sending of the stashed notification until the next check. - * That check either doesn't change anything and we finally re-send the stashed problem notification - * or recovers the service and we drop the stashed notification. */ + if (still_applies) { + bool still_suppressed; - /* One minute unless the check interval is too short so the next check will always run during the next minute. */ - auto threshold (checkable->GetCheckInterval() - 10); + switch (type) { + case NotificationProblem: + /* Fall through. */ + case NotificationRecovery: + still_suppressed = !checkable->IsReachable(DependencyNotification) || checkable->IsInDowntime() || checkable->IsAcknowledged(); + break; + case NotificationFlappingStart: + /* Fall through. */ + case NotificationFlappingEnd: + still_suppressed = checkable->IsInDowntime(); + break; + default: + break; + } - if (threshold > 60) - threshold = 60; - else if (threshold < 0) - threshold = 0; + if (!still_suppressed && checkable->GetEnableActiveChecks()) { + /* If e.g. the downtime just ended, but the service is still not ok, we would re-send the stashed problem notification. + * But if the next check result recovers the service soon, we would send a recovery notification soon after the problem one. + * This is not desired, especially for lots of services at once. + * Because of that if there's likely to be a check result soon, + * we delay the re-sending of the stashed notification until the next check. + * That check either doesn't change anything and we finally re-send the stashed problem notification + * or recovers the service and we drop the stashed notification. */ - still_suppressed = checkable->GetNextCheck() <= Utility::GetTime() + threshold; - } + /* One minute unless the check interval is too short so the next check will always run during the next minute. */ + auto threshold (checkable->GetCheckInterval() - 10); - if (!still_suppressed) { - Checkable::OnNotificationsRequested(checkable, type, cr, "", "", nullptr); + if (threshold > 60) + threshold = 60; + else if (threshold < 0) + threshold = 0; + still_suppressed = checkable->GetNextCheck() <= Utility::GetTime() + threshold; + } + + if (!still_suppressed && !wasLastParentRecoveryRecent.Get()) { + Checkable::OnNotificationsRequested(checkable, type, cr, "", "", nullptr); + + subtract |= type; + } + } else { subtract |= type; } - } else { - subtract |= type; } } } From 4b0313d3f356b2838ac0b9a641d165cd7ed662c9 Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Wed, 2 Dec 2020 12:24:40 +0100 Subject: [PATCH 2/2] On recovery: re-check children --- lib/icinga/checkable-check.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/lib/icinga/checkable-check.cpp b/lib/icinga/checkable-check.cpp index 5aac602cd..a6899d944 100644 --- a/lib/icinga/checkable-check.cpp +++ b/lib/icinga/checkable-check.cpp @@ -242,6 +242,20 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig OnReachabilityChanged(this, cr, children, origin); } + if (recovery) { + for (auto& child : children) { + if (child->GetProblem() && child->GetEnableActiveChecks()) { + auto nextCheck (now + Utility::Random() % 60); + + ObjectLock oLock (child); + + if (nextCheck < child->GetNextCheck()) { + child->SetNextCheck(nextCheck); + } + } + } + } + if (!reachable) SetLastStateUnreachable(Utility::GetTime());