Merge pull request #8560 from Icinga/bugfix/children-recover-too-late

On recovery: re-check children
This commit is contained in:
Noah Hilverling 2020-12-15 13:11:46 +01:00 committed by GitHub
commit d17b4ecc4b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 82 additions and 44 deletions

View File

@ -242,6 +242,20 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig
OnReachabilityChanged(this, cr, children, origin); OnReachabilityChanged(this, cr, children, origin);
} }
if (recovery) {
for (auto& child : children) {
if (child->GetProblem() && child->GetEnableActiveChecks()) {
auto nextCheck (now + Utility::Random() % 60);
ObjectLock oLock (child);
if (nextCheck < child->GetNextCheck()) {
child->SetNextCheck(nextCheck);
}
}
}
}
if (!reachable) if (!reachable)
SetLastStateUnreachable(Utility::GetTime()); SetLastStateUnreachable(Utility::GetTime());

View File

@ -10,6 +10,7 @@
#include "base/exception.hpp" #include "base/exception.hpp"
#include "base/context.hpp" #include "base/context.hpp"
#include "base/convert.hpp" #include "base/convert.hpp"
#include "base/lazy-init.hpp"
#include "remote/apilistener.hpp" #include "remote/apilistener.hpp"
using namespace icinga; using namespace icinga;
@ -145,73 +146,96 @@ static void FireSuppressedNotifications(Checkable* checkable)
int subtract = 0; int subtract = 0;
for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) { {
if (suppressed_types & type) { LazyInit<bool> wasLastParentRecoveryRecent ([&checkable]() {
bool still_applies;
auto cr (checkable->GetLastCheckResult()); auto cr (checkable->GetLastCheckResult());
switch (type) { if (!cr) {
case NotificationProblem: return true;
still_applies = cr && !checkable->IsStateOK(cr->GetState()) && checkable->GetStateType() == StateTypeHard;
break;
case NotificationRecovery:
still_applies = cr && checkable->IsStateOK(cr->GetState());
break;
case NotificationFlappingStart:
still_applies = checkable->IsFlapping();
break;
case NotificationFlappingEnd:
still_applies = !checkable->IsFlapping();
break;
default:
break;
} }
if (still_applies) { auto threshold (cr->GetExecutionStart());
bool still_suppressed;
for (auto& dep : checkable->GetDependencies()) {
auto parent (dep->GetParent());
ObjectLock oLock (parent);
if (!parent->GetProblem() && parent->GetLastStateChange() >= threshold) {
return true;
}
}
return false;
});
for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) {
if (suppressed_types & type) {
bool still_applies;
auto cr (checkable->GetLastCheckResult());
switch (type) { switch (type) {
case NotificationProblem: case NotificationProblem:
/* Fall through. */ still_applies = cr && !checkable->IsStateOK(cr->GetState()) && checkable->GetStateType() == StateTypeHard;
break;
case NotificationRecovery: case NotificationRecovery:
still_suppressed = !checkable->IsReachable(DependencyNotification) || checkable->IsInDowntime() || checkable->IsAcknowledged(); still_applies = cr && checkable->IsStateOK(cr->GetState());
break; break;
case NotificationFlappingStart: case NotificationFlappingStart:
/* Fall through. */ still_applies = checkable->IsFlapping();
break;
case NotificationFlappingEnd: case NotificationFlappingEnd:
still_suppressed = checkable->IsInDowntime(); still_applies = !checkable->IsFlapping();
break; break;
default: default:
break; break;
} }
if (!still_suppressed && checkable->GetEnableActiveChecks()) { if (still_applies) {
/* If e.g. the downtime just ended, but the service is still not ok, we would re-send the stashed problem notification. bool still_suppressed;
* But if the next check result recovers the service soon, we would send a recovery notification soon after the problem one.
* This is not desired, especially for lots of services at once.
* Because of that if there's likely to be a check result soon,
* we delay the re-sending of the stashed notification until the next check.
* That check either doesn't change anything and we finally re-send the stashed problem notification
* or recovers the service and we drop the stashed notification. */
/* One minute unless the check interval is too short so the next check will always run during the next minute. */ switch (type) {
auto threshold (checkable->GetCheckInterval() - 10); case NotificationProblem:
/* Fall through. */
case NotificationRecovery:
still_suppressed = !checkable->IsReachable(DependencyNotification) || checkable->IsInDowntime() || checkable->IsAcknowledged();
break;
case NotificationFlappingStart:
/* Fall through. */
case NotificationFlappingEnd:
still_suppressed = checkable->IsInDowntime();
break;
default:
break;
}
if (threshold > 60) if (!still_suppressed && checkable->GetEnableActiveChecks()) {
threshold = 60; /* If e.g. the downtime just ended, but the service is still not ok, we would re-send the stashed problem notification.
else if (threshold < 0) * But if the next check result recovers the service soon, we would send a recovery notification soon after the problem one.
threshold = 0; * This is not desired, especially for lots of services at once.
* Because of that if there's likely to be a check result soon,
* we delay the re-sending of the stashed notification until the next check.
* That check either doesn't change anything and we finally re-send the stashed problem notification
* or recovers the service and we drop the stashed notification. */
still_suppressed = checkable->GetNextCheck() <= Utility::GetTime() + threshold; /* One minute unless the check interval is too short so the next check will always run during the next minute. */
} auto threshold (checkable->GetCheckInterval() - 10);
if (!still_suppressed) { if (threshold > 60)
Checkable::OnNotificationsRequested(checkable, type, cr, "", "", nullptr); threshold = 60;
else if (threshold < 0)
threshold = 0;
still_suppressed = checkable->GetNextCheck() <= Utility::GetTime() + threshold;
}
if (!still_suppressed && !wasLastParentRecoveryRecent.Get()) {
Checkable::OnNotificationsRequested(checkable, type, cr, "", "", nullptr);
subtract |= type;
}
} else {
subtract |= type; subtract |= type;
} }
} else {
subtract |= type;
} }
} }
} }