mirror of
https://github.com/Icinga/icinga2.git
synced 2025-07-06 05:14:29 +02:00
Merge pull request #8560 from Icinga/bugfix/children-recover-too-late
On recovery: re-check children
This commit is contained in:
commit
d17b4ecc4b
@ -242,6 +242,20 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig
|
|||||||
OnReachabilityChanged(this, cr, children, origin);
|
OnReachabilityChanged(this, cr, children, origin);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (recovery) {
|
||||||
|
for (auto& child : children) {
|
||||||
|
if (child->GetProblem() && child->GetEnableActiveChecks()) {
|
||||||
|
auto nextCheck (now + Utility::Random() % 60);
|
||||||
|
|
||||||
|
ObjectLock oLock (child);
|
||||||
|
|
||||||
|
if (nextCheck < child->GetNextCheck()) {
|
||||||
|
child->SetNextCheck(nextCheck);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!reachable)
|
if (!reachable)
|
||||||
SetLastStateUnreachable(Utility::GetTime());
|
SetLastStateUnreachable(Utility::GetTime());
|
||||||
|
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
#include "base/exception.hpp"
|
#include "base/exception.hpp"
|
||||||
#include "base/context.hpp"
|
#include "base/context.hpp"
|
||||||
#include "base/convert.hpp"
|
#include "base/convert.hpp"
|
||||||
|
#include "base/lazy-init.hpp"
|
||||||
#include "remote/apilistener.hpp"
|
#include "remote/apilistener.hpp"
|
||||||
|
|
||||||
using namespace icinga;
|
using namespace icinga;
|
||||||
@ -145,73 +146,96 @@ static void FireSuppressedNotifications(Checkable* checkable)
|
|||||||
|
|
||||||
int subtract = 0;
|
int subtract = 0;
|
||||||
|
|
||||||
for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) {
|
{
|
||||||
if (suppressed_types & type) {
|
LazyInit<bool> wasLastParentRecoveryRecent ([&checkable]() {
|
||||||
bool still_applies;
|
|
||||||
auto cr (checkable->GetLastCheckResult());
|
auto cr (checkable->GetLastCheckResult());
|
||||||
|
|
||||||
switch (type) {
|
if (!cr) {
|
||||||
case NotificationProblem:
|
return true;
|
||||||
still_applies = cr && !checkable->IsStateOK(cr->GetState()) && checkable->GetStateType() == StateTypeHard;
|
|
||||||
break;
|
|
||||||
case NotificationRecovery:
|
|
||||||
still_applies = cr && checkable->IsStateOK(cr->GetState());
|
|
||||||
break;
|
|
||||||
case NotificationFlappingStart:
|
|
||||||
still_applies = checkable->IsFlapping();
|
|
||||||
break;
|
|
||||||
case NotificationFlappingEnd:
|
|
||||||
still_applies = !checkable->IsFlapping();
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (still_applies) {
|
auto threshold (cr->GetExecutionStart());
|
||||||
bool still_suppressed;
|
|
||||||
|
for (auto& dep : checkable->GetDependencies()) {
|
||||||
|
auto parent (dep->GetParent());
|
||||||
|
ObjectLock oLock (parent);
|
||||||
|
|
||||||
|
if (!parent->GetProblem() && parent->GetLastStateChange() >= threshold) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
});
|
||||||
|
|
||||||
|
for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) {
|
||||||
|
if (suppressed_types & type) {
|
||||||
|
bool still_applies;
|
||||||
|
auto cr (checkable->GetLastCheckResult());
|
||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case NotificationProblem:
|
case NotificationProblem:
|
||||||
/* Fall through. */
|
still_applies = cr && !checkable->IsStateOK(cr->GetState()) && checkable->GetStateType() == StateTypeHard;
|
||||||
|
break;
|
||||||
case NotificationRecovery:
|
case NotificationRecovery:
|
||||||
still_suppressed = !checkable->IsReachable(DependencyNotification) || checkable->IsInDowntime() || checkable->IsAcknowledged();
|
still_applies = cr && checkable->IsStateOK(cr->GetState());
|
||||||
break;
|
break;
|
||||||
case NotificationFlappingStart:
|
case NotificationFlappingStart:
|
||||||
/* Fall through. */
|
still_applies = checkable->IsFlapping();
|
||||||
|
break;
|
||||||
case NotificationFlappingEnd:
|
case NotificationFlappingEnd:
|
||||||
still_suppressed = checkable->IsInDowntime();
|
still_applies = !checkable->IsFlapping();
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!still_suppressed && checkable->GetEnableActiveChecks()) {
|
if (still_applies) {
|
||||||
/* If e.g. the downtime just ended, but the service is still not ok, we would re-send the stashed problem notification.
|
bool still_suppressed;
|
||||||
* But if the next check result recovers the service soon, we would send a recovery notification soon after the problem one.
|
|
||||||
* This is not desired, especially for lots of services at once.
|
|
||||||
* Because of that if there's likely to be a check result soon,
|
|
||||||
* we delay the re-sending of the stashed notification until the next check.
|
|
||||||
* That check either doesn't change anything and we finally re-send the stashed problem notification
|
|
||||||
* or recovers the service and we drop the stashed notification. */
|
|
||||||
|
|
||||||
/* One minute unless the check interval is too short so the next check will always run during the next minute. */
|
switch (type) {
|
||||||
auto threshold (checkable->GetCheckInterval() - 10);
|
case NotificationProblem:
|
||||||
|
/* Fall through. */
|
||||||
|
case NotificationRecovery:
|
||||||
|
still_suppressed = !checkable->IsReachable(DependencyNotification) || checkable->IsInDowntime() || checkable->IsAcknowledged();
|
||||||
|
break;
|
||||||
|
case NotificationFlappingStart:
|
||||||
|
/* Fall through. */
|
||||||
|
case NotificationFlappingEnd:
|
||||||
|
still_suppressed = checkable->IsInDowntime();
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
if (threshold > 60)
|
if (!still_suppressed && checkable->GetEnableActiveChecks()) {
|
||||||
threshold = 60;
|
/* If e.g. the downtime just ended, but the service is still not ok, we would re-send the stashed problem notification.
|
||||||
else if (threshold < 0)
|
* But if the next check result recovers the service soon, we would send a recovery notification soon after the problem one.
|
||||||
threshold = 0;
|
* This is not desired, especially for lots of services at once.
|
||||||
|
* Because of that if there's likely to be a check result soon,
|
||||||
|
* we delay the re-sending of the stashed notification until the next check.
|
||||||
|
* That check either doesn't change anything and we finally re-send the stashed problem notification
|
||||||
|
* or recovers the service and we drop the stashed notification. */
|
||||||
|
|
||||||
still_suppressed = checkable->GetNextCheck() <= Utility::GetTime() + threshold;
|
/* One minute unless the check interval is too short so the next check will always run during the next minute. */
|
||||||
}
|
auto threshold (checkable->GetCheckInterval() - 10);
|
||||||
|
|
||||||
if (!still_suppressed) {
|
if (threshold > 60)
|
||||||
Checkable::OnNotificationsRequested(checkable, type, cr, "", "", nullptr);
|
threshold = 60;
|
||||||
|
else if (threshold < 0)
|
||||||
|
threshold = 0;
|
||||||
|
|
||||||
|
still_suppressed = checkable->GetNextCheck() <= Utility::GetTime() + threshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!still_suppressed && !wasLastParentRecoveryRecent.Get()) {
|
||||||
|
Checkable::OnNotificationsRequested(checkable, type, cr, "", "", nullptr);
|
||||||
|
|
||||||
|
subtract |= type;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
subtract |= type;
|
subtract |= type;
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
subtract |= type;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user