Merge pull request #8560 from Icinga/bugfix/children-recover-too-late

On recovery: re-check children
2025-07-30 17:14:25 +02:00 · 2020-12-15 13:11:46 +01:00 · 2020-12-15 13:11:46 +01:00 · d17b4ecc4b
commit d17b4ecc4b
parent 7c7f549900 4b0313d3f3
2 changed files with 82 additions and 44 deletions
--- a/lib/icinga/checkable-check.cpp
+++ b/lib/icinga/checkable-check.cpp
@ -242,6 +242,20 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig
 			OnReachabilityChanged(this, cr, children, origin);
 	}
 	if (recovery) {
 		for (auto& child : children) {
 			if (child->GetProblem() && child->GetEnableActiveChecks()) {
 				auto nextCheck (now + Utility::Random() % 60);
 				ObjectLock oLock (child);
 				if (nextCheck < child->GetNextCheck()) {
 					child->SetNextCheck(nextCheck);
 				}
 			}
 		}
 	}
 	if (!reachable)
 		SetLastStateUnreachable(Utility::GetTime());
--- a/lib/icinga/checkable-notification.cpp
+++ b/lib/icinga/checkable-notification.cpp
@ -10,6 +10,7 @@
 #include "base/exception.hpp"
 #include "base/context.hpp"
 #include "base/convert.hpp"
 #include "base/lazy-init.hpp"
 #include "remote/apilistener.hpp"
 using namespace icinga;
@ -145,73 +146,96 @@ static void FireSuppressedNotifications(Checkable* checkable)
 	int subtract = 0;
-	for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) {
+	{
-		if (suppressed_types & type) {
+		LazyInit<bool> wasLastParentRecoveryRecent ([&checkable]() {
 			bool still_applies;
 			auto cr (checkable->GetLastCheckResult());
-			switch (type) {
+			if (!cr) {
-				case NotificationProblem:
+				return true;
 					still_applies = cr && !checkable->IsStateOK(cr->GetState()) && checkable->GetStateType() == StateTypeHard;
 					break;
 				case NotificationRecovery:
 					still_applies = cr && checkable->IsStateOK(cr->GetState());
 					break;
 				case NotificationFlappingStart:
 					still_applies = checkable->IsFlapping();
 					break;
 				case NotificationFlappingEnd:
 					still_applies = !checkable->IsFlapping();
 					break;
 				default:
 					break;
 			}
-			if (still_applies) {
+			auto threshold (cr->GetExecutionStart());
-				bool still_suppressed;
+
 			for (auto& dep : checkable->GetDependencies()) {
 				auto parent (dep->GetParent());
 				ObjectLock oLock (parent);
 				if (!parent->GetProblem() && parent->GetLastStateChange() >= threshold) {
 					return true;
 				}
 			}
 			return false;
 		});
 		for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) {
 			if (suppressed_types & type) {
 				bool still_applies;
 				auto cr (checkable->GetLastCheckResult());
 				switch (type) {
 					case NotificationProblem:
-						/* Fall through. */
+						still_applies = cr && !checkable->IsStateOK(cr->GetState()) && checkable->GetStateType() == StateTypeHard;
 						break;
 					case NotificationRecovery:
-						still_suppressed = !checkable->IsReachable(DependencyNotification) || checkable->IsInDowntime() || checkable->IsAcknowledged();
+						still_applies = cr && checkable->IsStateOK(cr->GetState());
 						break;
 					case NotificationFlappingStart:
-						/* Fall through. */
+						still_applies = checkable->IsFlapping();
 						break;
 					case NotificationFlappingEnd:
-						still_suppressed = checkable->IsInDowntime();
+						still_applies = !checkable->IsFlapping();
 						break;
 					default:
 						break;
 				}
-				if (!still_suppressed && checkable->GetEnableActiveChecks()) {
+				if (still_applies) {
-					/* If e.g. the downtime just ended, but the service is still not ok, we would re-send the stashed problem notification.
+					bool still_suppressed;
 					 * But if the next check result recovers the service soon, we would send a recovery notification soon after the problem one.
 					 * This is not desired, especially for lots of services at once.
 					 * Because of that if there's likely to be a check result soon,
 					 * we delay the re-sending of the stashed notification until the next check.
 					 * That check either doesn't change anything and we finally re-send the stashed problem notification
 					 * or recovers the service and we drop the stashed notification. */
-					/* One minute unless the check interval is too short so the next check will always run during the next minute. */
+					switch (type) {
-					auto threshold (checkable->GetCheckInterval() - 10);
+						case NotificationProblem:
 							/* Fall through. */
 						case NotificationRecovery:
 							still_suppressed = !checkable->IsReachable(DependencyNotification) || checkable->IsInDowntime() || checkable->IsAcknowledged();
 							break;
 						case NotificationFlappingStart:
 							/* Fall through. */
 						case NotificationFlappingEnd:
 							still_suppressed = checkable->IsInDowntime();
 							break;
 						default:
 							break;
 					}
-					if (threshold > 60)
+					if (!still_suppressed && checkable->GetEnableActiveChecks()) {
-						threshold = 60;
+						/* If e.g. the downtime just ended, but the service is still not ok, we would re-send the stashed problem notification.
-					else if (threshold < 0)
+						 * But if the next check result recovers the service soon, we would send a recovery notification soon after the problem one.
-						threshold = 0;
+						 * This is not desired, especially for lots of services at once.
 						 * Because of that if there's likely to be a check result soon,
 						 * we delay the re-sending of the stashed notification until the next check.
 						 * That check either doesn't change anything and we finally re-send the stashed problem notification
 						 * or recovers the service and we drop the stashed notification. */
-					still_suppressed = checkable->GetNextCheck() <= Utility::GetTime() + threshold;
+						/* One minute unless the check interval is too short so the next check will always run during the next minute. */
-				}
+						auto threshold (checkable->GetCheckInterval() - 10);
-				if (!still_suppressed) {
+						if (threshold > 60)
-					Checkable::OnNotificationsRequested(checkable, type, cr, "", "", nullptr);
+							threshold = 60;
 						else if (threshold < 0)
 							threshold = 0;
 						still_suppressed = checkable->GetNextCheck() <= Utility::GetTime() + threshold;
 					}
 					if (!still_suppressed && !wasLastParentRecoveryRecent.Get()) {
 						Checkable::OnNotificationsRequested(checkable, type, cr, "", "", nullptr);
 						subtract |= type;
 					}
 				} else {
 					subtract |= type;
 				}
 			} else {
 				subtract |= type;
 			}
 		}
 	}