Merge pull request #8560 from Icinga/bugfix/children-recover-too-late

On recovery: re-check children
2025-09-21 16:57:58 +02:00 · 2020-12-15 13:11:46 +01:00 · 2020-12-15 13:11:46 +01:00 · d17b4ecc4b
commit d17b4ecc4b
parent 7c7f549900 4b0313d3f3
2 changed files with 82 additions and 44 deletions
--- a/lib/icinga/checkable-check.cpp
+++ b/lib/icinga/checkable-check.cpp
@ -242,6 +242,20 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig
 			OnReachabilityChanged(this, cr, children, origin);
 	}

+	if (recovery) {
+		for (auto& child : children) {
+			if (child->GetProblem() && child->GetEnableActiveChecks()) {
+				auto nextCheck (now + Utility::Random() % 60);
+
+				ObjectLock oLock (child);
+
+				if (nextCheck < child->GetNextCheck()) {
+					child->SetNextCheck(nextCheck);
+				}
+			}
+		}
+	}
+
 	if (!reachable)
 		SetLastStateUnreachable(Utility::GetTime());

--- a/lib/icinga/checkable-notification.cpp
+++ b/lib/icinga/checkable-notification.cpp
@ -10,6 +10,7 @@
 #include "base/exception.hpp"
 #include "base/context.hpp"
 #include "base/convert.hpp"
+#include "base/lazy-init.hpp"
 #include "remote/apilistener.hpp"

 using namespace icinga;
@ -145,73 +146,96 @@ static void FireSuppressedNotifications(Checkable* checkable)

 	int subtract = 0;

-	for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) {
-		if (suppressed_types & type) {
-			bool still_applies;
+	{
+		LazyInit<bool> wasLastParentRecoveryRecent ([&checkable]() {
 			auto cr (checkable->GetLastCheckResult());

-			switch (type) {
-				case NotificationProblem:
-					still_applies = cr && !checkable->IsStateOK(cr->GetState()) && checkable->GetStateType() == StateTypeHard;
-					break;
-				case NotificationRecovery:
-					still_applies = cr && checkable->IsStateOK(cr->GetState());
-					break;
-				case NotificationFlappingStart:
-					still_applies = checkable->IsFlapping();
-					break;
-				case NotificationFlappingEnd:
-					still_applies = !checkable->IsFlapping();
-					break;
-				default:
-					break;
+			if (!cr) {
+				return true;
 			}

-			if (still_applies) {
-				bool still_suppressed;
+			auto threshold (cr->GetExecutionStart());
+
+			for (auto& dep : checkable->GetDependencies()) {
+				auto parent (dep->GetParent());
+				ObjectLock oLock (parent);
+
+				if (!parent->GetProblem() && parent->GetLastStateChange() >= threshold) {
+					return true;
+				}
+			}
+
+			return false;
+		});
+
+		for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) {
+			if (suppressed_types & type) {
+				bool still_applies;
+				auto cr (checkable->GetLastCheckResult());

 				switch (type) {
 					case NotificationProblem:
-						/* Fall through. */
+						still_applies = cr && !checkable->IsStateOK(cr->GetState()) && checkable->GetStateType() == StateTypeHard;
+						break;
 					case NotificationRecovery:
-						still_suppressed = !checkable->IsReachable(DependencyNotification) || checkable->IsInDowntime() || checkable->IsAcknowledged();
+						still_applies = cr && checkable->IsStateOK(cr->GetState());
 						break;
 					case NotificationFlappingStart:
-						/* Fall through. */
+						still_applies = checkable->IsFlapping();
+						break;
 					case NotificationFlappingEnd:
-						still_suppressed = checkable->IsInDowntime();
+						still_applies = !checkable->IsFlapping();
 						break;
 					default:
 						break;
 				}

-				if (!still_suppressed && checkable->GetEnableActiveChecks()) {
-					/* If e.g. the downtime just ended, but the service is still not ok, we would re-send the stashed problem notification.
-					 * But if the next check result recovers the service soon, we would send a recovery notification soon after the problem one.
-					 * This is not desired, especially for lots of services at once.
-					 * Because of that if there's likely to be a check result soon,
-					 * we delay the re-sending of the stashed notification until the next check.
-					 * That check either doesn't change anything and we finally re-send the stashed problem notification
-					 * or recovers the service and we drop the stashed notification. */
+				if (still_applies) {
+					bool still_suppressed;

-					/* One minute unless the check interval is too short so the next check will always run during the next minute. */
-					auto threshold (checkable->GetCheckInterval() - 10);
+					switch (type) {
+						case NotificationProblem:
+							/* Fall through. */
+						case NotificationRecovery:
+							still_suppressed = !checkable->IsReachable(DependencyNotification) || checkable->IsInDowntime() || checkable->IsAcknowledged();
+							break;
+						case NotificationFlappingStart:
+							/* Fall through. */
+						case NotificationFlappingEnd:
+							still_suppressed = checkable->IsInDowntime();
+							break;
+						default:
+							break;
+					}

-					if (threshold > 60)
-						threshold = 60;
-					else if (threshold < 0)
-						threshold = 0;
+					if (!still_suppressed && checkable->GetEnableActiveChecks()) {
+						/* If e.g. the downtime just ended, but the service is still not ok, we would re-send the stashed problem notification.
+						 * But if the next check result recovers the service soon, we would send a recovery notification soon after the problem one.
+						 * This is not desired, especially for lots of services at once.
+						 * Because of that if there's likely to be a check result soon,
+						 * we delay the re-sending of the stashed notification until the next check.
+						 * That check either doesn't change anything and we finally re-send the stashed problem notification
+						 * or recovers the service and we drop the stashed notification. */

-					still_suppressed = checkable->GetNextCheck() <= Utility::GetTime() + threshold;
-				}
+						/* One minute unless the check interval is too short so the next check will always run during the next minute. */
+						auto threshold (checkable->GetCheckInterval() - 10);

-				if (!still_suppressed) {
-					Checkable::OnNotificationsRequested(checkable, type, cr, "", "", nullptr);
+						if (threshold > 60)
+							threshold = 60;
+						else if (threshold < 0)
+							threshold = 0;

+						still_suppressed = checkable->GetNextCheck() <= Utility::GetTime() + threshold;
+					}
+
+					if (!still_suppressed && !wasLastParentRecoveryRecent.Get()) {
+						Checkable::OnNotificationsRequested(checkable, type, cr, "", "", nullptr);
+
+						subtract |= type;
+					}
+				} else {
 					subtract |= type;
 				}
-			} else {
-				subtract |= type;
 			}
 		}
 	}