From fc8de9e087f70bb779bfb2d266d2ab5b146ba1b2 Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Tue, 23 Sep 2025 18:10:57 +0200 Subject: [PATCH] Introduce `OnRescheduleCheck` signal & make use of it where appropriate This commit introduces a new kinda special `OnRescheduleCheck` signal that is emitted whenever we want to inform the checker to reschedule the checkable at a specific timestamp without actually changing the next check time. Previously, we called `SetNextCheck` with some random timestamp just to enforce the checker to either pick it up immediately or at a specific time. Then at some point in time, subscribing to the `OnNextCheckChanged` signal became effectively unusable for any other purpose than to inform the checker about a new next check time. Thus, it resulted in introducing a new signal that is solely responsible for informing the Icigna DB and IDO about a new next check time in places where calling `SetNextCheck` did make sense. This commit does quite the opposite: it replaces all calls to `SetNextCheck` that were only used to inform the checker about a new next check time wit `OnRescheduleCheck` calls. Only places where we actually wanted to change the next check time still call `SetNextCheck` and thus inform the checker and all other listeners about the new next check time. And as a bonus point, we now got rid of the two object locks for child and parent at the same time. --- lib/checker/checkercomponent.cpp | 13 ++++++++-- lib/checker/checkercomponent.hpp | 2 +- lib/icinga/checkable-check.cpp | 41 ++++++++++++++++++++++++-------- lib/icinga/checkable.cpp | 16 +++++++++++-- lib/icinga/checkable.hpp | 14 ++++++++++- 5 files changed, 70 insertions(+), 16 deletions(-) diff --git a/lib/checker/checkercomponent.cpp b/lib/checker/checkercomponent.cpp index 04583833a..66abdd944 100644 --- a/lib/checker/checkercomponent.cpp +++ b/lib/checker/checkercomponent.cpp @@ -55,6 +55,9 @@ void CheckerComponent::OnConfigLoaded() Checkable::OnNextCheckChanged.connect([this](const Checkable::Ptr& checkable, const Value&) { NextCheckChangedHandler(checkable); }); + Checkable::OnRescheduleCheck.connect([this](const Checkable::Ptr& checkable, double nextCheck) { + NextCheckChangedHandler(checkable, nextCheck); + }); } void CheckerComponent::Start(bool runtimeCreated) @@ -341,7 +344,7 @@ CheckableScheduleInfo CheckerComponent::GetCheckableScheduleInfo(const Checkable return csi; } -void CheckerComponent::NextCheckChangedHandler(const Checkable::Ptr& checkable) +void CheckerComponent::NextCheckChangedHandler(const Checkable::Ptr& checkable, double nextCheck) { std::unique_lock lock(m_Mutex); @@ -356,7 +359,13 @@ void CheckerComponent::NextCheckChangedHandler(const Checkable::Ptr& checkable) idx.erase(checkable); - CheckableScheduleInfo csi = GetCheckableScheduleInfo(checkable); + CheckableScheduleInfo csi; + if (nextCheck < 0) { + csi = GetCheckableScheduleInfo(checkable); + } else { + csi.NextCheck = nextCheck; + csi.Object = checkable; + } idx.insert(csi); m_CV.notify_all(); diff --git a/lib/checker/checkercomponent.hpp b/lib/checker/checkercomponent.hpp index 6f7a23715..88cb47797 100644 --- a/lib/checker/checkercomponent.hpp +++ b/lib/checker/checkercomponent.hpp @@ -93,7 +93,7 @@ private: void AdjustCheckTimer(); void ObjectHandler(const ConfigObject::Ptr& object); - void NextCheckChangedHandler(const Checkable::Ptr& checkable); + void NextCheckChangedHandler(const Checkable::Ptr& checkable, double nextCheck = -1); void RescheduleCheckTimer(); diff --git a/lib/icinga/checkable-check.cpp b/lib/icinga/checkable-check.cpp index 67d425b24..7d1fd29f5 100644 --- a/lib/icinga/checkable-check.cpp +++ b/lib/icinga/checkable-check.cpp @@ -23,6 +23,7 @@ boost::signals2::signal, const MessageOrigin::Ptr&)> Checkable::OnReachabilityChanged; boost::signals2::signal Checkable::OnNotificationsRequested; boost::signals2::signal Checkable::OnNextCheckUpdated; +boost::signals2::signal Checkable::OnRescheduleCheck; Atomic Checkable::CurrentConcurrentChecks (0); @@ -50,7 +51,17 @@ long Checkable::GetSchedulingOffset() const return m_SchedulingOffset; } -void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin) +/** + * Update the next check time of this checkable based on its check interval and last check time. + * + * If onlyReschedule is true, the next check time is not actually updated, but the @c Checkable::OnRescheduleCheck + * signal is emitted with the new calculated next check time. Otherwise, the next check time is updated + * and the @c Checkable::OnNextCheckChanged signal is emitted accordingly. + * + * @param origin The origin of the message triggering this update, can be nullptr. + * @param onlyReschedule If true, only emit @c OnRescheduleCheck without updating the next check time. + */ +void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin, bool onlyReschedule) { double interval; @@ -78,7 +89,14 @@ void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin) << " (" << lastCheck << ") to next check time at " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", nextCheck) << " (" << nextCheck << ")."; - SetNextCheck(nextCheck, false, origin); + if (onlyReschedule) { + // Someone requested to only reschedule the next check without actually changing it. + // So, just tell the checker about this new timestamp and return. + OnRescheduleCheck(this, nextCheck); + } else { + // Otherwise, set the next check to the newly calculated timestamp and inform all its listeners. + SetNextCheck(nextCheck, false, origin); + } } bool Checkable::HasBeenChecked() const @@ -518,12 +536,15 @@ Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr if (recovery) { for (auto& child : children) { if (child->GetProblem() && child->GetEnableActiveChecks()) { - auto nextCheck (now + Utility::Random() % 60); - - ObjectLock oLock (child); - - if (nextCheck < child->GetNextCheck()) { - child->SetNextCheck(nextCheck); + if (auto nextCheck (now + Utility::Random() % 60); nextCheck < child->GetNextCheck()) { + /** + * We only want to enforce the checker to pick this up sooner, and no need to actually change + * the timesatmp. Plus, no other listeners should be informed about this other than the checker, + * so we emit the OnRescheduleCheck signal directly. In case our checker isn't responsible for + * this child object, we've already broadcasted the `CheckResult` event which will cause on the + * responsible node to enter this exact branch and do the rescheduling for its own checker. + */ + OnRescheduleCheck(child, nextCheck); } } } @@ -539,8 +560,8 @@ Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr continue; if (parent->GetNextCheck() >= now + parent->GetRetryInterval()) { - ObjectLock olock(parent); - parent->SetNextCheck(now); + // See comment above for children. We want to just enforce an immediate check by our checker. + OnRescheduleCheck(parent, now); } } } diff --git a/lib/icinga/checkable.cpp b/lib/icinga/checkable.cpp index dbe73fe55..3a1262744 100644 --- a/lib/icinga/checkable.cpp +++ b/lib/icinga/checkable.cpp @@ -88,14 +88,26 @@ void Checkable::Start(bool runtimeCreated) auto cr (GetLastCheckResult()); if (GetLastCheckStarted() > (cr ? cr->GetExecutionEnd() : 0.0)) { - SetNextCheck(GetLastCheckStarted()); + /** + * Each node performs a check immediately after startup before the object authority is determined. + * Meaning, for runtime created objects we should not enter this branch and for non-runtime created + * objects we're going to call `ApiListener::UpdateObjectAuthority()` after all objects have been + * started, so we can safely assume that this timestamp is only relevant for our own checker. + * + * However, until the authority is determined, by default all objects are paused and thus triggering + * `OnRescheduleCheck` would have no effect, as the checker component ignores paused objects. Therefore, + * our only option is to set this dummy ts silently here, so that when the object is unpaused later, + * the checker component will pick it up and schedule the next check properly. + */ + SetNextCheck(GetLastCheckStarted(), true); } } if (GetNextCheck() < now + 60) { double delta = std::min(GetCheckInterval(), 60.0); delta *= (double)std::rand() / RAND_MAX; - SetNextCheck(now + delta); + // We only want to jitter the next check a bit, and inform the scheduler about it, so not setting it directly. + SetNextCheck(now + delta, true); } ObjectImpl::Start(runtimeCreated); diff --git a/lib/icinga/checkable.hpp b/lib/icinga/checkable.hpp index 9141dcf48..3ee412504 100644 --- a/lib/icinga/checkable.hpp +++ b/lib/icinga/checkable.hpp @@ -104,7 +104,7 @@ public: long GetSchedulingOffset() const; void SetSchedulingOffset(long offset); - void UpdateNextCheck(const MessageOrigin::Ptr& origin = nullptr); + void UpdateNextCheck(const MessageOrigin::Ptr& origin = nullptr, bool onlyReschedule = false); static String StateTypeToString(StateType type); @@ -148,6 +148,18 @@ public: static boost::signals2::signal OnAcknowledgementCleared; static boost::signals2::signal OnFlappingChange; static boost::signals2::signal OnNextCheckUpdated; + /** + * Think again! Are you really sure you want to subscribe to this signal? + * + * This signal is a very special and noisy one. It is emitted whenever someone wants to enforce the + * LOCAL scheduler to reschedule the next check of a checkable at the given timestamp. This can be + * due to a number of reasons, for example: Icinga 2 was interrupted while a check was being executed, + * and the checkable needs to be rescheduled on startup; or a parent host changed its state and all its + * children need to be rescheduled to reflect the new reachability state. There are other cases as well, + * but the point is: this signal is meant to only be used by the local @c CheckerComponent to update its + * internal queues. If you want to use this for something else, think twice!! + */ + static boost::signals2::signal OnRescheduleCheck; static boost::signals2::signal OnEventCommandExecuted; static Atomic CurrentConcurrentChecks;