Checkable: improve state notifications after suppression ends

This commit changes the Checkable notification suppression logic (notifications
are currently suppressed on the Checkable if it is unreachable, in a downtime,
or acknowledged) to that after the suppression reason ends, a state
notification is sent if and only if the first hard state after is different
from the last hard state from before. If the checkable is in a soft state after
the suppression ends, the notification is further suppressed until a hard state
is reached.

To achieve this behavior, a new attribute state_before_suppression is added to
Checkable. This attribute is set to the last hard state the first time either a
PROBLEM or a RECOVERY notification is suppressed. Compared to from before,
neither of these two flags in the suppressed_notification will ever be cleared
while the supression is still ongoing but only after the suppression ended and
the current state is compared with the old state stored in
state_before_suppression.
This commit is contained in:
Julian Brost 2022-01-28 15:15:38 +01:00
parent 23a4640ccd
commit 39cee3538a
3 changed files with 57 additions and 9 deletions

View File

@ -479,7 +479,12 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig
if (send_notification && !is_flapping) {
if (!IsPaused()) {
if (suppress_notification) {
/* If there are still some pending suppressed state notification, keep the suppression until these are
* handled by Checkable::FireSuppressedNotifications().
*/
bool pending = GetSuppressedNotifications() & (NotificationRecovery|NotificationProblem);
if (suppress_notification || pending) {
suppressed_types |= (recovery ? NotificationRecovery : NotificationProblem);
} else {
OnNotificationsRequested(this, recovery ? NotificationRecovery : NotificationProblem, cr, "", "", nullptr);
@ -496,12 +501,21 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig
int suppressed_types_before (GetSuppressedNotifications());
int suppressed_types_after (suppressed_types_before | suppressed_types);
for (int conflict : {NotificationProblem | NotificationRecovery, NotificationFlappingStart | NotificationFlappingEnd}) {
/* E.g. problem and recovery notifications neutralize each other. */
const int conflict = NotificationFlappingStart | NotificationFlappingEnd;
if ((suppressed_types_after & conflict) == conflict) {
/* Flapping start and end cancel out each other. */
suppressed_types_after &= ~conflict;
}
if ((suppressed_types_after & conflict) == conflict) {
suppressed_types_after &= ~conflict;
}
const int stateNotifications = NotificationRecovery | NotificationProblem;
if (!(suppressed_types_before & stateNotifications) && (suppressed_types & stateNotifications)) {
/* A state-related notification is suppressed for the first time, store the previous state. When
* notifications are no longer suppressed, this can be compared with the current state to determine
* if a notification must be sent. This is done differently compared to flapping notifications just above
* as for state notifications, problem and recovery don't always cancel each other. For example,
* WARNING -> OK -> CRITICAL generates both types once, but there should still be a notification.
*/
SetStateBeforeSuppression(old_stateType == StateTypeHard ? old_state : ServiceOK);
}
if (suppressed_types_after != suppressed_types_before) {

View File

@ -168,7 +168,38 @@ static void FireSuppressedNotifications(Checkable* checkable)
return false;
});
for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) {
if (suppressed_types & (NotificationProblem|NotificationRecovery)) {
CheckResult::Ptr cr = checkable->GetLastCheckResult();
NotificationType type = cr && checkable->IsStateOK(cr->GetState()) ? NotificationRecovery : NotificationProblem;
bool state_suppressed = checkable->NotificationReasonSuppressed(NotificationProblem) || checkable->NotificationReasonSuppressed(NotificationRecovery);
/* Only process (i.e. send or dismiss) suppressed state notifications if the following conditions are met:
*
* 1. State notifications are not suppressed at the moment. State notifications must only be removed from
* the suppressed notifications bitset after the reason for the suppression is gone as these bits are
* used as a marker for when to set the state_before_suppression attribute.
* 2. The checkable is in a hard state. Soft states represent a state where we are not certain yet about
* the actual state and wait with sending notifications. If we want to immediately send a notification,
* we might send a recovery notification for something that just started failing or a problem
* notification which might be for an intermittent problem that would have never received a
* notification if there was no suppression as it still was in a soft state. Both cases aren't ideal so
* better wait until we are certain.
* 3. The checkable isn't likely checked soon. For example, if a downtime ended, give the checkable a
* chance to recover afterwards before sending a notification.
* 4. No parent recovered recently. Similar to the previous condition, give the checkable a chance to
* recover after one of its dependencies recovered before sending a notification.
*
* If any of these conditions is not met, processing the suppressed notification is further delayed.
*/
if (!state_suppressed && checkable->GetStateType() == StateTypeHard && !checkable->IsLikelyToBeCheckedSoon() && !wasLastParentRecoveryRecent.Get()) {
if (checkable->NotificationReasonApplies(type)) {
Checkable::OnNotificationsRequested(checkable, type, cr, "", "", nullptr);
}
subtract |= NotificationRecovery|NotificationProblem;
}
}
for (auto type : {NotificationFlappingStart, NotificationFlappingEnd}) {
if (suppressed_types & type) {
bool still_applies = checkable->NotificationReasonApplies(type);
@ -224,12 +255,12 @@ bool Checkable::NotificationReasonApplies(NotificationType type)
case NotificationProblem:
{
auto cr (GetLastCheckResult());
return cr && !IsStateOK(cr->GetState()) && GetStateType() == StateTypeHard;
return cr && !IsStateOK(cr->GetState()) && cr->GetState() != GetStateBeforeSuppression();
}
case NotificationRecovery:
{
auto cr (GetLastCheckResult());
return cr && IsStateOK(cr->GetState());
return cr && IsStateOK(cr->GetState()) && cr->GetState() != GetStateBeforeSuppression();
}
case NotificationFlappingStart:
return IsFlapping();

View File

@ -175,6 +175,9 @@ abstract class Checkable : CustomVarObject
[state, no_user_view, no_user_modify] int suppressed_notifications {
default {{{ return 0; }}}
};
[state, enum, no_user_view, no_user_modify] ServiceState state_before_suppression {
default {{{ return ServiceOK; }}}
};
[config, navigation] name(Endpoint) command_endpoint (CommandEndpointRaw) {
navigate {{{