mirror of https://github.com/Icinga/icinga2.git
Checkable: improve state notifications after suppression ends
This commit changes the Checkable notification suppression logic (notifications are currently suppressed on the Checkable if it is unreachable, in a downtime, or acknowledged) to that after the suppression reason ends, a state notification is sent if and only if the first hard state after is different from the last hard state from before. If the checkable is in a soft state after the suppression ends, the notification is further suppressed until a hard state is reached. To achieve this behavior, a new attribute state_before_suppression is added to Checkable. This attribute is set to the last hard state the first time either a PROBLEM or a RECOVERY notification is suppressed. Compared to from before, neither of these two flags in the suppressed_notification will ever be cleared while the supression is still ongoing but only after the suppression ended and the current state is compared with the old state stored in state_before_suppression.
This commit is contained in:
parent
23a4640ccd
commit
39cee3538a
|
@ -479,7 +479,12 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig
|
||||||
|
|
||||||
if (send_notification && !is_flapping) {
|
if (send_notification && !is_flapping) {
|
||||||
if (!IsPaused()) {
|
if (!IsPaused()) {
|
||||||
if (suppress_notification) {
|
/* If there are still some pending suppressed state notification, keep the suppression until these are
|
||||||
|
* handled by Checkable::FireSuppressedNotifications().
|
||||||
|
*/
|
||||||
|
bool pending = GetSuppressedNotifications() & (NotificationRecovery|NotificationProblem);
|
||||||
|
|
||||||
|
if (suppress_notification || pending) {
|
||||||
suppressed_types |= (recovery ? NotificationRecovery : NotificationProblem);
|
suppressed_types |= (recovery ? NotificationRecovery : NotificationProblem);
|
||||||
} else {
|
} else {
|
||||||
OnNotificationsRequested(this, recovery ? NotificationRecovery : NotificationProblem, cr, "", "", nullptr);
|
OnNotificationsRequested(this, recovery ? NotificationRecovery : NotificationProblem, cr, "", "", nullptr);
|
||||||
|
@ -496,12 +501,21 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig
|
||||||
int suppressed_types_before (GetSuppressedNotifications());
|
int suppressed_types_before (GetSuppressedNotifications());
|
||||||
int suppressed_types_after (suppressed_types_before | suppressed_types);
|
int suppressed_types_after (suppressed_types_before | suppressed_types);
|
||||||
|
|
||||||
for (int conflict : {NotificationProblem | NotificationRecovery, NotificationFlappingStart | NotificationFlappingEnd}) {
|
const int conflict = NotificationFlappingStart | NotificationFlappingEnd;
|
||||||
/* E.g. problem and recovery notifications neutralize each other. */
|
if ((suppressed_types_after & conflict) == conflict) {
|
||||||
|
/* Flapping start and end cancel out each other. */
|
||||||
|
suppressed_types_after &= ~conflict;
|
||||||
|
}
|
||||||
|
|
||||||
if ((suppressed_types_after & conflict) == conflict) {
|
const int stateNotifications = NotificationRecovery | NotificationProblem;
|
||||||
suppressed_types_after &= ~conflict;
|
if (!(suppressed_types_before & stateNotifications) && (suppressed_types & stateNotifications)) {
|
||||||
}
|
/* A state-related notification is suppressed for the first time, store the previous state. When
|
||||||
|
* notifications are no longer suppressed, this can be compared with the current state to determine
|
||||||
|
* if a notification must be sent. This is done differently compared to flapping notifications just above
|
||||||
|
* as for state notifications, problem and recovery don't always cancel each other. For example,
|
||||||
|
* WARNING -> OK -> CRITICAL generates both types once, but there should still be a notification.
|
||||||
|
*/
|
||||||
|
SetStateBeforeSuppression(old_stateType == StateTypeHard ? old_state : ServiceOK);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (suppressed_types_after != suppressed_types_before) {
|
if (suppressed_types_after != suppressed_types_before) {
|
||||||
|
|
|
@ -168,7 +168,38 @@ static void FireSuppressedNotifications(Checkable* checkable)
|
||||||
return false;
|
return false;
|
||||||
});
|
});
|
||||||
|
|
||||||
for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) {
|
if (suppressed_types & (NotificationProblem|NotificationRecovery)) {
|
||||||
|
CheckResult::Ptr cr = checkable->GetLastCheckResult();
|
||||||
|
NotificationType type = cr && checkable->IsStateOK(cr->GetState()) ? NotificationRecovery : NotificationProblem;
|
||||||
|
bool state_suppressed = checkable->NotificationReasonSuppressed(NotificationProblem) || checkable->NotificationReasonSuppressed(NotificationRecovery);
|
||||||
|
|
||||||
|
/* Only process (i.e. send or dismiss) suppressed state notifications if the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. State notifications are not suppressed at the moment. State notifications must only be removed from
|
||||||
|
* the suppressed notifications bitset after the reason for the suppression is gone as these bits are
|
||||||
|
* used as a marker for when to set the state_before_suppression attribute.
|
||||||
|
* 2. The checkable is in a hard state. Soft states represent a state where we are not certain yet about
|
||||||
|
* the actual state and wait with sending notifications. If we want to immediately send a notification,
|
||||||
|
* we might send a recovery notification for something that just started failing or a problem
|
||||||
|
* notification which might be for an intermittent problem that would have never received a
|
||||||
|
* notification if there was no suppression as it still was in a soft state. Both cases aren't ideal so
|
||||||
|
* better wait until we are certain.
|
||||||
|
* 3. The checkable isn't likely checked soon. For example, if a downtime ended, give the checkable a
|
||||||
|
* chance to recover afterwards before sending a notification.
|
||||||
|
* 4. No parent recovered recently. Similar to the previous condition, give the checkable a chance to
|
||||||
|
* recover after one of its dependencies recovered before sending a notification.
|
||||||
|
*
|
||||||
|
* If any of these conditions is not met, processing the suppressed notification is further delayed.
|
||||||
|
*/
|
||||||
|
if (!state_suppressed && checkable->GetStateType() == StateTypeHard && !checkable->IsLikelyToBeCheckedSoon() && !wasLastParentRecoveryRecent.Get()) {
|
||||||
|
if (checkable->NotificationReasonApplies(type)) {
|
||||||
|
Checkable::OnNotificationsRequested(checkable, type, cr, "", "", nullptr);
|
||||||
|
}
|
||||||
|
subtract |= NotificationRecovery|NotificationProblem;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto type : {NotificationFlappingStart, NotificationFlappingEnd}) {
|
||||||
if (suppressed_types & type) {
|
if (suppressed_types & type) {
|
||||||
bool still_applies = checkable->NotificationReasonApplies(type);
|
bool still_applies = checkable->NotificationReasonApplies(type);
|
||||||
|
|
||||||
|
@ -224,12 +255,12 @@ bool Checkable::NotificationReasonApplies(NotificationType type)
|
||||||
case NotificationProblem:
|
case NotificationProblem:
|
||||||
{
|
{
|
||||||
auto cr (GetLastCheckResult());
|
auto cr (GetLastCheckResult());
|
||||||
return cr && !IsStateOK(cr->GetState()) && GetStateType() == StateTypeHard;
|
return cr && !IsStateOK(cr->GetState()) && cr->GetState() != GetStateBeforeSuppression();
|
||||||
}
|
}
|
||||||
case NotificationRecovery:
|
case NotificationRecovery:
|
||||||
{
|
{
|
||||||
auto cr (GetLastCheckResult());
|
auto cr (GetLastCheckResult());
|
||||||
return cr && IsStateOK(cr->GetState());
|
return cr && IsStateOK(cr->GetState()) && cr->GetState() != GetStateBeforeSuppression();
|
||||||
}
|
}
|
||||||
case NotificationFlappingStart:
|
case NotificationFlappingStart:
|
||||||
return IsFlapping();
|
return IsFlapping();
|
||||||
|
|
|
@ -175,6 +175,9 @@ abstract class Checkable : CustomVarObject
|
||||||
[state, no_user_view, no_user_modify] int suppressed_notifications {
|
[state, no_user_view, no_user_modify] int suppressed_notifications {
|
||||||
default {{{ return 0; }}}
|
default {{{ return 0; }}}
|
||||||
};
|
};
|
||||||
|
[state, enum, no_user_view, no_user_modify] ServiceState state_before_suppression {
|
||||||
|
default {{{ return ServiceOK; }}}
|
||||||
|
};
|
||||||
|
|
||||||
[config, navigation] name(Endpoint) command_endpoint (CommandEndpointRaw) {
|
[config, navigation] name(Endpoint) command_endpoint (CommandEndpointRaw) {
|
||||||
navigate {{{
|
navigate {{{
|
||||||
|
|
Loading…
Reference in New Issue