mirror of https://github.com/Icinga/icinga2.git
Checkable: improve state notifications after suppression ends
This commit changes the Checkable notification suppression logic (notifications are currently suppressed on the Checkable if it is unreachable, in a downtime, or acknowledged) to that after the suppression reason ends, a state notification is sent if and only if the first hard state after is different from the last hard state from before. If the checkable is in a soft state after the suppression ends, the notification is further suppressed until a hard state is reached. To achieve this behavior, a new attribute state_before_suppression is added to Checkable. This attribute is set to the last hard state the first time either a PROBLEM or a RECOVERY notification is suppressed. Compared to from before, neither of these two flags in the suppressed_notification will ever be cleared while the supression is still ongoing but only after the suppression ended and the current state is compared with the old state stored in state_before_suppression.
This commit is contained in:
parent
23a4640ccd
commit
39cee3538a
|
@ -479,7 +479,12 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig
|
|||
|
||||
if (send_notification && !is_flapping) {
|
||||
if (!IsPaused()) {
|
||||
if (suppress_notification) {
|
||||
/* If there are still some pending suppressed state notification, keep the suppression until these are
|
||||
* handled by Checkable::FireSuppressedNotifications().
|
||||
*/
|
||||
bool pending = GetSuppressedNotifications() & (NotificationRecovery|NotificationProblem);
|
||||
|
||||
if (suppress_notification || pending) {
|
||||
suppressed_types |= (recovery ? NotificationRecovery : NotificationProblem);
|
||||
} else {
|
||||
OnNotificationsRequested(this, recovery ? NotificationRecovery : NotificationProblem, cr, "", "", nullptr);
|
||||
|
@ -496,12 +501,21 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig
|
|||
int suppressed_types_before (GetSuppressedNotifications());
|
||||
int suppressed_types_after (suppressed_types_before | suppressed_types);
|
||||
|
||||
for (int conflict : {NotificationProblem | NotificationRecovery, NotificationFlappingStart | NotificationFlappingEnd}) {
|
||||
/* E.g. problem and recovery notifications neutralize each other. */
|
||||
const int conflict = NotificationFlappingStart | NotificationFlappingEnd;
|
||||
if ((suppressed_types_after & conflict) == conflict) {
|
||||
/* Flapping start and end cancel out each other. */
|
||||
suppressed_types_after &= ~conflict;
|
||||
}
|
||||
|
||||
if ((suppressed_types_after & conflict) == conflict) {
|
||||
suppressed_types_after &= ~conflict;
|
||||
}
|
||||
const int stateNotifications = NotificationRecovery | NotificationProblem;
|
||||
if (!(suppressed_types_before & stateNotifications) && (suppressed_types & stateNotifications)) {
|
||||
/* A state-related notification is suppressed for the first time, store the previous state. When
|
||||
* notifications are no longer suppressed, this can be compared with the current state to determine
|
||||
* if a notification must be sent. This is done differently compared to flapping notifications just above
|
||||
* as for state notifications, problem and recovery don't always cancel each other. For example,
|
||||
* WARNING -> OK -> CRITICAL generates both types once, but there should still be a notification.
|
||||
*/
|
||||
SetStateBeforeSuppression(old_stateType == StateTypeHard ? old_state : ServiceOK);
|
||||
}
|
||||
|
||||
if (suppressed_types_after != suppressed_types_before) {
|
||||
|
|
|
@ -168,7 +168,38 @@ static void FireSuppressedNotifications(Checkable* checkable)
|
|||
return false;
|
||||
});
|
||||
|
||||
for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) {
|
||||
if (suppressed_types & (NotificationProblem|NotificationRecovery)) {
|
||||
CheckResult::Ptr cr = checkable->GetLastCheckResult();
|
||||
NotificationType type = cr && checkable->IsStateOK(cr->GetState()) ? NotificationRecovery : NotificationProblem;
|
||||
bool state_suppressed = checkable->NotificationReasonSuppressed(NotificationProblem) || checkable->NotificationReasonSuppressed(NotificationRecovery);
|
||||
|
||||
/* Only process (i.e. send or dismiss) suppressed state notifications if the following conditions are met:
|
||||
*
|
||||
* 1. State notifications are not suppressed at the moment. State notifications must only be removed from
|
||||
* the suppressed notifications bitset after the reason for the suppression is gone as these bits are
|
||||
* used as a marker for when to set the state_before_suppression attribute.
|
||||
* 2. The checkable is in a hard state. Soft states represent a state where we are not certain yet about
|
||||
* the actual state and wait with sending notifications. If we want to immediately send a notification,
|
||||
* we might send a recovery notification for something that just started failing or a problem
|
||||
* notification which might be for an intermittent problem that would have never received a
|
||||
* notification if there was no suppression as it still was in a soft state. Both cases aren't ideal so
|
||||
* better wait until we are certain.
|
||||
* 3. The checkable isn't likely checked soon. For example, if a downtime ended, give the checkable a
|
||||
* chance to recover afterwards before sending a notification.
|
||||
* 4. No parent recovered recently. Similar to the previous condition, give the checkable a chance to
|
||||
* recover after one of its dependencies recovered before sending a notification.
|
||||
*
|
||||
* If any of these conditions is not met, processing the suppressed notification is further delayed.
|
||||
*/
|
||||
if (!state_suppressed && checkable->GetStateType() == StateTypeHard && !checkable->IsLikelyToBeCheckedSoon() && !wasLastParentRecoveryRecent.Get()) {
|
||||
if (checkable->NotificationReasonApplies(type)) {
|
||||
Checkable::OnNotificationsRequested(checkable, type, cr, "", "", nullptr);
|
||||
}
|
||||
subtract |= NotificationRecovery|NotificationProblem;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto type : {NotificationFlappingStart, NotificationFlappingEnd}) {
|
||||
if (suppressed_types & type) {
|
||||
bool still_applies = checkable->NotificationReasonApplies(type);
|
||||
|
||||
|
@ -224,12 +255,12 @@ bool Checkable::NotificationReasonApplies(NotificationType type)
|
|||
case NotificationProblem:
|
||||
{
|
||||
auto cr (GetLastCheckResult());
|
||||
return cr && !IsStateOK(cr->GetState()) && GetStateType() == StateTypeHard;
|
||||
return cr && !IsStateOK(cr->GetState()) && cr->GetState() != GetStateBeforeSuppression();
|
||||
}
|
||||
case NotificationRecovery:
|
||||
{
|
||||
auto cr (GetLastCheckResult());
|
||||
return cr && IsStateOK(cr->GetState());
|
||||
return cr && IsStateOK(cr->GetState()) && cr->GetState() != GetStateBeforeSuppression();
|
||||
}
|
||||
case NotificationFlappingStart:
|
||||
return IsFlapping();
|
||||
|
|
|
@ -175,6 +175,9 @@ abstract class Checkable : CustomVarObject
|
|||
[state, no_user_view, no_user_modify] int suppressed_notifications {
|
||||
default {{{ return 0; }}}
|
||||
};
|
||||
[state, enum, no_user_view, no_user_modify] ServiceState state_before_suppression {
|
||||
default {{{ return ServiceOK; }}}
|
||||
};
|
||||
|
||||
[config, navigation] name(Endpoint) command_endpoint (CommandEndpointRaw) {
|
||||
navigate {{{
|
||||
|
|
Loading…
Reference in New Issue