Merge pull request #7270 from Icinga/feature/notification-after-downtime-ends-5919

Re-send suppressed notifications
This commit is contained in:
Michael Friedrich 2019-07-11 10:46:59 +02:00 committed by GitHub
commit 60661eaecb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 321 additions and 16 deletions

View File

@ -53,6 +53,36 @@ and compiled into the binary as header only include. It helps our way to C++11 a
to fix additional UTF8 issues more easily. Read more about its [design goals](https://github.com/nlohmann/json#design-goals)
and [benchmarks](https://github.com/miloyip/nativejson-benchmark#parsing-time).
### Core <a id="upgrading-to-2-11-core"></a>
#### Downtime Notifications <a id="upgrading-to-2-11-core-downtime-notifications"></a>
Imagine that a host/service changes to a HARD NOT-OK state,
and its check interval is set to a high interval e.g. 1 hour.
A maintenance downtime prevents the notification being sent,
but once it ends and the host/service is still in a downtime,
no immediate notification is re-sent but you'll have to wait
for the next check.
Another scenario is with one-shot notifications (interval=0)
which would never notify again after the downtime ends and
the problem state being intact. The state change logic requires
to recover and become HARD NOT-OK to notify again.
In order to solve these problems with filtered/suppressed notifications
in downtimes, v2.11 changes the behaviour like this:
- If there was a notification suppressed in a downtime, the core stores that information
- Once the downtime ends and the problem state is still intact, Icinga checks whether a re-notification should be sent immediately
A new cluster message was added to keep this in sync amongst HA masters.
> **Important**
>
> In order to properly use this new feature, all involved endpoints
> must be upgraded to v2.11.
### Network Stack <a id="upgrading-to-2-11-network-stack"></a>
The core network stack has been rewritten in 2.11 (some say this could be Icinga 3).

View File

@ -1278,6 +1278,41 @@ Message updates will be dropped when:
* Checkable does not exist.
* Origin endpoint's zone is not allowed to access this checkable.
#### event::SuppressedNotifications <a id="technical-concepts-json-rpc-messages-event-setsupressednotifications"></a>
> Location: `clusterevents.cpp`
##### Message Body
Key | Value
----------|---------
jsonrpc | 2.0
method | event::SuppressedNotifications
params | Dictionary
##### Params
Key | Type | Description
-------------------------|---------------|------------------
host | String | Host name
service | String | Service name
supressed\_notifications | Number | Bitmask for suppressed notifications.
##### Functions
Event Sender: `Checkable::OnSuppressedNotificationsChanged`
Event Receiver: `SuppressedNotificationsChangedAPIHandler`
##### Permissions
The receiver will not process messages from not configured endpoints.
Message updates will be dropped when:
* Checkable does not exist.
* Origin endpoint's zone is not allowed to access this checkable.
#### event::SetNextNotification <a id="technical-concepts-json-rpc-messages-event-setnextnotification"></a>
> Location: `clusterevents.cpp`

View File

@ -309,15 +309,14 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig
bool in_downtime = IsInDowntime();
bool send_notification = false;
bool suppress_notification = !notification_reachable || in_downtime || IsAcknowledged();
if (notification_reachable && !in_downtime && !IsAcknowledged()) {
/* Send notifications whether when a hard state change occurred. */
if (hardChange && !(old_stateType == StateTypeSoft && IsStateOK(new_state)))
send_notification = true;
/* Or if the checkable is volatile and in a HARD state. */
else if (is_volatile && GetStateType() == StateTypeHard)
send_notification = true;
}
/* Send notifications whether when a hard state change occurred. */
if (hardChange && !(old_stateType == StateTypeSoft && IsStateOK(new_state)))
send_notification = true;
/* Or if the checkable is volatile and in a HARD state. */
else if (is_volatile && GetStateType() == StateTypeHard)
send_notification = true;
if (IsStateOK(old_state) && old_stateType == StateTypeSoft)
send_notification = false; /* Don't send notifications for SOFT-OK -> HARD-OK. */
@ -405,21 +404,33 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig
(is_volatile && !(IsStateOK(old_state) && IsStateOK(new_state))))
ExecuteEventHandler();
int suppressed_types = 0;
/* Flapping start/end notifications */
if (!in_downtime && !was_flapping && is_flapping) {
if (!was_flapping && is_flapping) {
/* FlappingStart notifications happen on state changes, not in downtimes */
if (!IsPaused())
OnNotificationsRequested(this, NotificationFlappingStart, cr, "", "", nullptr);
if (!IsPaused()) {
if (in_downtime) {
suppressed_types |= NotificationFlappingStart;
} else {
OnNotificationsRequested(this, NotificationFlappingStart, cr, "", "", nullptr);
}
}
Log(LogNotice, "Checkable")
<< "Flapping Start: Checkable '" << GetName() << "' started flapping (Current flapping value "
<< GetFlappingCurrent() << "% > high threshold " << GetFlappingThresholdHigh() << "%).";
NotifyFlapping(origin);
} else if (!in_downtime && was_flapping && !is_flapping) {
} else if (was_flapping && !is_flapping) {
/* FlappingEnd notifications are independent from state changes, must not happen in downtine */
if (!IsPaused())
OnNotificationsRequested(this, NotificationFlappingEnd, cr, "", "", nullptr);
if (!IsPaused()) {
if (in_downtime) {
suppressed_types |= NotificationFlappingEnd;
} else {
OnNotificationsRequested(this, NotificationFlappingEnd, cr, "", "", nullptr);
}
}
Log(LogNotice, "Checkable")
<< "Flapping Stop: Checkable '" << GetName() << "' stopped flapping (Current flapping value "
@ -429,8 +440,35 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig
}
if (send_notification && !is_flapping) {
if (!IsPaused())
OnNotificationsRequested(this, recovery ? NotificationRecovery : NotificationProblem, cr, "", "", nullptr);
if (!IsPaused()) {
if (suppress_notification) {
suppressed_types |= (recovery ? NotificationRecovery : NotificationProblem);
} else {
OnNotificationsRequested(this, recovery ? NotificationRecovery : NotificationProblem, cr, "", "", nullptr);
}
}
}
if (suppressed_types) {
/* If some notifications were suppressed, but just because of e.g. a downtime,
* stash them into a notification types bitmask for maybe re-sending later.
*/
ObjectLock olock (this);
int suppressed_types_before (GetSuppressedNotifications());
int suppressed_types_after (suppressed_types_before | suppressed_types);
for (int conflict : {NotificationProblem | NotificationRecovery, NotificationFlappingStart | NotificationFlappingEnd}) {
/* E.g. problem and recovery notifications neutralize each other. */
if ((suppressed_types_after & conflict) == conflict) {
suppressed_types_after &= ~conflict;
}
}
if (suppressed_types_after != suppressed_types_before) {
SetSuppressedNotifications(suppressed_types_after);
}
}
}

View File

@ -1,7 +1,9 @@
/* Icinga 2 | (c) 2012 Icinga GmbH | GPLv2+ */
#include "icinga/checkable.hpp"
#include "icinga/host.hpp"
#include "icinga/icingaapplication.hpp"
#include "icinga/service.hpp"
#include "base/objectlock.hpp"
#include "base/logger.hpp"
#include "base/exception.hpp"
@ -84,3 +86,117 @@ void Checkable::UnregisterNotification(const Notification::Ptr& notification)
boost::mutex::scoped_lock lock(m_NotificationMutex);
m_Notifications.erase(notification);
}
static void FireSuppressedNotifications(Checkable* checkable)
{
if (!checkable->IsActive())
return;
if (checkable->IsPaused())
return;
if (!checkable->GetEnableNotifications())
return;
int suppressed_types (checkable->GetSuppressedNotifications());
if (!suppressed_types)
return;
int subtract = 0;
for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) {
if (suppressed_types & type) {
bool still_applies;
auto cr (checkable->GetLastCheckResult());
switch (type) {
case NotificationProblem:
still_applies = cr && !checkable->IsStateOK(cr->GetState()) && checkable->GetStateType() == StateTypeHard;
break;
case NotificationRecovery:
still_applies = cr && checkable->IsStateOK(cr->GetState());
break;
case NotificationFlappingStart:
still_applies = checkable->IsFlapping();
break;
case NotificationFlappingEnd:
still_applies = !checkable->IsFlapping();
break;
default:
break;
}
if (still_applies) {
bool still_suppressed;
switch (type) {
case NotificationProblem:
/* Fall through. */
case NotificationRecovery:
still_suppressed = !checkable->IsReachable(DependencyNotification) || checkable->IsInDowntime() || checkable->IsAcknowledged();
break;
case NotificationFlappingStart:
/* Fall through. */
case NotificationFlappingEnd:
still_suppressed = checkable->IsInDowntime();
break;
default:
break;
}
if (!still_suppressed && checkable->GetEnableActiveChecks()) {
/* If e.g. the downtime just ended, but the service is still not ok, we would re-send the stashed problem notification.
* But if the next check result recovers the service soon, we would send a recovery notification soon after the problem one.
* This is not desired, especially for lots of services at once.
* Because of that if there's likely to be a check result soon,
* we delay the re-sending of the stashed notification until the next check.
* That check either doesn't change anything and we finally re-send the stashed problem notification
* or recovers the service and we drop the stashed notification. */
/* One minute unless the check interval is too short so the next check will always run during the next minute. */
auto threshold (checkable->GetCheckInterval() - 10);
if (threshold > 60)
threshold = 60;
else if (threshold < 0)
threshold = 0;
still_suppressed = checkable->GetNextCheck() <= Utility::GetTime() + threshold;
}
if (!still_suppressed) {
Checkable::OnNotificationsRequested(checkable, type, cr, "", "", nullptr);
subtract |= type;
}
} else {
subtract |= type;
}
}
}
if (subtract) {
ObjectLock olock (checkable);
int suppressed_types_before (checkable->GetSuppressedNotifications());
int suppressed_types_after (suppressed_types_before & ~subtract);
if (suppressed_types_after != suppressed_types_before) {
checkable->SetSuppressedNotifications(suppressed_types_after);
}
}
}
/**
* Re-sends all notifications previously suppressed by e.g. downtimes if the notification reason still applies.
*/
void Checkable::FireSuppressedNotifications(const Timer * const&)
{
for (auto& host : ConfigType::GetObjectsByType<Host>()) {
::FireSuppressedNotifications(host.get());
}
for (auto& service : ConfigType::GetObjectsByType<Service>()) {
::FireSuppressedNotifications(service.get());
}
}

View File

@ -7,6 +7,8 @@
#include "base/objectlock.hpp"
#include "base/utility.hpp"
#include "base/exception.hpp"
#include "base/timer.hpp"
#include <boost/thread/once.hpp>
using namespace icinga;
@ -16,6 +18,8 @@ INITIALIZE_ONCE(&Checkable::StaticInitialize);
boost::signals2::signal<void (const Checkable::Ptr&, const String&, const String&, AcknowledgementType, bool, bool, double, const MessageOrigin::Ptr&)> Checkable::OnAcknowledgementSet;
boost::signals2::signal<void (const Checkable::Ptr&, const MessageOrigin::Ptr&)> Checkable::OnAcknowledgementCleared;
static Timer::Ptr l_CheckablesFireSuppressedNotifications;
void Checkable::StaticInitialize()
{
/* fixed downtime start */
@ -65,6 +69,15 @@ void Checkable::Start(bool runtimeCreated)
}
ObjectImpl<Checkable>::Start(runtimeCreated);
static boost::once_flag once = BOOST_ONCE_INIT;
boost::call_once(once, []() {
l_CheckablesFireSuppressedNotifications = new Timer();
l_CheckablesFireSuppressedNotifications->SetInterval(5);
l_CheckablesFireSuppressedNotifications->OnTimerExpired.connect(&Checkable::FireSuppressedNotifications);
l_CheckablesFireSuppressedNotifications->Start();
});
}
void Checkable::AddGroup(const String& name)

View File

@ -3,6 +3,7 @@
#ifndef CHECKABLE_H
#define CHECKABLE_H
#include "base/timer.hpp"
#include "icinga/i2-icinga.hpp"
#include "icinga/checkable-ti.hpp"
#include "icinga/timeperiod.hpp"
@ -211,6 +212,8 @@ private:
static void NotifyDowntimeEnd(const Downtime::Ptr& downtime);
static void FireSuppressedNotifications(const Timer * const&);
/* Comments */
std::set<Comment::Ptr> m_Comments;
mutable boost::mutex m_CommentMutex;

View File

@ -154,6 +154,9 @@ abstract class Checkable : CustomVarObject
[state, no_user_view, no_user_modify] int flapping_buffer;
[state, no_user_view, no_user_modify] int flapping_index;
[state, protected] bool flapping;
[state, no_user_view, no_user_modify] int suppressed_notifications {
default {{{ return 0; }}}
};
[config, navigation] name(Endpoint) command_endpoint (CommandEndpointRaw) {
navigate {{{

View File

@ -24,6 +24,7 @@ INITIALIZE_ONCE(&ClusterEvents::StaticInitialize);
REGISTER_APIFUNCTION(CheckResult, event, &ClusterEvents::CheckResultAPIHandler);
REGISTER_APIFUNCTION(SetNextCheck, event, &ClusterEvents::NextCheckChangedAPIHandler);
REGISTER_APIFUNCTION(SetSuppressedNotifications, event, &ClusterEvents::SuppressedNotificationsChangedAPIHandler);
REGISTER_APIFUNCTION(SetNextNotification, event, &ClusterEvents::NextNotificationChangedAPIHandler);
REGISTER_APIFUNCTION(SetForceNextCheck, event, &ClusterEvents::ForceNextCheckChangedAPIHandler);
REGISTER_APIFUNCTION(SetForceNextNotification, event, &ClusterEvents::ForceNextNotificationChangedAPIHandler);
@ -38,6 +39,7 @@ void ClusterEvents::StaticInitialize()
{
Checkable::OnNewCheckResult.connect(&ClusterEvents::CheckResultHandler);
Checkable::OnNextCheckChanged.connect(&ClusterEvents::NextCheckChangedHandler);
Checkable::OnSuppressedNotificationsChanged.connect(&ClusterEvents::SuppressedNotificationsChangedHandler);
Notification::OnNextNotificationChanged.connect(&ClusterEvents::NextNotificationChangedHandler);
Checkable::OnForceNextCheckChanged.connect(&ClusterEvents::ForceNextCheckChangedHandler);
Checkable::OnForceNextNotificationChanged.connect(&ClusterEvents::ForceNextNotificationChangedHandler);
@ -232,6 +234,68 @@ Value ClusterEvents::NextCheckChangedAPIHandler(const MessageOrigin::Ptr& origin
return Empty;
}
void ClusterEvents::SuppressedNotificationsChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin)
{
ApiListener::Ptr listener = ApiListener::GetInstance();
if (!listener)
return;
Host::Ptr host;
Service::Ptr service;
tie(host, service) = GetHostService(checkable);
Dictionary::Ptr params = new Dictionary();
params->Set("host", host->GetName());
if (service)
params->Set("service", service->GetShortName());
params->Set("suppressed_notifications", checkable->GetSuppressedNotifications());
Dictionary::Ptr message = new Dictionary();
message->Set("jsonrpc", "2.0");
message->Set("method", "event::SetSuppressedNotifications");
message->Set("params", params);
listener->RelayMessage(origin, checkable, message, true);
}
Value ClusterEvents::SuppressedNotificationsChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params)
{
Endpoint::Ptr endpoint = origin->FromClient->GetEndpoint();
if (!endpoint) {
Log(LogNotice, "ClusterEvents")
<< "Discarding 'suppressed notifications changed' message from '" << origin->FromClient->GetIdentity() << "': Invalid endpoint origin (client not allowed).";
return Empty;
}
Host::Ptr host = Host::GetByName(params->Get("host"));
if (!host)
return Empty;
Checkable::Ptr checkable;
if (params->Contains("service"))
checkable = host->GetServiceByShortName(params->Get("service"));
else
checkable = host;
if (!checkable)
return Empty;
if (origin->FromZone && !origin->FromZone->CanAccessObject(checkable)) {
Log(LogNotice, "ClusterEvents")
<< "Discarding 'suppressed notifications changed' message for checkable '" << checkable->GetName()
<< "' from '" << origin->FromClient->GetIdentity() << "': Unauthorized access.";
return Empty;
}
checkable->SetSuppressedNotifications(params->Get("suppressed_notifications"), false, origin);
return Empty;
}
void ClusterEvents::NextNotificationChangedHandler(const Notification::Ptr& notification, const MessageOrigin::Ptr& origin)
{
ApiListener::Ptr listener = ApiListener::GetInstance();

View File

@ -26,6 +26,9 @@ public:
static void NextCheckChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin);
static Value NextCheckChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params);
static void SuppressedNotificationsChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin);
static Value SuppressedNotificationsChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params);
static void NextNotificationChangedHandler(const Notification::Ptr& notification, const MessageOrigin::Ptr& origin);
static Value NextNotificationChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params);