diff --git a/doc/16-upgrading-icinga-2.md b/doc/16-upgrading-icinga-2.md
index bd801d934..3f5ba1017 100644
--- a/doc/16-upgrading-icinga-2.md
+++ b/doc/16-upgrading-icinga-2.md
@@ -53,6 +53,36 @@ and compiled into the binary as header only include. It helps our way to C++11 a
to fix additional UTF8 issues more easily. Read more about its [design goals](https://github.com/nlohmann/json#design-goals)
and [benchmarks](https://github.com/miloyip/nativejson-benchmark#parsing-time).
+### Core
+
+#### Downtime Notifications
+
+Imagine that a host/service changes to a HARD NOT-OK state,
+and its check interval is set to a high interval e.g. 1 hour.
+
+A maintenance downtime prevents the notification being sent,
+but once it ends and the host/service is still in a downtime,
+no immediate notification is re-sent but you'll have to wait
+for the next check.
+
+Another scenario is with one-shot notifications (interval=0)
+which would never notify again after the downtime ends and
+the problem state being intact. The state change logic requires
+to recover and become HARD NOT-OK to notify again.
+
+In order to solve these problems with filtered/suppressed notifications
+in downtimes, v2.11 changes the behaviour like this:
+
+- If there was a notification suppressed in a downtime, the core stores that information
+- Once the downtime ends and the problem state is still intact, Icinga checks whether a re-notification should be sent immediately
+
+A new cluster message was added to keep this in sync amongst HA masters.
+
+> **Important**
+>
+> In order to properly use this new feature, all involved endpoints
+> must be upgraded to v2.11.
+
### Network Stack
The core network stack has been rewritten in 2.11 (some say this could be Icinga 3).
diff --git a/doc/19-technical-concepts.md b/doc/19-technical-concepts.md
index 12ed44f4b..187d3a5c4 100644
--- a/doc/19-technical-concepts.md
+++ b/doc/19-technical-concepts.md
@@ -1278,6 +1278,41 @@ Message updates will be dropped when:
* Checkable does not exist.
* Origin endpoint's zone is not allowed to access this checkable.
+#### event::SuppressedNotifications
+
+> Location: `clusterevents.cpp`
+
+##### Message Body
+
+Key | Value
+----------|---------
+jsonrpc | 2.0
+method | event::SuppressedNotifications
+params | Dictionary
+
+##### Params
+
+Key | Type | Description
+-------------------------|---------------|------------------
+host | String | Host name
+service | String | Service name
+supressed\_notifications | Number | Bitmask for suppressed notifications.
+
+##### Functions
+
+Event Sender: `Checkable::OnSuppressedNotificationsChanged`
+Event Receiver: `SuppressedNotificationsChangedAPIHandler`
+
+##### Permissions
+
+The receiver will not process messages from not configured endpoints.
+
+Message updates will be dropped when:
+
+* Checkable does not exist.
+* Origin endpoint's zone is not allowed to access this checkable.
+
+
#### event::SetNextNotification
> Location: `clusterevents.cpp`
diff --git a/lib/icinga/checkable-check.cpp b/lib/icinga/checkable-check.cpp
index cab1557ea..7b095353a 100644
--- a/lib/icinga/checkable-check.cpp
+++ b/lib/icinga/checkable-check.cpp
@@ -309,15 +309,14 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig
bool in_downtime = IsInDowntime();
bool send_notification = false;
+ bool suppress_notification = !notification_reachable || in_downtime || IsAcknowledged();
- if (notification_reachable && !in_downtime && !IsAcknowledged()) {
- /* Send notifications whether when a hard state change occurred. */
- if (hardChange && !(old_stateType == StateTypeSoft && IsStateOK(new_state)))
- send_notification = true;
- /* Or if the checkable is volatile and in a HARD state. */
- else if (is_volatile && GetStateType() == StateTypeHard)
- send_notification = true;
- }
+ /* Send notifications whether when a hard state change occurred. */
+ if (hardChange && !(old_stateType == StateTypeSoft && IsStateOK(new_state)))
+ send_notification = true;
+ /* Or if the checkable is volatile and in a HARD state. */
+ else if (is_volatile && GetStateType() == StateTypeHard)
+ send_notification = true;
if (IsStateOK(old_state) && old_stateType == StateTypeSoft)
send_notification = false; /* Don't send notifications for SOFT-OK -> HARD-OK. */
@@ -405,21 +404,33 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig
(is_volatile && !(IsStateOK(old_state) && IsStateOK(new_state))))
ExecuteEventHandler();
+ int suppressed_types = 0;
+
/* Flapping start/end notifications */
- if (!in_downtime && !was_flapping && is_flapping) {
+ if (!was_flapping && is_flapping) {
/* FlappingStart notifications happen on state changes, not in downtimes */
- if (!IsPaused())
- OnNotificationsRequested(this, NotificationFlappingStart, cr, "", "", nullptr);
+ if (!IsPaused()) {
+ if (in_downtime) {
+ suppressed_types |= NotificationFlappingStart;
+ } else {
+ OnNotificationsRequested(this, NotificationFlappingStart, cr, "", "", nullptr);
+ }
+ }
Log(LogNotice, "Checkable")
<< "Flapping Start: Checkable '" << GetName() << "' started flapping (Current flapping value "
<< GetFlappingCurrent() << "% > high threshold " << GetFlappingThresholdHigh() << "%).";
NotifyFlapping(origin);
- } else if (!in_downtime && was_flapping && !is_flapping) {
+ } else if (was_flapping && !is_flapping) {
/* FlappingEnd notifications are independent from state changes, must not happen in downtine */
- if (!IsPaused())
- OnNotificationsRequested(this, NotificationFlappingEnd, cr, "", "", nullptr);
+ if (!IsPaused()) {
+ if (in_downtime) {
+ suppressed_types |= NotificationFlappingEnd;
+ } else {
+ OnNotificationsRequested(this, NotificationFlappingEnd, cr, "", "", nullptr);
+ }
+ }
Log(LogNotice, "Checkable")
<< "Flapping Stop: Checkable '" << GetName() << "' stopped flapping (Current flapping value "
@@ -429,8 +440,35 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig
}
if (send_notification && !is_flapping) {
- if (!IsPaused())
- OnNotificationsRequested(this, recovery ? NotificationRecovery : NotificationProblem, cr, "", "", nullptr);
+ if (!IsPaused()) {
+ if (suppress_notification) {
+ suppressed_types |= (recovery ? NotificationRecovery : NotificationProblem);
+ } else {
+ OnNotificationsRequested(this, recovery ? NotificationRecovery : NotificationProblem, cr, "", "", nullptr);
+ }
+ }
+ }
+
+ if (suppressed_types) {
+ /* If some notifications were suppressed, but just because of e.g. a downtime,
+ * stash them into a notification types bitmask for maybe re-sending later.
+ */
+
+ ObjectLock olock (this);
+ int suppressed_types_before (GetSuppressedNotifications());
+ int suppressed_types_after (suppressed_types_before | suppressed_types);
+
+ for (int conflict : {NotificationProblem | NotificationRecovery, NotificationFlappingStart | NotificationFlappingEnd}) {
+ /* E.g. problem and recovery notifications neutralize each other. */
+
+ if ((suppressed_types_after & conflict) == conflict) {
+ suppressed_types_after &= ~conflict;
+ }
+ }
+
+ if (suppressed_types_after != suppressed_types_before) {
+ SetSuppressedNotifications(suppressed_types_after);
+ }
}
}
diff --git a/lib/icinga/checkable-notification.cpp b/lib/icinga/checkable-notification.cpp
index 568ff6c52..78c488dfe 100644
--- a/lib/icinga/checkable-notification.cpp
+++ b/lib/icinga/checkable-notification.cpp
@@ -1,7 +1,9 @@
/* Icinga 2 | (c) 2012 Icinga GmbH | GPLv2+ */
#include "icinga/checkable.hpp"
+#include "icinga/host.hpp"
#include "icinga/icingaapplication.hpp"
+#include "icinga/service.hpp"
#include "base/objectlock.hpp"
#include "base/logger.hpp"
#include "base/exception.hpp"
@@ -84,3 +86,117 @@ void Checkable::UnregisterNotification(const Notification::Ptr& notification)
boost::mutex::scoped_lock lock(m_NotificationMutex);
m_Notifications.erase(notification);
}
+
+static void FireSuppressedNotifications(Checkable* checkable)
+{
+ if (!checkable->IsActive())
+ return;
+
+ if (checkable->IsPaused())
+ return;
+
+ if (!checkable->GetEnableNotifications())
+ return;
+
+ int suppressed_types (checkable->GetSuppressedNotifications());
+ if (!suppressed_types)
+ return;
+
+ int subtract = 0;
+
+ for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) {
+ if (suppressed_types & type) {
+ bool still_applies;
+ auto cr (checkable->GetLastCheckResult());
+
+ switch (type) {
+ case NotificationProblem:
+ still_applies = cr && !checkable->IsStateOK(cr->GetState()) && checkable->GetStateType() == StateTypeHard;
+ break;
+ case NotificationRecovery:
+ still_applies = cr && checkable->IsStateOK(cr->GetState());
+ break;
+ case NotificationFlappingStart:
+ still_applies = checkable->IsFlapping();
+ break;
+ case NotificationFlappingEnd:
+ still_applies = !checkable->IsFlapping();
+ break;
+ default:
+ break;
+ }
+
+ if (still_applies) {
+ bool still_suppressed;
+
+ switch (type) {
+ case NotificationProblem:
+ /* Fall through. */
+ case NotificationRecovery:
+ still_suppressed = !checkable->IsReachable(DependencyNotification) || checkable->IsInDowntime() || checkable->IsAcknowledged();
+ break;
+ case NotificationFlappingStart:
+ /* Fall through. */
+ case NotificationFlappingEnd:
+ still_suppressed = checkable->IsInDowntime();
+ break;
+ default:
+ break;
+ }
+
+ if (!still_suppressed && checkable->GetEnableActiveChecks()) {
+ /* If e.g. the downtime just ended, but the service is still not ok, we would re-send the stashed problem notification.
+ * But if the next check result recovers the service soon, we would send a recovery notification soon after the problem one.
+ * This is not desired, especially for lots of services at once.
+ * Because of that if there's likely to be a check result soon,
+ * we delay the re-sending of the stashed notification until the next check.
+ * That check either doesn't change anything and we finally re-send the stashed problem notification
+ * or recovers the service and we drop the stashed notification. */
+
+ /* One minute unless the check interval is too short so the next check will always run during the next minute. */
+ auto threshold (checkable->GetCheckInterval() - 10);
+
+ if (threshold > 60)
+ threshold = 60;
+ else if (threshold < 0)
+ threshold = 0;
+
+ still_suppressed = checkable->GetNextCheck() <= Utility::GetTime() + threshold;
+ }
+
+ if (!still_suppressed) {
+ Checkable::OnNotificationsRequested(checkable, type, cr, "", "", nullptr);
+
+ subtract |= type;
+ }
+ } else {
+ subtract |= type;
+ }
+ }
+ }
+
+ if (subtract) {
+ ObjectLock olock (checkable);
+
+ int suppressed_types_before (checkable->GetSuppressedNotifications());
+ int suppressed_types_after (suppressed_types_before & ~subtract);
+
+ if (suppressed_types_after != suppressed_types_before) {
+ checkable->SetSuppressedNotifications(suppressed_types_after);
+ }
+ }
+}
+
+/**
+ * Re-sends all notifications previously suppressed by e.g. downtimes if the notification reason still applies.
+ */
+void Checkable::FireSuppressedNotifications(const Timer * const&)
+{
+ for (auto& host : ConfigType::GetObjectsByType()) {
+ ::FireSuppressedNotifications(host.get());
+ }
+
+ for (auto& service : ConfigType::GetObjectsByType()) {
+ ::FireSuppressedNotifications(service.get());
+ }
+}
diff --git a/lib/icinga/checkable.cpp b/lib/icinga/checkable.cpp
index 0f1879dda..c4265d05f 100644
--- a/lib/icinga/checkable.cpp
+++ b/lib/icinga/checkable.cpp
@@ -7,6 +7,8 @@
#include "base/objectlock.hpp"
#include "base/utility.hpp"
#include "base/exception.hpp"
+#include "base/timer.hpp"
+#include
using namespace icinga;
@@ -16,6 +18,8 @@ INITIALIZE_ONCE(&Checkable::StaticInitialize);
boost::signals2::signal Checkable::OnAcknowledgementSet;
boost::signals2::signal Checkable::OnAcknowledgementCleared;
+static Timer::Ptr l_CheckablesFireSuppressedNotifications;
+
void Checkable::StaticInitialize()
{
/* fixed downtime start */
@@ -65,6 +69,15 @@ void Checkable::Start(bool runtimeCreated)
}
ObjectImpl::Start(runtimeCreated);
+
+ static boost::once_flag once = BOOST_ONCE_INIT;
+
+ boost::call_once(once, []() {
+ l_CheckablesFireSuppressedNotifications = new Timer();
+ l_CheckablesFireSuppressedNotifications->SetInterval(5);
+ l_CheckablesFireSuppressedNotifications->OnTimerExpired.connect(&Checkable::FireSuppressedNotifications);
+ l_CheckablesFireSuppressedNotifications->Start();
+ });
}
void Checkable::AddGroup(const String& name)
diff --git a/lib/icinga/checkable.hpp b/lib/icinga/checkable.hpp
index fcfb3f74b..ee7212860 100644
--- a/lib/icinga/checkable.hpp
+++ b/lib/icinga/checkable.hpp
@@ -3,6 +3,7 @@
#ifndef CHECKABLE_H
#define CHECKABLE_H
+#include "base/timer.hpp"
#include "icinga/i2-icinga.hpp"
#include "icinga/checkable-ti.hpp"
#include "icinga/timeperiod.hpp"
@@ -211,6 +212,8 @@ private:
static void NotifyDowntimeEnd(const Downtime::Ptr& downtime);
+ static void FireSuppressedNotifications(const Timer * const&);
+
/* Comments */
std::set m_Comments;
mutable boost::mutex m_CommentMutex;
diff --git a/lib/icinga/checkable.ti b/lib/icinga/checkable.ti
index 418236316..7969d6f46 100644
--- a/lib/icinga/checkable.ti
+++ b/lib/icinga/checkable.ti
@@ -154,6 +154,9 @@ abstract class Checkable : CustomVarObject
[state, no_user_view, no_user_modify] int flapping_buffer;
[state, no_user_view, no_user_modify] int flapping_index;
[state, protected] bool flapping;
+ [state, no_user_view, no_user_modify] int suppressed_notifications {
+ default {{{ return 0; }}}
+ };
[config, navigation] name(Endpoint) command_endpoint (CommandEndpointRaw) {
navigate {{{
diff --git a/lib/icinga/clusterevents.cpp b/lib/icinga/clusterevents.cpp
index 2c14a3550..313adb1eb 100644
--- a/lib/icinga/clusterevents.cpp
+++ b/lib/icinga/clusterevents.cpp
@@ -24,6 +24,7 @@ INITIALIZE_ONCE(&ClusterEvents::StaticInitialize);
REGISTER_APIFUNCTION(CheckResult, event, &ClusterEvents::CheckResultAPIHandler);
REGISTER_APIFUNCTION(SetNextCheck, event, &ClusterEvents::NextCheckChangedAPIHandler);
+REGISTER_APIFUNCTION(SetSuppressedNotifications, event, &ClusterEvents::SuppressedNotificationsChangedAPIHandler);
REGISTER_APIFUNCTION(SetNextNotification, event, &ClusterEvents::NextNotificationChangedAPIHandler);
REGISTER_APIFUNCTION(SetForceNextCheck, event, &ClusterEvents::ForceNextCheckChangedAPIHandler);
REGISTER_APIFUNCTION(SetForceNextNotification, event, &ClusterEvents::ForceNextNotificationChangedAPIHandler);
@@ -38,6 +39,7 @@ void ClusterEvents::StaticInitialize()
{
Checkable::OnNewCheckResult.connect(&ClusterEvents::CheckResultHandler);
Checkable::OnNextCheckChanged.connect(&ClusterEvents::NextCheckChangedHandler);
+ Checkable::OnSuppressedNotificationsChanged.connect(&ClusterEvents::SuppressedNotificationsChangedHandler);
Notification::OnNextNotificationChanged.connect(&ClusterEvents::NextNotificationChangedHandler);
Checkable::OnForceNextCheckChanged.connect(&ClusterEvents::ForceNextCheckChangedHandler);
Checkable::OnForceNextNotificationChanged.connect(&ClusterEvents::ForceNextNotificationChangedHandler);
@@ -232,6 +234,68 @@ Value ClusterEvents::NextCheckChangedAPIHandler(const MessageOrigin::Ptr& origin
return Empty;
}
+void ClusterEvents::SuppressedNotificationsChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin)
+{
+ ApiListener::Ptr listener = ApiListener::GetInstance();
+
+ if (!listener)
+ return;
+
+ Host::Ptr host;
+ Service::Ptr service;
+ tie(host, service) = GetHostService(checkable);
+
+ Dictionary::Ptr params = new Dictionary();
+ params->Set("host", host->GetName());
+ if (service)
+ params->Set("service", service->GetShortName());
+ params->Set("suppressed_notifications", checkable->GetSuppressedNotifications());
+
+ Dictionary::Ptr message = new Dictionary();
+ message->Set("jsonrpc", "2.0");
+ message->Set("method", "event::SetSuppressedNotifications");
+ message->Set("params", params);
+
+ listener->RelayMessage(origin, checkable, message, true);
+}
+
+Value ClusterEvents::SuppressedNotificationsChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params)
+{
+ Endpoint::Ptr endpoint = origin->FromClient->GetEndpoint();
+
+ if (!endpoint) {
+ Log(LogNotice, "ClusterEvents")
+ << "Discarding 'suppressed notifications changed' message from '" << origin->FromClient->GetIdentity() << "': Invalid endpoint origin (client not allowed).";
+ return Empty;
+ }
+
+ Host::Ptr host = Host::GetByName(params->Get("host"));
+
+ if (!host)
+ return Empty;
+
+ Checkable::Ptr checkable;
+
+ if (params->Contains("service"))
+ checkable = host->GetServiceByShortName(params->Get("service"));
+ else
+ checkable = host;
+
+ if (!checkable)
+ return Empty;
+
+ if (origin->FromZone && !origin->FromZone->CanAccessObject(checkable)) {
+ Log(LogNotice, "ClusterEvents")
+ << "Discarding 'suppressed notifications changed' message for checkable '" << checkable->GetName()
+ << "' from '" << origin->FromClient->GetIdentity() << "': Unauthorized access.";
+ return Empty;
+ }
+
+ checkable->SetSuppressedNotifications(params->Get("suppressed_notifications"), false, origin);
+
+ return Empty;
+}
+
void ClusterEvents::NextNotificationChangedHandler(const Notification::Ptr& notification, const MessageOrigin::Ptr& origin)
{
ApiListener::Ptr listener = ApiListener::GetInstance();
diff --git a/lib/icinga/clusterevents.hpp b/lib/icinga/clusterevents.hpp
index 144155cc5..8dc6f48b9 100644
--- a/lib/icinga/clusterevents.hpp
+++ b/lib/icinga/clusterevents.hpp
@@ -26,6 +26,9 @@ public:
static void NextCheckChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin);
static Value NextCheckChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params);
+ static void SuppressedNotificationsChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin);
+ static Value SuppressedNotificationsChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params);
+
static void NextNotificationChangedHandler(const Notification::Ptr& notification, const MessageOrigin::Ptr& origin);
static Value NextNotificationChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params);