diff --git a/doc/16-upgrading-icinga-2.md b/doc/16-upgrading-icinga-2.md index bd801d934..3f5ba1017 100644 --- a/doc/16-upgrading-icinga-2.md +++ b/doc/16-upgrading-icinga-2.md @@ -53,6 +53,36 @@ and compiled into the binary as header only include. It helps our way to C++11 a to fix additional UTF8 issues more easily. Read more about its [design goals](https://github.com/nlohmann/json#design-goals) and [benchmarks](https://github.com/miloyip/nativejson-benchmark#parsing-time). +### Core + +#### Downtime Notifications + +Imagine that a host/service changes to a HARD NOT-OK state, +and its check interval is set to a high interval e.g. 1 hour. + +A maintenance downtime prevents the notification being sent, +but once it ends and the host/service is still in a downtime, +no immediate notification is re-sent but you'll have to wait +for the next check. + +Another scenario is with one-shot notifications (interval=0) +which would never notify again after the downtime ends and +the problem state being intact. The state change logic requires +to recover and become HARD NOT-OK to notify again. + +In order to solve these problems with filtered/suppressed notifications +in downtimes, v2.11 changes the behaviour like this: + +- If there was a notification suppressed in a downtime, the core stores that information +- Once the downtime ends and the problem state is still intact, Icinga checks whether a re-notification should be sent immediately + +A new cluster message was added to keep this in sync amongst HA masters. + +> **Important** +> +> In order to properly use this new feature, all involved endpoints +> must be upgraded to v2.11. + ### Network Stack The core network stack has been rewritten in 2.11 (some say this could be Icinga 3). diff --git a/doc/19-technical-concepts.md b/doc/19-technical-concepts.md index 12ed44f4b..187d3a5c4 100644 --- a/doc/19-technical-concepts.md +++ b/doc/19-technical-concepts.md @@ -1278,6 +1278,41 @@ Message updates will be dropped when: * Checkable does not exist. * Origin endpoint's zone is not allowed to access this checkable. +#### event::SuppressedNotifications + +> Location: `clusterevents.cpp` + +##### Message Body + +Key | Value +----------|--------- +jsonrpc | 2.0 +method | event::SuppressedNotifications +params | Dictionary + +##### Params + +Key | Type | Description +-------------------------|---------------|------------------ +host | String | Host name +service | String | Service name +supressed\_notifications | Number | Bitmask for suppressed notifications. + +##### Functions + +Event Sender: `Checkable::OnSuppressedNotificationsChanged` +Event Receiver: `SuppressedNotificationsChangedAPIHandler` + +##### Permissions + +The receiver will not process messages from not configured endpoints. + +Message updates will be dropped when: + +* Checkable does not exist. +* Origin endpoint's zone is not allowed to access this checkable. + + #### event::SetNextNotification > Location: `clusterevents.cpp` diff --git a/lib/icinga/checkable-check.cpp b/lib/icinga/checkable-check.cpp index cab1557ea..7b095353a 100644 --- a/lib/icinga/checkable-check.cpp +++ b/lib/icinga/checkable-check.cpp @@ -309,15 +309,14 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig bool in_downtime = IsInDowntime(); bool send_notification = false; + bool suppress_notification = !notification_reachable || in_downtime || IsAcknowledged(); - if (notification_reachable && !in_downtime && !IsAcknowledged()) { - /* Send notifications whether when a hard state change occurred. */ - if (hardChange && !(old_stateType == StateTypeSoft && IsStateOK(new_state))) - send_notification = true; - /* Or if the checkable is volatile and in a HARD state. */ - else if (is_volatile && GetStateType() == StateTypeHard) - send_notification = true; - } + /* Send notifications whether when a hard state change occurred. */ + if (hardChange && !(old_stateType == StateTypeSoft && IsStateOK(new_state))) + send_notification = true; + /* Or if the checkable is volatile and in a HARD state. */ + else if (is_volatile && GetStateType() == StateTypeHard) + send_notification = true; if (IsStateOK(old_state) && old_stateType == StateTypeSoft) send_notification = false; /* Don't send notifications for SOFT-OK -> HARD-OK. */ @@ -405,21 +404,33 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig (is_volatile && !(IsStateOK(old_state) && IsStateOK(new_state)))) ExecuteEventHandler(); + int suppressed_types = 0; + /* Flapping start/end notifications */ - if (!in_downtime && !was_flapping && is_flapping) { + if (!was_flapping && is_flapping) { /* FlappingStart notifications happen on state changes, not in downtimes */ - if (!IsPaused()) - OnNotificationsRequested(this, NotificationFlappingStart, cr, "", "", nullptr); + if (!IsPaused()) { + if (in_downtime) { + suppressed_types |= NotificationFlappingStart; + } else { + OnNotificationsRequested(this, NotificationFlappingStart, cr, "", "", nullptr); + } + } Log(LogNotice, "Checkable") << "Flapping Start: Checkable '" << GetName() << "' started flapping (Current flapping value " << GetFlappingCurrent() << "% > high threshold " << GetFlappingThresholdHigh() << "%)."; NotifyFlapping(origin); - } else if (!in_downtime && was_flapping && !is_flapping) { + } else if (was_flapping && !is_flapping) { /* FlappingEnd notifications are independent from state changes, must not happen in downtine */ - if (!IsPaused()) - OnNotificationsRequested(this, NotificationFlappingEnd, cr, "", "", nullptr); + if (!IsPaused()) { + if (in_downtime) { + suppressed_types |= NotificationFlappingEnd; + } else { + OnNotificationsRequested(this, NotificationFlappingEnd, cr, "", "", nullptr); + } + } Log(LogNotice, "Checkable") << "Flapping Stop: Checkable '" << GetName() << "' stopped flapping (Current flapping value " @@ -429,8 +440,35 @@ void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrig } if (send_notification && !is_flapping) { - if (!IsPaused()) - OnNotificationsRequested(this, recovery ? NotificationRecovery : NotificationProblem, cr, "", "", nullptr); + if (!IsPaused()) { + if (suppress_notification) { + suppressed_types |= (recovery ? NotificationRecovery : NotificationProblem); + } else { + OnNotificationsRequested(this, recovery ? NotificationRecovery : NotificationProblem, cr, "", "", nullptr); + } + } + } + + if (suppressed_types) { + /* If some notifications were suppressed, but just because of e.g. a downtime, + * stash them into a notification types bitmask for maybe re-sending later. + */ + + ObjectLock olock (this); + int suppressed_types_before (GetSuppressedNotifications()); + int suppressed_types_after (suppressed_types_before | suppressed_types); + + for (int conflict : {NotificationProblem | NotificationRecovery, NotificationFlappingStart | NotificationFlappingEnd}) { + /* E.g. problem and recovery notifications neutralize each other. */ + + if ((suppressed_types_after & conflict) == conflict) { + suppressed_types_after &= ~conflict; + } + } + + if (suppressed_types_after != suppressed_types_before) { + SetSuppressedNotifications(suppressed_types_after); + } } } diff --git a/lib/icinga/checkable-notification.cpp b/lib/icinga/checkable-notification.cpp index 568ff6c52..78c488dfe 100644 --- a/lib/icinga/checkable-notification.cpp +++ b/lib/icinga/checkable-notification.cpp @@ -1,7 +1,9 @@ /* Icinga 2 | (c) 2012 Icinga GmbH | GPLv2+ */ #include "icinga/checkable.hpp" +#include "icinga/host.hpp" #include "icinga/icingaapplication.hpp" +#include "icinga/service.hpp" #include "base/objectlock.hpp" #include "base/logger.hpp" #include "base/exception.hpp" @@ -84,3 +86,117 @@ void Checkable::UnregisterNotification(const Notification::Ptr& notification) boost::mutex::scoped_lock lock(m_NotificationMutex); m_Notifications.erase(notification); } + +static void FireSuppressedNotifications(Checkable* checkable) +{ + if (!checkable->IsActive()) + return; + + if (checkable->IsPaused()) + return; + + if (!checkable->GetEnableNotifications()) + return; + + int suppressed_types (checkable->GetSuppressedNotifications()); + if (!suppressed_types) + return; + + int subtract = 0; + + for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) { + if (suppressed_types & type) { + bool still_applies; + auto cr (checkable->GetLastCheckResult()); + + switch (type) { + case NotificationProblem: + still_applies = cr && !checkable->IsStateOK(cr->GetState()) && checkable->GetStateType() == StateTypeHard; + break; + case NotificationRecovery: + still_applies = cr && checkable->IsStateOK(cr->GetState()); + break; + case NotificationFlappingStart: + still_applies = checkable->IsFlapping(); + break; + case NotificationFlappingEnd: + still_applies = !checkable->IsFlapping(); + break; + default: + break; + } + + if (still_applies) { + bool still_suppressed; + + switch (type) { + case NotificationProblem: + /* Fall through. */ + case NotificationRecovery: + still_suppressed = !checkable->IsReachable(DependencyNotification) || checkable->IsInDowntime() || checkable->IsAcknowledged(); + break; + case NotificationFlappingStart: + /* Fall through. */ + case NotificationFlappingEnd: + still_suppressed = checkable->IsInDowntime(); + break; + default: + break; + } + + if (!still_suppressed && checkable->GetEnableActiveChecks()) { + /* If e.g. the downtime just ended, but the service is still not ok, we would re-send the stashed problem notification. + * But if the next check result recovers the service soon, we would send a recovery notification soon after the problem one. + * This is not desired, especially for lots of services at once. + * Because of that if there's likely to be a check result soon, + * we delay the re-sending of the stashed notification until the next check. + * That check either doesn't change anything and we finally re-send the stashed problem notification + * or recovers the service and we drop the stashed notification. */ + + /* One minute unless the check interval is too short so the next check will always run during the next minute. */ + auto threshold (checkable->GetCheckInterval() - 10); + + if (threshold > 60) + threshold = 60; + else if (threshold < 0) + threshold = 0; + + still_suppressed = checkable->GetNextCheck() <= Utility::GetTime() + threshold; + } + + if (!still_suppressed) { + Checkable::OnNotificationsRequested(checkable, type, cr, "", "", nullptr); + + subtract |= type; + } + } else { + subtract |= type; + } + } + } + + if (subtract) { + ObjectLock olock (checkable); + + int suppressed_types_before (checkable->GetSuppressedNotifications()); + int suppressed_types_after (suppressed_types_before & ~subtract); + + if (suppressed_types_after != suppressed_types_before) { + checkable->SetSuppressedNotifications(suppressed_types_after); + } + } +} + +/** + * Re-sends all notifications previously suppressed by e.g. downtimes if the notification reason still applies. + */ +void Checkable::FireSuppressedNotifications(const Timer * const&) +{ + for (auto& host : ConfigType::GetObjectsByType()) { + ::FireSuppressedNotifications(host.get()); + } + + for (auto& service : ConfigType::GetObjectsByType()) { + ::FireSuppressedNotifications(service.get()); + } +} diff --git a/lib/icinga/checkable.cpp b/lib/icinga/checkable.cpp index 0f1879dda..c4265d05f 100644 --- a/lib/icinga/checkable.cpp +++ b/lib/icinga/checkable.cpp @@ -7,6 +7,8 @@ #include "base/objectlock.hpp" #include "base/utility.hpp" #include "base/exception.hpp" +#include "base/timer.hpp" +#include using namespace icinga; @@ -16,6 +18,8 @@ INITIALIZE_ONCE(&Checkable::StaticInitialize); boost::signals2::signal Checkable::OnAcknowledgementSet; boost::signals2::signal Checkable::OnAcknowledgementCleared; +static Timer::Ptr l_CheckablesFireSuppressedNotifications; + void Checkable::StaticInitialize() { /* fixed downtime start */ @@ -65,6 +69,15 @@ void Checkable::Start(bool runtimeCreated) } ObjectImpl::Start(runtimeCreated); + + static boost::once_flag once = BOOST_ONCE_INIT; + + boost::call_once(once, []() { + l_CheckablesFireSuppressedNotifications = new Timer(); + l_CheckablesFireSuppressedNotifications->SetInterval(5); + l_CheckablesFireSuppressedNotifications->OnTimerExpired.connect(&Checkable::FireSuppressedNotifications); + l_CheckablesFireSuppressedNotifications->Start(); + }); } void Checkable::AddGroup(const String& name) diff --git a/lib/icinga/checkable.hpp b/lib/icinga/checkable.hpp index fcfb3f74b..ee7212860 100644 --- a/lib/icinga/checkable.hpp +++ b/lib/icinga/checkable.hpp @@ -3,6 +3,7 @@ #ifndef CHECKABLE_H #define CHECKABLE_H +#include "base/timer.hpp" #include "icinga/i2-icinga.hpp" #include "icinga/checkable-ti.hpp" #include "icinga/timeperiod.hpp" @@ -211,6 +212,8 @@ private: static void NotifyDowntimeEnd(const Downtime::Ptr& downtime); + static void FireSuppressedNotifications(const Timer * const&); + /* Comments */ std::set m_Comments; mutable boost::mutex m_CommentMutex; diff --git a/lib/icinga/checkable.ti b/lib/icinga/checkable.ti index 418236316..7969d6f46 100644 --- a/lib/icinga/checkable.ti +++ b/lib/icinga/checkable.ti @@ -154,6 +154,9 @@ abstract class Checkable : CustomVarObject [state, no_user_view, no_user_modify] int flapping_buffer; [state, no_user_view, no_user_modify] int flapping_index; [state, protected] bool flapping; + [state, no_user_view, no_user_modify] int suppressed_notifications { + default {{{ return 0; }}} + }; [config, navigation] name(Endpoint) command_endpoint (CommandEndpointRaw) { navigate {{{ diff --git a/lib/icinga/clusterevents.cpp b/lib/icinga/clusterevents.cpp index 2c14a3550..313adb1eb 100644 --- a/lib/icinga/clusterevents.cpp +++ b/lib/icinga/clusterevents.cpp @@ -24,6 +24,7 @@ INITIALIZE_ONCE(&ClusterEvents::StaticInitialize); REGISTER_APIFUNCTION(CheckResult, event, &ClusterEvents::CheckResultAPIHandler); REGISTER_APIFUNCTION(SetNextCheck, event, &ClusterEvents::NextCheckChangedAPIHandler); +REGISTER_APIFUNCTION(SetSuppressedNotifications, event, &ClusterEvents::SuppressedNotificationsChangedAPIHandler); REGISTER_APIFUNCTION(SetNextNotification, event, &ClusterEvents::NextNotificationChangedAPIHandler); REGISTER_APIFUNCTION(SetForceNextCheck, event, &ClusterEvents::ForceNextCheckChangedAPIHandler); REGISTER_APIFUNCTION(SetForceNextNotification, event, &ClusterEvents::ForceNextNotificationChangedAPIHandler); @@ -38,6 +39,7 @@ void ClusterEvents::StaticInitialize() { Checkable::OnNewCheckResult.connect(&ClusterEvents::CheckResultHandler); Checkable::OnNextCheckChanged.connect(&ClusterEvents::NextCheckChangedHandler); + Checkable::OnSuppressedNotificationsChanged.connect(&ClusterEvents::SuppressedNotificationsChangedHandler); Notification::OnNextNotificationChanged.connect(&ClusterEvents::NextNotificationChangedHandler); Checkable::OnForceNextCheckChanged.connect(&ClusterEvents::ForceNextCheckChangedHandler); Checkable::OnForceNextNotificationChanged.connect(&ClusterEvents::ForceNextNotificationChangedHandler); @@ -232,6 +234,68 @@ Value ClusterEvents::NextCheckChangedAPIHandler(const MessageOrigin::Ptr& origin return Empty; } +void ClusterEvents::SuppressedNotificationsChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin) +{ + ApiListener::Ptr listener = ApiListener::GetInstance(); + + if (!listener) + return; + + Host::Ptr host; + Service::Ptr service; + tie(host, service) = GetHostService(checkable); + + Dictionary::Ptr params = new Dictionary(); + params->Set("host", host->GetName()); + if (service) + params->Set("service", service->GetShortName()); + params->Set("suppressed_notifications", checkable->GetSuppressedNotifications()); + + Dictionary::Ptr message = new Dictionary(); + message->Set("jsonrpc", "2.0"); + message->Set("method", "event::SetSuppressedNotifications"); + message->Set("params", params); + + listener->RelayMessage(origin, checkable, message, true); +} + +Value ClusterEvents::SuppressedNotificationsChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params) +{ + Endpoint::Ptr endpoint = origin->FromClient->GetEndpoint(); + + if (!endpoint) { + Log(LogNotice, "ClusterEvents") + << "Discarding 'suppressed notifications changed' message from '" << origin->FromClient->GetIdentity() << "': Invalid endpoint origin (client not allowed)."; + return Empty; + } + + Host::Ptr host = Host::GetByName(params->Get("host")); + + if (!host) + return Empty; + + Checkable::Ptr checkable; + + if (params->Contains("service")) + checkable = host->GetServiceByShortName(params->Get("service")); + else + checkable = host; + + if (!checkable) + return Empty; + + if (origin->FromZone && !origin->FromZone->CanAccessObject(checkable)) { + Log(LogNotice, "ClusterEvents") + << "Discarding 'suppressed notifications changed' message for checkable '" << checkable->GetName() + << "' from '" << origin->FromClient->GetIdentity() << "': Unauthorized access."; + return Empty; + } + + checkable->SetSuppressedNotifications(params->Get("suppressed_notifications"), false, origin); + + return Empty; +} + void ClusterEvents::NextNotificationChangedHandler(const Notification::Ptr& notification, const MessageOrigin::Ptr& origin) { ApiListener::Ptr listener = ApiListener::GetInstance(); diff --git a/lib/icinga/clusterevents.hpp b/lib/icinga/clusterevents.hpp index 144155cc5..8dc6f48b9 100644 --- a/lib/icinga/clusterevents.hpp +++ b/lib/icinga/clusterevents.hpp @@ -26,6 +26,9 @@ public: static void NextCheckChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin); static Value NextCheckChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params); + static void SuppressedNotificationsChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin); + static Value SuppressedNotificationsChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params); + static void NextNotificationChangedHandler(const Notification::Ptr& notification, const MessageOrigin::Ptr& origin); static Value NextNotificationChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params);