diff --git a/doc/06-distributed-monitoring.md b/doc/06-distributed-monitoring.md index 48cfd1b26..da44f7717 100644 --- a/doc/06-distributed-monitoring.md +++ b/doc/06-distributed-monitoring.md @@ -2434,6 +2434,12 @@ By default, the following features provide advanced HA functionality: * [Checks](06-distributed-monitoring.md#distributed-monitoring-high-availability-checks) (load balanced, automated failover). * [Notifications](06-distributed-monitoring.md#distributed-monitoring-high-availability-notifications) (load balanced, automated failover). * [DB IDO](06-distributed-monitoring.md#distributed-monitoring-high-availability-db-ido) (Run-Once, automated failover). +* [Elasticsearch](09-object-types.md#objecttype-elasticsearchwriter) +* [Gelf](09-object-types.md#objecttype-gelfwriter) +* [Graphite](09-object-types.md#objecttype-graphitewriter) +* [InfluxDB](09-object-types.md#objecttype-influxdbwriter) +* [OpenTsdb](09-object-types.md#objecttype-opentsdbwriter) +* [Perfdata](09-object-types.md#objecttype-perfdatawriter) (for PNP) #### High-Availability with Checks diff --git a/doc/09-object-types.md b/doc/09-object-types.md index bd6572c15..562e75a6d 100644 --- a/doc/09-object-types.md +++ b/doc/09-object-types.md @@ -517,6 +517,7 @@ Configuration Attributes: ca\_path | String | **Optional.** Path to CA certificate to validate the remote host. Requires `enable_tls` set to `true`. cert\_path | String | **Optional.** Path to host certificate to present to the remote host for mutual verification. Requires `enable_tls` set to `true`. key\_path | String | **Optional.** Path to host key to accompany the cert\_path. Requires `enable_tls` set to `true`. + enable\_ha | Boolean | **Optional.** Enable the high availability functionality. Only valid in a [cluster setup](06-distributed-monitoring.md#distributed-monitoring-high-availability-features). Defaults to `true`. Note: If `flush_threshold` is set too low, this will force the feature to flush all data to Elasticsearch too often. Experiment with the setting, if you are processing more than 1024 metrics per second or similar. @@ -655,6 +656,7 @@ Configuration Attributes: port | Number | **Optional.** GELF receiver port. Defaults to `12201`. source | String | **Optional.** Source name for this instance. Defaults to `icinga2`. enable\_send\_perfdata | Boolean | **Optional.** Enable performance data for 'CHECK RESULT' events. + enable\_ha | Boolean | **Optional.** Enable the high availability functionality. Only valid in a [cluster setup](06-distributed-monitoring.md#distributed-monitoring-high-availability-features). Defaults to `true`. ## GraphiteWriter @@ -682,6 +684,7 @@ Configuration Attributes: service\_name\_template | String | **Optional.** Metric prefix for service name. Defaults to `icinga2.$host.name$.services.$service.name$.$service.check_command$`. enable\_send\_thresholds | Boolean | **Optional.** Send additional threshold metrics. Defaults to `false`. enable\_send\_metadata | Boolean | **Optional.** Send additional metadata metrics. Defaults to `false`. + enable\_ha | Boolean | **Optional.** Enable the high availability functionality. Only valid in a [cluster setup](06-distributed-monitoring.md#distributed-monitoring-high-availability-features). Defaults to `true`. Additional usage examples can be found [here](14-features.md#graphite-carbon-cache-writer). @@ -868,7 +871,7 @@ Configuration Attributes: table\_prefix | String | **Optional.** MySQL database table prefix. Defaults to `icinga_`. instance\_name | String | **Optional.** Unique identifier for the local Icinga 2 instance. Defaults to `default`. instance\_description | String | **Optional.** Description for the Icinga 2 instance. - enable\_ha | Boolean | **Optional.** Enable the high availability functionality. Only valid in a [cluster setup](06-distributed-monitoring.md#distributed-monitoring-high-availability-db-ido). Defaults to "true". + enable\_ha | Boolean | **Optional.** Enable the high availability functionality. Only valid in a [cluster setup](06-distributed-monitoring.md#distributed-monitoring-high-availability-db-ido). Defaults to `true`. failover\_timeout | Duration | **Optional.** Set the failover timeout in a [HA cluster](06-distributed-monitoring.md#distributed-monitoring-high-availability-db-ido). Must not be lower than 60s. Defaults to `60s`. cleanup | Dictionary | **Optional.** Dictionary with items for historical table cleanup. categories | Array | **Optional.** Array of information types that should be written to the database. @@ -1057,6 +1060,7 @@ Configuration Attributes: enable\_send\_metadata | Boolean | **Optional.** Whether to send check metadata e.g. states, execution time, latency etc. flush\_interval | Duration | **Optional.** How long to buffer data points before transferring to InfluxDB. Defaults to `10s`. flush\_threshold | Number | **Optional.** How many data points to buffer before forcing a transfer to InfluxDB. Defaults to `1024`. + enable\_ha | Boolean | **Optional.** Enable the high availability functionality. Only valid in a [cluster setup](06-distributed-monitoring.md#distributed-monitoring-high-availability-features). Defaults to `true`. Note: If `flush_threshold` is set too low, this will always force the feature to flush all data to InfluxDB. Experiment with the setting, if you are processing more than 1024 metrics per second @@ -1303,7 +1307,7 @@ Example: object OpenTsdbWriter "opentsdb" { host = "127.0.0.1" port = 4242 - +} ``` Configuration Attributes: @@ -1312,6 +1316,7 @@ Configuration Attributes: --------------------------|-----------------------|---------------------------------- host | String | **Optional.** OpenTSDB host address. Defaults to `127.0.0.1`. port | Number | **Optional.** OpenTSDB port. Defaults to `4242`. + enable\_ha | Boolean | **Optional.** Enable the high availability functionality. Only valid in a [cluster setup](06-distributed-monitoring.md#distributed-monitoring-high-availability-features). Defaults to `true`. ## PerfdataWriter @@ -1346,6 +1351,7 @@ Configuration Attributes: host\_format\_template | String | **Optional.** Host Format template for the performance data file. Defaults to a template that's suitable for use with PNP4Nagios. service\_format\_template | String | **Optional.** Service Format template for the performance data file. Defaults to a template that's suitable for use with PNP4Nagios. rotation\_interval | Duration | **Optional.** Rotation interval for the files specified in `{host,service}_perfdata_path`. Defaults to `30s`. + enable\_ha | Boolean | **Optional.** Enable the high availability functionality. Only valid in a [cluster setup](06-distributed-monitoring.md#distributed-monitoring-high-availability-features). Defaults to `true`. When rotating the performance data file the current UNIX timestamp is appended to the path specified in `host_perfdata_path` and `service_perfdata_path` to generate a unique filename. diff --git a/doc/16-upgrading-icinga-2.md b/doc/16-upgrading-icinga-2.md index 3c46c9f86..d5a03233f 100644 --- a/doc/16-upgrading-icinga-2.md +++ b/doc/16-upgrading-icinga-2.md @@ -7,6 +7,26 @@ Specific version upgrades are described below. Please note that version updates are incremental. An upgrade from v2.6 to v2.8 requires to follow the instructions for v2.7 too. +## Upgrading to v2.11 + +### HA-aware Features + +v2.11 introduces additional HA functionality similar to the DB IDO feature. +This enables the feature being active only on one endpoint while the other +endpoint is paused. When one endpoint is shut down, automatic failover happens. + +This feature is turned on by default. If you need one of the features twice, +please use `enable_ha = false` to restore the old behaviour. + +This affects the following features: + +* [Elasticsearch](09-object-types.md#objecttype-elasticsearchwriter) +* [Gelf](09-object-types.md#objecttype-gelfwriter) +* [Graphite](09-object-types.md#objecttype-graphitewriter) +* [InfluxDB](09-object-types.md#objecttype-influxdbwriter) +* [OpenTsdb](09-object-types.md#objecttype-opentsdbwriter) +* [Perfdata](09-object-types.md#objecttype-perfdatawriter) (for PNP) + ## Upgrading to v2.10 ### Path Constant Changes diff --git a/doc/19-technical-concepts.md b/doc/19-technical-concepts.md index 278b2231f..f9820c316 100644 --- a/doc/19-technical-concepts.md +++ b/doc/19-technical-concepts.md @@ -190,10 +190,12 @@ The GraphiteWriter feature calls the registered function and processes the received data. Features which connect Icinga 2 to external interfaces normally parse and reformat the received data into an applicable format. +Since this check result signal is blocking, many of the features include a work queue +with asynchronous task handling. + The GraphiteWriter uses a TCP socket to communicate with the carbon cache daemon of Graphite. The InfluxDBWriter is instead writing bulk metric messages -to InfluxDB's HTTP API. - +to InfluxDB's HTTP API, similar to Elasticsearch. ## Cluster @@ -322,10 +324,10 @@ The update procedure works the same way as above. ### High Availability -High availability is automatically enabled between two nodes in the same +General high availability is automatically enabled between two endpoints in the same cluster zone. -This requires the same configuration and enabled features on both nodes. +**This requires the same configuration and enabled features on both nodes.** HA zone members trust each other and share event updates as cluster messages. This includes for example check results, next check timestamp updates, acknowledgements @@ -334,18 +336,52 @@ or notifications. This ensures that both nodes are synchronized. If one node goes away, the remaining node takes over and continues as normal. +#### High Availability: Object Authority Cluster nodes automatically determine the authority for configuration -objects. This results in activated but paused objects. You can verify +objects. By default, all config objects are set to `HARunEverywhere` and +as such the object authority is true for any config object on any instance. + +Specific objects can override and influence this setting, e.g. with `HARunOnce` +instead prior to config object activation. + +This is done when the daemon starts and in a regular interval inside +the ApiListener class, specifically calling `ApiListener::UpdateObjectAuthority()`. + +The algorithm works like this: + +* Determine whether this instance is assigned to a local zone and endpoint. +* Collects all endpoints in this zone if they are connected. +* If there's two endpoints, but only us seeing ourselves and the application start is less than 60 seconds in the past, do nothing (wait for cluster reconnect to take place, grace period). +* Sort the collected endpoints by name. +* Iterate over all config types and their respective objects + * Ignore !active objects + * Ignore objects which are !HARunOnce. This means, they can run multiple times in a zone and don't need an authority update. + * If this instance doesn't have a local zone, set authority to true. This is for non-clustered standalone environments where everything belongs to this instance. + * Calculate the object authority based on the connected endpoint names. + * Set the authority (true or false) + +The object authority calculation works "offline" without any message exchange. +Each instance alculates the SDBM hash of the config object name, puts that in contrast +modulo the connected endpoints size. +This index is used to lookup the corresponding endpoint in the connected endpoints array, +including the local endpoint. Whether the local endpoint is equal to the selected endpoint, +or not, this sets the authority to `true` or `false`. + +``` +authority = endpoints[Utility::SDBM(object->GetName()) % endpoints.size()] == my_endpoint; +``` + +`ConfigObject::SetAuthority(bool authority)` triggers the following events: + +* Authority is true and object now paused: Resume the object and set `paused` to `false`. +* Authority is false, object not paused: Pause the object and set `paused` to true. + +**This results in activated but paused objects on one endpoint.** You can verify that by querying the `paused` attribute for all objects via REST API -or debug console. - -Nodes inside a HA zone calculate the object authority independent from each other. - -The number of endpoints in a zone is defined through the configuration. This number -is used inside a local modulo calculation to determine whether the node feels -responsible for this object or not. +or debug console on both endpoints. +Endpoints inside a HA zone calculate the object authority independent from each other. This object authority is important for selected features explained below. Since features are configuration objects too, you must ensure that all nodes @@ -354,6 +390,36 @@ one might have a checker feature on the left node, nothing on the right node. This leads to late check results because one half is not executed by the right node which holds half of the object authorities. +By default, features are enabled to "Run-Everywhere". Specific features which +support HA awareness, provide the `enable_ha` configuration attribute. When `enable_ha` +is set to `true` (usually the default), "Run-Once" is set and the feature pauses on one side. + +``` +vim /etc/icinga2/features-enabled/graphite.conf + +object GraphiteWriter "graphite" { + ... + enable_ha = true +} +``` + +Once such a feature is paused, there won't be any more event handling, e.g. the Elasticsearch +feature won't process any checkresults nor write to the Elasticsearch REST API. + +When the cluster connection drops, the feature configuration object is updated with +the new object authority by the ApiListener timer and resumes its operation. You can see +that by grepping the log file for `resumed` and `paused`. + +``` +[2018-10-24 13:28:28 +0200] information/GraphiteWriter: 'g-ha' paused. +``` + +``` +[2018-10-24 13:28:28 +0200] information/GraphiteWriter: 'g-ha' resumed. +``` + +Specific features with HA capabilities are explained below. + ### High Availability: Checker The `checker` feature only executes checks for `Checkable` objects (Host, Service) diff --git a/lib/perfdata/elasticsearchwriter.cpp b/lib/perfdata/elasticsearchwriter.cpp index 029857094..ec89bed01 100644 --- a/lib/perfdata/elasticsearchwriter.cpp +++ b/lib/perfdata/elasticsearchwriter.cpp @@ -49,6 +49,15 @@ void ElasticsearchWriter::OnConfigLoaded() ObjectImpl::OnConfigLoaded(); m_WorkQueue.SetName("ElasticsearchWriter, " + GetName()); + + if (!GetEnableHa()) { + Log(LogDebug, "ElasticsearchWriter") + << "HA functionality disabled. Won't pause connection: " << GetName(); + + SetHAMode(HARunEverywhere); + } else { + SetHAMode(HARunOnce); + } } void ElasticsearchWriter::StatsFunc(const Dictionary::Ptr& status, const Array::Ptr& perfdata) @@ -71,14 +80,14 @@ void ElasticsearchWriter::StatsFunc(const Dictionary::Ptr& status, const Array:: status->Set("elasticsearchwriter", new Dictionary(std::move(nodes))); } -void ElasticsearchWriter::Start(bool runtimeCreated) +void ElasticsearchWriter::Resume() { - ObjectImpl::Start(runtimeCreated); + ObjectImpl::Resume(); m_EventPrefix = "icinga2.event."; Log(LogInformation, "ElasticsearchWriter") - << "'" << GetName() << "' started."; + << "'" << GetName() << "' resumed."; m_WorkQueue.SetExceptionCallback(std::bind(&ElasticsearchWriter::ExceptionHandler, this, _1)); @@ -95,14 +104,14 @@ void ElasticsearchWriter::Start(bool runtimeCreated) Checkable::OnNotificationSentToAllUsers.connect(std::bind(&ElasticsearchWriter::NotificationSentToAllUsersHandler, this, _1, _2, _3, _4, _5, _6, _7)); } -void ElasticsearchWriter::Stop(bool runtimeRemoved) +void ElasticsearchWriter::Pause() { Log(LogInformation, "ElasticsearchWriter") - << "'" << GetName() << "' stopped."; + << "'" << GetName() << "' paused."; m_WorkQueue.Join(); - ObjectImpl::Stop(runtimeRemoved); + ObjectImpl::Pause(); } void ElasticsearchWriter::AddCheckResult(const Dictionary::Ptr& fields, const Checkable::Ptr& checkable, const CheckResult::Ptr& cr) @@ -176,6 +185,9 @@ void ElasticsearchWriter::AddCheckResult(const Dictionary::Ptr& fields, const Ch void ElasticsearchWriter::CheckResultHandler(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr) { + if (IsPaused()) + return; + m_WorkQueue.Enqueue(std::bind(&ElasticsearchWriter::InternalCheckResultHandler, this, checkable, cr)); } @@ -230,6 +242,9 @@ void ElasticsearchWriter::InternalCheckResultHandler(const Checkable::Ptr& check void ElasticsearchWriter::StateChangeHandler(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr, StateType type) { + if (IsPaused()) + return; + m_WorkQueue.Enqueue(std::bind(&ElasticsearchWriter::StateChangeHandlerInternal, this, checkable, cr, type)); } @@ -279,6 +294,9 @@ void ElasticsearchWriter::NotificationSentToAllUsersHandler(const Notification:: const Checkable::Ptr& checkable, const std::set& users, NotificationType type, const CheckResult::Ptr& cr, const String& author, const String& text) { + if (IsPaused()) + return; + m_WorkQueue.Enqueue(std::bind(&ElasticsearchWriter::NotificationSentToAllUsersHandlerInternal, this, notification, checkable, users, type, cr, author, text)); } diff --git a/lib/perfdata/elasticsearchwriter.hpp b/lib/perfdata/elasticsearchwriter.hpp index a22552ad6..7220e8ea3 100644 --- a/lib/perfdata/elasticsearchwriter.hpp +++ b/lib/perfdata/elasticsearchwriter.hpp @@ -41,8 +41,8 @@ public: protected: void OnConfigLoaded() override; - void Start(bool runtimeCreated) override; - void Stop(bool runtimeRemoved) override; + void Resume() override; + void Pause() override; private: String m_EventPrefix; diff --git a/lib/perfdata/elasticsearchwriter.ti b/lib/perfdata/elasticsearchwriter.ti index 2bbdd9d6a..7cf604fb9 100644 --- a/lib/perfdata/elasticsearchwriter.ti +++ b/lib/perfdata/elasticsearchwriter.ti @@ -37,6 +37,9 @@ class ElasticsearchWriter : ConfigObject [config] int flush_threshold { default {{{ return 1024; }}} }; + [config] bool enable_ha { + default {{{ return true; }}} + }; }; } diff --git a/lib/perfdata/gelfwriter.cpp b/lib/perfdata/gelfwriter.cpp index 78e15ff77..c4c0912d2 100644 --- a/lib/perfdata/gelfwriter.cpp +++ b/lib/perfdata/gelfwriter.cpp @@ -51,6 +51,15 @@ void GelfWriter::OnConfigLoaded() ObjectImpl::OnConfigLoaded(); m_WorkQueue.SetName("GelfWriter, " + GetName()); + + if (!GetEnableHa()) { + Log(LogDebug, "GelfWriter") + << "HA functionality disabled. Won't pause connection: " << GetName(); + + SetHAMode(HARunEverywhere); + } else { + SetHAMode(HARunOnce); + } } void GelfWriter::StatsFunc(const Dictionary::Ptr& status, const Array::Ptr& perfdata) @@ -75,12 +84,12 @@ void GelfWriter::StatsFunc(const Dictionary::Ptr& status, const Array::Ptr& perf status->Set("gelfwriter", new Dictionary(std::move(nodes))); } -void GelfWriter::Start(bool runtimeCreated) +void GelfWriter::Resume() { - ObjectImpl::Start(runtimeCreated); + ObjectImpl::Resume(); Log(LogInformation, "GelfWriter") - << "'" << GetName() << "' started."; + << "'" << GetName() << "' resumed."; /* Register exception handler for WQ tasks. */ m_WorkQueue.SetExceptionCallback(std::bind(&GelfWriter::ExceptionHandler, this, _1)); @@ -98,14 +107,14 @@ void GelfWriter::Start(bool runtimeCreated) Checkable::OnStateChange.connect(std::bind(&GelfWriter::StateChangeHandler, this, _1, _2, _3)); } -void GelfWriter::Stop(bool runtimeRemoved) +void GelfWriter::Pause() { Log(LogInformation, "GelfWriter") - << "'" << GetName() << "' stopped."; + << "'" << GetName() << "' paused."; m_WorkQueue.Join(); - ObjectImpl::Stop(runtimeRemoved); + ObjectImpl::Pause(); } void GelfWriter::AssertOnWorkQueue() @@ -131,6 +140,11 @@ void GelfWriter::Reconnect() { AssertOnWorkQueue(); + if (IsPaused()) { + SetConnected(false); + return; + } + double startTime = Utility::GetTime(); CONTEXT("Reconnecting to Graylog Gelf '" + GetName() + "'"); @@ -180,6 +194,9 @@ void GelfWriter::Disconnect() void GelfWriter::CheckResultHandler(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr) { + if (IsPaused()) + return; + m_WorkQueue.Enqueue(std::bind(&GelfWriter::CheckResultHandlerInternal, this, checkable, cr)); } @@ -284,6 +301,9 @@ void GelfWriter::NotificationToUserHandler(const Notification::Ptr& notification const User::Ptr& user, NotificationType notificationType, CheckResult::Ptr const& cr, const String& author, const String& commentText, const String& commandName) { + if (IsPaused()) + return; + m_WorkQueue.Enqueue(std::bind(&GelfWriter::NotificationToUserHandlerInternal, this, notification, checkable, user, notificationType, cr, author, commentText, commandName)); } @@ -348,6 +368,9 @@ void GelfWriter::NotificationToUserHandlerInternal(const Notification::Ptr& noti void GelfWriter::StateChangeHandler(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr, StateType type) { + if (IsPaused()) + return; + m_WorkQueue.Enqueue(std::bind(&GelfWriter::StateChangeHandlerInternal, this, checkable, cr, type)); } diff --git a/lib/perfdata/gelfwriter.hpp b/lib/perfdata/gelfwriter.hpp index 4e27a3c92..32e35da57 100644 --- a/lib/perfdata/gelfwriter.hpp +++ b/lib/perfdata/gelfwriter.hpp @@ -46,8 +46,8 @@ public: protected: void OnConfigLoaded() override; - void Start(bool runtimeCreated) override; - void Stop(bool runtimeRemoved) override; + void Resume() override; + void Pause() override; private: Stream::Ptr m_Stream; diff --git a/lib/perfdata/gelfwriter.ti b/lib/perfdata/gelfwriter.ti index 6382541bb..d13ee55d2 100644 --- a/lib/perfdata/gelfwriter.ti +++ b/lib/perfdata/gelfwriter.ti @@ -45,6 +45,9 @@ class GelfWriter : ConfigObject [no_user_modify] bool should_connect { default {{{ return true; }}} }; + [config] bool enable_ha { + default {{{ return true; }}} + }; }; } diff --git a/lib/perfdata/graphitewriter.cpp b/lib/perfdata/graphitewriter.cpp index d58e2c53b..a01f6c48b 100644 --- a/lib/perfdata/graphitewriter.cpp +++ b/lib/perfdata/graphitewriter.cpp @@ -49,6 +49,15 @@ void GraphiteWriter::OnConfigLoaded() ObjectImpl::OnConfigLoaded(); m_WorkQueue.SetName("GraphiteWriter, " + GetName()); + + if (!GetEnableHa()) { + Log(LogDebug, "GraphiteWriter") + << "HA functionality disabled. Won't pause connection: " << GetName(); + + SetHAMode(HARunEverywhere); + } else { + SetHAMode(HARunOnce); + } } void GraphiteWriter::StatsFunc(const Dictionary::Ptr& status, const Array::Ptr& perfdata) @@ -72,12 +81,12 @@ void GraphiteWriter::StatsFunc(const Dictionary::Ptr& status, const Array::Ptr& status->Set("graphitewriter", new Dictionary(std::move(nodes))); } -void GraphiteWriter::Start(bool runtimeCreated) +void GraphiteWriter::Resume() { - ObjectImpl::Start(runtimeCreated); + ObjectImpl::Resume(); Log(LogInformation, "GraphiteWriter") - << "'" << GetName() << "' started."; + << "'" << GetName() << "' resumed."; /* Register exception handler for WQ tasks. */ m_WorkQueue.SetExceptionCallback(std::bind(&GraphiteWriter::ExceptionHandler, this, _1)); @@ -93,14 +102,14 @@ void GraphiteWriter::Start(bool runtimeCreated) Checkable::OnNewCheckResult.connect(std::bind(&GraphiteWriter::CheckResultHandler, this, _1, _2)); } -void GraphiteWriter::Stop(bool runtimeRemoved) +void GraphiteWriter::Pause() { Log(LogInformation, "GraphiteWriter") - << "'" << GetName() << "' stopped."; + << "'" << GetName() << "' paused."; m_WorkQueue.Join(); - ObjectImpl::Stop(runtimeRemoved); + ObjectImpl::Pause(); } void GraphiteWriter::AssertOnWorkQueue() @@ -126,6 +135,11 @@ void GraphiteWriter::Reconnect() { AssertOnWorkQueue(); + if (IsPaused()) { + SetConnected(false); + return; + } + double startTime = Utility::GetTime(); CONTEXT("Reconnecting to Graphite '" + GetName() + "'"); @@ -175,6 +189,9 @@ void GraphiteWriter::Disconnect() void GraphiteWriter::CheckResultHandler(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr) { + if (IsPaused()) + return; + m_WorkQueue.Enqueue(std::bind(&GraphiteWriter::CheckResultHandlerInternal, this, checkable, cr)); } diff --git a/lib/perfdata/graphitewriter.hpp b/lib/perfdata/graphitewriter.hpp index 48a2f4203..6934d13b3 100644 --- a/lib/perfdata/graphitewriter.hpp +++ b/lib/perfdata/graphitewriter.hpp @@ -49,8 +49,8 @@ public: protected: void OnConfigLoaded() override; - void Start(bool runtimeCreated) override; - void Stop(bool runtimeRemoved) override; + void Resume() override; + void Pause() override; private: Stream::Ptr m_Stream; diff --git a/lib/perfdata/graphitewriter.ti b/lib/perfdata/graphitewriter.ti index dfe62a14b..b28ba87a9 100644 --- a/lib/perfdata/graphitewriter.ti +++ b/lib/perfdata/graphitewriter.ti @@ -47,6 +47,9 @@ class GraphiteWriter : ConfigObject [no_user_modify] bool should_connect { default {{{ return true; }}} }; + [config] bool enable_ha { + default {{{ return true; }}} + }; }; } diff --git a/lib/perfdata/influxdbwriter.cpp b/lib/perfdata/influxdbwriter.cpp index 437fe542a..91fb1be10 100644 --- a/lib/perfdata/influxdbwriter.cpp +++ b/lib/perfdata/influxdbwriter.cpp @@ -75,6 +75,15 @@ void InfluxdbWriter::OnConfigLoaded() ObjectImpl::OnConfigLoaded(); m_WorkQueue.SetName("InfluxdbWriter, " + GetName()); + + if (!GetEnableHa()) { + Log(LogDebug, "InfluxdbWriter") + << "HA functionality disabled. Won't pause connection: " << GetName(); + + SetHAMode(HARunEverywhere); + } else { + SetHAMode(HARunOnce); + } } void InfluxdbWriter::StatsFunc(const Dictionary::Ptr& status, const Array::Ptr& perfdata) @@ -100,12 +109,12 @@ void InfluxdbWriter::StatsFunc(const Dictionary::Ptr& status, const Array::Ptr& status->Set("influxdbwriter", new Dictionary(std::move(nodes))); } -void InfluxdbWriter::Start(bool runtimeCreated) +void InfluxdbWriter::Resume() { - ObjectImpl::Start(runtimeCreated); + ObjectImpl::Resume(); Log(LogInformation, "InfluxdbWriter") - << "'" << GetName() << "' started."; + << "'" << GetName() << "' resumed."; /* Register exception handler for WQ tasks. */ m_WorkQueue.SetExceptionCallback(std::bind(&InfluxdbWriter::ExceptionHandler, this, _1)); @@ -121,14 +130,14 @@ void InfluxdbWriter::Start(bool runtimeCreated) Checkable::OnNewCheckResult.connect(std::bind(&InfluxdbWriter::CheckResultHandler, this, _1, _2)); } -void InfluxdbWriter::Stop(bool runtimeRemoved) +void InfluxdbWriter::Pause() { Log(LogInformation, "InfluxdbWriter") - << "'" << GetName() << "' stopped."; + << "'" << GetName() << "' paused."; m_WorkQueue.Join(); - ObjectImpl::Stop(runtimeRemoved); + ObjectImpl::Pause(); } void InfluxdbWriter::AssertOnWorkQueue() @@ -188,6 +197,9 @@ Stream::Ptr InfluxdbWriter::Connect() void InfluxdbWriter::CheckResultHandler(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr) { + if (IsPaused()) + return; + m_WorkQueue.Enqueue(std::bind(&InfluxdbWriter::CheckResultHandlerWQ, this, checkable, cr), PriorityLow); } diff --git a/lib/perfdata/influxdbwriter.hpp b/lib/perfdata/influxdbwriter.hpp index 109e3a872..face44f69 100644 --- a/lib/perfdata/influxdbwriter.hpp +++ b/lib/perfdata/influxdbwriter.hpp @@ -49,8 +49,8 @@ public: protected: void OnConfigLoaded() override; - void Start(bool runtimeCreated) override; - void Stop(bool runtimeRemoved) override; + void Resume() override; + void Pause() override; private: WorkQueue m_WorkQueue{10000000, 1}; diff --git a/lib/perfdata/influxdbwriter.ti b/lib/perfdata/influxdbwriter.ti index 58962d5b5..d2f1ba6d2 100644 --- a/lib/perfdata/influxdbwriter.ti +++ b/lib/perfdata/influxdbwriter.ti @@ -88,6 +88,9 @@ class InfluxdbWriter : ConfigObject [config] int flush_threshold { default {{{ return 1024; }}} }; + [config] bool enable_ha { + default {{{ return true; }}} + }; }; validator InfluxdbWriter { diff --git a/lib/perfdata/opentsdbwriter.cpp b/lib/perfdata/opentsdbwriter.cpp index 1099f4449..78feea8f2 100644 --- a/lib/perfdata/opentsdbwriter.cpp +++ b/lib/perfdata/opentsdbwriter.cpp @@ -44,6 +44,20 @@ REGISTER_TYPE(OpenTsdbWriter); REGISTER_STATSFUNCTION(OpenTsdbWriter, &OpenTsdbWriter::StatsFunc); +void OpenTsdbWriter::OnConfigLoaded() +{ + ObjectImpl::OnConfigLoaded(); + + if (!GetEnableHa()) { + Log(LogDebug, "OpenTsdbWriter") + << "HA functionality disabled. Won't pause connection: " << GetName(); + + SetHAMode(HARunEverywhere); + } else { + SetHAMode(HARunOnce); + } +} + void OpenTsdbWriter::StatsFunc(const Dictionary::Ptr& status, const Array::Ptr&) { DictionaryData nodes; @@ -55,12 +69,12 @@ void OpenTsdbWriter::StatsFunc(const Dictionary::Ptr& status, const Array::Ptr&) status->Set("opentsdbwriter", new Dictionary(std::move(nodes))); } -void OpenTsdbWriter::Start(bool runtimeCreated) +void OpenTsdbWriter::Resume() { - ObjectImpl::Start(runtimeCreated); + ObjectImpl::Resume(); Log(LogInformation, "OpentsdbWriter") - << "'" << GetName() << "' started."; + << "'" << GetName() << "' resumed."; m_ReconnectTimer = new Timer(); m_ReconnectTimer->SetInterval(10); @@ -71,16 +85,19 @@ void OpenTsdbWriter::Start(bool runtimeCreated) Service::OnNewCheckResult.connect(std::bind(&OpenTsdbWriter::CheckResultHandler, this, _1, _2)); } -void OpenTsdbWriter::Stop(bool runtimeRemoved) +void OpenTsdbWriter::Pause() { Log(LogInformation, "OpentsdbWriter") - << "'" << GetName() << "' stopped."; + << "'" << GetName() << "' paused."; - ObjectImpl::Stop(runtimeRemoved); + ObjectImpl::Pause(); } void OpenTsdbWriter::ReconnectTimerHandler() { + if (IsPaused()) + return; + if (m_Stream) return; @@ -102,6 +119,9 @@ void OpenTsdbWriter::ReconnectTimerHandler() void OpenTsdbWriter::CheckResultHandler(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr) { + if (IsPaused()) + return; + CONTEXT("Processing check result for '" + checkable->GetName() + "'"); if (!IcingaApplication::GetInstance()->GetEnablePerfdata() || !checkable->GetEnablePerfdata()) diff --git a/lib/perfdata/opentsdbwriter.hpp b/lib/perfdata/opentsdbwriter.hpp index 4d40387dd..193d6b31a 100644 --- a/lib/perfdata/opentsdbwriter.hpp +++ b/lib/perfdata/opentsdbwriter.hpp @@ -44,8 +44,9 @@ public: static void StatsFunc(const Dictionary::Ptr& status, const Array::Ptr& perfdata); protected: - void Start(bool runtimeCreated) override; - void Stop(bool runtimeRemoved) override; + void OnConfigLoaded() override; + void Resume() override; + void Pause() override; private: Stream::Ptr m_Stream; diff --git a/lib/perfdata/opentsdbwriter.ti b/lib/perfdata/opentsdbwriter.ti index fbec3a491..3418e1606 100644 --- a/lib/perfdata/opentsdbwriter.ti +++ b/lib/perfdata/opentsdbwriter.ti @@ -34,6 +34,9 @@ class OpenTsdbWriter : ConfigObject [config] String port { default {{{ return "4242"; }}} }; + [config] bool enable_ha { + default {{{ return true; }}} + }; }; } diff --git a/lib/perfdata/perfdatawriter.cpp b/lib/perfdata/perfdatawriter.cpp index a00dc45d7..c560ef256 100644 --- a/lib/perfdata/perfdatawriter.cpp +++ b/lib/perfdata/perfdatawriter.cpp @@ -38,6 +38,20 @@ REGISTER_TYPE(PerfdataWriter); REGISTER_STATSFUNCTION(PerfdataWriter, &PerfdataWriter::StatsFunc); +void PerfdataWriter::OnConfigLoaded() +{ + ObjectImpl::OnConfigLoaded(); + + if (!GetEnableHa()) { + Log(LogDebug, "PerfdataWriter") + << "HA functionality disabled. Won't pause connection: " << GetName(); + + SetHAMode(HARunEverywhere); + } else { + SetHAMode(HARunOnce); + } +} + void PerfdataWriter::StatsFunc(const Dictionary::Ptr& status, const Array::Ptr&) { DictionaryData nodes; @@ -49,12 +63,12 @@ void PerfdataWriter::StatsFunc(const Dictionary::Ptr& status, const Array::Ptr&) status->Set("perfdatawriter", new Dictionary(std::move(nodes))); } -void PerfdataWriter::Start(bool runtimeCreated) +void PerfdataWriter::Resume() { - ObjectImpl::Start(runtimeCreated); + ObjectImpl::Resume(); Log(LogInformation, "PerfdataWriter") - << "'" << GetName() << "' started."; + << "'" << GetName() << "' resumed."; Checkable::OnNewCheckResult.connect(std::bind(&PerfdataWriter::CheckResultHandler, this, _1, _2)); @@ -67,12 +81,12 @@ void PerfdataWriter::Start(bool runtimeCreated) RotateFile(m_HostOutputFile, GetHostTempPath(), GetHostPerfdataPath()); } -void PerfdataWriter::Stop(bool runtimeRemoved) +void PerfdataWriter::Pause() { Log(LogInformation, "PerfdataWriter") - << "'" << GetName() << "' stopped."; + << "'" << GetName() << "' paused."; - ObjectImpl::Stop(runtimeRemoved); + ObjectImpl::Pause(); } Value PerfdataWriter::EscapeMacroMetric(const Value& value) @@ -85,6 +99,9 @@ Value PerfdataWriter::EscapeMacroMetric(const Value& value) void PerfdataWriter::CheckResultHandler(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr) { + if (IsPaused()) + return; + CONTEXT("Writing performance data for object '" + checkable->GetName() + "'"); if (!IcingaApplication::GetInstance()->GetEnablePerfdata() || !checkable->GetEnablePerfdata()) @@ -154,6 +171,9 @@ void PerfdataWriter::RotateFile(std::ofstream& output, const String& temp_path, void PerfdataWriter::RotationTimerHandler() { + if (IsPaused()) + return; + RotateFile(m_ServiceOutputFile, GetServiceTempPath(), GetServicePerfdataPath()); RotateFile(m_HostOutputFile, GetHostTempPath(), GetHostPerfdataPath()); } diff --git a/lib/perfdata/perfdatawriter.hpp b/lib/perfdata/perfdatawriter.hpp index 10fe7a2dd..56d01d382 100644 --- a/lib/perfdata/perfdatawriter.hpp +++ b/lib/perfdata/perfdatawriter.hpp @@ -46,8 +46,9 @@ public: void ValidateServiceFormatTemplate(const Lazy& lvalue, const ValidationUtils& utils) override; protected: - void Start(bool runtimeCreated) override; - void Stop(bool runtimeRemoved) override; + void OnConfigLoaded() override; + void Resume() override; + void Pause() override; private: void CheckResultHandler(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr); diff --git a/lib/perfdata/perfdatawriter.ti b/lib/perfdata/perfdatawriter.ti index 8381c6c9e..d2ceb6899 100644 --- a/lib/perfdata/perfdatawriter.ti +++ b/lib/perfdata/perfdatawriter.ti @@ -70,6 +70,9 @@ class PerfdataWriter : ConfigObject [config] double rotation_interval { default {{{ return 30; }}} }; + [config] bool enable_ha { + default {{{ return true; }}} + }; }; }