From f5d40bab2d01730d48460f8350e31fe3997a4988 Mon Sep 17 00:00:00 2001 From: Michael Friedrich Date: Thu, 13 Feb 2014 15:15:16 +0100 Subject: [PATCH 1/2] Cluster: Periodically dump status json. Refs #5444 --- components/cluster/CMakeLists.txt | 1 + components/cluster/cluster-type.conf | 5 +- components/cluster/clusterlistener.cpp | 145 +++++++++++++++++++++++++ components/cluster/clusterlistener.h | 6 + components/cluster/clusterlistener.ti | 7 ++ doc/4.3-object-types.md | 20 ++-- 6 files changed, 174 insertions(+), 10 deletions(-) diff --git a/components/cluster/CMakeLists.txt b/components/cluster/CMakeLists.txt index 8695356de..f04e9d11b 100644 --- a/components/cluster/CMakeLists.txt +++ b/components/cluster/CMakeLists.txt @@ -38,4 +38,5 @@ install(TARGETS cluster RUNTIME DESTINATION ${CMAKE_INSTALL_SBINDIR} LIBRARY DES #install(CODE "file(MAKE_DIRECTORY \"\$ENV{DESTDIR}${CMAKE_INSTALL_FULL_LOCALSTATEDIR}/lib/icinga2/cluster\")") install(CODE "file(MAKE_DIRECTORY \"\$ENV{DESTDIR}${CMAKE_INSTALL_FULL_LOCALSTATEDIR}/lib/icinga2/cluster/config\")") install(CODE "file(MAKE_DIRECTORY \"\$ENV{DESTDIR}${CMAKE_INSTALL_FULL_LOCALSTATEDIR}/lib/icinga2/cluster/log\")") +install(CODE "file(MAKE_DIRECTORY \"\$ENV{DESTDIR}${CMAKE_INSTALL_FULL_LOCALSTATEDIR}/cache/icinga2/cluster\")") diff --git a/components/cluster/cluster-type.conf b/components/cluster/cluster-type.conf index b15a817d5..e36dc58bd 100644 --- a/components/cluster/cluster-type.conf +++ b/components/cluster/cluster-type.conf @@ -34,7 +34,10 @@ type ClusterListener { %attribute array "peers" { %attribute name(Endpoint) "*" - } + }, + + %attribute string "status_path", + %attribute number "status_update_interval" } type Endpoint { diff --git a/components/cluster/clusterlistener.cpp b/components/cluster/clusterlistener.cpp index 3b51c5ee3..a27b0db8e 100644 --- a/components/cluster/clusterlistener.cpp +++ b/components/cluster/clusterlistener.cpp @@ -19,6 +19,7 @@ #include "cluster/clusterlistener.h" #include "cluster/endpoint.h" +#include "icinga/cib.h" #include "icinga/domain.h" #include "icinga/icingaapplication.h" #include "base/netstring.h" @@ -119,6 +120,12 @@ void ClusterListener::Start(void) } } } + + m_StatusTimer = make_shared(); + m_StatusTimer->SetInterval(GetStatusUpdateInterval()); + m_StatusTimer->OnTimerExpired.connect(boost::bind(&ClusterListener::StatusTimerHandler, this)); + m_StatusTimer->Start(); + m_StatusTimer->Reschedule(0); } /** @@ -1558,3 +1565,141 @@ bool ClusterListener::SupportsNotifications(void) return !type->GetObjects().empty() && IcingaApplication::GetInstance()->GetEnableNotifications(); } + +bool ClusterListener::SupportsFeature(const String& name) +{ + DynamicType::Ptr type = DynamicType::GetByName(name); + + if (!type) + return false; + + return !type->GetObjects().empty(); +} + +void ClusterListener::StatusTimerHandler(void) +{ + Log(LogInformation, "cluster", "Writing cluster.json file"); + + String statuspath = GetStatusPath(); + String statuspathtmp = statuspath + ".tmp"; /* XXX make this a global definition */ + + std::ofstream statusfp; + statusfp.open(statuspathtmp.CStr(), std::ofstream::out | std::ofstream::trunc); + + statusfp << std::fixed; + + statusfp << JsonSerialize(GetClusterStatus()); + + statusfp.close(); + +#ifdef _WIN32 + _unlink(statuspath.CStr()); +#endif /* _WIN32 */ + + if (rename(statuspathtmp.CStr(), statuspath.CStr()) < 0) { + BOOST_THROW_EXCEPTION(posix_error() + << boost::errinfo_api_function("rename") + << boost::errinfo_errno(errno) + << boost::errinfo_file_name(statuspathtmp)); + } + + Log(LogInformation, "cluster", "Finished writing cluster.json file"); +} + +Dictionary::Ptr ClusterListener::GetClusterStatus(void) +{ + Dictionary::Ptr bag = make_shared(); + + /* cluster stats */ + bag->Set("node", IcingaApplication::GetInstance()->GetNodeName()); + bag->Set("identity", GetIdentity()); + + double count_endpoints = 0; + Array::Ptr not_connected_endpoints = make_shared(); + Array::Ptr connected_endpoints = make_shared(); + + BOOST_FOREACH(const Endpoint::Ptr& endpoint, DynamicType::GetObjects()) { + count_endpoints++; + + if(!endpoint->IsConnected() && endpoint->GetName() != GetIdentity()) + not_connected_endpoints->Add(endpoint->GetName()); + else if(endpoint->IsConnected() && endpoint->GetName() != GetIdentity()) + connected_endpoints->Add(endpoint->GetName()); + } + + std::sort(not_connected_endpoints->Begin(), not_connected_endpoints->End()); + std::sort(connected_endpoints->Begin(), connected_endpoints->End()); + + bag->Set("num_endpoints", count_endpoints); + bag->Set("num_conn_endpoints", connected_endpoints->GetLength()); + bag->Set("num_not_conn_endpoints", not_connected_endpoints->GetLength()); + bag->Set("conn_endpoints", connected_endpoints); + bag->Set("not_conn_endpoints", not_connected_endpoints); + + /* features */ + bag->Set("feature_CheckerComponent", SupportsChecks() ? 1 : 0); + bag->Set("feature_NotificationComponent", SupportsNotifications() ? 1 : 0); + + /* XXX find a more generic way of getting features as a list */ + bag->Set("feature_IdoMysqlConnection", SupportsFeature("IdoMysqlConnection") ? 1 : 0); + bag->Set("feature_IdoPgsqlConnection", SupportsFeature("IdoPgsqlConnection") ? 1 : 0); + bag->Set("feature_StatusDataWriter", SupportsFeature("StatusDataWriter") ? 1 : 0); + bag->Set("feature_CompatLogger", SupportsFeature("CompatLogger") ? 1 : 0); + bag->Set("feature_ExternalCommandListener", SupportsFeature("ExternalCommandListener") ? 1 : 0); + bag->Set("feature_CheckResultReader", SupportsFeature("CheckResultReader") ? 1 : 0); + bag->Set("feature_LivestatusListener", SupportsFeature("LivestatusListener") ? 1 : 0); + bag->Set("feature_GraphiteWriter", SupportsFeature("GraphiteWriter") ? 1 : 0); + bag->Set("feature_PerfdataWriter", SupportsFeature("PerfdataWriter") ? 1 : 0); + bag->Set("feature_FileLogger", SupportsFeature("FileLogger") ? 1 : 0); + bag->Set("feature_SyslogLogger", SupportsFeature("SyslogLogger") ? 1 : 0); + + + /* icinga stats */ + double interval = Utility::GetTime() - Application::GetStartTime(); + + if (interval > 60) + interval = 60; + + bag->Set("active_checks", CIB::GetActiveChecksStatistics(interval) / interval); + bag->Set("passive_checks", CIB::GetPassiveChecksStatistics(interval) / interval); + + bag->Set("active_checks_1min", CIB::GetActiveChecksStatistics(60)); + bag->Set("passive_checks_1min", CIB::GetPassiveChecksStatistics(60)); + bag->Set("active_checks_5min", CIB::GetActiveChecksStatistics(60 * 5)); + bag->Set("passive_checks_5min", CIB::GetPassiveChecksStatistics(60 * 5)); + bag->Set("active_checks_15min", CIB::GetActiveChecksStatistics(60 * 15)); + bag->Set("passive_checks_15min", CIB::GetPassiveChecksStatistics(60 * 15)); + + ServiceCheckStatistics scs = CIB::CalculateServiceCheckStats(); + + bag->Set("min_latency", scs.min_latency); + bag->Set("max_latency", scs.max_latency); + bag->Set("avg_latency", scs.avg_latency); + bag->Set("min_execution_time", scs.min_latency); + bag->Set("max_execution_time", scs.max_latency); + bag->Set("avg_execution_time", scs.avg_execution_time); + + ServiceStatistics ss = CIB::CalculateServiceStats(); + + bag->Set("num_services_ok", ss.services_ok); + bag->Set("num_services_warning", ss.services_warning); + bag->Set("num_services_critical", ss.services_critical); + bag->Set("num_services_unknown", ss.services_unknown); + bag->Set("num_services_pending", ss.services_pending); + bag->Set("num_services_unreachable", ss.services_unreachable); + bag->Set("num_services_flapping", ss.services_flapping); + bag->Set("num_services_in_downtime", ss.services_in_downtime); + bag->Set("num_services_acknowledged", ss.services_acknowledged); + + HostStatistics hs = CIB::CalculateHostStats(); + + bag->Set("num_hosts_up", hs.hosts_up); + bag->Set("num_hosts_down", hs.hosts_down); + bag->Set("num_hosts_unreachable", hs.hosts_unreachable); + bag->Set("num_hosts_flapping", hs.hosts_flapping); + bag->Set("num_hosts_in_downtime", hs.hosts_in_downtime); + bag->Set("num_hosts_acknowledged", hs.hosts_acknowledged); + + return bag; +} + diff --git a/components/cluster/clusterlistener.h b/components/cluster/clusterlistener.h index 776ce94ea..5f8e3bf40 100644 --- a/components/cluster/clusterlistener.h +++ b/components/cluster/clusterlistener.h @@ -61,6 +61,9 @@ private: Timer::Ptr m_ClusterTimer; void ClusterTimerHandler(void); + Timer::Ptr m_StatusTimer; + void StatusTimerHandler(void); + std::set m_Servers; void AddListener(const String& service); @@ -107,12 +110,15 @@ private: static bool SupportsChecks(void); static bool SupportsNotifications(void); + static bool SupportsFeature(const String& name); void SetSecurityInfo(const Dictionary::Ptr& message, const DynamicObject::Ptr& object, int privs); void PersistMessage(const Endpoint::Ptr& source, const Dictionary::Ptr& message); static void MessageExceptionHandler(boost::exception_ptr exp); + + Dictionary::Ptr GetClusterStatus(void); }; } diff --git a/components/cluster/clusterlistener.ti b/components/cluster/clusterlistener.ti index ac54697b1..78ddf2706 100644 --- a/components/cluster/clusterlistener.ti +++ b/components/cluster/clusterlistener.ti @@ -1,4 +1,5 @@ #include "base/dynamicobject.h" +#include "base/application.h" namespace icinga { @@ -14,6 +15,12 @@ class ClusterListener : DynamicObject [config] Array::Ptr peers; [state] double log_message_timestamp; String identity; + [config] String status_path { + default {{{ return Application::GetLocalStateDir() + "/cache/icinga2/cluster/cluster.json"; }}} + }; + [config] double status_update_interval { + default {{{ return 15; }}} + }; }; } diff --git a/doc/4.3-object-types.md b/doc/4.3-object-types.md index 4128f5c29..63a76668f 100644 --- a/doc/4.3-object-types.md +++ b/doc/4.3-object-types.md @@ -851,15 +851,17 @@ Example: Attributes: - Name |Description - ----------------|---------------- - cert\_path |**Required.** Path to the public key. - key\_path |**Required.** Path to the private key. - ca\_path |**Required.** Path to the CA certificate file. - crl\_path |**Optional.** Path to the CRL file. - bind\_host |**Optional.** The IP address the cluster listener should be bound to. - bind\_port |**Optional.** The port the cluster listener should be bound to. - peers |**Optional.** A list of + Name |Description + --------------------------|-------------------------- + cert\_path |**Required.** Path to the public key. + key\_path |**Required.** Path to the private key. + ca\_path |**Required.** Path to the CA certificate file. + crl\_path |**Optional.** Path to the CRL file. + bind\_host |**Optional.** The IP address the cluster listener should be bound to. + bind\_port |**Optional.** The port the cluster listener should be bound to. + peers |**Optional.** A list of + status\_path |**Optional.** Path to cluster status file. Defaults to IcingaLocalStateDir + "/cache/icinga2/cluster/cluster.json" + status\_update\_interval |**Optional.** The interval in which the status files are updated. Defaults to 15 seconds. ### Endpoint From 66aa874f7dac652130d70515c36b8c9599a1a1a4 Mon Sep 17 00:00:00 2001 From: Michael Friedrich Date: Thu, 13 Feb 2014 19:23:38 +0100 Subject: [PATCH 2/2] Refactor ClusterCheckTask based on cluster status. Refs #5444 --- components/cluster/clusterchecktask.cpp | 67 +++++++++++++------------ components/cluster/clusterchecktask.h | 1 + components/cluster/clusterlistener.h | 4 +- 3 files changed, 38 insertions(+), 34 deletions(-) diff --git a/components/cluster/clusterchecktask.cpp b/components/cluster/clusterchecktask.cpp index 43c57315e..ff135a182 100644 --- a/components/cluster/clusterchecktask.cpp +++ b/components/cluster/clusterchecktask.cpp @@ -37,52 +37,55 @@ REGISTER_SCRIPTFUNCTION(ClusterCheck, &ClusterCheckTask::ScriptFunc); CheckResult::Ptr ClusterCheckTask::ScriptFunc(const Service::Ptr&) { - double interval = Utility::GetTime() - Application::GetStartTime(); - - if (interval > 60) - interval = 60; - - double count_endpoints = 0; - std::vector not_connected_endpoints; - std::vector connected_endpoints; - + Dictionary::Ptr status; BOOST_FOREACH(const ClusterListener::Ptr& cluster_listener, DynamicType::GetObjects()) { - String identity = cluster_listener->GetIdentity(); - - BOOST_FOREACH(const Endpoint::Ptr& endpoint, DynamicType::GetObjects()) { - count_endpoints++; - - if(!endpoint->IsConnected() && endpoint->GetName() != identity) - not_connected_endpoints.push_back(endpoint->GetName()); - else if(endpoint->IsConnected() && endpoint->GetName() != identity) - connected_endpoints.push_back(endpoint->GetName()); - } + /* XXX there's only one cluster listener */ + status = cluster_listener->GetClusterStatus(); } - std::sort(not_connected_endpoints.begin(), not_connected_endpoints.end()); - std::sort(connected_endpoints.begin(), connected_endpoints.end()); + String connected_endpoints = FormatArray(status->Get("conn_endpoints")); + String not_connected_endpoints = FormatArray(status->Get("not_conn_endpoints")); + + /* remove unneeded perfdata */ + status->Set("conn_endpoints", Empty); + status->Set("not_conn_endpoints", Empty); ServiceState state = StateOK; - String output = "Icinga 2 Cluster is running: Connected Endpoints: "+ Convert::ToString(connected_endpoints.size()) + " (" + - boost::algorithm::join(connected_endpoints, ",") + ")."; + String output = "Icinga 2 Cluster is running: Connected Endpoints: "+ Convert::ToString(status->Get("num_conn_endpoints")) + " (" + + connected_endpoints + ")."; - if (not_connected_endpoints.size() > 0) { + if (status->Get("num_not_conn_endpoints") > 0) { state = StateCritical; - output = "Icinga 2 Cluster Problem: " + Convert::ToString(not_connected_endpoints.size()) + - " Endpoints (" + boost::algorithm::join(not_connected_endpoints, ",") + ") not connected."; + output = "Icinga 2 Cluster Problem: " + Convert::ToString(status->Get("num_not_conn_endpoints")) + + " Endpoints (" + not_connected_endpoints + ") not connected."; } - Dictionary::Ptr perfdata = make_shared(); - perfdata->Set("num_endpoints", count_endpoints); - perfdata->Set("num_conn_endpoints", connected_endpoints.size()); - perfdata->Set("num_not_conn_endpoints", not_connected_endpoints.size()); - CheckResult::Ptr cr = make_shared(); cr->SetOutput(output); - cr->SetPerformanceData(perfdata); + cr->SetPerformanceData(status); cr->SetState(state); cr->SetCheckSource(IcingaApplication::GetInstance()->GetNodeName()); return cr; } +String ClusterCheckTask::FormatArray(const Array::Ptr& arr) +{ + bool first = true; + String str; + + if (arr) { + ObjectLock olock(arr); + BOOST_FOREACH(const Value& value, arr) { + if (first) + first = false; + else + str += ","; + + str += Convert::ToString(value); + } + } + + return str; +} + diff --git a/components/cluster/clusterchecktask.h b/components/cluster/clusterchecktask.h index 30fc6842d..d34ddd3a5 100644 --- a/components/cluster/clusterchecktask.h +++ b/components/cluster/clusterchecktask.h @@ -37,6 +37,7 @@ public: private: ClusterCheckTask(void); + static String FormatArray(const Array::Ptr& arr); }; } diff --git a/components/cluster/clusterlistener.h b/components/cluster/clusterlistener.h index 5f8e3bf40..cbcdd36b6 100644 --- a/components/cluster/clusterlistener.h +++ b/components/cluster/clusterlistener.h @@ -51,6 +51,8 @@ public: shared_ptr GetSSLContext(void) const; String GetClusterDir(void) const; + Dictionary::Ptr GetClusterStatus(void); + private: shared_ptr m_SSLContext; @@ -117,8 +119,6 @@ private: void PersistMessage(const Endpoint::Ptr& source, const Dictionary::Ptr& message); static void MessageExceptionHandler(boost::exception_ptr exp); - - Dictionary::Ptr GetClusterStatus(void); }; }