diff --git a/components/db_ido_mysql/idomysqlconnection.cpp b/components/db_ido_mysql/idomysqlconnection.cpp index 661609f57..fb6d61fa6 100644 --- a/components/db_ido_mysql/idomysqlconnection.cpp +++ b/components/db_ido_mysql/idomysqlconnection.cpp @@ -33,7 +33,7 @@ using namespace icinga; -#define SCHEMA_VERSION "1.11.6" +#define SCHEMA_VERSION "1.11.7" REGISTER_TYPE(IdoMysqlConnection); REGISTER_STATSFUNCTION(IdoMysqlConnectionStats, &IdoMysqlConnection::StatsFunc); @@ -188,6 +188,7 @@ void IdoMysqlConnection::Reconnect(void) passwd = (!ipasswd.IsEmpty()) ? ipasswd.CStr() : NULL; db = (!idb.IsEmpty()) ? idb.CStr() : NULL; + /* connection */ if (!mysql_init(&m_Connection)) { std::ostringstream msgbuf; msgbuf << "mysql_init() failed: \"" << mysql_error(&m_Connection) << "\""; @@ -210,9 +211,9 @@ void IdoMysqlConnection::Reconnect(void) String dbVersionName = "idoutils"; IdoMysqlResult result = Query("SELECT version FROM " + GetTablePrefix() + "dbversion WHERE name='" + Escape(dbVersionName) + "'"); - Dictionary::Ptr version_row = FetchRow(result); + Dictionary::Ptr row = FetchRow(result); - if (!version_row) { + if (!row) { Log(LogCritical, "IdoMysqlConnection", "Schema does not provide any valid version! Verify your schema installation."); Application::Exit(EXIT_FAILURE); @@ -220,7 +221,7 @@ void IdoMysqlConnection::Reconnect(void) DiscardRows(result); - String version = version_row->Get("version"); + String version = row->Get("version"); if (Utility::CompareVersion(SCHEMA_VERSION, version) < 0) { Log(LogCritical, "IdoMysqlConnection", "Schema version '" + version + "' does not match the required version '" + @@ -232,16 +233,68 @@ void IdoMysqlConnection::Reconnect(void) String instanceName = GetInstanceName(); result = Query("SELECT instance_id FROM " + GetTablePrefix() + "instances WHERE instance_name = '" + Escape(instanceName) + "'"); - - Dictionary::Ptr row = FetchRow(result); + row = FetchRow(result); if (!row) { Query("INSERT INTO " + GetTablePrefix() + "instances (instance_name, instance_description) VALUES ('" + Escape(instanceName) + "', '" + Escape(GetInstanceDescription()) + "')"); m_InstanceID = GetLastInsertID(); } else { + m_InstanceID = DbReference(row->Get("instance_id")); + } + + DiscardRows(result); + + Endpoint::Ptr my_endpoint = Endpoint::GetLocalEndpoint(); + + /* we have an endpoint in a cluster setup, so decide if we can proceed here */ + if (my_endpoint && GetHAMode() == HARunOnce) { + /* get the current endpoint writing to programstatus table */ + result = Query("SELECT UNIX_TIMESTAMP(status_update_time) AS status_update_time, endpoint_name FROM " + + GetTablePrefix() + "programstatus WHERE instance_id = " + Convert::ToString(m_InstanceID)); + row = FetchRow(result); DiscardRows(result); - m_InstanceID = DbReference(row->Get("instance_id")); + String endpoint_name; + + if (row) + endpoint_name = row->Get("endpoint_name"); + else + Log(LogNotice, "IdoMysqlConnection", "Empty program status table"); + + /* if we did not write into the database earlier, another instance is active */ + if (endpoint_name != my_endpoint->GetName()) { + double status_update_time; + + if (row) + status_update_time = row->Get("status_update_time"); + else + status_update_time = 0; + + double status_update_age = Utility::GetTime() - status_update_time; + + Log(LogNotice, "IdoMysqlConnection", "Last update by '" + + endpoint_name + "' was " + Convert::ToString(status_update_age) + "s ago."); + + if (status_update_age < GetFailoverTimeout()) { + mysql_close(&m_Connection); + m_Connected = false; + + return; + } + + /* activate the IDO only, if we're authoritative in this zone */ + if (IsPaused()) { + Log(LogNotice, "IdoMysqlConnection", "Local endpoint '" + + my_endpoint->GetName() + "' is not authoritative, bailing out."); + + mysql_close(&m_Connection); + m_Connected = false; + + return; + } + } + + Log(LogNotice, "IdoMysqlConnection", "Enabling IDO connection."); } std::ostringstream msgbuf; diff --git a/components/db_ido_mysql/schema/mysql.sql b/components/db_ido_mysql/schema/mysql.sql index e8d7698bb..44c52df35 100644 --- a/components/db_ido_mysql/schema/mysql.sql +++ b/components/db_ido_mysql/schema/mysql.sql @@ -900,6 +900,7 @@ CREATE TABLE IF NOT EXISTS icinga_programstatus ( status_update_time timestamp default '0000-00-00 00:00:00', program_start_time timestamp default '0000-00-00 00:00:00', program_end_time timestamp default '0000-00-00 00:00:00', + endpoint_name varchar(255) character set latin1 collate latin1_general_cs default NULL, is_currently_running smallint default 0, process_id bigint unsigned default 0, daemon_mode smallint default 0, diff --git a/components/db_ido_mysql/schema/upgrade/2.1.0.sql b/components/db_ido_mysql/schema/upgrade/2.1.0.sql new file mode 100644 index 000000000..68d732eee --- /dev/null +++ b/components/db_ido_mysql/schema/upgrade/2.1.0.sql @@ -0,0 +1,17 @@ +-- ----------------------------------------- +-- upgrade path for Icinga 2.1.0 +-- +-- ----------------------------------------- +-- Copyright (c) 2014 Icinga Development Team (http://www.icinga.org) +-- +-- Please check http://docs.icinga.org for upgrading information! +-- ----------------------------------------- + +ALTER TABLE `icinga_programstatus` ADD COLUMN `endpoint_name` varchar(255) character set latin1 collate latin1_general_cs default NULL; + +-- ----------------------------------------- +-- update dbversion +-- ----------------------------------------- + +INSERT INTO icinga_dbversion (name, version, create_time, modify_time) VALUES ('idoutils', '1.11.7', NOW(), NOW()) ON DUPLICATE KEY UPDATE version='1.11.7', modify_time=NOW(); + diff --git a/components/db_ido_pgsql/idopgsqlconnection.cpp b/components/db_ido_pgsql/idopgsqlconnection.cpp index ec85d89c4..9fe413dfb 100644 --- a/components/db_ido_pgsql/idopgsqlconnection.cpp +++ b/components/db_ido_pgsql/idopgsqlconnection.cpp @@ -34,7 +34,7 @@ using namespace icinga; -#define SCHEMA_VERSION "1.11.6" +#define SCHEMA_VERSION "1.11.7" REGISTER_TYPE(IdoPgsqlConnection); @@ -212,15 +212,15 @@ void IdoPgsqlConnection::Reconnect(void) String dbVersionName = "idoutils"; IdoPgsqlResult result = Query("SELECT version FROM " + GetTablePrefix() + "dbversion WHERE name=E'" + Escape(dbVersionName) + "'"); - Dictionary::Ptr version_row = FetchRow(result, 0); + Dictionary::Ptr row = FetchRow(result, 0); - if (!version_row) { + if (!row) { Log(LogCritical, "IdoPgsqlConnection", "Schema does not provide any valid version! Verify your schema installation."); Application::Exit(EXIT_FAILURE); } - String version = version_row->Get("version"); + String version = row->Get("version"); if (Utility::CompareVersion(SCHEMA_VERSION, version) < 0) { Log(LogCritical, "IdoPgsqlConnection", "Schema version '" + version + "' does not match the required version '" + @@ -232,8 +232,7 @@ void IdoPgsqlConnection::Reconnect(void) String instanceName = GetInstanceName(); result = Query("SELECT instance_id FROM " + GetTablePrefix() + "instances WHERE instance_name = E'" + Escape(instanceName) + "'"); - - Dictionary::Ptr row = FetchRow(result, 0); + row = FetchRow(result, 0); if (!row) { Query("INSERT INTO " + GetTablePrefix() + "instances (instance_name, instance_description) VALUES (E'" + Escape(instanceName) + "', E'" + Escape(GetInstanceDescription()) + "')"); @@ -242,6 +241,58 @@ void IdoPgsqlConnection::Reconnect(void) m_InstanceID = DbReference(row->Get("instance_id")); } + Endpoint::Ptr my_endpoint = Endpoint::GetLocalEndpoint(); + + /* we have an endpoint in a cluster setup, so decide if we can proceed here */ + if (my_endpoint && GetHAMode() == HARunOnce) { + /* get the current endpoint writing to programstatus table */ + result = Query("SELECT UNIX_TIMESTAMP(status_update_time) AS status_update_time, endpoint_name FROM " + + GetTablePrefix() + "programstatus WHERE instance_id = " + Convert::ToString(m_InstanceID)); + row = FetchRow(result, 0); + + String endpoint_name; + + if (row) + endpoint_name = row->Get("endpoint_name"); + else + Log(LogNotice, "IdoPgsqlConnection", "Empty program status table"); + + /* if we did not write into the database earlier, another instance is active */ + if (endpoint_name != my_endpoint->GetName()) { + double status_update_time; + + if (row) + status_update_time = row->Get("status_update_time"); + else + status_update_time = 0; + + double status_update_age = Utility::GetTime() - status_update_time; + + Log(LogNotice, "IdoPgsqlConnection", "Last update by '" + + endpoint_name + "' was " + Convert::ToString(status_update_age) + "s ago."); + + if (status_update_age < GetFailoverTimeout()) { + PQfinish(m_Connection); + m_Connection = NULL; + + return; + } + + /* activate the IDO only, if we're authoritative in this zone */ + if (IsPaused()) { + Log(LogNotice, "IdoPgsqlConnection", "Local endpoint '" + + my_endpoint->GetName() + "' is not authoritative, bailing out."); + + PQfinish(m_Connection); + m_Connection = NULL; + + return; + } + } + + Log(LogNotice, "IdoPgsqlConnection", "Enabling IDO connection."); + } + std::ostringstream msgbuf; msgbuf << "pgSQL IDO instance id: " << static_cast(m_InstanceID) << " (schema version: '" + version + "')"; Log(LogInformation, "IdoPgsqlConnection", msgbuf.str()); diff --git a/components/db_ido_pgsql/schema/pgsql.sql b/components/db_ido_pgsql/schema/pgsql.sql index 516c67132..6a3533be5 100644 --- a/components/db_ido_pgsql/schema/pgsql.sql +++ b/components/db_ido_pgsql/schema/pgsql.sql @@ -928,6 +928,7 @@ CREATE TABLE icinga_programstatus ( program_start_time timestamp with time zone default '1970-01-01 00:00:00', program_end_time timestamp with time zone default '1970-01-01 00:00:00', is_currently_running INTEGER default 0, + endpoint_name TEXT default '', process_id bigint default 0, daemon_mode INTEGER default 0, last_command_check timestamp with time zone default '1970-01-01 00:00:00', diff --git a/components/db_ido_pgsql/schema/upgrade/2.1.0.sql b/components/db_ido_pgsql/schema/upgrade/2.1.0.sql new file mode 100644 index 000000000..8a749e8ee --- /dev/null +++ b/components/db_ido_pgsql/schema/upgrade/2.1.0.sql @@ -0,0 +1,17 @@ +-- ----------------------------------------- +-- upgrade path for Icinga 2.1.0 +-- +-- ----------------------------------------- +-- Copyright (c) 2014 Icinga Development Team (http://www.icinga.org) +-- +-- Please check http://docs.icinga.org for upgrading information! +-- ----------------------------------------- + +ALTER TABLE icinga_programstatus ADD COLUMN endpoint_name TEXT default NULL; + +-- ----------------------------------------- +-- update dbversion +-- ----------------------------------------- + +SELECT updatedbversion('1.11.7'); + diff --git a/doc/3-monitoring-basics.md b/doc/3-monitoring-basics.md index 656fd2728..9df7b496a 100644 --- a/doc/3-monitoring-basics.md +++ b/doc/3-monitoring-basics.md @@ -1784,6 +1784,8 @@ chapter. Details on the configuration can be found in the [IdoMysqlConnection](#objecttype-idomysqlconnection) and [IdoPgsqlConnection](#objecttype-idoPgsqlconnection) object configuration documentation. +The DB IDO feature supports [High Availability](##high-availability-db-ido) in +the Icinga 2 cluster. The following example query checks the health of the current Icinga 2 instance writing its current status to the DB IDO backend table `icinga_programstatus` diff --git a/doc/4-monitoring-remote-systems.md b/doc/4-monitoring-remote-systems.md index b4f6ebce0..e95bd1c0f 100644 --- a/doc/4-monitoring-remote-systems.md +++ b/doc/4-monitoring-remote-systems.md @@ -479,6 +479,46 @@ the Icinga 2 daemon. # icinga2 -c /etc/icinga2/node1/icinga2.conf -DLocalStateDir=/opt/node1/var +### High Availability with DB IDO + +All instances within the same zone (e.g. the `master` zone as HA cluster) must +have the DB IDO feature enabled. + +Example DB IDO MySQL: + + # icinga2-enable-feature ido-mysql + The feature 'ido-mysql' is already enabled. + +By default the DB IDO feature only runs on the elected zone master. All other nodes +disable the active IDO database connection at runtime. + +> **Note** +> +> The DB IDO HA feature can be disabled by setting the `enable_ha` attribute to `false` +> for the [IdoMysqlConnection](#objecttype-idomysqlconnection) or +> [IdoPgsqlConnection](#objecttype-idopgsqlconnection) object on all nodes in the +> same zone. +> +> All endpoints will enable the DB IDO feature then, connect to the configured +> database and dump configuration, status and historical data on their own. + +If the instance with the active DB IDO connection dies, the HA functionality will +re-enable the DB IDO connection on the newly elected zone master. + +The DB IDO feature will try to determine which cluster endpoint is currently writing +to the database and bail out if another endpoint is active. You can manually verify that +by running the following query: + + icinga=> SELECT status_update_time, endpoint_name FROM icinga_programstatus; + status_update_time | endpoint_name + ------------------------+--------------- + 2014-08-15 15:52:26+02 | icinga2a + (1 Zeile) + +This is useful when the cluster connection between endpoints breaks, and prevents +data duplication in split-brain-scenarios. The failover timeout can be set for the +`failover_timeout` attribute, but not lower than 60 seconds. + ### Cluster Scenarios @@ -630,7 +670,8 @@ High availability with Icinga 2 is possible by putting multiple nodes into a dedicated `Zone`. All nodes will elect their active master, and retry an election once the current active master failed. -Selected features (such as DB IDO) will only be active on the current active master. +Selected features (such as [DB IDO](#high-availability-db-ido)) will only be +active on the current active master. All other passive nodes will pause the features without reload/restart. Connections from other zones will be accepted by all active and passive nodes diff --git a/doc/6-configuring-icinga-2.md b/doc/6-configuring-icinga-2.md index 922de5e5a..97f18a40c 100644 --- a/doc/6-configuring-icinga-2.md +++ b/doc/6-configuring-icinga-2.md @@ -1272,6 +1272,8 @@ Attributes: table\_prefix |**Optional.** MySQL database table prefix. Defaults to "icinga\_". instance\_name |**Optional.** Unique identifier for the local Icinga 2 instance. Defaults to "default". instance\_description|**Optional.** Description for the Icinga 2 instance. + enable_ha |**Optional.** Enable the high availability functionality. Only valid in a [cluster setup](#high-availability-db-ido). Defaults to "true". + failover_timeout | **Optional.** Set the failover timeout in a [HA cluster](#high-availability-db-ido). Must not be lower than 60s". Defaults to "60s". cleanup |**Optional.** Dictionary with items for historical table cleanup. categories |**Optional.** The types of information that should be written to the database. @@ -1359,6 +1361,8 @@ Attributes: table\_prefix |**Optional.** PostgreSQL database table prefix. Defaults to "icinga\_". instance\_name |**Optional.** Unique identifier for the local Icinga 2 instance. Defaults to "default". instance\_description|**Optional.** Description for the Icinga 2 instance. + enable_ha |**Optional.** Enable the high availability functionality. Only valid in a [cluster setup](#high-availability-db-ido). Defaults to "true". + failover_timeout | **Optional.** Set the failover timeout in a [HA cluster](#high-availability-db-ido). Must not be lower than 60s". Defaults to "60s". cleanup |**Optional.** Dictionary with items for historical table cleanup. categories |**Optional.** The types of information that should be written to the database. diff --git a/lib/base/dynamicobject.ti b/lib/base/dynamicobject.ti index b887d0b9b..25ee6d74e 100644 --- a/lib/base/dynamicobject.ti +++ b/lib/base/dynamicobject.ti @@ -2,6 +2,12 @@ namespace icinga { code {{{ +enum HAMode +{ + HARunOnce, + HARunEverywhere +}; + class NameComposer { public: virtual String MakeName(const String& shortName, const Dictionary::Ptr props) const = 0; @@ -32,6 +38,7 @@ abstract class DynamicObject [get_protected] bool stop_called; [get_protected] bool pause_called; [get_protected] bool resume_called; + [enum] HAMode ha_mode (HAMode); Dictionary::Ptr authority_info; [protected] Dictionary::Ptr extensions; diff --git a/lib/db_ido/db_ido-type.conf b/lib/db_ido/db_ido-type.conf index b85608b75..b9e6847f1 100644 --- a/lib/db_ido/db_ido-type.conf +++ b/lib/db_ido/db_ido-type.conf @@ -18,6 +18,8 @@ ******************************************************************************/ %type DbConnection { + %validator "ValidateFailoverTimeout" + %attribute %string "table_prefix", %attribute %dictionary "cleanup" { @@ -38,5 +40,9 @@ %attribute %number "systemcommands_age", }, - %attribute %number "categories" + %attribute %number "categories", + + %attribute %number "enable_ha", + + %attribute %number "failover_timeout", } diff --git a/lib/db_ido/dbconnection.cpp b/lib/db_ido/dbconnection.cpp index 0678e75b0..4c641ab80 100644 --- a/lib/db_ido/dbconnection.cpp +++ b/lib/db_ido/dbconnection.cpp @@ -22,22 +22,35 @@ #include "icinga/icingaapplication.hpp" #include "icinga/host.hpp" #include "icinga/service.hpp" +#include "config/configcompilercontext.hpp" #include "base/dynamictype.hpp" #include "base/convert.hpp" #include "base/objectlock.hpp" #include "base/utility.hpp" #include "base/initialize.hpp" #include "base/logger_fwd.hpp" +#include "base/scriptfunction.hpp" #include using namespace icinga; REGISTER_TYPE(DbConnection); +REGISTER_SCRIPTFUNCTION(ValidateFailoverTimeout, &DbConnection::ValidateFailoverTimeout); Timer::Ptr DbConnection::m_ProgramStatusTimer; INITIALIZE_ONCE(&DbConnection::StaticInitialize); +void DbConnection::OnConfigLoaded(void) +{ + DynamicObject::OnConfigLoaded(); + + if (!GetEnableHa()) { + Log(LogDebug, "DbConnection", "HA functionality disabled. Won't pause IDO connection: " + GetName()); + SetHAMode(HARunEverywhere); + } +} + void DbConnection::Start(void) { DynamicObject::Start(); @@ -108,6 +121,7 @@ void DbConnection::ProgramStatusHandler(void) query2.Fields->Set("status_update_time", DbValue::FromTimestamp(Utility::GetTime())); query2.Fields->Set("program_start_time", DbValue::FromTimestamp(Application::GetStartTime())); query2.Fields->Set("is_currently_running", 1); + query2.Fields->Set("endpoint_name", IcingaApplication::GetInstance()->GetNodeName()); query2.Fields->Set("process_id", Utility::GetPid()); query2.Fields->Set("daemon_mode", 1); query2.Fields->Set("last_command_check", DbValue::FromTimestamp(Utility::GetTime())); @@ -358,7 +372,7 @@ void DbConnection::UpdateAllObjects(void) void DbConnection::PrepareDatabase(void) { - /* + /* * only clear tables on reconnect which * cannot be updated by their existing ids * for details check https://dev.icinga.org/issues/5565 @@ -384,7 +398,6 @@ void DbConnection::PrepareDatabase(void) //ClearConfigTable("hostgroups"); //ClearConfigTable("hosts"); //ClearConfigTable("hoststatus"); - ClearConfigTable("programstatus"); ClearConfigTable("scheduleddowntime"); ClearConfigTable("service_contactgroups"); ClearConfigTable("service_contacts"); @@ -400,3 +413,12 @@ void DbConnection::PrepareDatabase(void) FillIDCache(type); } } + +void DbConnection::ValidateFailoverTimeout(const String& location, const Dictionary::Ptr& attrs) +{ + Value failover_timeout = attrs->Get("failover_timeout"); + if (failover_timeout < 60) { + ConfigCompilerContext::GetInstance()->AddMessage(true, "Validation failed for " + + location + ": Failover timeout minimum is 60s."); + } +} diff --git a/lib/db_ido/dbconnection.hpp b/lib/db_ido/dbconnection.hpp index 1a0e14951..e800bfdcb 100644 --- a/lib/db_ido/dbconnection.hpp +++ b/lib/db_ido/dbconnection.hpp @@ -63,7 +63,10 @@ public: void SetStatusUpdate(const DbObject::Ptr& dbobj, bool hasupdate); bool GetStatusUpdate(const DbObject::Ptr& dbobj) const; + static void ValidateFailoverTimeout(const String& location, const Dictionary::Ptr& attrs); + protected: + virtual void OnConfigLoaded(void); virtual void Start(void); virtual void Resume(void); virtual void Pause(void); diff --git a/lib/db_ido/dbconnection.ti b/lib/db_ido/dbconnection.ti index d71af8517..33a1c6821 100644 --- a/lib/db_ido/dbconnection.ti +++ b/lib/db_ido/dbconnection.ti @@ -21,6 +21,14 @@ abstract class DbConnection : DynamicObject DbCatLog | DbCatNotification | DbCatProgramStatus | DbCatRetention | DbCatStateHistory; }}} }; + + [config] bool enable_ha { + default {{{ return true; }}} + }; + + [config] double failover_timeout { + default {{{ return 60; }}} + }; }; } diff --git a/lib/remote/authority.cpp b/lib/remote/authority.cpp index 2c7159020..6b4b60d9d 100644 --- a/lib/remote/authority.cpp +++ b/lib/remote/authority.cpp @@ -61,7 +61,8 @@ static void AuthorityTimerHandler(void) BOOST_FOREACH(const DynamicObject::Ptr& object, type->GetObjects()) { Endpoint::Ptr endpoint = endpoints[Utility::SDBM(object->GetName()) % endpoints.size()]; - object->SetAuthority(endpoint == my_endpoint); + if (object->GetHAMode() == HARunOnce) + object->SetAuthority(endpoint == my_endpoint); } } }