Merge branch 'feature/ido-ha-6827'

fixes #6827
refs #6203
This commit is contained in:
Michael Friedrich 2014-08-15 17:24:05 +02:00
commit 60f219450f
15 changed files with 252 additions and 18 deletions

View File

@ -33,7 +33,7 @@
using namespace icinga;
#define SCHEMA_VERSION "1.11.6"
#define SCHEMA_VERSION "1.11.7"
REGISTER_TYPE(IdoMysqlConnection);
REGISTER_STATSFUNCTION(IdoMysqlConnectionStats, &IdoMysqlConnection::StatsFunc);
@ -188,6 +188,7 @@ void IdoMysqlConnection::Reconnect(void)
passwd = (!ipasswd.IsEmpty()) ? ipasswd.CStr() : NULL;
db = (!idb.IsEmpty()) ? idb.CStr() : NULL;
/* connection */
if (!mysql_init(&m_Connection)) {
std::ostringstream msgbuf;
msgbuf << "mysql_init() failed: \"" << mysql_error(&m_Connection) << "\"";
@ -210,9 +211,9 @@ void IdoMysqlConnection::Reconnect(void)
String dbVersionName = "idoutils";
IdoMysqlResult result = Query("SELECT version FROM " + GetTablePrefix() + "dbversion WHERE name='" + Escape(dbVersionName) + "'");
Dictionary::Ptr version_row = FetchRow(result);
Dictionary::Ptr row = FetchRow(result);
if (!version_row) {
if (!row) {
Log(LogCritical, "IdoMysqlConnection", "Schema does not provide any valid version! Verify your schema installation.");
Application::Exit(EXIT_FAILURE);
@ -220,7 +221,7 @@ void IdoMysqlConnection::Reconnect(void)
DiscardRows(result);
String version = version_row->Get("version");
String version = row->Get("version");
if (Utility::CompareVersion(SCHEMA_VERSION, version) < 0) {
Log(LogCritical, "IdoMysqlConnection", "Schema version '" + version + "' does not match the required version '" +
@ -232,16 +233,68 @@ void IdoMysqlConnection::Reconnect(void)
String instanceName = GetInstanceName();
result = Query("SELECT instance_id FROM " + GetTablePrefix() + "instances WHERE instance_name = '" + Escape(instanceName) + "'");
Dictionary::Ptr row = FetchRow(result);
row = FetchRow(result);
if (!row) {
Query("INSERT INTO " + GetTablePrefix() + "instances (instance_name, instance_description) VALUES ('" + Escape(instanceName) + "', '" + Escape(GetInstanceDescription()) + "')");
m_InstanceID = GetLastInsertID();
} else {
m_InstanceID = DbReference(row->Get("instance_id"));
}
DiscardRows(result);
Endpoint::Ptr my_endpoint = Endpoint::GetLocalEndpoint();
/* we have an endpoint in a cluster setup, so decide if we can proceed here */
if (my_endpoint && GetHAMode() == HARunOnce) {
/* get the current endpoint writing to programstatus table */
result = Query("SELECT UNIX_TIMESTAMP(status_update_time) AS status_update_time, endpoint_name FROM " +
GetTablePrefix() + "programstatus WHERE instance_id = " + Convert::ToString(m_InstanceID));
row = FetchRow(result);
DiscardRows(result);
m_InstanceID = DbReference(row->Get("instance_id"));
String endpoint_name;
if (row)
endpoint_name = row->Get("endpoint_name");
else
Log(LogNotice, "IdoMysqlConnection", "Empty program status table");
/* if we did not write into the database earlier, another instance is active */
if (endpoint_name != my_endpoint->GetName()) {
double status_update_time;
if (row)
status_update_time = row->Get("status_update_time");
else
status_update_time = 0;
double status_update_age = Utility::GetTime() - status_update_time;
Log(LogNotice, "IdoMysqlConnection", "Last update by '" +
endpoint_name + "' was " + Convert::ToString(status_update_age) + "s ago.");
if (status_update_age < GetFailoverTimeout()) {
mysql_close(&m_Connection);
m_Connected = false;
return;
}
/* activate the IDO only, if we're authoritative in this zone */
if (IsPaused()) {
Log(LogNotice, "IdoMysqlConnection", "Local endpoint '" +
my_endpoint->GetName() + "' is not authoritative, bailing out.");
mysql_close(&m_Connection);
m_Connected = false;
return;
}
}
Log(LogNotice, "IdoMysqlConnection", "Enabling IDO connection.");
}
std::ostringstream msgbuf;

View File

@ -900,6 +900,7 @@ CREATE TABLE IF NOT EXISTS icinga_programstatus (
status_update_time timestamp default '0000-00-00 00:00:00',
program_start_time timestamp default '0000-00-00 00:00:00',
program_end_time timestamp default '0000-00-00 00:00:00',
endpoint_name varchar(255) character set latin1 collate latin1_general_cs default NULL,
is_currently_running smallint default 0,
process_id bigint unsigned default 0,
daemon_mode smallint default 0,

View File

@ -0,0 +1,17 @@
-- -----------------------------------------
-- upgrade path for Icinga 2.1.0
--
-- -----------------------------------------
-- Copyright (c) 2014 Icinga Development Team (http://www.icinga.org)
--
-- Please check http://docs.icinga.org for upgrading information!
-- -----------------------------------------
ALTER TABLE `icinga_programstatus` ADD COLUMN `endpoint_name` varchar(255) character set latin1 collate latin1_general_cs default NULL;
-- -----------------------------------------
-- update dbversion
-- -----------------------------------------
INSERT INTO icinga_dbversion (name, version, create_time, modify_time) VALUES ('idoutils', '1.11.7', NOW(), NOW()) ON DUPLICATE KEY UPDATE version='1.11.7', modify_time=NOW();

View File

@ -34,7 +34,7 @@
using namespace icinga;
#define SCHEMA_VERSION "1.11.6"
#define SCHEMA_VERSION "1.11.7"
REGISTER_TYPE(IdoPgsqlConnection);
@ -212,15 +212,15 @@ void IdoPgsqlConnection::Reconnect(void)
String dbVersionName = "idoutils";
IdoPgsqlResult result = Query("SELECT version FROM " + GetTablePrefix() + "dbversion WHERE name=E'" + Escape(dbVersionName) + "'");
Dictionary::Ptr version_row = FetchRow(result, 0);
Dictionary::Ptr row = FetchRow(result, 0);
if (!version_row) {
if (!row) {
Log(LogCritical, "IdoPgsqlConnection", "Schema does not provide any valid version! Verify your schema installation.");
Application::Exit(EXIT_FAILURE);
}
String version = version_row->Get("version");
String version = row->Get("version");
if (Utility::CompareVersion(SCHEMA_VERSION, version) < 0) {
Log(LogCritical, "IdoPgsqlConnection", "Schema version '" + version + "' does not match the required version '" +
@ -232,8 +232,7 @@ void IdoPgsqlConnection::Reconnect(void)
String instanceName = GetInstanceName();
result = Query("SELECT instance_id FROM " + GetTablePrefix() + "instances WHERE instance_name = E'" + Escape(instanceName) + "'");
Dictionary::Ptr row = FetchRow(result, 0);
row = FetchRow(result, 0);
if (!row) {
Query("INSERT INTO " + GetTablePrefix() + "instances (instance_name, instance_description) VALUES (E'" + Escape(instanceName) + "', E'" + Escape(GetInstanceDescription()) + "')");
@ -242,6 +241,58 @@ void IdoPgsqlConnection::Reconnect(void)
m_InstanceID = DbReference(row->Get("instance_id"));
}
Endpoint::Ptr my_endpoint = Endpoint::GetLocalEndpoint();
/* we have an endpoint in a cluster setup, so decide if we can proceed here */
if (my_endpoint && GetHAMode() == HARunOnce) {
/* get the current endpoint writing to programstatus table */
result = Query("SELECT UNIX_TIMESTAMP(status_update_time) AS status_update_time, endpoint_name FROM " +
GetTablePrefix() + "programstatus WHERE instance_id = " + Convert::ToString(m_InstanceID));
row = FetchRow(result, 0);
String endpoint_name;
if (row)
endpoint_name = row->Get("endpoint_name");
else
Log(LogNotice, "IdoPgsqlConnection", "Empty program status table");
/* if we did not write into the database earlier, another instance is active */
if (endpoint_name != my_endpoint->GetName()) {
double status_update_time;
if (row)
status_update_time = row->Get("status_update_time");
else
status_update_time = 0;
double status_update_age = Utility::GetTime() - status_update_time;
Log(LogNotice, "IdoPgsqlConnection", "Last update by '" +
endpoint_name + "' was " + Convert::ToString(status_update_age) + "s ago.");
if (status_update_age < GetFailoverTimeout()) {
PQfinish(m_Connection);
m_Connection = NULL;
return;
}
/* activate the IDO only, if we're authoritative in this zone */
if (IsPaused()) {
Log(LogNotice, "IdoPgsqlConnection", "Local endpoint '" +
my_endpoint->GetName() + "' is not authoritative, bailing out.");
PQfinish(m_Connection);
m_Connection = NULL;
return;
}
}
Log(LogNotice, "IdoPgsqlConnection", "Enabling IDO connection.");
}
std::ostringstream msgbuf;
msgbuf << "pgSQL IDO instance id: " << static_cast<long>(m_InstanceID) << " (schema version: '" + version + "')";
Log(LogInformation, "IdoPgsqlConnection", msgbuf.str());

View File

@ -928,6 +928,7 @@ CREATE TABLE icinga_programstatus (
program_start_time timestamp with time zone default '1970-01-01 00:00:00',
program_end_time timestamp with time zone default '1970-01-01 00:00:00',
is_currently_running INTEGER default 0,
endpoint_name TEXT default '',
process_id bigint default 0,
daemon_mode INTEGER default 0,
last_command_check timestamp with time zone default '1970-01-01 00:00:00',

View File

@ -0,0 +1,17 @@
-- -----------------------------------------
-- upgrade path for Icinga 2.1.0
--
-- -----------------------------------------
-- Copyright (c) 2014 Icinga Development Team (http://www.icinga.org)
--
-- Please check http://docs.icinga.org for upgrading information!
-- -----------------------------------------
ALTER TABLE icinga_programstatus ADD COLUMN endpoint_name TEXT default NULL;
-- -----------------------------------------
-- update dbversion
-- -----------------------------------------
SELECT updatedbversion('1.11.7');

View File

@ -1784,6 +1784,8 @@ chapter. Details on the configuration can be found in the
[IdoMysqlConnection](#objecttype-idomysqlconnection) and
[IdoPgsqlConnection](#objecttype-idoPgsqlconnection)
object configuration documentation.
The DB IDO feature supports [High Availability](##high-availability-db-ido) in
the Icinga 2 cluster.
The following example query checks the health of the current Icinga 2 instance
writing its current status to the DB IDO backend table `icinga_programstatus`

View File

@ -479,6 +479,46 @@ the Icinga 2 daemon.
# icinga2 -c /etc/icinga2/node1/icinga2.conf -DLocalStateDir=/opt/node1/var
### <a id="high-availability-db-ido"></a> High Availability with DB IDO
All instances within the same zone (e.g. the `master` zone as HA cluster) must
have the DB IDO feature enabled.
Example DB IDO MySQL:
# icinga2-enable-feature ido-mysql
The feature 'ido-mysql' is already enabled.
By default the DB IDO feature only runs on the elected zone master. All other nodes
disable the active IDO database connection at runtime.
> **Note**
>
> The DB IDO HA feature can be disabled by setting the `enable_ha` attribute to `false`
> for the [IdoMysqlConnection](#objecttype-idomysqlconnection) or
> [IdoPgsqlConnection](#objecttype-idopgsqlconnection) object on all nodes in the
> same zone.
>
> All endpoints will enable the DB IDO feature then, connect to the configured
> database and dump configuration, status and historical data on their own.
If the instance with the active DB IDO connection dies, the HA functionality will
re-enable the DB IDO connection on the newly elected zone master.
The DB IDO feature will try to determine which cluster endpoint is currently writing
to the database and bail out if another endpoint is active. You can manually verify that
by running the following query:
icinga=> SELECT status_update_time, endpoint_name FROM icinga_programstatus;
status_update_time | endpoint_name
------------------------+---------------
2014-08-15 15:52:26+02 | icinga2a
(1 Zeile)
This is useful when the cluster connection between endpoints breaks, and prevents
data duplication in split-brain-scenarios. The failover timeout can be set for the
`failover_timeout` attribute, but not lower than 60 seconds.
### <a id="cluster-scenarios"></a> Cluster Scenarios
@ -630,7 +670,8 @@ High availability with Icinga 2 is possible by putting multiple nodes into
a dedicated `Zone`. All nodes will elect their active master, and retry an
election once the current active master failed.
Selected features (such as DB IDO) will only be active on the current active master.
Selected features (such as [DB IDO](#high-availability-db-ido)) will only be
active on the current active master.
All other passive nodes will pause the features without reload/restart.
Connections from other zones will be accepted by all active and passive nodes

View File

@ -1272,6 +1272,8 @@ Attributes:
table\_prefix |**Optional.** MySQL database table prefix. Defaults to "icinga\_".
instance\_name |**Optional.** Unique identifier for the local Icinga 2 instance. Defaults to "default".
instance\_description|**Optional.** Description for the Icinga 2 instance.
enable_ha |**Optional.** Enable the high availability functionality. Only valid in a [cluster setup](#high-availability-db-ido). Defaults to "true".
failover_timeout | **Optional.** Set the failover timeout in a [HA cluster](#high-availability-db-ido). Must not be lower than 60s". Defaults to "60s".
cleanup |**Optional.** Dictionary with items for historical table cleanup.
categories |**Optional.** The types of information that should be written to the database.
@ -1359,6 +1361,8 @@ Attributes:
table\_prefix |**Optional.** PostgreSQL database table prefix. Defaults to "icinga\_".
instance\_name |**Optional.** Unique identifier for the local Icinga 2 instance. Defaults to "default".
instance\_description|**Optional.** Description for the Icinga 2 instance.
enable_ha |**Optional.** Enable the high availability functionality. Only valid in a [cluster setup](#high-availability-db-ido). Defaults to "true".
failover_timeout | **Optional.** Set the failover timeout in a [HA cluster](#high-availability-db-ido). Must not be lower than 60s". Defaults to "60s".
cleanup |**Optional.** Dictionary with items for historical table cleanup.
categories |**Optional.** The types of information that should be written to the database.

View File

@ -2,6 +2,12 @@ namespace icinga
{
code {{{
enum HAMode
{
HARunOnce,
HARunEverywhere
};
class NameComposer {
public:
virtual String MakeName(const String& shortName, const Dictionary::Ptr props) const = 0;
@ -32,6 +38,7 @@ abstract class DynamicObject
[get_protected] bool stop_called;
[get_protected] bool pause_called;
[get_protected] bool resume_called;
[enum] HAMode ha_mode (HAMode);
Dictionary::Ptr authority_info;
[protected] Dictionary::Ptr extensions;

View File

@ -18,6 +18,8 @@
******************************************************************************/
%type DbConnection {
%validator "ValidateFailoverTimeout"
%attribute %string "table_prefix",
%attribute %dictionary "cleanup" {
@ -38,5 +40,9 @@
%attribute %number "systemcommands_age",
},
%attribute %number "categories"
%attribute %number "categories",
%attribute %number "enable_ha",
%attribute %number "failover_timeout",
}

View File

@ -22,22 +22,35 @@
#include "icinga/icingaapplication.hpp"
#include "icinga/host.hpp"
#include "icinga/service.hpp"
#include "config/configcompilercontext.hpp"
#include "base/dynamictype.hpp"
#include "base/convert.hpp"
#include "base/objectlock.hpp"
#include "base/utility.hpp"
#include "base/initialize.hpp"
#include "base/logger_fwd.hpp"
#include "base/scriptfunction.hpp"
#include <boost/foreach.hpp>
using namespace icinga;
REGISTER_TYPE(DbConnection);
REGISTER_SCRIPTFUNCTION(ValidateFailoverTimeout, &DbConnection::ValidateFailoverTimeout);
Timer::Ptr DbConnection::m_ProgramStatusTimer;
INITIALIZE_ONCE(&DbConnection::StaticInitialize);
void DbConnection::OnConfigLoaded(void)
{
DynamicObject::OnConfigLoaded();
if (!GetEnableHa()) {
Log(LogDebug, "DbConnection", "HA functionality disabled. Won't pause IDO connection: " + GetName());
SetHAMode(HARunEverywhere);
}
}
void DbConnection::Start(void)
{
DynamicObject::Start();
@ -108,6 +121,7 @@ void DbConnection::ProgramStatusHandler(void)
query2.Fields->Set("status_update_time", DbValue::FromTimestamp(Utility::GetTime()));
query2.Fields->Set("program_start_time", DbValue::FromTimestamp(Application::GetStartTime()));
query2.Fields->Set("is_currently_running", 1);
query2.Fields->Set("endpoint_name", IcingaApplication::GetInstance()->GetNodeName());
query2.Fields->Set("process_id", Utility::GetPid());
query2.Fields->Set("daemon_mode", 1);
query2.Fields->Set("last_command_check", DbValue::FromTimestamp(Utility::GetTime()));
@ -358,7 +372,7 @@ void DbConnection::UpdateAllObjects(void)
void DbConnection::PrepareDatabase(void)
{
/*
/*
* only clear tables on reconnect which
* cannot be updated by their existing ids
* for details check https://dev.icinga.org/issues/5565
@ -384,7 +398,6 @@ void DbConnection::PrepareDatabase(void)
//ClearConfigTable("hostgroups");
//ClearConfigTable("hosts");
//ClearConfigTable("hoststatus");
ClearConfigTable("programstatus");
ClearConfigTable("scheduleddowntime");
ClearConfigTable("service_contactgroups");
ClearConfigTable("service_contacts");
@ -400,3 +413,12 @@ void DbConnection::PrepareDatabase(void)
FillIDCache(type);
}
}
void DbConnection::ValidateFailoverTimeout(const String& location, const Dictionary::Ptr& attrs)
{
Value failover_timeout = attrs->Get("failover_timeout");
if (failover_timeout < 60) {
ConfigCompilerContext::GetInstance()->AddMessage(true, "Validation failed for " +
location + ": Failover timeout minimum is 60s.");
}
}

View File

@ -63,7 +63,10 @@ public:
void SetStatusUpdate(const DbObject::Ptr& dbobj, bool hasupdate);
bool GetStatusUpdate(const DbObject::Ptr& dbobj) const;
static void ValidateFailoverTimeout(const String& location, const Dictionary::Ptr& attrs);
protected:
virtual void OnConfigLoaded(void);
virtual void Start(void);
virtual void Resume(void);
virtual void Pause(void);

View File

@ -21,6 +21,14 @@ abstract class DbConnection : DynamicObject
DbCatLog | DbCatNotification | DbCatProgramStatus | DbCatRetention | DbCatStateHistory;
}}}
};
[config] bool enable_ha {
default {{{ return true; }}}
};
[config] double failover_timeout {
default {{{ return 60; }}}
};
};
}

View File

@ -61,7 +61,8 @@ static void AuthorityTimerHandler(void)
BOOST_FOREACH(const DynamicObject::Ptr& object, type->GetObjects()) {
Endpoint::Ptr endpoint = endpoints[Utility::SDBM(object->GetName()) % endpoints.size()];
object->SetAuthority(endpoint == my_endpoint);
if (object->GetHAMode() == HARunOnce)
object->SetAuthority(endpoint == my_endpoint);
}
}
}