From 7e37609b1ebb5dea3444060ee6db9ae8853cfd4b Mon Sep 17 00:00:00 2001 From: Michael Friedrich Date: Mon, 15 Jun 2015 20:10:58 +0200 Subject: [PATCH] Documentation: Enhance cluster troubleshooting; add HA command_endpoint fixes #9419 fixes #9420 --- doc/10-icinga2-client.md | 6 +-- doc/12-distributed-monitoring-ha.md | 38 +++++++++++++++++-- doc/16-troubleshooting.md | 58 ++++++++++++++++++++++++++++- 3 files changed, 94 insertions(+), 8 deletions(-) diff --git a/doc/10-icinga2-client.md b/doc/10-icinga2-client.md index 89b075cac..0d0c22f89 100644 --- a/doc/10-icinga2-client.md +++ b/doc/10-icinga2-client.md @@ -155,7 +155,7 @@ graphical installer for Windows based client setup. Your client setup requires the following * A ready configured and installed [master node](10-icinga2-client.md#icinga2-client-installation-master-setup) -* SSL signed certificate for communication with the master (Use [CSR auto-signing](certifiates-csr-autosigning)). +* SSL signed certificate for communication with the master (Use [CSR auto-signing](10-icinga2-client.md#csr-autosigning-requirements)). * Enabled API feature, and a local Endpoint and Zone object configuration * Firewall ACLs for the communication port (default 5665) @@ -600,8 +600,8 @@ defined endpoint. The check result is then received asynchronously through the c vars.users_wgreater = 10 vars.users_cgreater = 20 - /* assign where a remote client is set */ - assign where host.vars.remote_client + /* assign where a remote client pattern is matched */ + assign where match("*-remote", host.name) } diff --git a/doc/12-distributed-monitoring-ha.md b/doc/12-distributed-monitoring-ha.md index fd5bc9489..91690ed09 100644 --- a/doc/12-distributed-monitoring-ha.md +++ b/doc/12-distributed-monitoring-ha.md @@ -391,12 +391,19 @@ master instances anymore. ## Cluster Health Check -The Icinga 2 [ITL](7-icinga-template-library.md#icinga-template-library) ships an internal check command checking all configured -`EndPoints` in the cluster setup. The check result will become critical if -one or more configured nodes are not connected. +The Icinga 2 [ITL](7-icinga-template-library.md#icinga-template-library) provides +an internal check command checking all configured `EndPoints` in the cluster setup. +The check result will become critical if one or more configured nodes are not connected. Example: + object Host "icinga2a" { + display_name = "Health Checks on icinga2a" + + address = "192.168.33.10" + check_command = "hostalive" + } + object Service "cluster" { check_command = "cluster" check_interval = 5s @@ -423,6 +430,31 @@ Example for the `checker` zone checking the connection to the `master` zone: host_name = "icinga2b" } +## Cluster Health Check with Command Endpoints + +If you are planning to sync the zone configuration inside a [High-Availability]() +cluster zone, you can also use the `command_endpoint` object attribute to +pin host/service checks to a specific endpoint inside the same zone. + +This requires the `accept_commands` setting inside the [ApiListener](12-distributed-monitoring-ha.md#configure-apilistener-object) +object set to `true` similar to the [remote client command execution bridge](10-icinga2-client.md#icinga2-client-configuration-command-bridge) +setup. + +Make sure to set `command_endpoint` to the correct endpoint instance. +The example below assumes that the endpoint name is the same as the +host name configured for health checks. If it differs, define a host +custom attribute providing [this information](10-icinga2-client.md#icinga2-client-configuration-command-bridge-master-config). + + apply Service "cluster-ha" { + check_command = "cluster" + check_interval = 5s + retry_interval = 1s + /* make sure host.name is the same as endpoint name */ + command_endpoint = host.name + + assign where regex("^icinga2[a|b]", host.name) + } + ## Cluster Scenarios diff --git a/doc/16-troubleshooting.md b/doc/16-troubleshooting.md index 3a3682020..f72a50315 100644 --- a/doc/16-troubleshooting.md +++ b/doc/16-troubleshooting.md @@ -169,6 +169,11 @@ or modify these attributes in the current object. ## Cluster Troubleshooting +This applies to anything using the cluster protocol: + +* [Distributed and High-Availability](12-distributed-monitoring-ha.md#distributed-monitoring-high-availability) scenarios +* [Remote client](10-icinga2-client.md#icinga2-client-scenarios) scenarios + You should configure the [cluster health checks](12-distributed-monitoring-ha.md#cluster-health-check) if you haven't done so already. @@ -196,16 +201,50 @@ happens (default port is `5665`). ### Cluster Troubleshooting SSL Errors -If the cluster communication fails with cryptic SSL error messages, make sure to check +If the cluster communication fails with SSL error messages, make sure to check the following * File permissions on the SSL certificate files * Does the used CA match for all cluster endpoints? + * Verify the `Issuer` being your trusted CA + * Verify the `Subject` containing your endpoint's common name (CN) + * Check the validity of the certificate itself -Examples: +Steps: # ls -la /etc/icinga2/pki + # cd /etc/icinga2/pki/ + # openssl x509 -in icinga2a.crt -text + Certificate: + Data: + Version: 1 (0x0) + Serial Number: 2 (0x2) + Signature Algorithm: sha1WithRSAEncryption + Issuer: C=DE, ST=Bavaria, L=Nuremberg, O=NETWAYS GmbH, OU=Monitoring, CN=Icinga CA + Validity + Not Before: Jan 7 13:17:38 2014 GMT + Not After : Jan 5 13:17:38 2024 GMT + Subject: C=DE, ST=Bavaria, L=Nuremberg, O=NETWAYS GmbH, OU=Monitoring, CN=icinga2a + Subject Public Key Info: + Public Key Algorithm: rsaEncryption + Public-Key: (4096 bit) + Modulus: + ... + +Try to manually connect to the cluster node: + + # openssl s_client -connect 192.168.33.10:5665 + + +Unauthenticated nodes are able to connect required by the +[CSR auto-signing](10-icinga2-client.md#csr-autosigning-requirements) functionality. + + [2015-06-10 03:28:11 +0200] information/ApiListener: New client connection for identity 'icinga-client' (unauthenticated) + +If this message does not go away, make sure to verify the client's certificate and +its received `ca.crt` in `/etc/icinga2/pki`. + ### Cluster Troubleshooting Message Errors @@ -216,6 +255,21 @@ they remain in a Split-Brain-mode and history may differ. Although the Icinga 2 cluster protocol stores historical events in a replay log for later synchronisation, you should make sure to check why the network connection failed. +### Cluster Troubleshooting Command Endpoint Errors + +Command endpoints can be used for clients acting as [remote command execution bridge](10-icinga2-client.md#icinga2-client-configuration-command-bridge) +as well as inside an [High-Availability cluster](12-distributed-monitoring-ha.md#distributed-monitoring-high-availability). + +There is no cli command for manually executing the check, but you can verify +the following (e.g. by invoking a forced check from the web interface): + +* `icinga2.log` contains connection and execution errors + * `CheckCommand` definition not found on the remote client + * Referenced check plugin not found on the remote client + * Runtime warnings and errors, e.g. unresolved runtime macros or configuration problems +* Specific error messages are also populated into `UNKNOWN` check results including a detailed error message in their output + + ### Cluster Troubleshooting Config Sync If the cluster zones do not sync their configuration, make sure to check the following: