From c418a9611e82dd6694f09b46a88ae478fab3c161 Mon Sep 17 00:00:00 2001 From: Jean Flach Date: Wed, 17 Jan 2018 13:52:23 +0100 Subject: [PATCH] Add systemd watchdog and adjust reload behaviour --- CMakeLists.txt | 10 ++++++ config.h.cmake | 1 + doc/02-getting-started.md | 8 +++++ etc/initsystem/CMakeLists.txt | 3 -- etc/initsystem/icinga2.service.cmake | 4 +-- lib/base/application.cpp | 41 +++++++++++++++++++++++-- lib/base/application.hpp | 1 + lib/cli/daemoncommand.cpp | 46 +++++----------------------- 8 files changed, 69 insertions(+), 45 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9f06d6095..2fdf380c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,6 +38,11 @@ option(ICINGA2_WITH_PERFDATA "Build the perfdata module" ON) option(ICINGA2_WITH_STUDIO "Build the Icinga Studio application" OFF) option(ICINGA2_WITH_TESTS "Run unit tests" ON) +option (USE_SYSTEMD + "Configure icinga as native systemd service instead of a SysV initscript" OFF) + +set(HAVE_SYSTEMD ${USE_SYSTEMD}) + file(STRINGS icinga2.spec VERSION_LINE REGEX "^Version: ") string(REPLACE "Version: " "" ICINGA2_VERSION ${VERSION_LINE}) @@ -155,6 +160,11 @@ if(UNIX OR CYGWIN) list(APPEND base_OBJS $) endif() +if(HAVE_SYSTEMD) + list(APPEND base_DEPS systemd) +endif() + + if(EDITLINE_FOUND) list(APPEND base_DEPS ${EDITLINE_LIBRARIES}) include_directories(${EDITLINE_INCLUDE_DIR}) diff --git a/config.h.cmake b/config.h.cmake index 1903b287c..e8a7bdcba 100644 --- a/config.h.cmake +++ b/config.h.cmake @@ -9,6 +9,7 @@ #cmakedefine HAVE_CXXABI_H #cmakedefine HAVE_NICE #cmakedefine HAVE_EDITLINE +#cmakedefine HAVE_SYSTEMD #cmakedefine ICINGA2_UNITY_BUILD diff --git a/doc/02-getting-started.md b/doc/02-getting-started.md index b9000a398..8177c6dcf 100644 --- a/doc/02-getting-started.md +++ b/doc/02-getting-started.md @@ -360,6 +360,14 @@ content: StartLimitInterval=10 StartLimitBurst=3 +Using the watchdog can also help with monitoring Icinga 2, to activate and use it add the following to the override: + + WatchdogSec=30s + +This way Systemd will kill Icinga 2 if does not notify for over 30 seconds, a timout of less than 10 seconds is not +recommended. When the watchdog is activated, `Restart=` can be set to `watchdog` to restart Icinga 2 in the case of a +watchdog timeout. + Run `systemctl daemon-reload && systemctl restart icinga2` to apply the changes. Now Systemd will always try to restart Icinga 2 (except if you run `systemctl stop icinga2`). After three failures in ten seconds it will stop diff --git a/etc/initsystem/CMakeLists.txt b/etc/initsystem/CMakeLists.txt index b2dd63558..e51bb3ef5 100644 --- a/etc/initsystem/CMakeLists.txt +++ b/etc/initsystem/CMakeLists.txt @@ -34,9 +34,6 @@ if(NOT WIN32) PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) - option (USE_SYSTEMD - "Configure icinga as native systemd service instead of a SysV initscript" OFF) - # required for packaging on Gentoo, see Bug #6498 option (INSTALL_SYSTEMD_SERVICE_AND_INITSCRIPT "Force install both the systemd service definition file and the SysV initscript in parallel, regardless of how USE_SYSTEMD is set. Only use this for special packaging purposes and if you know what you are doing" OFF) diff --git a/etc/initsystem/icinga2.service.cmake b/etc/initsystem/icinga2.service.cmake index 895db7149..771c98f7b 100644 --- a/etc/initsystem/icinga2.service.cmake +++ b/etc/initsystem/icinga2.service.cmake @@ -3,10 +3,10 @@ Description=Icinga host/service/network monitoring system After=syslog.target network-online.target postgresql.service mariadb.service carbon-cache.service carbon-relay.service [Service] -Type=forking +Type=notify EnvironmentFile=@ICINGA2_SYSCONFIGFILE@ ExecStartPre=@CMAKE_INSTALL_PREFIX@/lib/icinga2/prepare-dirs @ICINGA2_SYSCONFIGFILE@ -ExecStart=@CMAKE_INSTALL_FULL_SBINDIR@/icinga2 daemon -d -e ${ICINGA2_ERROR_LOG} +ExecStart=@CMAKE_INSTALL_FULL_SBINDIR@/icinga2 daemon -e ${ICINGA2_ERROR_LOG} PIDFile=@ICINGA2_RUNDIR@/icinga2/icinga2.pid ExecReload=@CMAKE_INSTALL_PREFIX@/lib/icinga2/safe-reload @ICINGA2_SYSCONFIGFILE@ TimeoutStartSec=30m diff --git a/lib/base/application.cpp b/lib/base/application.cpp index a2b7918dc..b06709d3e 100644 --- a/lib/base/application.cpp +++ b/lib/base/application.cpp @@ -44,10 +44,12 @@ #ifdef __linux__ #include #endif /* __linux__ */ - #ifdef _WIN32 #include -#endif /* _win32 */ +#endif /* _WIN32 */ +#ifdef HAVE_SYSTEMD +#include +#endif /* HAVE_SYSTEMD */ using namespace icinga; @@ -315,6 +317,11 @@ void Application::SetArgV(char **argv) */ void Application::RunEventLoop() { + +#ifdef HAVE_SYSTEMD + sd_notify(0, "READY=1"); +#endif /* HAVE_SYSTEMD */ + double lastLoop = Utility::GetTime(); mainloop: @@ -331,6 +338,10 @@ mainloop: double now = Utility::GetTime(); double timeDiff = lastLoop - now; +#ifdef HAVE_SYSTEMD + sd_notify(0, "WATCHDOG=1"); +#endif /* HAVE_SYSTEMD */ + if (std::fabs(timeDiff) > 15) { /* We made a significant jump in time. */ Log(LogInformation, "Application") @@ -347,6 +358,10 @@ mainloop: if (m_RequestRestart) { m_RequestRestart = false; // we are now handling the request, once is enough +#ifdef HAVE_SYSTEMD + sd_notify(0, "RELOADING=1"); +#endif /* HAVE_SYSTEMD */ + // are we already restarting? ignore request if we already are if (l_Restarting) goto mainloop; @@ -357,6 +372,10 @@ mainloop: goto mainloop; } +#ifdef HAVE_SYSTEMD + sd_notify(0, "STOPPING=1"); +#endif /* HAVE_SYSTEMD */ + Log(LogInformation, "Application", "Shutting down..."); ConfigObject::StopObjects(); @@ -712,6 +731,21 @@ void Application::SigUsr1Handler(int) RequestReopenLogs(); } +/** + * Signal handler for SIGUSR2. Hands over PID to child and commits suicide + * + * @param - The signal number. + */ +void Application::SigUsr2Handler(int) +{ + Log(LogInformation, "Application", "Reload requested, letting new process take over."); +#ifdef HAVE_SYSTEMD + sd_notifyf(0, "MAINPID=%lu", (unsigned long) m_ReloadProcess); +#endif /* HAVE_SYSTEMD */ + + Exit(0); +} + /** * Signal handler for SIGABRT. Helps with debugging ASSERT()s. * @@ -964,6 +998,9 @@ int Application::Run() sa.sa_handler = &Application::SigUsr1Handler; sigaction(SIGUSR1, &sa, nullptr); + + sa.sa_handler = &Application::SigUsr2Handler; + sigaction(SIGUSR2, &sa, nullptr); #else /* _WIN32 */ SetConsoleCtrlHandler(&Application::CtrlHandler, TRUE); #endif /* _WIN32 */ diff --git a/lib/base/application.hpp b/lib/base/application.hpp index 1a16f7c9d..0ff6ce383 100644 --- a/lib/base/application.hpp +++ b/lib/base/application.hpp @@ -205,6 +205,7 @@ private: static void SigAbrtHandler(int signum); static void SigUsr1Handler(int signum); + static void SigUsr2Handler(int signum); static void ExceptionHandler(); static String GetCrashReportFilename(); diff --git a/lib/cli/daemoncommand.cpp b/lib/cli/daemoncommand.cpp index 859febdc3..56582ec64 100644 --- a/lib/cli/daemoncommand.cpp +++ b/lib/cli/daemoncommand.cpp @@ -136,39 +136,6 @@ static bool SetDaemonIO(const String& stderrFile) return true; } -/** - * Terminate another process and wait till it has ended - * - * @params target PID of the process to end - */ -static void TerminateAndWaitForEnd(pid_t target) -{ -#ifndef _WIN32 - // allow 30 seconds timeout - double timeout = Utility::GetTime() + 30; - - int ret = kill(target, SIGTERM); - - while (Utility::GetTime() < timeout && (ret == 0 || errno != ESRCH)) { - Utility::Sleep(0.1); - ret = kill(target, 0); - } - - // timeout and the process still seems to live: update pid and kill it - if (ret == 0 || errno != ESRCH) { - String pidFile = Application::GetPidPath(); - std::ofstream fp(pidFile.CStr()); - fp << Utility::GetPid(); - fp.close(); - - kill(target, SIGKILL); - } - -#else - // TODO: implement this for Win32 -#endif /* _WIN32 */ -} - String DaemonCommand::GetDescription() const { return "Starts Icinga 2."; @@ -251,11 +218,14 @@ int DaemonCommand::Run(const po::variables_map& vm, const std::vector(); - Log(LogInformation, "cli") - << "Terminating previous instance of Icinga (PID " << parentpid << ")"; - TerminateAndWaitForEnd(parentpid); - Log(LogInformation, "cli", "Previous instance has ended, taking over now."); + /* We went through validation and now ask the old process kindly to die */ + Log(LogInformation, "cli", "Requesting to take over."); + int rc = kill(vm["reload-internal"].as(), SIGUSR2); + if (rc) { + Log(LogCritical, "Application") + << "Failed to send signal to \"" << vm["reload-internal"].as() << "\" with " << strerror(errno); + return EXIT_FAILURE; + } } if (vm.count("daemonize")) {