Add systemd watchdog and adjust reload behaviour

This commit is contained in:
Jean Flach 2018-01-17 13:52:23 +01:00
parent 627fddf12b
commit c418a9611e
8 changed files with 69 additions and 45 deletions

View File

@ -38,6 +38,11 @@ option(ICINGA2_WITH_PERFDATA "Build the perfdata module" ON)
option(ICINGA2_WITH_STUDIO "Build the Icinga Studio application" OFF) option(ICINGA2_WITH_STUDIO "Build the Icinga Studio application" OFF)
option(ICINGA2_WITH_TESTS "Run unit tests" ON) option(ICINGA2_WITH_TESTS "Run unit tests" ON)
option (USE_SYSTEMD
"Configure icinga as native systemd service instead of a SysV initscript" OFF)
set(HAVE_SYSTEMD ${USE_SYSTEMD})
file(STRINGS icinga2.spec VERSION_LINE REGEX "^Version: ") file(STRINGS icinga2.spec VERSION_LINE REGEX "^Version: ")
string(REPLACE "Version: " "" ICINGA2_VERSION ${VERSION_LINE}) string(REPLACE "Version: " "" ICINGA2_VERSION ${VERSION_LINE})
@ -155,6 +160,11 @@ if(UNIX OR CYGWIN)
list(APPEND base_OBJS $<TARGET_OBJECTS:execvpe>) list(APPEND base_OBJS $<TARGET_OBJECTS:execvpe>)
endif() endif()
if(HAVE_SYSTEMD)
list(APPEND base_DEPS systemd)
endif()
if(EDITLINE_FOUND) if(EDITLINE_FOUND)
list(APPEND base_DEPS ${EDITLINE_LIBRARIES}) list(APPEND base_DEPS ${EDITLINE_LIBRARIES})
include_directories(${EDITLINE_INCLUDE_DIR}) include_directories(${EDITLINE_INCLUDE_DIR})

View File

@ -9,6 +9,7 @@
#cmakedefine HAVE_CXXABI_H #cmakedefine HAVE_CXXABI_H
#cmakedefine HAVE_NICE #cmakedefine HAVE_NICE
#cmakedefine HAVE_EDITLINE #cmakedefine HAVE_EDITLINE
#cmakedefine HAVE_SYSTEMD
#cmakedefine ICINGA2_UNITY_BUILD #cmakedefine ICINGA2_UNITY_BUILD

View File

@ -360,6 +360,14 @@ content:
StartLimitInterval=10 StartLimitInterval=10
StartLimitBurst=3 StartLimitBurst=3
Using the watchdog can also help with monitoring Icinga 2, to activate and use it add the following to the override:
WatchdogSec=30s
This way Systemd will kill Icinga 2 if does not notify for over 30 seconds, a timout of less than 10 seconds is not
recommended. When the watchdog is activated, `Restart=` can be set to `watchdog` to restart Icinga 2 in the case of a
watchdog timeout.
Run `systemctl daemon-reload && systemctl restart icinga2` to apply the changes. Run `systemctl daemon-reload && systemctl restart icinga2` to apply the changes.
Now Systemd will always try to restart Icinga 2 (except if you run Now Systemd will always try to restart Icinga 2 (except if you run
`systemctl stop icinga2`). After three failures in ten seconds it will stop `systemctl stop icinga2`). After three failures in ten seconds it will stop

View File

@ -34,9 +34,6 @@ if(NOT WIN32)
PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE
) )
option (USE_SYSTEMD
"Configure icinga as native systemd service instead of a SysV initscript" OFF)
# required for packaging on Gentoo, see Bug #6498 # required for packaging on Gentoo, see Bug #6498
option (INSTALL_SYSTEMD_SERVICE_AND_INITSCRIPT option (INSTALL_SYSTEMD_SERVICE_AND_INITSCRIPT
"Force install both the systemd service definition file and the SysV initscript in parallel, regardless of how USE_SYSTEMD is set. Only use this for special packaging purposes and if you know what you are doing" OFF) "Force install both the systemd service definition file and the SysV initscript in parallel, regardless of how USE_SYSTEMD is set. Only use this for special packaging purposes and if you know what you are doing" OFF)

View File

@ -3,10 +3,10 @@ Description=Icinga host/service/network monitoring system
After=syslog.target network-online.target postgresql.service mariadb.service carbon-cache.service carbon-relay.service After=syslog.target network-online.target postgresql.service mariadb.service carbon-cache.service carbon-relay.service
[Service] [Service]
Type=forking Type=notify
EnvironmentFile=@ICINGA2_SYSCONFIGFILE@ EnvironmentFile=@ICINGA2_SYSCONFIGFILE@
ExecStartPre=@CMAKE_INSTALL_PREFIX@/lib/icinga2/prepare-dirs @ICINGA2_SYSCONFIGFILE@ ExecStartPre=@CMAKE_INSTALL_PREFIX@/lib/icinga2/prepare-dirs @ICINGA2_SYSCONFIGFILE@
ExecStart=@CMAKE_INSTALL_FULL_SBINDIR@/icinga2 daemon -d -e ${ICINGA2_ERROR_LOG} ExecStart=@CMAKE_INSTALL_FULL_SBINDIR@/icinga2 daemon -e ${ICINGA2_ERROR_LOG}
PIDFile=@ICINGA2_RUNDIR@/icinga2/icinga2.pid PIDFile=@ICINGA2_RUNDIR@/icinga2/icinga2.pid
ExecReload=@CMAKE_INSTALL_PREFIX@/lib/icinga2/safe-reload @ICINGA2_SYSCONFIGFILE@ ExecReload=@CMAKE_INSTALL_PREFIX@/lib/icinga2/safe-reload @ICINGA2_SYSCONFIGFILE@
TimeoutStartSec=30m TimeoutStartSec=30m

View File

@ -44,10 +44,12 @@
#ifdef __linux__ #ifdef __linux__
#include <sys/prctl.h> #include <sys/prctl.h>
#endif /* __linux__ */ #endif /* __linux__ */
#ifdef _WIN32 #ifdef _WIN32
#include <windows.h> #include <windows.h>
#endif /* _win32 */ #endif /* _WIN32 */
#ifdef HAVE_SYSTEMD
#include <systemd/sd-daemon.h>
#endif /* HAVE_SYSTEMD */
using namespace icinga; using namespace icinga;
@ -315,6 +317,11 @@ void Application::SetArgV(char **argv)
*/ */
void Application::RunEventLoop() void Application::RunEventLoop()
{ {
#ifdef HAVE_SYSTEMD
sd_notify(0, "READY=1");
#endif /* HAVE_SYSTEMD */
double lastLoop = Utility::GetTime(); double lastLoop = Utility::GetTime();
mainloop: mainloop:
@ -331,6 +338,10 @@ mainloop:
double now = Utility::GetTime(); double now = Utility::GetTime();
double timeDiff = lastLoop - now; double timeDiff = lastLoop - now;
#ifdef HAVE_SYSTEMD
sd_notify(0, "WATCHDOG=1");
#endif /* HAVE_SYSTEMD */
if (std::fabs(timeDiff) > 15) { if (std::fabs(timeDiff) > 15) {
/* We made a significant jump in time. */ /* We made a significant jump in time. */
Log(LogInformation, "Application") Log(LogInformation, "Application")
@ -347,6 +358,10 @@ mainloop:
if (m_RequestRestart) { if (m_RequestRestart) {
m_RequestRestart = false; // we are now handling the request, once is enough m_RequestRestart = false; // we are now handling the request, once is enough
#ifdef HAVE_SYSTEMD
sd_notify(0, "RELOADING=1");
#endif /* HAVE_SYSTEMD */
// are we already restarting? ignore request if we already are // are we already restarting? ignore request if we already are
if (l_Restarting) if (l_Restarting)
goto mainloop; goto mainloop;
@ -357,6 +372,10 @@ mainloop:
goto mainloop; goto mainloop;
} }
#ifdef HAVE_SYSTEMD
sd_notify(0, "STOPPING=1");
#endif /* HAVE_SYSTEMD */
Log(LogInformation, "Application", "Shutting down..."); Log(LogInformation, "Application", "Shutting down...");
ConfigObject::StopObjects(); ConfigObject::StopObjects();
@ -712,6 +731,21 @@ void Application::SigUsr1Handler(int)
RequestReopenLogs(); RequestReopenLogs();
} }
/**
* Signal handler for SIGUSR2. Hands over PID to child and commits suicide
*
* @param - The signal number.
*/
void Application::SigUsr2Handler(int)
{
Log(LogInformation, "Application", "Reload requested, letting new process take over.");
#ifdef HAVE_SYSTEMD
sd_notifyf(0, "MAINPID=%lu", (unsigned long) m_ReloadProcess);
#endif /* HAVE_SYSTEMD */
Exit(0);
}
/** /**
* Signal handler for SIGABRT. Helps with debugging ASSERT()s. * Signal handler for SIGABRT. Helps with debugging ASSERT()s.
* *
@ -964,6 +998,9 @@ int Application::Run()
sa.sa_handler = &Application::SigUsr1Handler; sa.sa_handler = &Application::SigUsr1Handler;
sigaction(SIGUSR1, &sa, nullptr); sigaction(SIGUSR1, &sa, nullptr);
sa.sa_handler = &Application::SigUsr2Handler;
sigaction(SIGUSR2, &sa, nullptr);
#else /* _WIN32 */ #else /* _WIN32 */
SetConsoleCtrlHandler(&Application::CtrlHandler, TRUE); SetConsoleCtrlHandler(&Application::CtrlHandler, TRUE);
#endif /* _WIN32 */ #endif /* _WIN32 */

View File

@ -205,6 +205,7 @@ private:
static void SigAbrtHandler(int signum); static void SigAbrtHandler(int signum);
static void SigUsr1Handler(int signum); static void SigUsr1Handler(int signum);
static void SigUsr2Handler(int signum);
static void ExceptionHandler(); static void ExceptionHandler();
static String GetCrashReportFilename(); static String GetCrashReportFilename();

View File

@ -136,39 +136,6 @@ static bool SetDaemonIO(const String& stderrFile)
return true; return true;
} }
/**
* Terminate another process and wait till it has ended
*
* @params target PID of the process to end
*/
static void TerminateAndWaitForEnd(pid_t target)
{
#ifndef _WIN32
// allow 30 seconds timeout
double timeout = Utility::GetTime() + 30;
int ret = kill(target, SIGTERM);
while (Utility::GetTime() < timeout && (ret == 0 || errno != ESRCH)) {
Utility::Sleep(0.1);
ret = kill(target, 0);
}
// timeout and the process still seems to live: update pid and kill it
if (ret == 0 || errno != ESRCH) {
String pidFile = Application::GetPidPath();
std::ofstream fp(pidFile.CStr());
fp << Utility::GetPid();
fp.close();
kill(target, SIGKILL);
}
#else
// TODO: implement this for Win32
#endif /* _WIN32 */
}
String DaemonCommand::GetDescription() const String DaemonCommand::GetDescription() const
{ {
return "Starts Icinga 2."; return "Starts Icinga 2.";
@ -251,11 +218,14 @@ int DaemonCommand::Run(const po::variables_map& vm, const std::vector<std::strin
} }
if (vm.count("reload-internal")) { if (vm.count("reload-internal")) {
int parentpid = vm["reload-internal"].as<int>(); /* We went through validation and now ask the old process kindly to die */
Log(LogInformation, "cli") Log(LogInformation, "cli", "Requesting to take over.");
<< "Terminating previous instance of Icinga (PID " << parentpid << ")"; int rc = kill(vm["reload-internal"].as<int>(), SIGUSR2);
TerminateAndWaitForEnd(parentpid); if (rc) {
Log(LogInformation, "cli", "Previous instance has ended, taking over now."); Log(LogCritical, "Application")
<< "Failed to send signal to \"" << vm["reload-internal"].as<int>() << "\" with " << strerror(errno);
return EXIT_FAILURE;
}
} }
if (vm.count("daemonize")) { if (vm.count("daemonize")) {