Add systemd watchdog and adjust reload behaviour

This commit is contained in:
Jean Flach 2018-01-17 13:52:23 +01:00
parent 627fddf12b
commit c418a9611e
8 changed files with 69 additions and 45 deletions

View File

@ -38,6 +38,11 @@ option(ICINGA2_WITH_PERFDATA "Build the perfdata module" ON)
option(ICINGA2_WITH_STUDIO "Build the Icinga Studio application" OFF)
option(ICINGA2_WITH_TESTS "Run unit tests" ON)
option (USE_SYSTEMD
"Configure icinga as native systemd service instead of a SysV initscript" OFF)
set(HAVE_SYSTEMD ${USE_SYSTEMD})
file(STRINGS icinga2.spec VERSION_LINE REGEX "^Version: ")
string(REPLACE "Version: " "" ICINGA2_VERSION ${VERSION_LINE})
@ -155,6 +160,11 @@ if(UNIX OR CYGWIN)
list(APPEND base_OBJS $<TARGET_OBJECTS:execvpe>)
endif()
if(HAVE_SYSTEMD)
list(APPEND base_DEPS systemd)
endif()
if(EDITLINE_FOUND)
list(APPEND base_DEPS ${EDITLINE_LIBRARIES})
include_directories(${EDITLINE_INCLUDE_DIR})

View File

@ -9,6 +9,7 @@
#cmakedefine HAVE_CXXABI_H
#cmakedefine HAVE_NICE
#cmakedefine HAVE_EDITLINE
#cmakedefine HAVE_SYSTEMD
#cmakedefine ICINGA2_UNITY_BUILD

View File

@ -360,6 +360,14 @@ content:
StartLimitInterval=10
StartLimitBurst=3
Using the watchdog can also help with monitoring Icinga 2, to activate and use it add the following to the override:
WatchdogSec=30s
This way Systemd will kill Icinga 2 if does not notify for over 30 seconds, a timout of less than 10 seconds is not
recommended. When the watchdog is activated, `Restart=` can be set to `watchdog` to restart Icinga 2 in the case of a
watchdog timeout.
Run `systemctl daemon-reload && systemctl restart icinga2` to apply the changes.
Now Systemd will always try to restart Icinga 2 (except if you run
`systemctl stop icinga2`). After three failures in ten seconds it will stop

View File

@ -34,9 +34,6 @@ if(NOT WIN32)
PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE
)
option (USE_SYSTEMD
"Configure icinga as native systemd service instead of a SysV initscript" OFF)
# required for packaging on Gentoo, see Bug #6498
option (INSTALL_SYSTEMD_SERVICE_AND_INITSCRIPT
"Force install both the systemd service definition file and the SysV initscript in parallel, regardless of how USE_SYSTEMD is set. Only use this for special packaging purposes and if you know what you are doing" OFF)

View File

@ -3,10 +3,10 @@ Description=Icinga host/service/network monitoring system
After=syslog.target network-online.target postgresql.service mariadb.service carbon-cache.service carbon-relay.service
[Service]
Type=forking
Type=notify
EnvironmentFile=@ICINGA2_SYSCONFIGFILE@
ExecStartPre=@CMAKE_INSTALL_PREFIX@/lib/icinga2/prepare-dirs @ICINGA2_SYSCONFIGFILE@
ExecStart=@CMAKE_INSTALL_FULL_SBINDIR@/icinga2 daemon -d -e ${ICINGA2_ERROR_LOG}
ExecStart=@CMAKE_INSTALL_FULL_SBINDIR@/icinga2 daemon -e ${ICINGA2_ERROR_LOG}
PIDFile=@ICINGA2_RUNDIR@/icinga2/icinga2.pid
ExecReload=@CMAKE_INSTALL_PREFIX@/lib/icinga2/safe-reload @ICINGA2_SYSCONFIGFILE@
TimeoutStartSec=30m

View File

@ -44,10 +44,12 @@
#ifdef __linux__
#include <sys/prctl.h>
#endif /* __linux__ */
#ifdef _WIN32
#include <windows.h>
#endif /* _win32 */
#endif /* _WIN32 */
#ifdef HAVE_SYSTEMD
#include <systemd/sd-daemon.h>
#endif /* HAVE_SYSTEMD */
using namespace icinga;
@ -315,6 +317,11 @@ void Application::SetArgV(char **argv)
*/
void Application::RunEventLoop()
{
#ifdef HAVE_SYSTEMD
sd_notify(0, "READY=1");
#endif /* HAVE_SYSTEMD */
double lastLoop = Utility::GetTime();
mainloop:
@ -331,6 +338,10 @@ mainloop:
double now = Utility::GetTime();
double timeDiff = lastLoop - now;
#ifdef HAVE_SYSTEMD
sd_notify(0, "WATCHDOG=1");
#endif /* HAVE_SYSTEMD */
if (std::fabs(timeDiff) > 15) {
/* We made a significant jump in time. */
Log(LogInformation, "Application")
@ -347,6 +358,10 @@ mainloop:
if (m_RequestRestart) {
m_RequestRestart = false; // we are now handling the request, once is enough
#ifdef HAVE_SYSTEMD
sd_notify(0, "RELOADING=1");
#endif /* HAVE_SYSTEMD */
// are we already restarting? ignore request if we already are
if (l_Restarting)
goto mainloop;
@ -357,6 +372,10 @@ mainloop:
goto mainloop;
}
#ifdef HAVE_SYSTEMD
sd_notify(0, "STOPPING=1");
#endif /* HAVE_SYSTEMD */
Log(LogInformation, "Application", "Shutting down...");
ConfigObject::StopObjects();
@ -712,6 +731,21 @@ void Application::SigUsr1Handler(int)
RequestReopenLogs();
}
/**
* Signal handler for SIGUSR2. Hands over PID to child and commits suicide
*
* @param - The signal number.
*/
void Application::SigUsr2Handler(int)
{
Log(LogInformation, "Application", "Reload requested, letting new process take over.");
#ifdef HAVE_SYSTEMD
sd_notifyf(0, "MAINPID=%lu", (unsigned long) m_ReloadProcess);
#endif /* HAVE_SYSTEMD */
Exit(0);
}
/**
* Signal handler for SIGABRT. Helps with debugging ASSERT()s.
*
@ -964,6 +998,9 @@ int Application::Run()
sa.sa_handler = &Application::SigUsr1Handler;
sigaction(SIGUSR1, &sa, nullptr);
sa.sa_handler = &Application::SigUsr2Handler;
sigaction(SIGUSR2, &sa, nullptr);
#else /* _WIN32 */
SetConsoleCtrlHandler(&Application::CtrlHandler, TRUE);
#endif /* _WIN32 */

View File

@ -205,6 +205,7 @@ private:
static void SigAbrtHandler(int signum);
static void SigUsr1Handler(int signum);
static void SigUsr2Handler(int signum);
static void ExceptionHandler();
static String GetCrashReportFilename();

View File

@ -136,39 +136,6 @@ static bool SetDaemonIO(const String& stderrFile)
return true;
}
/**
* Terminate another process and wait till it has ended
*
* @params target PID of the process to end
*/
static void TerminateAndWaitForEnd(pid_t target)
{
#ifndef _WIN32
// allow 30 seconds timeout
double timeout = Utility::GetTime() + 30;
int ret = kill(target, SIGTERM);
while (Utility::GetTime() < timeout && (ret == 0 || errno != ESRCH)) {
Utility::Sleep(0.1);
ret = kill(target, 0);
}
// timeout and the process still seems to live: update pid and kill it
if (ret == 0 || errno != ESRCH) {
String pidFile = Application::GetPidPath();
std::ofstream fp(pidFile.CStr());
fp << Utility::GetPid();
fp.close();
kill(target, SIGKILL);
}
#else
// TODO: implement this for Win32
#endif /* _WIN32 */
}
String DaemonCommand::GetDescription() const
{
return "Starts Icinga 2.";
@ -251,11 +218,14 @@ int DaemonCommand::Run(const po::variables_map& vm, const std::vector<std::strin
}
if (vm.count("reload-internal")) {
int parentpid = vm["reload-internal"].as<int>();
Log(LogInformation, "cli")
<< "Terminating previous instance of Icinga (PID " << parentpid << ")";
TerminateAndWaitForEnd(parentpid);
Log(LogInformation, "cli", "Previous instance has ended, taking over now.");
/* We went through validation and now ask the old process kindly to die */
Log(LogInformation, "cli", "Requesting to take over.");
int rc = kill(vm["reload-internal"].as<int>(), SIGUSR2);
if (rc) {
Log(LogCritical, "Application")
<< "Failed to send signal to \"" << vm["reload-internal"].as<int>() << "\" with " << strerror(errno);
return EXIT_FAILURE;
}
}
if (vm.count("daemonize")) {