mirror of https://github.com/Icinga/icinga2.git
Add systemd watchdog and adjust reload behaviour
This commit is contained in:
parent
627fddf12b
commit
c418a9611e
|
@ -38,6 +38,11 @@ option(ICINGA2_WITH_PERFDATA "Build the perfdata module" ON)
|
|||
option(ICINGA2_WITH_STUDIO "Build the Icinga Studio application" OFF)
|
||||
option(ICINGA2_WITH_TESTS "Run unit tests" ON)
|
||||
|
||||
option (USE_SYSTEMD
|
||||
"Configure icinga as native systemd service instead of a SysV initscript" OFF)
|
||||
|
||||
set(HAVE_SYSTEMD ${USE_SYSTEMD})
|
||||
|
||||
file(STRINGS icinga2.spec VERSION_LINE REGEX "^Version: ")
|
||||
string(REPLACE "Version: " "" ICINGA2_VERSION ${VERSION_LINE})
|
||||
|
||||
|
@ -155,6 +160,11 @@ if(UNIX OR CYGWIN)
|
|||
list(APPEND base_OBJS $<TARGET_OBJECTS:execvpe>)
|
||||
endif()
|
||||
|
||||
if(HAVE_SYSTEMD)
|
||||
list(APPEND base_DEPS systemd)
|
||||
endif()
|
||||
|
||||
|
||||
if(EDITLINE_FOUND)
|
||||
list(APPEND base_DEPS ${EDITLINE_LIBRARIES})
|
||||
include_directories(${EDITLINE_INCLUDE_DIR})
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
#cmakedefine HAVE_CXXABI_H
|
||||
#cmakedefine HAVE_NICE
|
||||
#cmakedefine HAVE_EDITLINE
|
||||
#cmakedefine HAVE_SYSTEMD
|
||||
|
||||
#cmakedefine ICINGA2_UNITY_BUILD
|
||||
|
||||
|
|
|
@ -360,6 +360,14 @@ content:
|
|||
StartLimitInterval=10
|
||||
StartLimitBurst=3
|
||||
|
||||
Using the watchdog can also help with monitoring Icinga 2, to activate and use it add the following to the override:
|
||||
|
||||
WatchdogSec=30s
|
||||
|
||||
This way Systemd will kill Icinga 2 if does not notify for over 30 seconds, a timout of less than 10 seconds is not
|
||||
recommended. When the watchdog is activated, `Restart=` can be set to `watchdog` to restart Icinga 2 in the case of a
|
||||
watchdog timeout.
|
||||
|
||||
Run `systemctl daemon-reload && systemctl restart icinga2` to apply the changes.
|
||||
Now Systemd will always try to restart Icinga 2 (except if you run
|
||||
`systemctl stop icinga2`). After three failures in ten seconds it will stop
|
||||
|
|
|
@ -34,9 +34,6 @@ if(NOT WIN32)
|
|||
PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE
|
||||
)
|
||||
|
||||
option (USE_SYSTEMD
|
||||
"Configure icinga as native systemd service instead of a SysV initscript" OFF)
|
||||
|
||||
# required for packaging on Gentoo, see Bug #6498
|
||||
option (INSTALL_SYSTEMD_SERVICE_AND_INITSCRIPT
|
||||
"Force install both the systemd service definition file and the SysV initscript in parallel, regardless of how USE_SYSTEMD is set. Only use this for special packaging purposes and if you know what you are doing" OFF)
|
||||
|
|
|
@ -3,10 +3,10 @@ Description=Icinga host/service/network monitoring system
|
|||
After=syslog.target network-online.target postgresql.service mariadb.service carbon-cache.service carbon-relay.service
|
||||
|
||||
[Service]
|
||||
Type=forking
|
||||
Type=notify
|
||||
EnvironmentFile=@ICINGA2_SYSCONFIGFILE@
|
||||
ExecStartPre=@CMAKE_INSTALL_PREFIX@/lib/icinga2/prepare-dirs @ICINGA2_SYSCONFIGFILE@
|
||||
ExecStart=@CMAKE_INSTALL_FULL_SBINDIR@/icinga2 daemon -d -e ${ICINGA2_ERROR_LOG}
|
||||
ExecStart=@CMAKE_INSTALL_FULL_SBINDIR@/icinga2 daemon -e ${ICINGA2_ERROR_LOG}
|
||||
PIDFile=@ICINGA2_RUNDIR@/icinga2/icinga2.pid
|
||||
ExecReload=@CMAKE_INSTALL_PREFIX@/lib/icinga2/safe-reload @ICINGA2_SYSCONFIGFILE@
|
||||
TimeoutStartSec=30m
|
||||
|
|
|
@ -44,10 +44,12 @@
|
|||
#ifdef __linux__
|
||||
#include <sys/prctl.h>
|
||||
#endif /* __linux__ */
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#endif /* _win32 */
|
||||
#endif /* _WIN32 */
|
||||
#ifdef HAVE_SYSTEMD
|
||||
#include <systemd/sd-daemon.h>
|
||||
#endif /* HAVE_SYSTEMD */
|
||||
|
||||
using namespace icinga;
|
||||
|
||||
|
@ -315,6 +317,11 @@ void Application::SetArgV(char **argv)
|
|||
*/
|
||||
void Application::RunEventLoop()
|
||||
{
|
||||
|
||||
#ifdef HAVE_SYSTEMD
|
||||
sd_notify(0, "READY=1");
|
||||
#endif /* HAVE_SYSTEMD */
|
||||
|
||||
double lastLoop = Utility::GetTime();
|
||||
|
||||
mainloop:
|
||||
|
@ -331,6 +338,10 @@ mainloop:
|
|||
double now = Utility::GetTime();
|
||||
double timeDiff = lastLoop - now;
|
||||
|
||||
#ifdef HAVE_SYSTEMD
|
||||
sd_notify(0, "WATCHDOG=1");
|
||||
#endif /* HAVE_SYSTEMD */
|
||||
|
||||
if (std::fabs(timeDiff) > 15) {
|
||||
/* We made a significant jump in time. */
|
||||
Log(LogInformation, "Application")
|
||||
|
@ -347,6 +358,10 @@ mainloop:
|
|||
if (m_RequestRestart) {
|
||||
m_RequestRestart = false; // we are now handling the request, once is enough
|
||||
|
||||
#ifdef HAVE_SYSTEMD
|
||||
sd_notify(0, "RELOADING=1");
|
||||
#endif /* HAVE_SYSTEMD */
|
||||
|
||||
// are we already restarting? ignore request if we already are
|
||||
if (l_Restarting)
|
||||
goto mainloop;
|
||||
|
@ -357,6 +372,10 @@ mainloop:
|
|||
goto mainloop;
|
||||
}
|
||||
|
||||
#ifdef HAVE_SYSTEMD
|
||||
sd_notify(0, "STOPPING=1");
|
||||
#endif /* HAVE_SYSTEMD */
|
||||
|
||||
Log(LogInformation, "Application", "Shutting down...");
|
||||
|
||||
ConfigObject::StopObjects();
|
||||
|
@ -712,6 +731,21 @@ void Application::SigUsr1Handler(int)
|
|||
RequestReopenLogs();
|
||||
}
|
||||
|
||||
/**
|
||||
* Signal handler for SIGUSR2. Hands over PID to child and commits suicide
|
||||
*
|
||||
* @param - The signal number.
|
||||
*/
|
||||
void Application::SigUsr2Handler(int)
|
||||
{
|
||||
Log(LogInformation, "Application", "Reload requested, letting new process take over.");
|
||||
#ifdef HAVE_SYSTEMD
|
||||
sd_notifyf(0, "MAINPID=%lu", (unsigned long) m_ReloadProcess);
|
||||
#endif /* HAVE_SYSTEMD */
|
||||
|
||||
Exit(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Signal handler for SIGABRT. Helps with debugging ASSERT()s.
|
||||
*
|
||||
|
@ -964,6 +998,9 @@ int Application::Run()
|
|||
|
||||
sa.sa_handler = &Application::SigUsr1Handler;
|
||||
sigaction(SIGUSR1, &sa, nullptr);
|
||||
|
||||
sa.sa_handler = &Application::SigUsr2Handler;
|
||||
sigaction(SIGUSR2, &sa, nullptr);
|
||||
#else /* _WIN32 */
|
||||
SetConsoleCtrlHandler(&Application::CtrlHandler, TRUE);
|
||||
#endif /* _WIN32 */
|
||||
|
|
|
@ -205,6 +205,7 @@ private:
|
|||
|
||||
static void SigAbrtHandler(int signum);
|
||||
static void SigUsr1Handler(int signum);
|
||||
static void SigUsr2Handler(int signum);
|
||||
static void ExceptionHandler();
|
||||
|
||||
static String GetCrashReportFilename();
|
||||
|
|
|
@ -136,39 +136,6 @@ static bool SetDaemonIO(const String& stderrFile)
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Terminate another process and wait till it has ended
|
||||
*
|
||||
* @params target PID of the process to end
|
||||
*/
|
||||
static void TerminateAndWaitForEnd(pid_t target)
|
||||
{
|
||||
#ifndef _WIN32
|
||||
// allow 30 seconds timeout
|
||||
double timeout = Utility::GetTime() + 30;
|
||||
|
||||
int ret = kill(target, SIGTERM);
|
||||
|
||||
while (Utility::GetTime() < timeout && (ret == 0 || errno != ESRCH)) {
|
||||
Utility::Sleep(0.1);
|
||||
ret = kill(target, 0);
|
||||
}
|
||||
|
||||
// timeout and the process still seems to live: update pid and kill it
|
||||
if (ret == 0 || errno != ESRCH) {
|
||||
String pidFile = Application::GetPidPath();
|
||||
std::ofstream fp(pidFile.CStr());
|
||||
fp << Utility::GetPid();
|
||||
fp.close();
|
||||
|
||||
kill(target, SIGKILL);
|
||||
}
|
||||
|
||||
#else
|
||||
// TODO: implement this for Win32
|
||||
#endif /* _WIN32 */
|
||||
}
|
||||
|
||||
String DaemonCommand::GetDescription() const
|
||||
{
|
||||
return "Starts Icinga 2.";
|
||||
|
@ -251,11 +218,14 @@ int DaemonCommand::Run(const po::variables_map& vm, const std::vector<std::strin
|
|||
}
|
||||
|
||||
if (vm.count("reload-internal")) {
|
||||
int parentpid = vm["reload-internal"].as<int>();
|
||||
Log(LogInformation, "cli")
|
||||
<< "Terminating previous instance of Icinga (PID " << parentpid << ")";
|
||||
TerminateAndWaitForEnd(parentpid);
|
||||
Log(LogInformation, "cli", "Previous instance has ended, taking over now.");
|
||||
/* We went through validation and now ask the old process kindly to die */
|
||||
Log(LogInformation, "cli", "Requesting to take over.");
|
||||
int rc = kill(vm["reload-internal"].as<int>(), SIGUSR2);
|
||||
if (rc) {
|
||||
Log(LogCritical, "Application")
|
||||
<< "Failed to send signal to \"" << vm["reload-internal"].as<int>() << "\" with " << strerror(errno);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
if (vm.count("daemonize")) {
|
||||
|
|
Loading…
Reference in New Issue