diff --git a/doc/16-upgrading-icinga-2.md b/doc/16-upgrading-icinga-2.md index 3f5ba1017..ada066a4b 100644 --- a/doc/16-upgrading-icinga-2.md +++ b/doc/16-upgrading-icinga-2.md @@ -55,6 +55,17 @@ and [benchmarks](https://github.com/miloyip/nativejson-benchmark#parsing-time). ### Core +#### Reload Handling + +2.11 provides fixes for unwanted notifications during restarts. +The updated systemd service file now uses the `KillMode=mixed` setting. + +The reload handling was improved with an umbrella process, which means +that normal runtime operations include **3 processes**. You may need to +adjust the local instance monitoring of the [procs](08-advanced-topics.md#monitoring-icinga) check. + +More details can be found in the [technical concepts](19-technical-concepts.md#technical-concepts-core-reload) chapter. + #### Downtime Notifications Imagine that a host/service changes to a HARD NOT-OK state, diff --git a/doc/19-technical-concepts.md b/doc/19-technical-concepts.md index 187d3a5c4..dbf975e3f 100644 --- a/doc/19-technical-concepts.md +++ b/doc/19-technical-concepts.md @@ -176,6 +176,49 @@ The following signals are triggered in the stages: * [Flex](https://github.com/westes/flex) * [GNU Bison](https://www.gnu.org/software/bison/) +## Core + +#:## Core: Reload Handling + +The initial design of the reload state machine looks like this: + +* receive reload signal SIGHUP +* fork a child process, start configuration validation in parallel work queues +* parent process continues with old configuration objects and the event scheduling +(doing checks, replicating cluster events, triggering alert notifications, etc.) +* validation NOT ok: child process terminates, parent process continues with old configuration state +* validation ok: child process signals parent process to terminate and save its current state (all events until now) into the icinga2 state file +* parent process shuts down writing icinga2.state file +* child process waits for parent process gone, reads the icinga2 state file and synchronizes all historical and status data +* child becomes the new session leader + +Since Icinga 2.6, there are two processes when checked with `ps aux | grep icinga2` or `pidof icinga2`. +This was to ensure that feature file descriptors don't leak into the plugin process (e.g. DB IDO MySQL sockets). + +Icinga 2.9 changed the reload handling a bit with SIGUSR2 signals +and systemd notifies. + +With systemd, it could occur that the tree was broken thus resulting +in killing all remaining processes on stop, instead of a clean exit. +You can read the full story [here](https://github.com/Icinga/icinga2/issues/7309). + +With 2.11 you'll now see 3 processes: + +- The umbrella process which takes care about signal handling and process spawning/stopping +- The main process with the check scheduler, notifications, etc. +- The execution helper process + +During reload, the umbrella process spawns a new reload process which validates the configuration. +Once successful, the new reload process signals the umbrella process that it is finished. +The umbrella process forwards the signal and tells the old main process to shutdown. +The old main process writes the icinga2.state file. The umbrella process signals +the reload process that the main process terminated. + +The reload process was in idle wait before, and now continues to read the written +state file and run the event loop (checks, notifications, "events", ...). The reload +process itself also spawns the execution helper process again. + + ## Features Features are implemented in specific libraries and can be enabled diff --git a/lib/base/CMakeLists.txt b/lib/base/CMakeLists.txt index fb3de3029..100ca27a7 100644 --- a/lib/base/CMakeLists.txt +++ b/lib/base/CMakeLists.txt @@ -15,6 +15,7 @@ set(base_SOURCES i2-base.hpp application.cpp application.hpp application-ti.hpp application-version.cpp application-environment.cpp array.cpp array.hpp array-script.cpp + atomic.hpp base64.cpp base64.hpp boolean.cpp boolean.hpp boolean-script.cpp configobject.cpp configobject.hpp configobject-ti.hpp configobject-script.cpp diff --git a/lib/base/application.cpp b/lib/base/application.cpp index 880af34a2..4707f0d23 100644 --- a/lib/base/application.cpp +++ b/lib/base/application.cpp @@ -27,10 +27,9 @@ #endif /* __linux__ */ #ifdef _WIN32 #include +#else /* _WIN32 */ +#include #endif /* _WIN32 */ -#ifdef HAVE_SYSTEMD -#include -#endif /* HAVE_SYSTEMD */ using namespace icinga; @@ -42,6 +41,11 @@ bool Application::m_ShuttingDown = false; bool Application::m_RequestRestart = false; bool Application::m_RequestReopenLogs = false; pid_t Application::m_ReloadProcess = 0; + +#ifndef _WIN32 +pid_t Application::m_UmbrellaProcess = 0; +#endif /* _WIN32 */ + static bool l_Restarting = false; static bool l_InExceptionHandler = false; int Application::m_ArgC; @@ -73,7 +77,9 @@ void Application::Stop(bool runtimeRemoved) WSACleanup(); #endif /* _WIN32 */ +#ifdef _WIN32 ClosePidFile(true); +#endif /* _WIN32 */ ObjectImpl::Stop(runtimeRemoved); } @@ -286,25 +292,24 @@ void Application::SetArgV(char **argv) */ void Application::RunEventLoop() { -#ifdef HAVE_SYSTEMD - sd_notify(0, "READY=1"); -#endif /* HAVE_SYSTEMD */ - double lastLoop = Utility::GetTime(); while (!m_ShuttingDown) { if (m_RequestRestart) { m_RequestRestart = false; // we are now handling the request, once is enough -#ifdef HAVE_SYSTEMD - sd_notify(0, "RELOADING=1"); -#endif /* HAVE_SYSTEMD */ - +#ifdef _WIN32 // are we already restarting? ignore request if we already are if (!l_Restarting) { l_Restarting = true; m_ReloadProcess = StartReloadProcess(); } +#else /* _WIN32 */ + Log(LogNotice, "Application") + << "Got reload command, forwarding to umbrella process (PID " << m_UmbrellaProcess << ")"; + + (void)kill(m_UmbrellaProcess, SIGHUP); +#endif /* _WIN32 */ } else { /* Watches for changes to the system time. Adjusts timers if necessary. */ Utility::Sleep(2.5); @@ -318,10 +323,6 @@ void Application::RunEventLoop() double now = Utility::GetTime(); double timeDiff = lastLoop - now; -#ifdef HAVE_SYSTEMD - sd_notify(0, "WATCHDOG=1"); -#endif /* HAVE_SYSTEMD */ - if (std::fabs(timeDiff) > 15) { /* We made a significant jump in time. */ Log(LogInformation, "Application") @@ -336,10 +337,6 @@ void Application::RunEventLoop() } } -#ifdef HAVE_SYSTEMD - sd_notify(0, "STOPPING=1"); -#endif /* HAVE_SYSTEMD */ - Log(LogInformation, "Application", "Shutting down..."); ConfigObject::StopObjects(); @@ -446,6 +443,18 @@ void Application::RequestReopenLogs() m_RequestReopenLogs = true; } +#ifndef _WIN32 +/** + * Sets the PID of the Icinga umbrella process. + * + * @param pid The PID of the Icinga umbrella process. + */ +void Application::SetUmbrellaProcess(pid_t pid) +{ + m_UmbrellaProcess = pid; +} +#endif /* _WIN32 */ + /** * Retrieves the full path of the executable. * @@ -680,29 +689,6 @@ void Application::AttachDebugger(const String& filename, bool interactive) #endif /* _WIN32 */ } -#ifndef _WIN32 -/** - * Signal handler for SIGINT and SIGTERM. Prepares the application for cleanly - * shutting down during the next execution of the event loop. - * - * @param - The signal number. - */ -void Application::SigIntTermHandler(int signum) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigaction(signum, &sa, nullptr); - - Application::Ptr instance = Application::GetInstance(); - - if (!instance) - return; - - instance->RequestShutdown(); -} -#endif /* _WIN32 */ - /** * Signal handler for SIGUSR1. This signal causes Icinga to re-open * its log files and is mainly for use by logrotate. @@ -717,42 +703,6 @@ void Application::SigUsr1Handler(int) RequestReopenLogs(); } -/** - * Signal handler for SIGUSR2. Hands over PID to child and commits suicide - * - * @param - The signal number. - */ -void Application::SigUsr2Handler(int) -{ - Log(LogInformation, "Application", "Reload requested, letting new process take over."); -#ifdef HAVE_SYSTEMD - sd_notifyf(0, "MAINPID=%lu", (unsigned long) m_ReloadProcess); -#endif /* HAVE_SYSTEMD */ - - /* Write the PID of the new process to the pidfile before this - * process exits to keep systemd happy. - */ - Application::Ptr instance = GetInstance(); - try { - instance->UpdatePidFile(Configuration::PidPath, m_ReloadProcess); - } catch (const std::exception&) { - /* abort restart */ - Log(LogCritical, "Application", "Cannot update PID file. Aborting restart operation."); - return; - } - - instance->ClosePidFile(false); - - /* Ensure to dump the program state on reload. */ - ConfigObject::StopObjects(); - instance->OnShutdown(); - - Log(LogInformation, "Application") - << "Reload done, parent process shutting down. Child process with PID '" << m_ReloadProcess << "' is taking over."; - - Exit(0); -} - /** * Signal handler for SIGABRT. Helps with debugging ASSERT()s. * @@ -999,19 +949,13 @@ int Application::Run() #ifndef _WIN32 struct sigaction sa; memset(&sa, 0, sizeof(sa)); - sa.sa_handler = &Application::SigIntTermHandler; - sigaction(SIGINT, &sa, nullptr); - sigaction(SIGTERM, &sa, nullptr); - sa.sa_handler = &Application::SigUsr1Handler; sigaction(SIGUSR1, &sa, nullptr); - - sa.sa_handler = &Application::SigUsr2Handler; - sigaction(SIGUSR2, &sa, nullptr); #else /* _WIN32 */ SetConsoleCtrlHandler(&Application::CtrlHandler, TRUE); #endif /* _WIN32 */ +#ifdef _WIN32 try { UpdatePidFile(Configuration::PidPath); } catch (const std::exception&) { @@ -1019,6 +963,7 @@ int Application::Run() << "Cannot update PID file '" << Configuration::PidPath << "'. Aborting."; return EXIT_FAILURE; } +#endif /* _WIN32 */ SetMainTime(Utility::GetTime()); diff --git a/lib/base/application.hpp b/lib/base/application.hpp index 92f627e75..a6b78ff09 100644 --- a/lib/base/application.hpp +++ b/lib/base/application.hpp @@ -57,6 +57,10 @@ public: static void RequestRestart(); static void RequestReopenLogs(); +#ifndef _WIN32 + static void SetUmbrellaProcess(pid_t pid); +#endif /* _WIN32 */ + static bool IsShuttingDown(); static bool IsRestarting(); @@ -122,9 +126,13 @@ private: static pid_t m_ReloadProcess; /**< The PID of a subprocess doing a reload, only valid when l_Restarting==true */ static bool m_RequestReopenLogs; /**< Whether we should re-open log files. */ +#ifndef _WIN32 + static pid_t m_UmbrellaProcess; /**< The PID of the Icinga umbrella process */ +#endif /* _WIN32 */ + static int m_ArgC; /**< The number of command-line arguments. */ static char **m_ArgV; /**< Command-line arguments. */ - FILE *m_PidFile; /**< The PID file */ + FILE *m_PidFile = nullptr; /**< The PID file */ static bool m_Debugging; /**< Whether debugging is enabled. */ static LogSeverity m_DebuggingSeverity; /**< Whether debugging severity is set. */ static double m_StartTime; @@ -132,9 +140,7 @@ private: static bool m_ScriptDebuggerEnabled; static double m_LastReloadFailed; -#ifndef _WIN32 - static void SigIntTermHandler(int signum); -#else /* _WIN32 */ +#ifdef _WIN32 static BOOL WINAPI CtrlHandler(DWORD type); static LONG WINAPI SEHUnhandledExceptionFilter(PEXCEPTION_POINTERS exi); #endif /* _WIN32 */ @@ -143,7 +149,6 @@ private: static void SigAbrtHandler(int signum); static void SigUsr1Handler(int signum); - static void SigUsr2Handler(int signum); static void ExceptionHandler(); static String GetCrashReportFilename(); diff --git a/lib/base/atomic.hpp b/lib/base/atomic.hpp new file mode 100644 index 000000000..0ebcddefb --- /dev/null +++ b/lib/base/atomic.hpp @@ -0,0 +1,43 @@ +/* Icinga 2 | (c) 2019 Icinga GmbH | GPLv2+ */ + +#ifndef ATOMIC_H +#define ATOMIC_H + +#include + +namespace icinga +{ + +/** + * Extends std::atomic with an atomic constructor. + * + * @ingroup base + */ +template +class Atomic : public std::atomic { +public: + /** + * Like std::atomic#atomic, but operates atomically + * + * @param desired Initial value + */ + inline Atomic(T desired) + { + this->store(desired); + } + + /** + * Like std::atomic#atomic, but operates atomically + * + * @param desired Initial value + * @param order Initial store operation's memory order + */ + inline Atomic(T desired, std::memory_order order) + { + this->store(desired, order); + } +}; + +} + +#endif /* ATOMIC_H */ diff --git a/lib/cli/daemoncommand.cpp b/lib/cli/daemoncommand.cpp index 9fbe486ba..8091c403e 100644 --- a/lib/cli/daemoncommand.cpp +++ b/lib/cli/daemoncommand.cpp @@ -7,6 +7,8 @@ #include "config/configcompiler.hpp" #include "config/configcompilercontext.hpp" #include "config/configitembuilder.hpp" +#include "base/atomic.hpp" +#include "base/defer.hpp" #include "base/logger.hpp" #include "base/application.hpp" #include "base/timer.hpp" @@ -16,10 +18,23 @@ #include "base/scriptglobal.hpp" #include "base/context.hpp" #include "config.h" +#include +#include #include #include #include +#ifndef _WIN32 +#include +#include +#include +#include +#endif /* _WIN32 */ + +#ifdef HAVE_SYSTEMD +#include +#endif /* HAVE_SYSTEMD */ + using namespace icinga; namespace po = boost::program_options; @@ -27,13 +42,6 @@ static po::variables_map g_AppParams; REGISTER_CLICOMMAND("daemon", DaemonCommand); -#ifndef _WIN32 -static void SigHupHandler(int) -{ - Application::RequestRestart(); -} -#endif /* _WIN32 */ - /* * Daemonize(). On error, this function logs by itself and exits (i.e. does not return). * @@ -163,11 +171,6 @@ void DaemonCommand::InitParameters(boost::program_options::options_description& ("close-stdio", "do not log to stdout (or stderr) after startup") #endif /* _WIN32 */ ; - -#ifndef _WIN32 - hiddenDesc.add_options() - ("reload-internal", po::value(), "used internally to implement config reload: do not call manually, send SIGHUP instead"); -#endif /* _WIN32 */ } std::vector DaemonCommand::GetArgumentSuggestions(const String& argument, const String& word) const @@ -178,6 +181,357 @@ std::vector DaemonCommand::GetArgumentSuggestions(const String& argument return CLICommand::GetArgumentSuggestions(argument, word); } +#ifndef _WIN32 +// The PID of the Icinga umbrella process +pid_t l_UmbrellaPid = 0; + +// Whether the umbrella process allowed us to continue working beyond config validation +static Atomic l_AllowedToWork (false); +#endif /* _WIN32 */ + +/** + * Do the actual work (config loading, ...) + * + * @param configs Files to read config from + * + * @return Exit code + */ +static inline +int RunWorker(const std::vector& configs) +{ + Log(LogInformation, "cli", "Loading configuration file(s)."); + + { + std::vector newItems; + + if (!DaemonUtility::LoadConfigFiles(configs, newItems, Configuration::ObjectsPath, Configuration::VarsPath)) + return EXIT_FAILURE; + +#ifndef _WIN32 + Log(LogNotice, "cli") + << "Notifying umbrella process (PID " << l_UmbrellaPid << ") about the config loading success"; + + (void)kill(l_UmbrellaPid, SIGUSR2); + + Log(LogNotice, "cli") + << "Waiting for the umbrella process to let us doing the actual work"; + + while (!l_AllowedToWork.load()) { + Utility::Sleep(0.2); + } + + Log(LogNotice, "cli") + << "The umbrella process let us continuing"; +#endif /* _WIN32 */ + + /* restore the previous program state */ + try { + ConfigObject::RestoreObjects(Configuration::StatePath); + } catch (const std::exception& ex) { + Log(LogCritical, "cli") + << "Failed to restore state file: " << DiagnosticInformation(ex); + return EXIT_FAILURE; + } + + WorkQueue upq(25000, Configuration::Concurrency); + upq.SetName("DaemonCommand::Run"); + + // activate config only after daemonization: it starts threads and that is not compatible with fork() + if (!ConfigItem::ActivateItems(upq, newItems, false, false, true)) { + Log(LogCritical, "cli", "Error activating configuration."); + return EXIT_FAILURE; + } + } + + /* Create the internal API object storage. Do this here too with setups without API. */ + ConfigObjectUtility::CreateStorage(); + + /* Remove ignored Downtime/Comment objects. */ + try { + String configDir = ConfigObjectUtility::GetConfigDir(); + ConfigItem::RemoveIgnoredItems(configDir); + } catch (const std::exception& ex) { + Log(LogNotice, "cli") + << "Cannot clean ignored downtimes/comments: " << ex.what(); + } + + ApiListener::UpdateObjectAuthority(); + + return Application::GetInstance()->Run(); +} + +#ifndef _WIN32 +/** + * The possible states of a seemless worker being started by StartUnixWorker(). + */ +enum class UnixWorkerState : uint_fast8_t +{ + Pending, + LoadedConfig, + Failed +}; + +// The signals to block temporarily in StartUnixWorker(). +static const sigset_t l_UnixWorkerSignals = ([]() -> sigset_t { + sigset_t s; + + (void)sigemptyset(&s); + (void)sigaddset(&s, SIGCHLD); + (void)sigaddset(&s, SIGUSR1); + (void)sigaddset(&s, SIGUSR2); + (void)sigaddset(&s, SIGINT); + (void)sigaddset(&s, SIGTERM); + (void)sigaddset(&s, SIGHUP); + + return s; +})(); + +// The PID of the seemless worker currently being started by StartUnixWorker() +static Atomic l_CurrentlyStartingUnixWorkerPid (-1); + +// The state of the seemless worker currently being started by StartUnixWorker() +static Atomic l_CurrentlyStartingUnixWorkerState (UnixWorkerState::Pending); + +// The last temination signal we received +static Atomic l_TermSignal (-1); + +// Whether someone requested to re-load config (and we didn't handle that request, yet) +static Atomic l_RequestedReload (false); + +// Whether someone requested to re-open logs (and we didn't handle that request, yet) +static Atomic l_RequestedReopenLogs (false); + +/** + * Umbrella process' signal handlers + */ +static void UmbrellaSignalHandler(int num, siginfo_t *info, void*) +{ + switch (num) { + case SIGUSR1: + // Someone requested to re-open logs + l_RequestedReopenLogs.store(true); + break; + case SIGUSR2: + if (l_CurrentlyStartingUnixWorkerState.load() == UnixWorkerState::Pending + && info->si_pid == l_CurrentlyStartingUnixWorkerPid.load()) { + // The seemless worker currently being started by StartUnixWorker() successfully loaded its config + l_CurrentlyStartingUnixWorkerState.store(UnixWorkerState::LoadedConfig); + } + break; + case SIGCHLD: + if (l_CurrentlyStartingUnixWorkerState.load() == UnixWorkerState::Pending + && info->si_pid == l_CurrentlyStartingUnixWorkerPid.load()) { + // The seemless worker currently being started by StartUnixWorker() failed + l_CurrentlyStartingUnixWorkerState.store(UnixWorkerState::Failed); + } + break; + case SIGINT: + case SIGTERM: + // Someone requested our termination + + { + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + + sa.sa_handler = SIG_DFL; + + (void)sigaction(num, &sa, nullptr); + } + + l_TermSignal.store(num); + break; + case SIGHUP: + // Someone requested to re-load config + l_RequestedReload.store(true); + break; + default: + // Programming error (or someone has broken the userspace) + VERIFY(!"Caught unexpected signal"); + } +} + +/** + * Seemless worker's signal handlers + */ +static void WorkerSignalHandler(int num, siginfo_t *info, void*) +{ + switch (num) { + case SIGUSR2: + if (info->si_pid == l_UmbrellaPid) { + // The umbrella process allowed us to continue working beyond config validation + l_AllowedToWork.store(true); + } + break; + case SIGINT: + case SIGTERM: + if (info->si_pid == l_UmbrellaPid) { + // The umbrella process requested our termination + Application::RequestShutdown(); + } + break; + default: + // Programming error (or someone has broken the userspace) + VERIFY(!"Caught unexpected signal"); + } +} + +#ifdef HAVE_SYSTEMD +// When we last notified the watchdog. +static Atomic l_LastNotifiedWatchdog (0); + +/** + * Notify the watchdog if not notified during the last 2.5s. + */ +static void NotifyWatchdog() +{ + double now = Utility::GetTime(); + + if (now - l_LastNotifiedWatchdog.load() >= 2.5) { + sd_notify(0, "WATCHDOG=1"); + l_LastNotifiedWatchdog.store(now); + } +} +#endif /* HAVE_SYSTEMD */ + +/** + * Starts seemless worker process doing the actual work (config loading, ...) + * + * @param configs Files to read config from + * + * @return The worker's PID on success, -1 on failure (if the worker couldn't load its config) + */ +static pid_t StartUnixWorker(const std::vector& configs) +{ + Log(LogNotice, "cli") + << "Spawning seemless worker process doing the actual work"; + + try { + Application::UninitializeBase(); + } catch (const std::exception& ex) { + Log(LogCritical, "cli") + << "Failed to stop thread pool before forking, unexpected error: " << DiagnosticInformation(ex); + exit(EXIT_FAILURE); + } + + /* Block the signal handlers we'd like to change in the child process until we changed them. + * Block SIGUSR2 and SIGCHLD handlers until we've set l_CurrentlyStartingUnixWorkerPid. + */ + (void)sigprocmask(SIG_BLOCK, &l_UnixWorkerSignals, nullptr); + + pid_t pid = fork(); + + switch (pid) { + case -1: + Log(LogCritical, "cli") + << "fork() failed with error code " << errno << ", \"" << Utility::FormatErrorNumber(errno) << "\""; + exit(EXIT_FAILURE); + + case 0: + try { + { + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + + sa.sa_handler = SIG_DFL; + + (void)sigaction(SIGCHLD, &sa, nullptr); + (void)sigaction(SIGUSR1, &sa, nullptr); + (void)sigaction(SIGHUP, &sa, nullptr); + } + + { + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + + sa.sa_sigaction = &WorkerSignalHandler; + sa.sa_flags = SA_RESTART | SA_SIGINFO; + + (void)sigaction(SIGUSR2, &sa, nullptr); + (void)sigaction(SIGINT, &sa, nullptr); + (void)sigaction(SIGTERM, &sa, nullptr); + } + + (void)sigprocmask(SIG_UNBLOCK, &l_UnixWorkerSignals, nullptr); + + try { + Application::InitializeBase(); + } catch (const std::exception& ex) { + Log(LogCritical, "cli") + << "Failed to re-initialize thread pool after forking (child): " << DiagnosticInformation(ex); + _exit(EXIT_FAILURE); + } + + _exit(RunWorker(configs)); + } catch (...) { + _exit(EXIT_FAILURE); + } + + default: + l_CurrentlyStartingUnixWorkerPid.store(pid); + (void)sigprocmask(SIG_UNBLOCK, &l_UnixWorkerSignals, nullptr); + + Log(LogNotice, "cli") + << "Spawned worker process (PID " << pid << "), waiting for it to load its config"; + + // Wait for the newly spawned process to either load its config or fail. + for (;;) { +#ifdef HAVE_SYSTEMD + NotifyWatchdog(); +#endif /* HAVE_SYSTEMD */ + + switch (l_CurrentlyStartingUnixWorkerState.load()) { + case UnixWorkerState::LoadedConfig: + Log(LogNotice, "cli") + << "Worker process successfully loaded its config"; + break; + case UnixWorkerState::Failed: + Log(LogNotice, "cli") + << "Worker process couldn't load its config"; + + while (waitpid(pid, nullptr, 0) == -1 && errno == EINTR) { +#ifdef HAVE_SYSTEMD + NotifyWatchdog(); +#endif /* HAVE_SYSTEMD */ + } + pid = -1; + break; + default: + Utility::Sleep(0.2); + continue; + } + + break; + } + + // Reset flags for the next time + l_CurrentlyStartingUnixWorkerPid.store(-1); + l_CurrentlyStartingUnixWorkerState.store(UnixWorkerState::Pending); + + try { + Application::InitializeBase(); + } catch (const std::exception& ex) { + Log(LogCritical, "cli") + << "Failed to re-initialize thread pool after forking (parent): " << DiagnosticInformation(ex); + exit(EXIT_FAILURE); + } + } + + return pid; +} + +/** + * Workaround to instantiate Application (which is abstract) in DaemonCommand#Run() + */ +class PidFileManagementApp : public Application +{ +public: + inline int Main() override + { + return EXIT_FAILURE; + } +}; +#endif /* _WIN32 */ + /** * The entry point for the "daemon" CLI command. * @@ -194,15 +548,6 @@ int DaemonCommand::Run(const po::variables_map& vm, const std::vector 0) { - Log(LogCritical, "cli") - << "Another instance of Icinga already running with PID " << runningpid; - return EXIT_FAILURE; - } - } - std::vector configs; if (vm.count("config") > 0) configs = vm["config"].as >(); @@ -212,67 +557,52 @@ int DaemonCommand::Run(const po::variables_map& vm, const std::vector newItems; - - if (!DaemonUtility::LoadConfigFiles(configs, newItems, Configuration::ObjectsPath, Configuration::VarsPath)) - return EXIT_FAILURE; - if (vm.count("validate")) { + Log(LogInformation, "cli", "Loading configuration file(s)."); + + std::vector newItems; + + if (!DaemonUtility::LoadConfigFiles(configs, newItems, Configuration::ObjectsPath, Configuration::VarsPath)) + return EXIT_FAILURE; + Log(LogInformation, "cli", "Finished validating the configuration file(s)."); return EXIT_SUCCESS; } -#ifndef _WIN32 - if (vm.count("reload-internal")) { - /* We went through validation and now ask the old process kindly to die */ - Log(LogInformation, "cli", "Requesting to take over."); - int rc = kill(vm["reload-internal"].as(), SIGUSR2); - if (rc) { + { + pid_t runningpid = Application::ReadPidFile(Configuration::PidPath); + if (runningpid > 0) { Log(LogCritical, "cli") - << "Failed to send signal to \"" << vm["reload-internal"].as() << "\" with " << strerror(errno); + << "Another instance of Icinga already running with PID " << runningpid; return EXIT_FAILURE; } - - double start = Utility::GetTime(); - while (kill(vm["reload-internal"].as(), SIGCHLD) == 0) - Utility::Sleep(0.2); - - Log(LogNotice, "cli") - << "Waited for " << Utility::FormatDuration(Utility::GetTime() - start) << " on old process to exit."; } -#endif /* _WIN32 */ if (vm.count("daemonize")) { - if (!vm.count("reload-internal")) { - // no additional fork neccessary on reload - - // this subroutine either succeeds, or logs an error - // and terminates the process (does not return). - Daemonize(); - } + // this subroutine either succeeds, or logs an error + // and terminates the process (does not return). + Daemonize(); } - /* restore the previous program state */ +#ifndef _WIN32 + /* The Application manages the PID file, + * but on *nix this process doesn't load any config + * so there's no central Application instance. + */ + PidFileManagementApp app; + try { - ConfigObject::RestoreObjects(Configuration::StatePath); - } catch (const std::exception& ex) { - Log(LogCritical, "cli") - << "Failed to restore state file: " << DiagnosticInformation(ex); + app.UpdatePidFile(Configuration::PidPath); + } catch (const std::exception&) { + Log(LogCritical, "Application") + << "Cannot update PID file '" << Configuration::PidPath << "'. Aborting."; return EXIT_FAILURE; } - { - WorkQueue upq(25000, Configuration::Concurrency); - upq.SetName("DaemonCommand::Run"); - - // activate config only after daemonization: it starts threads and that is not compatible with fork() - if (!ConfigItem::ActivateItems(upq, newItems, false, false, true)) { - Log(LogCritical, "cli", "Error activating configuration."); - return EXIT_FAILURE; - } - } + Defer closePidFile ([&app]() { + app.ClosePidFile(true); + }); +#endif /* _WIN32 */ if (vm.count("daemonize") || vm.count("close-stdio")) { // After disabling the console log, any further errors will go to the configured log only. @@ -287,26 +617,139 @@ int DaemonCommand::Run(const po::variables_map& vm, const std::vector 0) { + Log(LogNotice, "cli") + << "Seemless worker (PID " << currentWorker << ") stopped, stopping as well"; + +#ifdef HAVE_SYSTEMD + if (!notifiedTermination) { + notifiedTermination = true; + sd_notify(0, "STOPPING=1"); + } +#endif /* HAVE_SYSTEMD */ + + // If killed by signal, forward it via the exit code (to be as seemless as possible) + return WIFSIGNALED(status) ? 128 + WTERMSIG(status) : WEXITSTATUS(status); + } + } + + Utility::Sleep(0.2); + } #endif /* _WIN32 */ - - ApiListener::UpdateObjectAuthority(); - - return Application::GetInstance()->Run(); }