diff --git a/doc/16-upgrading-icinga-2.md b/doc/16-upgrading-icinga-2.md
index 3f5ba1017..ada066a4b 100644
--- a/doc/16-upgrading-icinga-2.md
+++ b/doc/16-upgrading-icinga-2.md
@@ -55,6 +55,17 @@ and [benchmarks](https://github.com/miloyip/nativejson-benchmark#parsing-time).
### Core
+#### Reload Handling
+
+2.11 provides fixes for unwanted notifications during restarts.
+The updated systemd service file now uses the `KillMode=mixed` setting.
+
+The reload handling was improved with an umbrella process, which means
+that normal runtime operations include **3 processes**. You may need to
+adjust the local instance monitoring of the [procs](08-advanced-topics.md#monitoring-icinga) check.
+
+More details can be found in the [technical concepts](19-technical-concepts.md#technical-concepts-core-reload) chapter.
+
#### Downtime Notifications
Imagine that a host/service changes to a HARD NOT-OK state,
diff --git a/doc/19-technical-concepts.md b/doc/19-technical-concepts.md
index 187d3a5c4..dbf975e3f 100644
--- a/doc/19-technical-concepts.md
+++ b/doc/19-technical-concepts.md
@@ -176,6 +176,49 @@ The following signals are triggered in the stages:
* [Flex](https://github.com/westes/flex)
* [GNU Bison](https://www.gnu.org/software/bison/)
+## Core
+
+#:## Core: Reload Handling
+
+The initial design of the reload state machine looks like this:
+
+* receive reload signal SIGHUP
+* fork a child process, start configuration validation in parallel work queues
+* parent process continues with old configuration objects and the event scheduling
+(doing checks, replicating cluster events, triggering alert notifications, etc.)
+* validation NOT ok: child process terminates, parent process continues with old configuration state
+* validation ok: child process signals parent process to terminate and save its current state (all events until now) into the icinga2 state file
+* parent process shuts down writing icinga2.state file
+* child process waits for parent process gone, reads the icinga2 state file and synchronizes all historical and status data
+* child becomes the new session leader
+
+Since Icinga 2.6, there are two processes when checked with `ps aux | grep icinga2` or `pidof icinga2`.
+This was to ensure that feature file descriptors don't leak into the plugin process (e.g. DB IDO MySQL sockets).
+
+Icinga 2.9 changed the reload handling a bit with SIGUSR2 signals
+and systemd notifies.
+
+With systemd, it could occur that the tree was broken thus resulting
+in killing all remaining processes on stop, instead of a clean exit.
+You can read the full story [here](https://github.com/Icinga/icinga2/issues/7309).
+
+With 2.11 you'll now see 3 processes:
+
+- The umbrella process which takes care about signal handling and process spawning/stopping
+- The main process with the check scheduler, notifications, etc.
+- The execution helper process
+
+During reload, the umbrella process spawns a new reload process which validates the configuration.
+Once successful, the new reload process signals the umbrella process that it is finished.
+The umbrella process forwards the signal and tells the old main process to shutdown.
+The old main process writes the icinga2.state file. The umbrella process signals
+the reload process that the main process terminated.
+
+The reload process was in idle wait before, and now continues to read the written
+state file and run the event loop (checks, notifications, "events", ...). The reload
+process itself also spawns the execution helper process again.
+
+
## Features
Features are implemented in specific libraries and can be enabled
diff --git a/lib/base/CMakeLists.txt b/lib/base/CMakeLists.txt
index fb3de3029..100ca27a7 100644
--- a/lib/base/CMakeLists.txt
+++ b/lib/base/CMakeLists.txt
@@ -15,6 +15,7 @@ set(base_SOURCES
i2-base.hpp
application.cpp application.hpp application-ti.hpp application-version.cpp application-environment.cpp
array.cpp array.hpp array-script.cpp
+ atomic.hpp
base64.cpp base64.hpp
boolean.cpp boolean.hpp boolean-script.cpp
configobject.cpp configobject.hpp configobject-ti.hpp configobject-script.cpp
diff --git a/lib/base/application.cpp b/lib/base/application.cpp
index 880af34a2..4707f0d23 100644
--- a/lib/base/application.cpp
+++ b/lib/base/application.cpp
@@ -27,10 +27,9 @@
#endif /* __linux__ */
#ifdef _WIN32
#include
+#else /* _WIN32 */
+#include
#endif /* _WIN32 */
-#ifdef HAVE_SYSTEMD
-#include
-#endif /* HAVE_SYSTEMD */
using namespace icinga;
@@ -42,6 +41,11 @@ bool Application::m_ShuttingDown = false;
bool Application::m_RequestRestart = false;
bool Application::m_RequestReopenLogs = false;
pid_t Application::m_ReloadProcess = 0;
+
+#ifndef _WIN32
+pid_t Application::m_UmbrellaProcess = 0;
+#endif /* _WIN32 */
+
static bool l_Restarting = false;
static bool l_InExceptionHandler = false;
int Application::m_ArgC;
@@ -73,7 +77,9 @@ void Application::Stop(bool runtimeRemoved)
WSACleanup();
#endif /* _WIN32 */
+#ifdef _WIN32
ClosePidFile(true);
+#endif /* _WIN32 */
ObjectImpl::Stop(runtimeRemoved);
}
@@ -286,25 +292,24 @@ void Application::SetArgV(char **argv)
*/
void Application::RunEventLoop()
{
-#ifdef HAVE_SYSTEMD
- sd_notify(0, "READY=1");
-#endif /* HAVE_SYSTEMD */
-
double lastLoop = Utility::GetTime();
while (!m_ShuttingDown) {
if (m_RequestRestart) {
m_RequestRestart = false; // we are now handling the request, once is enough
-#ifdef HAVE_SYSTEMD
- sd_notify(0, "RELOADING=1");
-#endif /* HAVE_SYSTEMD */
-
+#ifdef _WIN32
// are we already restarting? ignore request if we already are
if (!l_Restarting) {
l_Restarting = true;
m_ReloadProcess = StartReloadProcess();
}
+#else /* _WIN32 */
+ Log(LogNotice, "Application")
+ << "Got reload command, forwarding to umbrella process (PID " << m_UmbrellaProcess << ")";
+
+ (void)kill(m_UmbrellaProcess, SIGHUP);
+#endif /* _WIN32 */
} else {
/* Watches for changes to the system time. Adjusts timers if necessary. */
Utility::Sleep(2.5);
@@ -318,10 +323,6 @@ void Application::RunEventLoop()
double now = Utility::GetTime();
double timeDiff = lastLoop - now;
-#ifdef HAVE_SYSTEMD
- sd_notify(0, "WATCHDOG=1");
-#endif /* HAVE_SYSTEMD */
-
if (std::fabs(timeDiff) > 15) {
/* We made a significant jump in time. */
Log(LogInformation, "Application")
@@ -336,10 +337,6 @@ void Application::RunEventLoop()
}
}
-#ifdef HAVE_SYSTEMD
- sd_notify(0, "STOPPING=1");
-#endif /* HAVE_SYSTEMD */
-
Log(LogInformation, "Application", "Shutting down...");
ConfigObject::StopObjects();
@@ -446,6 +443,18 @@ void Application::RequestReopenLogs()
m_RequestReopenLogs = true;
}
+#ifndef _WIN32
+/**
+ * Sets the PID of the Icinga umbrella process.
+ *
+ * @param pid The PID of the Icinga umbrella process.
+ */
+void Application::SetUmbrellaProcess(pid_t pid)
+{
+ m_UmbrellaProcess = pid;
+}
+#endif /* _WIN32 */
+
/**
* Retrieves the full path of the executable.
*
@@ -680,29 +689,6 @@ void Application::AttachDebugger(const String& filename, bool interactive)
#endif /* _WIN32 */
}
-#ifndef _WIN32
-/**
- * Signal handler for SIGINT and SIGTERM. Prepares the application for cleanly
- * shutting down during the next execution of the event loop.
- *
- * @param - The signal number.
- */
-void Application::SigIntTermHandler(int signum)
-{
- struct sigaction sa;
- memset(&sa, 0, sizeof(sa));
- sa.sa_handler = SIG_DFL;
- sigaction(signum, &sa, nullptr);
-
- Application::Ptr instance = Application::GetInstance();
-
- if (!instance)
- return;
-
- instance->RequestShutdown();
-}
-#endif /* _WIN32 */
-
/**
* Signal handler for SIGUSR1. This signal causes Icinga to re-open
* its log files and is mainly for use by logrotate.
@@ -717,42 +703,6 @@ void Application::SigUsr1Handler(int)
RequestReopenLogs();
}
-/**
- * Signal handler for SIGUSR2. Hands over PID to child and commits suicide
- *
- * @param - The signal number.
- */
-void Application::SigUsr2Handler(int)
-{
- Log(LogInformation, "Application", "Reload requested, letting new process take over.");
-#ifdef HAVE_SYSTEMD
- sd_notifyf(0, "MAINPID=%lu", (unsigned long) m_ReloadProcess);
-#endif /* HAVE_SYSTEMD */
-
- /* Write the PID of the new process to the pidfile before this
- * process exits to keep systemd happy.
- */
- Application::Ptr instance = GetInstance();
- try {
- instance->UpdatePidFile(Configuration::PidPath, m_ReloadProcess);
- } catch (const std::exception&) {
- /* abort restart */
- Log(LogCritical, "Application", "Cannot update PID file. Aborting restart operation.");
- return;
- }
-
- instance->ClosePidFile(false);
-
- /* Ensure to dump the program state on reload. */
- ConfigObject::StopObjects();
- instance->OnShutdown();
-
- Log(LogInformation, "Application")
- << "Reload done, parent process shutting down. Child process with PID '" << m_ReloadProcess << "' is taking over.";
-
- Exit(0);
-}
-
/**
* Signal handler for SIGABRT. Helps with debugging ASSERT()s.
*
@@ -999,19 +949,13 @@ int Application::Run()
#ifndef _WIN32
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
- sa.sa_handler = &Application::SigIntTermHandler;
- sigaction(SIGINT, &sa, nullptr);
- sigaction(SIGTERM, &sa, nullptr);
-
sa.sa_handler = &Application::SigUsr1Handler;
sigaction(SIGUSR1, &sa, nullptr);
-
- sa.sa_handler = &Application::SigUsr2Handler;
- sigaction(SIGUSR2, &sa, nullptr);
#else /* _WIN32 */
SetConsoleCtrlHandler(&Application::CtrlHandler, TRUE);
#endif /* _WIN32 */
+#ifdef _WIN32
try {
UpdatePidFile(Configuration::PidPath);
} catch (const std::exception&) {
@@ -1019,6 +963,7 @@ int Application::Run()
<< "Cannot update PID file '" << Configuration::PidPath << "'. Aborting.";
return EXIT_FAILURE;
}
+#endif /* _WIN32 */
SetMainTime(Utility::GetTime());
diff --git a/lib/base/application.hpp b/lib/base/application.hpp
index 92f627e75..a6b78ff09 100644
--- a/lib/base/application.hpp
+++ b/lib/base/application.hpp
@@ -57,6 +57,10 @@ public:
static void RequestRestart();
static void RequestReopenLogs();
+#ifndef _WIN32
+ static void SetUmbrellaProcess(pid_t pid);
+#endif /* _WIN32 */
+
static bool IsShuttingDown();
static bool IsRestarting();
@@ -122,9 +126,13 @@ private:
static pid_t m_ReloadProcess; /**< The PID of a subprocess doing a reload, only valid when l_Restarting==true */
static bool m_RequestReopenLogs; /**< Whether we should re-open log files. */
+#ifndef _WIN32
+ static pid_t m_UmbrellaProcess; /**< The PID of the Icinga umbrella process */
+#endif /* _WIN32 */
+
static int m_ArgC; /**< The number of command-line arguments. */
static char **m_ArgV; /**< Command-line arguments. */
- FILE *m_PidFile; /**< The PID file */
+ FILE *m_PidFile = nullptr; /**< The PID file */
static bool m_Debugging; /**< Whether debugging is enabled. */
static LogSeverity m_DebuggingSeverity; /**< Whether debugging severity is set. */
static double m_StartTime;
@@ -132,9 +140,7 @@ private:
static bool m_ScriptDebuggerEnabled;
static double m_LastReloadFailed;
-#ifndef _WIN32
- static void SigIntTermHandler(int signum);
-#else /* _WIN32 */
+#ifdef _WIN32
static BOOL WINAPI CtrlHandler(DWORD type);
static LONG WINAPI SEHUnhandledExceptionFilter(PEXCEPTION_POINTERS exi);
#endif /* _WIN32 */
@@ -143,7 +149,6 @@ private:
static void SigAbrtHandler(int signum);
static void SigUsr1Handler(int signum);
- static void SigUsr2Handler(int signum);
static void ExceptionHandler();
static String GetCrashReportFilename();
diff --git a/lib/base/atomic.hpp b/lib/base/atomic.hpp
new file mode 100644
index 000000000..0ebcddefb
--- /dev/null
+++ b/lib/base/atomic.hpp
@@ -0,0 +1,43 @@
+/* Icinga 2 | (c) 2019 Icinga GmbH | GPLv2+ */
+
+#ifndef ATOMIC_H
+#define ATOMIC_H
+
+#include
+
+namespace icinga
+{
+
+/**
+ * Extends std::atomic with an atomic constructor.
+ *
+ * @ingroup base
+ */
+template
+class Atomic : public std::atomic {
+public:
+ /**
+ * Like std::atomic#atomic, but operates atomically
+ *
+ * @param desired Initial value
+ */
+ inline Atomic(T desired)
+ {
+ this->store(desired);
+ }
+
+ /**
+ * Like std::atomic#atomic, but operates atomically
+ *
+ * @param desired Initial value
+ * @param order Initial store operation's memory order
+ */
+ inline Atomic(T desired, std::memory_order order)
+ {
+ this->store(desired, order);
+ }
+};
+
+}
+
+#endif /* ATOMIC_H */
diff --git a/lib/cli/daemoncommand.cpp b/lib/cli/daemoncommand.cpp
index 9fbe486ba..8091c403e 100644
--- a/lib/cli/daemoncommand.cpp
+++ b/lib/cli/daemoncommand.cpp
@@ -7,6 +7,8 @@
#include "config/configcompiler.hpp"
#include "config/configcompilercontext.hpp"
#include "config/configitembuilder.hpp"
+#include "base/atomic.hpp"
+#include "base/defer.hpp"
#include "base/logger.hpp"
#include "base/application.hpp"
#include "base/timer.hpp"
@@ -16,10 +18,23 @@
#include "base/scriptglobal.hpp"
#include "base/context.hpp"
#include "config.h"
+#include
+#include
#include
#include
#include
+#ifndef _WIN32
+#include
+#include
+#include
+#include
+#endif /* _WIN32 */
+
+#ifdef HAVE_SYSTEMD
+#include
+#endif /* HAVE_SYSTEMD */
+
using namespace icinga;
namespace po = boost::program_options;
@@ -27,13 +42,6 @@ static po::variables_map g_AppParams;
REGISTER_CLICOMMAND("daemon", DaemonCommand);
-#ifndef _WIN32
-static void SigHupHandler(int)
-{
- Application::RequestRestart();
-}
-#endif /* _WIN32 */
-
/*
* Daemonize(). On error, this function logs by itself and exits (i.e. does not return).
*
@@ -163,11 +171,6 @@ void DaemonCommand::InitParameters(boost::program_options::options_description&
("close-stdio", "do not log to stdout (or stderr) after startup")
#endif /* _WIN32 */
;
-
-#ifndef _WIN32
- hiddenDesc.add_options()
- ("reload-internal", po::value(), "used internally to implement config reload: do not call manually, send SIGHUP instead");
-#endif /* _WIN32 */
}
std::vector DaemonCommand::GetArgumentSuggestions(const String& argument, const String& word) const
@@ -178,6 +181,357 @@ std::vector DaemonCommand::GetArgumentSuggestions(const String& argument
return CLICommand::GetArgumentSuggestions(argument, word);
}
+#ifndef _WIN32
+// The PID of the Icinga umbrella process
+pid_t l_UmbrellaPid = 0;
+
+// Whether the umbrella process allowed us to continue working beyond config validation
+static Atomic l_AllowedToWork (false);
+#endif /* _WIN32 */
+
+/**
+ * Do the actual work (config loading, ...)
+ *
+ * @param configs Files to read config from
+ *
+ * @return Exit code
+ */
+static inline
+int RunWorker(const std::vector& configs)
+{
+ Log(LogInformation, "cli", "Loading configuration file(s).");
+
+ {
+ std::vector newItems;
+
+ if (!DaemonUtility::LoadConfigFiles(configs, newItems, Configuration::ObjectsPath, Configuration::VarsPath))
+ return EXIT_FAILURE;
+
+#ifndef _WIN32
+ Log(LogNotice, "cli")
+ << "Notifying umbrella process (PID " << l_UmbrellaPid << ") about the config loading success";
+
+ (void)kill(l_UmbrellaPid, SIGUSR2);
+
+ Log(LogNotice, "cli")
+ << "Waiting for the umbrella process to let us doing the actual work";
+
+ while (!l_AllowedToWork.load()) {
+ Utility::Sleep(0.2);
+ }
+
+ Log(LogNotice, "cli")
+ << "The umbrella process let us continuing";
+#endif /* _WIN32 */
+
+ /* restore the previous program state */
+ try {
+ ConfigObject::RestoreObjects(Configuration::StatePath);
+ } catch (const std::exception& ex) {
+ Log(LogCritical, "cli")
+ << "Failed to restore state file: " << DiagnosticInformation(ex);
+ return EXIT_FAILURE;
+ }
+
+ WorkQueue upq(25000, Configuration::Concurrency);
+ upq.SetName("DaemonCommand::Run");
+
+ // activate config only after daemonization: it starts threads and that is not compatible with fork()
+ if (!ConfigItem::ActivateItems(upq, newItems, false, false, true)) {
+ Log(LogCritical, "cli", "Error activating configuration.");
+ return EXIT_FAILURE;
+ }
+ }
+
+ /* Create the internal API object storage. Do this here too with setups without API. */
+ ConfigObjectUtility::CreateStorage();
+
+ /* Remove ignored Downtime/Comment objects. */
+ try {
+ String configDir = ConfigObjectUtility::GetConfigDir();
+ ConfigItem::RemoveIgnoredItems(configDir);
+ } catch (const std::exception& ex) {
+ Log(LogNotice, "cli")
+ << "Cannot clean ignored downtimes/comments: " << ex.what();
+ }
+
+ ApiListener::UpdateObjectAuthority();
+
+ return Application::GetInstance()->Run();
+}
+
+#ifndef _WIN32
+/**
+ * The possible states of a seemless worker being started by StartUnixWorker().
+ */
+enum class UnixWorkerState : uint_fast8_t
+{
+ Pending,
+ LoadedConfig,
+ Failed
+};
+
+// The signals to block temporarily in StartUnixWorker().
+static const sigset_t l_UnixWorkerSignals = ([]() -> sigset_t {
+ sigset_t s;
+
+ (void)sigemptyset(&s);
+ (void)sigaddset(&s, SIGCHLD);
+ (void)sigaddset(&s, SIGUSR1);
+ (void)sigaddset(&s, SIGUSR2);
+ (void)sigaddset(&s, SIGINT);
+ (void)sigaddset(&s, SIGTERM);
+ (void)sigaddset(&s, SIGHUP);
+
+ return s;
+})();
+
+// The PID of the seemless worker currently being started by StartUnixWorker()
+static Atomic l_CurrentlyStartingUnixWorkerPid (-1);
+
+// The state of the seemless worker currently being started by StartUnixWorker()
+static Atomic l_CurrentlyStartingUnixWorkerState (UnixWorkerState::Pending);
+
+// The last temination signal we received
+static Atomic l_TermSignal (-1);
+
+// Whether someone requested to re-load config (and we didn't handle that request, yet)
+static Atomic l_RequestedReload (false);
+
+// Whether someone requested to re-open logs (and we didn't handle that request, yet)
+static Atomic l_RequestedReopenLogs (false);
+
+/**
+ * Umbrella process' signal handlers
+ */
+static void UmbrellaSignalHandler(int num, siginfo_t *info, void*)
+{
+ switch (num) {
+ case SIGUSR1:
+ // Someone requested to re-open logs
+ l_RequestedReopenLogs.store(true);
+ break;
+ case SIGUSR2:
+ if (l_CurrentlyStartingUnixWorkerState.load() == UnixWorkerState::Pending
+ && info->si_pid == l_CurrentlyStartingUnixWorkerPid.load()) {
+ // The seemless worker currently being started by StartUnixWorker() successfully loaded its config
+ l_CurrentlyStartingUnixWorkerState.store(UnixWorkerState::LoadedConfig);
+ }
+ break;
+ case SIGCHLD:
+ if (l_CurrentlyStartingUnixWorkerState.load() == UnixWorkerState::Pending
+ && info->si_pid == l_CurrentlyStartingUnixWorkerPid.load()) {
+ // The seemless worker currently being started by StartUnixWorker() failed
+ l_CurrentlyStartingUnixWorkerState.store(UnixWorkerState::Failed);
+ }
+ break;
+ case SIGINT:
+ case SIGTERM:
+ // Someone requested our termination
+
+ {
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+
+ sa.sa_handler = SIG_DFL;
+
+ (void)sigaction(num, &sa, nullptr);
+ }
+
+ l_TermSignal.store(num);
+ break;
+ case SIGHUP:
+ // Someone requested to re-load config
+ l_RequestedReload.store(true);
+ break;
+ default:
+ // Programming error (or someone has broken the userspace)
+ VERIFY(!"Caught unexpected signal");
+ }
+}
+
+/**
+ * Seemless worker's signal handlers
+ */
+static void WorkerSignalHandler(int num, siginfo_t *info, void*)
+{
+ switch (num) {
+ case SIGUSR2:
+ if (info->si_pid == l_UmbrellaPid) {
+ // The umbrella process allowed us to continue working beyond config validation
+ l_AllowedToWork.store(true);
+ }
+ break;
+ case SIGINT:
+ case SIGTERM:
+ if (info->si_pid == l_UmbrellaPid) {
+ // The umbrella process requested our termination
+ Application::RequestShutdown();
+ }
+ break;
+ default:
+ // Programming error (or someone has broken the userspace)
+ VERIFY(!"Caught unexpected signal");
+ }
+}
+
+#ifdef HAVE_SYSTEMD
+// When we last notified the watchdog.
+static Atomic l_LastNotifiedWatchdog (0);
+
+/**
+ * Notify the watchdog if not notified during the last 2.5s.
+ */
+static void NotifyWatchdog()
+{
+ double now = Utility::GetTime();
+
+ if (now - l_LastNotifiedWatchdog.load() >= 2.5) {
+ sd_notify(0, "WATCHDOG=1");
+ l_LastNotifiedWatchdog.store(now);
+ }
+}
+#endif /* HAVE_SYSTEMD */
+
+/**
+ * Starts seemless worker process doing the actual work (config loading, ...)
+ *
+ * @param configs Files to read config from
+ *
+ * @return The worker's PID on success, -1 on failure (if the worker couldn't load its config)
+ */
+static pid_t StartUnixWorker(const std::vector& configs)
+{
+ Log(LogNotice, "cli")
+ << "Spawning seemless worker process doing the actual work";
+
+ try {
+ Application::UninitializeBase();
+ } catch (const std::exception& ex) {
+ Log(LogCritical, "cli")
+ << "Failed to stop thread pool before forking, unexpected error: " << DiagnosticInformation(ex);
+ exit(EXIT_FAILURE);
+ }
+
+ /* Block the signal handlers we'd like to change in the child process until we changed them.
+ * Block SIGUSR2 and SIGCHLD handlers until we've set l_CurrentlyStartingUnixWorkerPid.
+ */
+ (void)sigprocmask(SIG_BLOCK, &l_UnixWorkerSignals, nullptr);
+
+ pid_t pid = fork();
+
+ switch (pid) {
+ case -1:
+ Log(LogCritical, "cli")
+ << "fork() failed with error code " << errno << ", \"" << Utility::FormatErrorNumber(errno) << "\"";
+ exit(EXIT_FAILURE);
+
+ case 0:
+ try {
+ {
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+
+ sa.sa_handler = SIG_DFL;
+
+ (void)sigaction(SIGCHLD, &sa, nullptr);
+ (void)sigaction(SIGUSR1, &sa, nullptr);
+ (void)sigaction(SIGHUP, &sa, nullptr);
+ }
+
+ {
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+
+ sa.sa_sigaction = &WorkerSignalHandler;
+ sa.sa_flags = SA_RESTART | SA_SIGINFO;
+
+ (void)sigaction(SIGUSR2, &sa, nullptr);
+ (void)sigaction(SIGINT, &sa, nullptr);
+ (void)sigaction(SIGTERM, &sa, nullptr);
+ }
+
+ (void)sigprocmask(SIG_UNBLOCK, &l_UnixWorkerSignals, nullptr);
+
+ try {
+ Application::InitializeBase();
+ } catch (const std::exception& ex) {
+ Log(LogCritical, "cli")
+ << "Failed to re-initialize thread pool after forking (child): " << DiagnosticInformation(ex);
+ _exit(EXIT_FAILURE);
+ }
+
+ _exit(RunWorker(configs));
+ } catch (...) {
+ _exit(EXIT_FAILURE);
+ }
+
+ default:
+ l_CurrentlyStartingUnixWorkerPid.store(pid);
+ (void)sigprocmask(SIG_UNBLOCK, &l_UnixWorkerSignals, nullptr);
+
+ Log(LogNotice, "cli")
+ << "Spawned worker process (PID " << pid << "), waiting for it to load its config";
+
+ // Wait for the newly spawned process to either load its config or fail.
+ for (;;) {
+#ifdef HAVE_SYSTEMD
+ NotifyWatchdog();
+#endif /* HAVE_SYSTEMD */
+
+ switch (l_CurrentlyStartingUnixWorkerState.load()) {
+ case UnixWorkerState::LoadedConfig:
+ Log(LogNotice, "cli")
+ << "Worker process successfully loaded its config";
+ break;
+ case UnixWorkerState::Failed:
+ Log(LogNotice, "cli")
+ << "Worker process couldn't load its config";
+
+ while (waitpid(pid, nullptr, 0) == -1 && errno == EINTR) {
+#ifdef HAVE_SYSTEMD
+ NotifyWatchdog();
+#endif /* HAVE_SYSTEMD */
+ }
+ pid = -1;
+ break;
+ default:
+ Utility::Sleep(0.2);
+ continue;
+ }
+
+ break;
+ }
+
+ // Reset flags for the next time
+ l_CurrentlyStartingUnixWorkerPid.store(-1);
+ l_CurrentlyStartingUnixWorkerState.store(UnixWorkerState::Pending);
+
+ try {
+ Application::InitializeBase();
+ } catch (const std::exception& ex) {
+ Log(LogCritical, "cli")
+ << "Failed to re-initialize thread pool after forking (parent): " << DiagnosticInformation(ex);
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ return pid;
+}
+
+/**
+ * Workaround to instantiate Application (which is abstract) in DaemonCommand#Run()
+ */
+class PidFileManagementApp : public Application
+{
+public:
+ inline int Main() override
+ {
+ return EXIT_FAILURE;
+ }
+};
+#endif /* _WIN32 */
+
/**
* The entry point for the "daemon" CLI command.
*
@@ -194,15 +548,6 @@ int DaemonCommand::Run(const po::variables_map& vm, const std::vector 0) {
- Log(LogCritical, "cli")
- << "Another instance of Icinga already running with PID " << runningpid;
- return EXIT_FAILURE;
- }
- }
-
std::vector configs;
if (vm.count("config") > 0)
configs = vm["config"].as >();
@@ -212,67 +557,52 @@ int DaemonCommand::Run(const po::variables_map& vm, const std::vector newItems;
-
- if (!DaemonUtility::LoadConfigFiles(configs, newItems, Configuration::ObjectsPath, Configuration::VarsPath))
- return EXIT_FAILURE;
-
if (vm.count("validate")) {
+ Log(LogInformation, "cli", "Loading configuration file(s).");
+
+ std::vector newItems;
+
+ if (!DaemonUtility::LoadConfigFiles(configs, newItems, Configuration::ObjectsPath, Configuration::VarsPath))
+ return EXIT_FAILURE;
+
Log(LogInformation, "cli", "Finished validating the configuration file(s).");
return EXIT_SUCCESS;
}
-#ifndef _WIN32
- if (vm.count("reload-internal")) {
- /* We went through validation and now ask the old process kindly to die */
- Log(LogInformation, "cli", "Requesting to take over.");
- int rc = kill(vm["reload-internal"].as(), SIGUSR2);
- if (rc) {
+ {
+ pid_t runningpid = Application::ReadPidFile(Configuration::PidPath);
+ if (runningpid > 0) {
Log(LogCritical, "cli")
- << "Failed to send signal to \"" << vm["reload-internal"].as() << "\" with " << strerror(errno);
+ << "Another instance of Icinga already running with PID " << runningpid;
return EXIT_FAILURE;
}
-
- double start = Utility::GetTime();
- while (kill(vm["reload-internal"].as(), SIGCHLD) == 0)
- Utility::Sleep(0.2);
-
- Log(LogNotice, "cli")
- << "Waited for " << Utility::FormatDuration(Utility::GetTime() - start) << " on old process to exit.";
}
-#endif /* _WIN32 */
if (vm.count("daemonize")) {
- if (!vm.count("reload-internal")) {
- // no additional fork neccessary on reload
-
- // this subroutine either succeeds, or logs an error
- // and terminates the process (does not return).
- Daemonize();
- }
+ // this subroutine either succeeds, or logs an error
+ // and terminates the process (does not return).
+ Daemonize();
}
- /* restore the previous program state */
+#ifndef _WIN32
+ /* The Application manages the PID file,
+ * but on *nix this process doesn't load any config
+ * so there's no central Application instance.
+ */
+ PidFileManagementApp app;
+
try {
- ConfigObject::RestoreObjects(Configuration::StatePath);
- } catch (const std::exception& ex) {
- Log(LogCritical, "cli")
- << "Failed to restore state file: " << DiagnosticInformation(ex);
+ app.UpdatePidFile(Configuration::PidPath);
+ } catch (const std::exception&) {
+ Log(LogCritical, "Application")
+ << "Cannot update PID file '" << Configuration::PidPath << "'. Aborting.";
return EXIT_FAILURE;
}
- {
- WorkQueue upq(25000, Configuration::Concurrency);
- upq.SetName("DaemonCommand::Run");
-
- // activate config only after daemonization: it starts threads and that is not compatible with fork()
- if (!ConfigItem::ActivateItems(upq, newItems, false, false, true)) {
- Log(LogCritical, "cli", "Error activating configuration.");
- return EXIT_FAILURE;
- }
- }
+ Defer closePidFile ([&app]() {
+ app.ClosePidFile(true);
+ });
+#endif /* _WIN32 */
if (vm.count("daemonize") || vm.count("close-stdio")) {
// After disabling the console log, any further errors will go to the configured log only.
@@ -287,26 +617,139 @@ int DaemonCommand::Run(const po::variables_map& vm, const std::vector 0) {
+ Log(LogNotice, "cli")
+ << "Seemless worker (PID " << currentWorker << ") stopped, stopping as well";
+
+#ifdef HAVE_SYSTEMD
+ if (!notifiedTermination) {
+ notifiedTermination = true;
+ sd_notify(0, "STOPPING=1");
+ }
+#endif /* HAVE_SYSTEMD */
+
+ // If killed by signal, forward it via the exit code (to be as seemless as possible)
+ return WIFSIGNALED(status) ? 128 + WTERMSIG(status) : WEXITSTATUS(status);
+ }
+ }
+
+ Utility::Sleep(0.2);
+ }
#endif /* _WIN32 */
-
- ApiListener::UpdateObjectAuthority();
-
- return Application::GetInstance()->Run();
}