From 35f42fa5a38027b1c454f92bc14377bbacfbc1e1 Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Wed, 18 Jun 2025 11:58:18 +0200 Subject: [PATCH 1/3] Handle concurrent config package updates gracefully Previously, we used a simple boolean to track the state of the package updates, and didn't reset it back when the config validation was successful because it was assumed that if we successfully validated the config beforehand, then the worker would also successfully reload the config afterwards, and that the old worker would be terminated. However, this assumption is not always true due to a number of reasons that I can't even think of right now, but the most obvious one is that after we successfully validated the config, the config might have changed again before the worker was able to reload it. If that happens, then the new worker might fail to successfully validate the config due to the recent changes, in which case the old worker would remain active, and this flag would still be set to true, causing any subsequent requests to fail with a `423` until you manually restart the Icinga 2 service. So, in order to prevent such a situation, we are additionally tracking the last time a reload failed and allow to bypass the `m_RunningPackageUpdates` flag only if the last reload failed time was changed since the previous request. --- lib/remote/configstageshandler.cpp | 37 ++++++++++++++++++++++++++---- lib/remote/configstageshandler.hpp | 3 --- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/lib/remote/configstageshandler.cpp b/lib/remote/configstageshandler.cpp index 0dee5f25f..95d1b1b6e 100644 --- a/lib/remote/configstageshandler.cpp +++ b/lib/remote/configstageshandler.cpp @@ -12,7 +12,10 @@ using namespace icinga; REGISTER_URLHANDLER("/v1/config/stages", ConfigStagesHandler); -std::atomic ConfigStagesHandler::m_RunningPackageUpdates (false); +static bool l_RunningPackageUpdates(false); +// A timestamp that indicates the last time an Icinga 2 reload failed. +static double l_LastReloadFailedTime(0); +static std::mutex l_RunningPackageUpdatesMutex; // Protects the above two variables. bool ConfigStagesHandler::HandleRequest( const WaitGroup::Ptr&, @@ -132,12 +135,36 @@ void ConfigStagesHandler::HandlePost( if (reload && !activate) BOOST_THROW_EXCEPTION(std::invalid_argument("Parameter 'reload' must be false when 'activate' is false.")); - if (m_RunningPackageUpdates.exchange(true)) { - return HttpUtility::SendJsonError(response, params, 423, - "Conflicting request, there is already an ongoing package update in progress. Please try it again later."); + { + std::lock_guard runningPackageUpdatesLock(l_RunningPackageUpdatesMutex); + double currentReloadFailedTime = Application::GetLastReloadFailed(); + + /** + * Once the m_RunningPackageUpdates flag is set, it typically remains set until the current worker process is + * terminated, in which case the new worker will have its own m_RunningPackageUpdates flag set to false. + * However, if the reload fails for any reason, the m_RunningPackageUpdates flag will remain set to true + * in the current worker process, which will prevent any further package updates from being processed until + * the next Icinga 2 restart. + * + * So, in order to prevent such a situation, we are additionally tracking the last time a reload failed + * and allow to bypass the m_RunningPackageUpdates flag only if the last reload failed time was changed + * since the previous request. + */ + if (l_RunningPackageUpdates && l_LastReloadFailedTime == currentReloadFailedTime) { + return HttpUtility::SendJsonError( + response, params, 423, + "Conflicting request, there is already an ongoing package update in progress. Please try it again later." + ); + } + + l_RunningPackageUpdates = true; + l_LastReloadFailedTime = currentReloadFailedTime; } - auto resetPackageUpdates (Shared::Make([]() { ConfigStagesHandler::m_RunningPackageUpdates.store(false); })); + auto resetPackageUpdates (Shared::Make([]() { + std::lock_guard lock(l_RunningPackageUpdatesMutex); + l_RunningPackageUpdates = false; + })); std::unique_lock lock(ConfigPackageUtility::GetStaticPackageMutex()); diff --git a/lib/remote/configstageshandler.hpp b/lib/remote/configstageshandler.hpp index a26ddc49c..a6abb726d 100644 --- a/lib/remote/configstageshandler.hpp +++ b/lib/remote/configstageshandler.hpp @@ -4,7 +4,6 @@ #define CONFIGSTAGESHANDLER_H #include "remote/httphandler.hpp" -#include namespace icinga { @@ -48,8 +47,6 @@ private: boost::beast::http::response& response, const Dictionary::Ptr& params ); - - static std::atomic m_RunningPackageUpdates; }; } From 1ac4d8396321976f58da743990af8829b9d3da4d Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Tue, 1 Jul 2025 12:00:34 +0200 Subject: [PATCH 2/3] Use `AtomicFile` where applicable in `ConfigPackageUtility` --- lib/remote/configpackageutility.cpp | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/lib/remote/configpackageutility.cpp b/lib/remote/configpackageutility.cpp index e79540147..949e69b00 100644 --- a/lib/remote/configpackageutility.cpp +++ b/lib/remote/configpackageutility.cpp @@ -3,6 +3,7 @@ #include "remote/configpackageutility.hpp" #include "remote/apilistener.hpp" #include "base/application.hpp" +#include "base/atomic-file.hpp" #include "base/exception.hpp" #include "base/utility.hpp" #include @@ -119,14 +120,9 @@ String ConfigPackageUtility::CreateStage(const String& packageName, const Dictio void ConfigPackageUtility::WritePackageConfig(const String& packageName) { String stageName = GetActiveStage(packageName); + AtomicFile::Write(GetPackageDir() + "/" + packageName + "/include.conf", 0644, "include \"*/include.conf\"\n"); - String includePath = GetPackageDir() + "/" + packageName + "/include.conf"; - std::ofstream fpInclude(includePath.CStr(), std::ofstream::out | std::ostream::binary | std::ostream::trunc); - fpInclude << "include \"*/include.conf\"\n"; - fpInclude.close(); - - String activePath = GetPackageDir() + "/" + packageName + "/active.conf"; - std::ofstream fpActive(activePath.CStr(), std::ofstream::out | std::ostream::binary | std::ostream::trunc); + AtomicFile fpActive(GetPackageDir() + "/" + packageName + "/active.conf", 0644); fpActive << "if (!globals.contains(\"ActiveStages\")) {\n" << " globals.ActiveStages = {}\n" << "}\n" @@ -145,19 +141,18 @@ void ConfigPackageUtility::WritePackageConfig(const String& packageName) << "if (!ActiveStages.contains(\"" << packageName << "\")) {\n" << " ActiveStages[\"" << packageName << "\"] = \"" << stageName << "\"\n" << "}\n"; - fpActive.close(); + fpActive.Commit(); } void ConfigPackageUtility::WriteStageConfig(const String& packageName, const String& stageName) { - String path = GetPackageDir() + "/" + packageName + "/" + stageName + "/include.conf"; - std::ofstream fp(path.CStr(), std::ofstream::out | std::ostream::binary | std::ostream::trunc); + AtomicFile fp(GetPackageDir() + "/" + packageName + "/" + stageName + "/include.conf", 0644); fp << "include \"../active.conf\"\n" << "if (ActiveStages[\"" << packageName << "\"] == \"" << stageName << "\") {\n" << " include_recursive \"conf.d\"\n" << " include_zones \"" << packageName << "\", \"zones.d\"\n" << "}\n"; - fp.close(); + fp.Commit(); } void ConfigPackageUtility::ActivateStage(const String& packageName, const String& stageName) @@ -284,12 +279,7 @@ String ConfigPackageUtility::GetActiveStageFromFile(const String& packageName) void ConfigPackageUtility::SetActiveStageToFile(const String& packageName, const String& stageName) { std::unique_lock lock(GetStaticActiveStageMutex()); - - String activeStagePath = GetPackageDir() + "/" + packageName + "/active-stage"; - - std::ofstream fpActiveStage(activeStagePath.CStr(), std::ofstream::out | std::ostream::binary | std::ostream::trunc); //TODO: fstream exceptions - fpActiveStage << stageName; - fpActiveStage.close(); + AtomicFile::Write(GetPackageDir() + "/" + packageName + "/active-stage", 0644, stageName); } String ConfigPackageUtility::GetActiveStage(const String& packageName) From ce3275d27f710bc239b914204e15ecae5540637f Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Tue, 22 Jul 2025 11:25:03 +0200 Subject: [PATCH 3/3] Disallow stage deletions during reload Once the new worker process has read the config, it also includes a `include */include.conf` statement within the config packages root directory, and from there on we must not allow to delete any stage directory from the config package. Otherwise, when the worker actually evaluates that include statement, it will fail to find the directory where the include file is located, or the `active.conf` file, which is included from each stage's `include.conf` file, thus causing the worker fail. Co-Authored-By: Johannes Schmidt --- lib/remote/configpackageshandler.cpp | 13 +++++++++++++ lib/remote/configstageshandler.cpp | 13 +++++++++++++ 2 files changed, 26 insertions(+) diff --git a/lib/remote/configpackageshandler.cpp b/lib/remote/configpackageshandler.cpp index 7987092bc..f24f5b1d2 100644 --- a/lib/remote/configpackageshandler.cpp +++ b/lib/remote/configpackageshandler.cpp @@ -2,6 +2,7 @@ #include "remote/configpackageshandler.hpp" #include "remote/configpackageutility.hpp" +#include "remote/configobjectslock.hpp" #include "remote/httputility.hpp" #include "remote/filterutility.hpp" #include "base/exception.hpp" @@ -111,6 +112,12 @@ void ConfigPackagesHandler::HandlePost( return; } + ConfigObjectsSharedLock configObjectsSharedLock(std::try_to_lock); + if (!configObjectsSharedLock) { + HttpUtility::SendJsonError(response, params, 503, "Icinga is reloading"); + return; + } + try { std::unique_lock lock(ConfigPackageUtility::GetStaticPackageMutex()); @@ -157,6 +164,12 @@ void ConfigPackagesHandler::HandleDelete( return; } + ConfigObjectsSharedLock lock(std::try_to_lock); + if (!lock) { + HttpUtility::SendJsonError(response, params, 503, "Icinga is reloading"); + return; + } + try { ConfigPackageUtility::DeletePackage(packageName); } catch (const std::exception& ex) { diff --git a/lib/remote/configstageshandler.cpp b/lib/remote/configstageshandler.cpp index 95d1b1b6e..451ba1dbf 100644 --- a/lib/remote/configstageshandler.cpp +++ b/lib/remote/configstageshandler.cpp @@ -2,6 +2,7 @@ #include "remote/configstageshandler.hpp" #include "remote/configpackageutility.hpp" +#include "remote/configobjectslock.hpp" #include "remote/httputility.hpp" #include "remote/filterutility.hpp" #include "base/application.hpp" @@ -135,6 +136,12 @@ void ConfigStagesHandler::HandlePost( if (reload && !activate) BOOST_THROW_EXCEPTION(std::invalid_argument("Parameter 'reload' must be false when 'activate' is false.")); + ConfigObjectsSharedLock configObjectsSharedLock(std::try_to_lock); + if (!configObjectsSharedLock) { + HttpUtility::SendJsonError(response, params, 503, "Icinga is reloading"); + return; + } + { std::lock_guard runningPackageUpdatesLock(l_RunningPackageUpdatesMutex); double currentReloadFailedTime = Application::GetLastReloadFailed(); @@ -228,6 +235,12 @@ void ConfigStagesHandler::HandleDelete( if (!ConfigPackageUtility::ValidateStageName(stageName)) return HttpUtility::SendJsonError(response, params, 400, "Invalid stage name '" + stageName + "'."); + ConfigObjectsSharedLock lock(std::try_to_lock); + if (!lock) { + HttpUtility::SendJsonError(response, params, 503, "Icinga is reloading"); + return; + } + try { ConfigPackageUtility::DeleteStage(packageName, stageName); } catch (const std::exception& ex) {