icinga2/lib/icinga/clusterevents-check.cpp

380 lines
12 KiB
C++
Raw Normal View History

/* Icinga 2 | (c) 2012 Icinga GmbH | GPLv2+ */
#include "icinga/clusterevents.hpp"
#include "icinga/icingaapplication.hpp"
#include "remote/apilistener.hpp"
#include "base/configuration.hpp"
2020-10-19 12:32:02 +02:00
#include "base/defer.hpp"
#include "base/serializer.hpp"
#include "base/exception.hpp"
#include <boost/thread/once.hpp>
#include <thread>
using namespace icinga;
2021-02-02 10:16:04 +01:00
std::mutex ClusterEvents::m_Mutex;
std::deque<std::function<void ()>> ClusterEvents::m_CheckRequestQueue;
bool ClusterEvents::m_CheckSchedulerRunning;
int ClusterEvents::m_ChecksExecutedDuringInterval;
int ClusterEvents::m_ChecksDroppedDuringInterval;
Timer::Ptr ClusterEvents::m_LogTimer;
void ClusterEvents::RemoteCheckThreadProc()
{
Utility::SetThreadName("Remote Check Scheduler");
int maxConcurrentChecks = IcingaApplication::GetInstance()->GetMaxConcurrentChecks();
2021-02-02 10:16:04 +01:00
std::unique_lock<std::mutex> lock(m_Mutex);
for(;;) {
if (m_CheckRequestQueue.empty())
break;
lock.unlock();
Checkable::AquirePendingCheckSlot(maxConcurrentChecks);
lock.lock();
auto callback = m_CheckRequestQueue.front();
m_CheckRequestQueue.pop_front();
m_ChecksExecutedDuringInterval++;
lock.unlock();
callback();
Checkable::DecreasePendingChecks();
lock.lock();
}
m_CheckSchedulerRunning = false;
}
void ClusterEvents::EnqueueCheck(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params)
{
static boost::once_flag once = BOOST_ONCE_INIT;
boost::call_once(once, []() {
m_LogTimer = Timer::Create();
m_LogTimer->SetInterval(10);
m_LogTimer->OnTimerExpired.connect([](const Timer * const&) { LogRemoteCheckQueueInformation(); });
m_LogTimer->Start();
});
2021-02-02 10:16:04 +01:00
std::unique_lock<std::mutex> lock(m_Mutex);
if (m_CheckRequestQueue.size() >= 25000) {
m_ChecksDroppedDuringInterval++;
return;
}
m_CheckRequestQueue.emplace_back([origin, params]() { ExecuteCheckFromQueue(origin, params); });
if (!m_CheckSchedulerRunning) {
std::thread t(ClusterEvents::RemoteCheckThreadProc);
t.detach();
m_CheckSchedulerRunning = true;
}
}
static void SendEventExecutedCommand(const Dictionary::Ptr& params, long exitStatus, const String& output,
double start, double end, const ApiListener::Ptr& listener, const MessageOrigin::Ptr& origin,
2020-11-23 16:39:24 +01:00
const Endpoint::Ptr& sourceEndpoint)
{
Dictionary::Ptr executedParams = new Dictionary();
executedParams->Set("execution", params->Get("source"));
executedParams->Set("host", params->Get("host"));
2020-11-23 16:39:24 +01:00
if (params->Contains("service"))
executedParams->Set("service", params->Get("service"));
2020-11-23 16:39:24 +01:00
executedParams->Set("exit", exitStatus);
executedParams->Set("output", output);
executedParams->Set("start", start);
executedParams->Set("end", end);
2020-07-31 14:07:48 +02:00
if (origin->IsLocal()) {
ClusterEvents::ExecutedCommandAPIHandler(origin, executedParams);
} else {
Dictionary::Ptr executedMessage = new Dictionary();
executedMessage->Set("jsonrpc", "2.0");
executedMessage->Set("method", "event::ExecutedCommand");
executedMessage->Set("params", executedParams);
listener->SyncSendMessage(sourceEndpoint, executedMessage);
}
}
void ClusterEvents::ExecuteCheckFromQueue(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params) {
Endpoint::Ptr sourceEndpoint;
2020-11-23 16:39:24 +01:00
if (origin->FromClient) {
sourceEndpoint = origin->FromClient->GetEndpoint();
} else if (origin->IsLocal()){
sourceEndpoint = Endpoint::GetLocalEndpoint();
}
if (!sourceEndpoint || (origin->FromZone && !Zone::GetLocalZone()->IsChildOf(origin->FromZone))) {
Log(LogNotice, "ClusterEvents")
2020-07-29 09:54:18 +02:00
<< "Discarding 'execute command' message from '" << origin->FromClient->GetIdentity() << "': Invalid endpoint origin (client not allowed).";
return;
}
ApiListener::Ptr listener = ApiListener::GetInstance();
if (!listener) {
2020-07-14 11:57:12 +02:00
Log(LogCritical, "ApiListener") << "No instance available.";
return;
}
2020-07-14 11:57:12 +02:00
Defer resetExecuteCommandProcessFinishedHandler ([]() {
Checkable::ExecuteCommandProcessFinishedHandler = nullptr;
});
if (params->Contains("source")) {
2020-07-10 10:05:38 +02:00
String uuid = params->Get("source");
2020-07-20 12:46:22 +02:00
String checkableName = params->Get("host");
2020-11-23 16:39:24 +01:00
if (params->Contains("service"))
2020-07-20 12:46:22 +02:00
checkableName += "!" + params->Get("service");
2020-07-13 10:20:23 +02:00
/* Check deadline */
double deadline = params->Get("deadline");
2020-11-23 16:39:24 +01:00
2020-07-20 13:15:12 +02:00
if (Utility::GetTime() > deadline) {
2020-07-13 10:20:23 +02:00
Log(LogNotice, "ApiListener")
2020-07-20 12:46:22 +02:00
<< "Discarding 'ExecuteCheckFromQueue' event for checkable '" << checkableName
2020-07-13 10:20:23 +02:00
<< "' from '" << origin->FromClient->GetIdentity() << "': Deadline has expired.";
return;
}
2020-07-20 12:46:22 +02:00
Checkable::ExecuteCommandProcessFinishedHandler = [checkableName, listener, sourceEndpoint, origin, params] (const Value& commandLine, const ProcessResult& pr) {
if (params->Get("command_type") == "check_command") {
Checkable::CurrentConcurrentChecks.fetch_sub(1);
Checkable::DecreasePendingChecks();
}
if (pr.ExitStatus > 3) {
Process::Arguments parguments = Process::PrepareCommand(commandLine);
Log(LogWarning, "ApiListener")
2020-07-20 12:46:22 +02:00
<< "Command for object '" << checkableName << "' (PID: " << pr.PID
<< ", arguments: " << Process::PrettyPrintArguments(parguments) << ") terminated with exit code "
<< pr.ExitStatus << ", output: " << pr.Output;
}
SendEventExecutedCommand(params, pr.ExitStatus, pr.Output, pr.ExecutionStart, pr.ExecutionEnd, listener,
origin, sourceEndpoint);
};
}
2020-07-13 17:06:08 +02:00
if (!listener->GetAcceptCommands() && !origin->IsLocal()) {
Log(LogWarning, "ApiListener")
2020-07-29 09:54:18 +02:00
<< "Ignoring command. '" << listener->GetName() << "' does not accept commands.";
2020-07-20 08:18:53 +02:00
String output = "Endpoint '" + Endpoint::GetLocalEndpoint()->GetName() + "' does not accept commands.";
if (params->Contains("source")) {
double now = Utility::GetTime();
SendEventExecutedCommand(params, 126, output, now, now, listener, origin, sourceEndpoint);
} else {
2020-07-20 08:18:53 +02:00
Host::Ptr host = new Host();
Dictionary::Ptr attrs = new Dictionary();
attrs->Set("__name", params->Get("host"));
attrs->Set("type", "Host");
attrs->Set("enable_active_checks", false);
Deserialize(host, attrs, false, FAConfig);
if (params->Contains("service"))
host->SetExtension("agent_service_name", params->Get("service"));
CheckResult::Ptr cr = new CheckResult();
cr->SetState(ServiceUnknown);
cr->SetOutput(output);
Dictionary::Ptr message = MakeCheckResultMessage(host, cr);
listener->SyncSendMessage(sourceEndpoint, message);
}
return;
}
/* use a virtual host object for executing the command */
Host::Ptr host = new Host();
Dictionary::Ptr attrs = new Dictionary();
attrs->Set("__name", params->Get("host"));
attrs->Set("type", "Host");
/*
* Override the check timeout if the parent caller provided the value. Compatible with older versions not
* passing this inside the cluster message.
* This happens with host/service command_endpoint agents and the 'check_timeout' attribute being specified.
*/
if (params->Contains("check_timeout"))
attrs->Set("check_timeout", params->Get("check_timeout"));
Deserialize(host, attrs, false, FAConfig);
if (params->Contains("service"))
host->SetExtension("agent_service_name", params->Get("service"));
String command = params->Get("command");
String command_type = params->Get("command_type");
if (command_type == "check_command") {
if (!CheckCommand::GetByName(command)) {
ServiceState state = ServiceUnknown;
String output = "Check command '" + command + "' does not exist.";
double now = Utility::GetTime();
if (params->Contains("source")) {
SendEventExecutedCommand(params, state, output, now, now, listener, origin, sourceEndpoint);
} else {
CheckResult::Ptr cr = new CheckResult();
cr->SetState(state);
cr->SetOutput(output);
Dictionary::Ptr message = MakeCheckResultMessage(host, cr);
listener->SyncSendMessage(sourceEndpoint, message);
}
2020-11-23 16:39:24 +01:00
return;
}
} else if (command_type == "event_command") {
if (!EventCommand::GetByName(command)) {
String output = "Event command '" + command + "' does not exist.";
Log(LogWarning, "ClusterEvents") << output;
if (params->Contains("source")) {
double now = Utility::GetTime();
SendEventExecutedCommand(params, ServiceUnknown, output, now, now, listener, origin, sourceEndpoint);
}
2020-11-23 16:39:24 +01:00
return;
}
} else if (command_type == "notification_command") {
if (!NotificationCommand::GetByName(command)) {
String output = "Notification command '" + command + "' does not exist.";
Log(LogWarning, "ClusterEvents") << output;
if (params->Contains("source")) {
double now = Utility::GetTime();
SendEventExecutedCommand(params, ServiceUnknown, output, now, now, listener, origin, sourceEndpoint);
}
2020-11-23 16:39:24 +01:00
return;
2020-07-21 13:27:03 +02:00
}
}
attrs->Set(command_type, params->Get("command"));
attrs->Set("command_endpoint", sourceEndpoint->GetName());
Deserialize(host, attrs, false, FAConfig);
host->SetExtension("agent_check", true);
Dictionary::Ptr macros = params->Get("macros");
if (command_type == "check_command") {
try {
host->ExecuteRemoteCheck(macros);
} catch (const std::exception& ex) {
String output = "Exception occurred while checking '" + host->GetName() + "': " + DiagnosticInformation(ex);
ServiceState state = ServiceUnknown;
double now = Utility::GetTime();
if (params->Contains("source")) {
SendEventExecutedCommand(params, state, output, now, now, listener, origin, sourceEndpoint);
} else {
CheckResult::Ptr cr = new CheckResult();
cr->SetState(state);
cr->SetOutput(output);
cr->SetScheduleStart(now);
cr->SetScheduleEnd(now);
cr->SetExecutionStart(now);
cr->SetExecutionEnd(now);
Dictionary::Ptr message = MakeCheckResultMessage(host, cr);
listener->SyncSendMessage(sourceEndpoint, message);
}
Log(LogCritical, "checker", output);
}
} else if (command_type == "event_command") {
2020-07-23 13:03:30 +02:00
try {
host->ExecuteEventHandler(macros, true);
} catch (const std::exception& ex) {
if (params->Contains("source")) {
String output = "Exception occurred while executing event command '" + command + "' for '" +
host->GetName() + "': " + DiagnosticInformation(ex);
double now = Utility::GetTime();
SendEventExecutedCommand(params, ServiceUnknown, output, now, now, listener, origin, sourceEndpoint);
} else {
throw;
2020-07-23 13:03:30 +02:00
}
}
} else if (command_type == "notification_command" && params->Contains("source")) {
2020-07-21 13:27:03 +02:00
/* Get user */
User::Ptr user = new User();
Dictionary::Ptr attrs = new Dictionary();
attrs->Set("__name", params->Get("user"));
attrs->Set("type", User::GetTypeName());
Deserialize(user, attrs, false, FAConfig);
/* Get notification */
Notification::Ptr notification = new Notification();
attrs->Clear();
attrs->Set("__name", params->Get("notification"));
attrs->Set("type", Notification::GetTypeName());
attrs->Set("command", command);
Deserialize(notification, attrs, false, FAConfig);
try {
CheckResult::Ptr cr = new CheckResult();
String author = macros->Get("notification_author");
NotificationCommand::Ptr notificationCommand = NotificationCommand::GetByName(command);
notificationCommand->Execute(notification, user, cr, NotificationType::NotificationCustom,
author, "");
} catch (const std::exception& ex) {
String output = "Exception occurred during notification '" + notification->GetName()
+ "' for checkable '" + notification->GetCheckable()->GetName()
+ "' and user '" + user->GetName() + "' using command '" + command + "': "
+ DiagnosticInformation(ex, false);
double now = Utility::GetTime();
SendEventExecutedCommand(params, ServiceUnknown, output, now, now, listener, origin, sourceEndpoint);
2020-07-21 13:27:03 +02:00
}
}
}
int ClusterEvents::GetCheckRequestQueueSize()
{
return m_CheckRequestQueue.size();
}
void ClusterEvents::LogRemoteCheckQueueInformation() {
if (m_ChecksDroppedDuringInterval > 0) {
Log(LogCritical, "ClusterEvents")
<< "Remote check queue ran out of slots. "
<< m_ChecksDroppedDuringInterval << " checks dropped.";
m_ChecksDroppedDuringInterval = 0;
}
if (m_ChecksExecutedDuringInterval == 0)
return;
Log(LogInformation, "RemoteCheckQueue")
<< "items: " << m_CheckRequestQueue.size()
<< ", rate: " << m_ChecksExecutedDuringInterval / 10 << "/s "
<< "(" << m_ChecksExecutedDuringInterval * 6 << "/min "
<< m_ChecksExecutedDuringInterval * 6 * 5 << "/5min "
<< m_ChecksExecutedDuringInterval * 6 * 15 << "/15min" << ");";
m_ChecksExecutedDuringInterval = 0;
}