mirror of
https://github.com/Icinga/icinga2.git
synced 2025-09-26 19:18:48 +02:00
This commit changes the ordering of CheckableScheduleInfo in the multi-index container to ensure that checkables with running checks are pushed to the end of the ordering. This prevents them from being prioritized for scheduling ahead of others, which could lead to unnecessary CPU load due to repeated scheduling attempts. By using a very large value for the index of checkables with running checks, they are effectively deprioritized until their current check is completed and they can be reinserted with their actual next check time.
731 lines
24 KiB
C++
731 lines
24 KiB
C++
/* Icinga 2 | (c) 2012 Icinga GmbH | GPLv2+ */
|
|
|
|
#include "icinga/checkable.hpp"
|
|
#include "icinga/service.hpp"
|
|
#include "icinga/host.hpp"
|
|
#include "icinga/checkcommand.hpp"
|
|
#include "icinga/icingaapplication.hpp"
|
|
#include "icinga/cib.hpp"
|
|
#include "icinga/clusterevents.hpp"
|
|
#include "remote/messageorigin.hpp"
|
|
#include "remote/apilistener.hpp"
|
|
#include "base/objectlock.hpp"
|
|
#include "base/logger.hpp"
|
|
#include "base/convert.hpp"
|
|
#include "base/utility.hpp"
|
|
#include "base/context.hpp"
|
|
#include <shared_mutex>
|
|
|
|
using namespace icinga;
|
|
|
|
boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, const MessageOrigin::Ptr&)> Checkable::OnNewCheckResult;
|
|
boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, StateType, const MessageOrigin::Ptr&)> Checkable::OnStateChange;
|
|
boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, std::set<Checkable::Ptr>, const MessageOrigin::Ptr&)> Checkable::OnReachabilityChanged;
|
|
boost::signals2::signal<void (const Checkable::Ptr&, NotificationType, const CheckResult::Ptr&, const String&, const String&, const MessageOrigin::Ptr&)> Checkable::OnNotificationsRequested;
|
|
boost::signals2::signal<void (const Checkable::Ptr&)> Checkable::OnNextCheckUpdated;
|
|
|
|
Atomic<uint_fast64_t> Checkable::CurrentConcurrentChecks (0);
|
|
|
|
std::mutex Checkable::m_StatsMutex;
|
|
int Checkable::m_PendingChecks = 0;
|
|
std::condition_variable Checkable::m_PendingChecksCV;
|
|
|
|
CheckCommand::Ptr Checkable::GetCheckCommand() const
|
|
{
|
|
return dynamic_pointer_cast<CheckCommand>(NavigateCheckCommandRaw());
|
|
}
|
|
|
|
TimePeriod::Ptr Checkable::GetCheckPeriod() const
|
|
{
|
|
return TimePeriod::GetByName(GetCheckPeriodRaw());
|
|
}
|
|
|
|
void Checkable::SetSchedulingOffset(long offset)
|
|
{
|
|
m_SchedulingOffset = offset;
|
|
}
|
|
|
|
long Checkable::GetSchedulingOffset() const
|
|
{
|
|
return m_SchedulingOffset;
|
|
}
|
|
|
|
void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin)
|
|
{
|
|
double interval;
|
|
|
|
if (GetStateType() == StateTypeSoft && GetLastCheckResult() != nullptr)
|
|
interval = GetRetryInterval();
|
|
else
|
|
interval = GetCheckInterval();
|
|
|
|
double now = Utility::GetTime();
|
|
double adj = 0;
|
|
|
|
if (interval > 1)
|
|
adj = fmod(now * 100 + GetSchedulingOffset(), interval * 100) / 100.0;
|
|
|
|
if (adj != 0.0)
|
|
adj = std::min(0.5 + fmod(GetSchedulingOffset(), interval * 5) / 100.0, adj);
|
|
|
|
double nextCheck = now - adj + interval;
|
|
double lastCheck = GetLastCheck();
|
|
|
|
Log(LogDebug, "Checkable")
|
|
<< std::fixed << std::setprecision(0)
|
|
<< "Update checkable '" << GetName() << "' with check interval '" << GetCheckInterval()
|
|
<< "' from last check time at " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", (lastCheck < 0 ? 0 : lastCheck))
|
|
<< " (" << lastCheck << ") to next check time at "
|
|
<< Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", nextCheck) << " (" << nextCheck << ").";
|
|
|
|
SetNextCheck(nextCheck, false, origin);
|
|
}
|
|
|
|
bool Checkable::HasBeenChecked() const
|
|
{
|
|
return GetLastCheckResult() != nullptr;
|
|
}
|
|
|
|
bool Checkable::HasRunningCheck() const
|
|
{
|
|
return m_CheckRunning;
|
|
}
|
|
|
|
double Checkable::GetLastCheck() const
|
|
{
|
|
CheckResult::Ptr cr = GetLastCheckResult();
|
|
double schedule_end = -1;
|
|
|
|
if (cr)
|
|
schedule_end = cr->GetScheduleEnd();
|
|
|
|
return schedule_end;
|
|
}
|
|
|
|
Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const WaitGroup::Ptr& producer, const MessageOrigin::Ptr& origin)
|
|
{
|
|
using Result = Checkable::ProcessingResult;
|
|
|
|
VERIFY(cr);
|
|
VERIFY(producer);
|
|
|
|
ObjectLock olock(this);
|
|
m_CheckRunning.store(false);
|
|
|
|
double now = Utility::GetTime();
|
|
|
|
if (cr->GetScheduleStart() == 0)
|
|
cr->SetScheduleStart(now);
|
|
|
|
if (cr->GetScheduleEnd() == 0)
|
|
cr->SetScheduleEnd(now);
|
|
|
|
if (cr->GetExecutionStart() == 0)
|
|
cr->SetExecutionStart(now);
|
|
|
|
if (cr->GetExecutionEnd() == 0)
|
|
cr->SetExecutionEnd(now);
|
|
|
|
if (!origin || origin->IsLocal())
|
|
cr->SetSchedulingSource(IcingaApplication::GetInstance()->GetNodeName());
|
|
|
|
Endpoint::Ptr command_endpoint = GetCommandEndpoint();
|
|
|
|
if (cr->GetCheckSource().IsEmpty()) {
|
|
if ((!origin || origin->IsLocal()))
|
|
cr->SetCheckSource(IcingaApplication::GetInstance()->GetNodeName());
|
|
|
|
/* override check source if command_endpoint was defined */
|
|
if (command_endpoint && !GetExtension("agent_check"))
|
|
cr->SetCheckSource(command_endpoint->GetName());
|
|
}
|
|
|
|
std::shared_lock producerLock (*producer, std::try_to_lock);
|
|
|
|
if (!producerLock) {
|
|
// Discard the check result to not delay the current reload.
|
|
// We'll re-run the check immediately after the reload.
|
|
return Result::CheckableInactive;
|
|
}
|
|
|
|
/* agent checks go through the api */
|
|
if (command_endpoint && GetExtension("agent_check")) {
|
|
ApiListener::Ptr listener = ApiListener::GetInstance();
|
|
|
|
if (listener) {
|
|
/* send message back to its origin */
|
|
Dictionary::Ptr message = ClusterEvents::MakeCheckResultMessage(this, cr);
|
|
listener->SyncSendMessage(command_endpoint, message);
|
|
}
|
|
|
|
return Result::Ok;
|
|
|
|
}
|
|
|
|
if (!IsActive())
|
|
return Result::CheckableInactive;
|
|
|
|
bool reachable = IsReachable();
|
|
bool notification_reachable = IsReachable(DependencyNotification);
|
|
|
|
// Cache whether the previous state of this Checkable affects its children before overwriting the last check result.
|
|
// This will be used to determine whether the on reachability changed event should be triggered.
|
|
bool affectsPreviousStateChildren(reachable && AffectsChildren());
|
|
|
|
CheckResult::Ptr old_cr = GetLastCheckResult();
|
|
ServiceState old_state = GetStateRaw();
|
|
StateType old_stateType = GetStateType();
|
|
long old_attempt = GetCheckAttempt();
|
|
bool recovery = false;
|
|
|
|
/* When we have an check result already (not after fresh start),
|
|
* prevent to accept old check results and allow overrides for
|
|
* CRs happened in the future.
|
|
*/
|
|
if (old_cr) {
|
|
double currentCRTimestamp = old_cr->GetExecutionStart();
|
|
double newCRTimestamp = cr->GetExecutionStart();
|
|
|
|
/* Our current timestamp may be from the future (wrong server time adjusted again). Allow overrides here. */
|
|
if (currentCRTimestamp > now) {
|
|
/* our current CR is from the future, let the new CR override it. */
|
|
Log(LogDebug, "Checkable")
|
|
<< std::fixed << std::setprecision(6) << "Processing check result for checkable '" << GetName() << "' from "
|
|
<< Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", newCRTimestamp) << " (" << newCRTimestamp
|
|
<< "). Overriding since ours is from the future at "
|
|
<< Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", currentCRTimestamp) << " (" << currentCRTimestamp << ").";
|
|
} else {
|
|
/* Current timestamp is from the past, but the new timestamp is even more in the past. Skip it. */
|
|
if (newCRTimestamp < currentCRTimestamp) {
|
|
Log(LogDebug, "Checkable")
|
|
<< std::fixed << std::setprecision(6) << "Skipping check result for checkable '" << GetName() << "' from "
|
|
<< Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", newCRTimestamp) << " (" << newCRTimestamp
|
|
<< "). It is in the past compared to ours at "
|
|
<< Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", currentCRTimestamp) << " (" << currentCRTimestamp << ").";
|
|
return Result::NewerCheckResultPresent;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* The ExecuteCheck function already sets the old state, but we need to do it again
|
|
* in case this was a passive check result. */
|
|
SetLastStateRaw(old_state);
|
|
SetLastStateType(old_stateType);
|
|
SetLastReachable(reachable);
|
|
|
|
Host::Ptr host;
|
|
Service::Ptr service;
|
|
tie(host, service) = GetHostService(this);
|
|
|
|
CheckableType checkableType = CheckableHost;
|
|
if (service)
|
|
checkableType = CheckableService;
|
|
|
|
long attempt = 1;
|
|
|
|
std::set<Checkable::Ptr> children = GetChildren();
|
|
|
|
if (IsStateOK(cr->GetState())) {
|
|
SetStateType(StateTypeHard); // NOT-OK -> HARD OK
|
|
|
|
if (!IsStateOK(old_state))
|
|
recovery = true;
|
|
|
|
ResetNotificationNumbers();
|
|
SaveLastState(ServiceOK, cr->GetExecutionEnd());
|
|
} else {
|
|
/* OK -> NOT-OK change, first SOFT state. Reset attempt counter. */
|
|
if (IsStateOK(old_state)) {
|
|
SetStateType(StateTypeSoft);
|
|
attempt = 1;
|
|
}
|
|
|
|
/* SOFT state change, increase attempt counter. */
|
|
if (old_stateType == StateTypeSoft && !IsStateOK(old_state)) {
|
|
SetStateType(StateTypeSoft);
|
|
attempt = old_attempt + 1;
|
|
}
|
|
|
|
/* HARD state change (e.g. previously 2/3 and this next attempt). Reset attempt counter. */
|
|
if (attempt >= GetMaxCheckAttempts()) {
|
|
SetStateType(StateTypeHard);
|
|
attempt = 1;
|
|
}
|
|
|
|
if (!IsStateOK(cr->GetState())) {
|
|
SaveLastState(cr->GetState(), cr->GetExecutionEnd());
|
|
}
|
|
}
|
|
|
|
if (!reachable)
|
|
SetLastStateUnreachable(cr->GetExecutionEnd());
|
|
|
|
SetCheckAttempt(attempt);
|
|
|
|
ServiceState new_state = cr->GetState();
|
|
SetStateRaw(new_state);
|
|
|
|
bool stateChange;
|
|
|
|
/* Exception on state change calculation for hosts. */
|
|
if (checkableType == CheckableService)
|
|
stateChange = (old_state != new_state);
|
|
else
|
|
stateChange = (Host::CalculateState(old_state) != Host::CalculateState(new_state));
|
|
|
|
/* Store the current last state change for the next iteration. */
|
|
SetPreviousStateChange(GetLastStateChange());
|
|
|
|
if (stateChange) {
|
|
SetLastStateChange(cr->GetExecutionEnd());
|
|
|
|
/* remove acknowledgements */
|
|
if (GetAcknowledgement() == AcknowledgementNormal ||
|
|
(GetAcknowledgement() == AcknowledgementSticky && IsStateOK(new_state))) {
|
|
ClearAcknowledgement("");
|
|
}
|
|
}
|
|
|
|
bool remove_acknowledgement_comments = false;
|
|
|
|
if (GetAcknowledgement() == AcknowledgementNone)
|
|
remove_acknowledgement_comments = true;
|
|
|
|
bool hardChange = (GetStateType() == StateTypeHard && old_stateType == StateTypeSoft);
|
|
|
|
if (stateChange && old_stateType == StateTypeHard && GetStateType() == StateTypeHard)
|
|
hardChange = true;
|
|
|
|
bool is_volatile = GetVolatile();
|
|
|
|
if (hardChange || is_volatile) {
|
|
SetLastHardStateRaw(new_state);
|
|
SetLastHardStateChange(cr->GetExecutionEnd());
|
|
SetLastHardStatesRaw(GetLastHardStatesRaw() / 100u + new_state * 100u);
|
|
}
|
|
|
|
if (stateChange) {
|
|
SetLastSoftStatesRaw(GetLastSoftStatesRaw() / 100u + new_state * 100u);
|
|
}
|
|
|
|
cr->SetPreviousHardState(ServiceState(GetLastHardStatesRaw() % 100u));
|
|
|
|
if (!IsStateOK(new_state))
|
|
TriggerDowntimes(cr->GetExecutionEnd());
|
|
|
|
/* statistics for external tools */
|
|
Checkable::UpdateStatistics(cr, checkableType);
|
|
|
|
bool in_downtime = IsInDowntime();
|
|
|
|
bool send_notification = false;
|
|
bool suppress_notification = !notification_reachable || in_downtime || IsAcknowledged();
|
|
|
|
/* Send notifications whether when a hard state change occurred. */
|
|
if (hardChange && !(old_stateType == StateTypeSoft && IsStateOK(new_state)))
|
|
send_notification = true;
|
|
/* Or if the checkable is volatile and in a HARD state. */
|
|
else if (is_volatile && GetStateType() == StateTypeHard)
|
|
send_notification = true;
|
|
|
|
if (IsStateOK(old_state) && old_stateType == StateTypeSoft)
|
|
send_notification = false; /* Don't send notifications for SOFT-OK -> HARD-OK. */
|
|
|
|
if (is_volatile && IsStateOK(old_state) && IsStateOK(new_state))
|
|
send_notification = false; /* Don't send notifications for volatile OK -> OK changes. */
|
|
|
|
if (remove_acknowledgement_comments)
|
|
RemoveAckComments(String(), cr->GetExecutionEnd());
|
|
|
|
Dictionary::Ptr vars_after = new Dictionary({
|
|
{ "state", new_state },
|
|
{ "state_type", GetStateType() },
|
|
{ "attempt", GetCheckAttempt() },
|
|
{ "reachable", reachable }
|
|
});
|
|
|
|
if (old_cr)
|
|
cr->SetVarsBefore(old_cr->GetVarsAfter());
|
|
|
|
cr->SetVarsAfter(vars_after);
|
|
|
|
if (service) {
|
|
SetLastCheckResult(cr);
|
|
} else {
|
|
bool wasProblem = GetProblem();
|
|
|
|
SetLastCheckResult(cr);
|
|
|
|
if (GetProblem() != wasProblem) {
|
|
auto services = host->GetServices();
|
|
for (auto& service : services) {
|
|
Service::OnHostProblemChanged(service, cr, origin);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool was_flapping = IsFlapping();
|
|
|
|
UpdateFlappingStatus(cr->GetState());
|
|
|
|
bool is_flapping = IsFlapping();
|
|
|
|
// Don't recompute the next check when the current check isn't generated by this endpoint. When the check is
|
|
// remotely generated we should've already received the "SetNextCheck" event before the "event::CheckResult"
|
|
// cluster event. Otherwise, the next check received before this check will be invalidated and cause the Checkable
|
|
// "next_check/next_update" in an HA setup to always be different from the other endpoint as the "m_SchedulingOffset"
|
|
// is randomly initialised on each node.
|
|
if (!origin) {
|
|
if (cr->GetActive()) {
|
|
UpdateNextCheck();
|
|
} else {
|
|
/* Reschedule the next check for external passive check results. The side effect of
|
|
* this is that for as long as we receive results for a service we
|
|
* won't execute any active checks. */
|
|
double offset;
|
|
double ttl = cr->GetTtl();
|
|
|
|
if (ttl > 0)
|
|
offset = ttl;
|
|
else
|
|
offset = GetCheckInterval();
|
|
|
|
SetNextCheck(Utility::GetTime() + offset);
|
|
}
|
|
}
|
|
|
|
#ifdef I2_DEBUG /* I2_DEBUG */
|
|
Log(LogDebug, "Checkable")
|
|
<< "Flapping: Checkable " << GetName()
|
|
<< " was: " << was_flapping
|
|
<< " is: " << is_flapping
|
|
<< " threshold low: " << GetFlappingThresholdLow()
|
|
<< " threshold high: " << GetFlappingThresholdHigh()
|
|
<< "% current: " << GetFlappingCurrent() << "%.";
|
|
#endif /* I2_DEBUG */
|
|
|
|
OnNewCheckResult(this, cr, origin);
|
|
|
|
/* signal status updates to for example db_ido */
|
|
OnStateChanged(this);
|
|
|
|
String old_state_str = (service ? Service::StateToString(old_state) : Host::StateToString(Host::CalculateState(old_state)));
|
|
String new_state_str = (service ? Service::StateToString(new_state) : Host::StateToString(Host::CalculateState(new_state)));
|
|
|
|
/* Whether a hard state change or a volatile state change except OK -> OK happened. */
|
|
if (hardChange || (is_volatile && !(IsStateOK(old_state) && IsStateOK(new_state)))) {
|
|
OnStateChange(this, cr, StateTypeHard, origin);
|
|
Log(LogNotice, "Checkable")
|
|
<< "State Change: Checkable '" << GetName() << "' hard state change from " << old_state_str << " to " << new_state_str << " detected." << (is_volatile ? " Checkable is volatile." : "");
|
|
}
|
|
/* Whether a state change happened or the state type is SOFT (must be logged too). */
|
|
else if (stateChange || GetStateType() == StateTypeSoft) {
|
|
OnStateChange(this, cr, StateTypeSoft, origin);
|
|
Log(LogNotice, "Checkable")
|
|
<< "State Change: Checkable '" << GetName() << "' soft state change from " << old_state_str << " to " << new_state_str << " detected.";
|
|
}
|
|
|
|
if (GetStateType() == StateTypeSoft || hardChange || recovery ||
|
|
(is_volatile && !(IsStateOK(old_state) && IsStateOK(new_state))))
|
|
ExecuteEventHandler();
|
|
|
|
int suppressed_types = 0;
|
|
|
|
/* Flapping start/end notifications */
|
|
if (!was_flapping && is_flapping) {
|
|
/* FlappingStart notifications happen on state changes, not in downtimes */
|
|
if (!IsPaused()) {
|
|
if (in_downtime) {
|
|
suppressed_types |= NotificationFlappingStart;
|
|
} else {
|
|
OnNotificationsRequested(this, NotificationFlappingStart, cr, "", "", nullptr);
|
|
}
|
|
}
|
|
|
|
Log(LogNotice, "Checkable")
|
|
<< "Flapping Start: Checkable '" << GetName() << "' started flapping (Current flapping value "
|
|
<< GetFlappingCurrent() << "% > high threshold " << GetFlappingThresholdHigh() << "%).";
|
|
|
|
NotifyFlapping(origin);
|
|
} else if (was_flapping && !is_flapping) {
|
|
/* FlappingEnd notifications are independent from state changes, must not happen in downtine */
|
|
if (!IsPaused()) {
|
|
if (in_downtime) {
|
|
suppressed_types |= NotificationFlappingEnd;
|
|
} else {
|
|
OnNotificationsRequested(this, NotificationFlappingEnd, cr, "", "", nullptr);
|
|
}
|
|
}
|
|
|
|
Log(LogNotice, "Checkable")
|
|
<< "Flapping Stop: Checkable '" << GetName() << "' stopped flapping (Current flapping value "
|
|
<< GetFlappingCurrent() << "% < low threshold " << GetFlappingThresholdLow() << "%).";
|
|
|
|
NotifyFlapping(origin);
|
|
}
|
|
|
|
if (send_notification && !is_flapping) {
|
|
if (!IsPaused()) {
|
|
/* If there are still some pending suppressed state notification, keep the suppression until these are
|
|
* handled by Checkable::FireSuppressedNotifications().
|
|
*/
|
|
bool pending = GetSuppressedNotifications() & (NotificationRecovery|NotificationProblem);
|
|
|
|
if (suppress_notification || pending) {
|
|
suppressed_types |= (recovery ? NotificationRecovery : NotificationProblem);
|
|
} else {
|
|
OnNotificationsRequested(this, recovery ? NotificationRecovery : NotificationProblem, cr, "", "", nullptr);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (suppressed_types) {
|
|
/* If some notifications were suppressed, but just because of e.g. a downtime,
|
|
* stash them into a notification types bitmask for maybe re-sending later.
|
|
*/
|
|
|
|
int suppressed_types_before (GetSuppressedNotifications());
|
|
int suppressed_types_after (suppressed_types_before | suppressed_types);
|
|
|
|
const int conflict = NotificationFlappingStart | NotificationFlappingEnd;
|
|
if ((suppressed_types_after & conflict) == conflict) {
|
|
/* Flapping start and end cancel out each other. */
|
|
suppressed_types_after &= ~conflict;
|
|
}
|
|
|
|
const int stateNotifications = NotificationRecovery | NotificationProblem;
|
|
if (!(suppressed_types_before & stateNotifications) && (suppressed_types & stateNotifications)) {
|
|
/* A state-related notification is suppressed for the first time, store the previous state. When
|
|
* notifications are no longer suppressed, this can be compared with the current state to determine
|
|
* if a notification must be sent. This is done differently compared to flapping notifications just above
|
|
* as for state notifications, problem and recovery don't always cancel each other. For example,
|
|
* WARNING -> OK -> CRITICAL generates both types once, but there should still be a notification.
|
|
*/
|
|
SetStateBeforeSuppression(old_stateType == StateTypeHard ? old_state : ServiceOK);
|
|
}
|
|
|
|
if (suppressed_types_after != suppressed_types_before) {
|
|
SetSuppressedNotifications(suppressed_types_after);
|
|
}
|
|
}
|
|
|
|
/* update reachability for child objects */
|
|
if ((stateChange || hardChange) && !children.empty() && (affectsPreviousStateChildren || AffectsChildren()))
|
|
OnReachabilityChanged(this, cr, children, origin);
|
|
|
|
olock.Unlock();
|
|
|
|
if (recovery) {
|
|
for (auto& child : children) {
|
|
if (child->GetProblem() && child->GetEnableActiveChecks()) {
|
|
auto nextCheck (now + Utility::Random() % 60);
|
|
|
|
ObjectLock oLock (child);
|
|
|
|
if (nextCheck < child->GetNextCheck()) {
|
|
child->SetNextCheck(nextCheck);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (stateChange) {
|
|
/* reschedule direct parents */
|
|
for (const Checkable::Ptr& parent : GetParents()) {
|
|
if (parent.get() == this)
|
|
continue;
|
|
|
|
if (!parent->GetEnableActiveChecks())
|
|
continue;
|
|
|
|
if (parent->GetNextCheck() >= now + parent->GetRetryInterval()) {
|
|
ObjectLock olock(parent);
|
|
parent->SetNextCheck(now);
|
|
}
|
|
}
|
|
}
|
|
|
|
return Result::Ok;
|
|
}
|
|
|
|
void Checkable::ExecuteRemoteCheck(const WaitGroup::Ptr& producer, const Dictionary::Ptr& resolvedMacros)
|
|
{
|
|
CONTEXT("Executing remote check for object '" << GetName() << "'");
|
|
|
|
double scheduled_start = GetNextCheck();
|
|
double before_check = Utility::GetTime();
|
|
|
|
CheckResult::Ptr cr = new CheckResult();
|
|
cr->SetScheduleStart(scheduled_start);
|
|
cr->SetExecutionStart(before_check);
|
|
|
|
GetCheckCommand()->Execute(this, cr, producer, resolvedMacros, true);
|
|
}
|
|
|
|
void Checkable::ExecuteCheck(const WaitGroup::Ptr& producer)
|
|
{
|
|
CONTEXT("Executing check for object '" << GetName() << "'");
|
|
|
|
/* don't run another check if there is one pending */
|
|
if (m_CheckRunning.exchange(true))
|
|
return; // Should never happen as the checker already takes care of this.
|
|
|
|
/* keep track of scheduling info in case the check type doesn't provide its own information */
|
|
double scheduled_start = GetNextCheck();
|
|
double before_check = Utility::GetTime();
|
|
|
|
SetLastCheckStarted(Utility::GetTime());
|
|
|
|
/* This calls SetNextCheck() which updates the CheckerComponent's idle/pending
|
|
* queues and ensures that checks are not fired multiple times. ProcessCheckResult()
|
|
* is called too late. See #6421.
|
|
*/
|
|
UpdateNextCheck();
|
|
|
|
bool reachable = IsReachable();
|
|
|
|
{
|
|
ObjectLock olock(this);
|
|
|
|
SetLastStateRaw(GetStateRaw());
|
|
SetLastStateType(GetLastStateType());
|
|
SetLastReachable(reachable);
|
|
}
|
|
|
|
CheckResult::Ptr cr = new CheckResult();
|
|
|
|
cr->SetScheduleStart(scheduled_start);
|
|
cr->SetExecutionStart(before_check);
|
|
|
|
Endpoint::Ptr endpoint = GetCommandEndpoint();
|
|
bool local = !endpoint || endpoint == Endpoint::GetLocalEndpoint();
|
|
|
|
if (local) {
|
|
GetCheckCommand()->Execute(this, cr, producer, nullptr, false);
|
|
} else {
|
|
Dictionary::Ptr macros = new Dictionary();
|
|
GetCheckCommand()->Execute(this, cr, producer, macros, false);
|
|
|
|
if (endpoint->GetConnected()) {
|
|
/* perform check on remote endpoint */
|
|
Dictionary::Ptr message = new Dictionary();
|
|
message->Set("jsonrpc", "2.0");
|
|
message->Set("method", "event::ExecuteCommand");
|
|
|
|
Host::Ptr host;
|
|
Service::Ptr service;
|
|
tie(host, service) = GetHostService(this);
|
|
|
|
Dictionary::Ptr params = new Dictionary();
|
|
message->Set("params", params);
|
|
params->Set("command_type", "check_command");
|
|
params->Set("command", GetCheckCommand()->GetName());
|
|
params->Set("host", host->GetName());
|
|
|
|
if (service)
|
|
params->Set("service", service->GetShortName());
|
|
|
|
double checkTimeout = GetCheckCommand()->GetTimeout();
|
|
|
|
/*
|
|
* If the host/service object specifies the 'check_timeout' attribute,
|
|
* forward this to the remote endpoint to limit the command execution time.
|
|
*/
|
|
if (auto ckCheckTimeout(GetCheckTimeout()); !ckCheckTimeout.IsEmpty()) {
|
|
checkTimeout = Convert::ToDouble(ckCheckTimeout);
|
|
params->Set("check_timeout", ckCheckTimeout);
|
|
}
|
|
|
|
params->Set("macros", macros);
|
|
|
|
ApiListener::Ptr listener = ApiListener::GetInstance();
|
|
|
|
if (listener)
|
|
listener->SyncSendMessage(endpoint, message);
|
|
|
|
/* Re-schedule the check so we don't run it again until after we've received
|
|
* a check result from the remote instance. The check will be re-scheduled
|
|
* using the proper check interval once we've received a check result.
|
|
*/
|
|
SetNextCheck(Utility::GetTime() + checkTimeout + 30);
|
|
|
|
/*
|
|
* Let the user know that there was a problem with the check if
|
|
* 1) The endpoint is not syncing (replay log, etc.)
|
|
* 2) Outside of the cold startup window (5min)
|
|
*/
|
|
} else if (!endpoint->GetSyncing() && Application::GetInstance()->GetStartTime() < Utility::GetTime() - 300) {
|
|
/* fail to perform check on unconnected endpoint */
|
|
cr->SetState(ServiceUnknown);
|
|
|
|
String output = "Remote Icinga instance '" + endpoint->GetName() + "' is not connected to ";
|
|
|
|
Endpoint::Ptr localEndpoint = Endpoint::GetLocalEndpoint();
|
|
|
|
if (localEndpoint)
|
|
output += "'" + localEndpoint->GetName() + "'";
|
|
else
|
|
output += "this instance";
|
|
|
|
cr->SetOutput(output);
|
|
|
|
ProcessCheckResult(cr, producer);
|
|
}
|
|
|
|
/**
|
|
* If this is a remote check, we don't know when the check result will be received and processed.
|
|
* Therefore, we must mark the check as no longer running here, otherwise, no further checks
|
|
* would be executed for this checkable as it would always appear as having a running check
|
|
* (see the check at the start of this function).
|
|
*/
|
|
m_CheckRunning.store(false);
|
|
}
|
|
}
|
|
|
|
void Checkable::UpdateStatistics(const CheckResult::Ptr& cr, CheckableType type)
|
|
{
|
|
time_t ts = cr->GetScheduleEnd();
|
|
|
|
if (type == CheckableHost) {
|
|
if (cr->GetActive())
|
|
CIB::UpdateActiveHostChecksStatistics(ts, 1);
|
|
else
|
|
CIB::UpdatePassiveHostChecksStatistics(ts, 1);
|
|
} else if (type == CheckableService) {
|
|
if (cr->GetActive())
|
|
CIB::UpdateActiveServiceChecksStatistics(ts, 1);
|
|
else
|
|
CIB::UpdatePassiveServiceChecksStatistics(ts, 1);
|
|
} else {
|
|
Log(LogWarning, "Checkable", "Unknown checkable type for statistic update.");
|
|
}
|
|
}
|
|
|
|
void Checkable::IncreasePendingChecks()
|
|
{
|
|
std::unique_lock<std::mutex> lock(m_StatsMutex);
|
|
m_PendingChecks++;
|
|
}
|
|
|
|
void Checkable::DecreasePendingChecks()
|
|
{
|
|
std::unique_lock<std::mutex> lock(m_StatsMutex);
|
|
m_PendingChecks--;
|
|
m_PendingChecksCV.notify_one();
|
|
}
|
|
|
|
int Checkable::GetPendingChecks()
|
|
{
|
|
std::unique_lock<std::mutex> lock(m_StatsMutex);
|
|
return m_PendingChecks;
|
|
}
|
|
|
|
void Checkable::AquirePendingCheckSlot(int maxPendingChecks)
|
|
{
|
|
std::unique_lock<std::mutex> lock(m_StatsMutex);
|
|
while (m_PendingChecks >= maxPendingChecks)
|
|
m_PendingChecksCV.wait(lock);
|
|
|
|
m_PendingChecks++;
|
|
}
|