mirror of https://github.com/Icinga/icinga2.git
717 lines
22 KiB
C++
717 lines
22 KiB
C++
/* Icinga 2 | (c) 2012 Icinga GmbH | GPLv2+ */
|
|
|
|
#include "icinga/checkable.hpp"
|
|
#include "icinga/service.hpp"
|
|
#include "icinga/host.hpp"
|
|
#include "icinga/checkcommand.hpp"
|
|
#include "icinga/icingaapplication.hpp"
|
|
#include "icinga/cib.hpp"
|
|
#include "icinga/clusterevents.hpp"
|
|
#include "remote/messageorigin.hpp"
|
|
#include "remote/apilistener.hpp"
|
|
#include "base/objectlock.hpp"
|
|
#include "base/logger.hpp"
|
|
#include "base/convert.hpp"
|
|
#include "base/utility.hpp"
|
|
#include "base/context.hpp"
|
|
|
|
using namespace icinga;
|
|
|
|
boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, const MessageOrigin::Ptr&)> Checkable::OnNewCheckResult;
|
|
boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, StateType, const MessageOrigin::Ptr&)> Checkable::OnStateChange;
|
|
boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, std::set<Checkable::Ptr>, const MessageOrigin::Ptr&)> Checkable::OnReachabilityChanged;
|
|
boost::signals2::signal<void (const Checkable::Ptr&, NotificationType, const CheckResult::Ptr&, const String&, const String&, const MessageOrigin::Ptr&)> Checkable::OnNotificationsRequested;
|
|
boost::signals2::signal<void (const Checkable::Ptr&)> Checkable::OnNextCheckUpdated;
|
|
|
|
Atomic<uint_fast64_t> Checkable::CurrentConcurrentChecks (0);
|
|
|
|
std::mutex Checkable::m_StatsMutex;
|
|
int Checkable::m_PendingChecks = 0;
|
|
std::condition_variable Checkable::m_PendingChecksCV;
|
|
|
|
CheckCommand::Ptr Checkable::GetCheckCommand() const
|
|
{
|
|
return dynamic_pointer_cast<CheckCommand>(NavigateCheckCommandRaw());
|
|
}
|
|
|
|
TimePeriod::Ptr Checkable::GetCheckPeriod() const
|
|
{
|
|
return TimePeriod::GetByName(GetCheckPeriodRaw());
|
|
}
|
|
|
|
void Checkable::SetSchedulingOffset(long offset)
|
|
{
|
|
m_SchedulingOffset = offset;
|
|
}
|
|
|
|
long Checkable::GetSchedulingOffset()
|
|
{
|
|
return m_SchedulingOffset;
|
|
}
|
|
|
|
void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin)
|
|
{
|
|
double interval;
|
|
|
|
if (GetStateType() == StateTypeSoft && GetLastCheckResult() != nullptr)
|
|
interval = GetRetryInterval();
|
|
else
|
|
interval = GetCheckInterval();
|
|
|
|
double now = Utility::GetTime();
|
|
double adj = 0;
|
|
|
|
if (interval > 1)
|
|
adj = fmod(now * 100 + GetSchedulingOffset(), interval * 100) / 100.0;
|
|
|
|
if (adj != 0.0)
|
|
adj = std::min(0.5 + fmod(GetSchedulingOffset(), interval * 5) / 100.0, adj);
|
|
|
|
double nextCheck = now - adj + interval;
|
|
double lastCheck = GetLastCheck();
|
|
|
|
Log(LogDebug, "Checkable")
|
|
<< "Update checkable '" << GetName() << "' with check interval '" << GetCheckInterval()
|
|
<< "' from last check time at " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", (lastCheck < 0 ? 0 : lastCheck))
|
|
<< " (" << GetLastCheck() << ") to next check time at " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", nextCheck) << " (" << nextCheck << ").";
|
|
|
|
SetNextCheck(nextCheck, false, origin);
|
|
}
|
|
|
|
bool Checkable::HasBeenChecked() const
|
|
{
|
|
return GetLastCheckResult() != nullptr;
|
|
}
|
|
|
|
double Checkable::GetLastCheck() const
|
|
{
|
|
CheckResult::Ptr cr = GetLastCheckResult();
|
|
double schedule_end = -1;
|
|
|
|
if (cr)
|
|
schedule_end = cr->GetScheduleEnd();
|
|
|
|
return schedule_end;
|
|
}
|
|
|
|
Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrigin::Ptr& origin)
|
|
{
|
|
using Result = Checkable::ProcessingResult;
|
|
|
|
{
|
|
ObjectLock olock(this);
|
|
m_CheckRunning = false;
|
|
}
|
|
|
|
if (!cr)
|
|
return Result::NoCheckResult;
|
|
|
|
double now = Utility::GetTime();
|
|
|
|
if (cr->GetScheduleStart() == 0)
|
|
cr->SetScheduleStart(now);
|
|
|
|
if (cr->GetScheduleEnd() == 0)
|
|
cr->SetScheduleEnd(now);
|
|
|
|
if (cr->GetExecutionStart() == 0)
|
|
cr->SetExecutionStart(now);
|
|
|
|
if (cr->GetExecutionEnd() == 0)
|
|
cr->SetExecutionEnd(now);
|
|
|
|
if (!origin || origin->IsLocal())
|
|
cr->SetSchedulingSource(IcingaApplication::GetInstance()->GetNodeName());
|
|
|
|
Endpoint::Ptr command_endpoint = GetCommandEndpoint();
|
|
|
|
if (cr->GetCheckSource().IsEmpty()) {
|
|
if ((!origin || origin->IsLocal()))
|
|
cr->SetCheckSource(IcingaApplication::GetInstance()->GetNodeName());
|
|
|
|
/* override check source if command_endpoint was defined */
|
|
if (command_endpoint && !GetExtension("agent_check"))
|
|
cr->SetCheckSource(command_endpoint->GetName());
|
|
}
|
|
|
|
/* agent checks go through the api */
|
|
if (command_endpoint && GetExtension("agent_check")) {
|
|
ApiListener::Ptr listener = ApiListener::GetInstance();
|
|
|
|
if (listener) {
|
|
/* send message back to its origin */
|
|
Dictionary::Ptr message = ClusterEvents::MakeCheckResultMessage(this, cr);
|
|
listener->SyncSendMessage(command_endpoint, message);
|
|
}
|
|
|
|
return Result::Ok;
|
|
|
|
}
|
|
|
|
if (!IsActive())
|
|
return Result::CheckableInactive;
|
|
|
|
bool reachable = IsReachable();
|
|
bool notification_reachable = IsReachable(DependencyNotification);
|
|
|
|
ObjectLock olock(this);
|
|
|
|
CheckResult::Ptr old_cr = GetLastCheckResult();
|
|
ServiceState old_state = GetStateRaw();
|
|
StateType old_stateType = GetStateType();
|
|
long old_attempt = GetCheckAttempt();
|
|
bool recovery = false;
|
|
|
|
/* When we have an check result already (not after fresh start),
|
|
* prevent to accept old check results and allow overrides for
|
|
* CRs happened in the future.
|
|
*/
|
|
if (old_cr) {
|
|
double currentCRTimestamp = old_cr->GetExecutionStart();
|
|
double newCRTimestamp = cr->GetExecutionStart();
|
|
|
|
/* Our current timestamp may be from the future (wrong server time adjusted again). Allow overrides here. */
|
|
if (currentCRTimestamp > now) {
|
|
/* our current CR is from the future, let the new CR override it. */
|
|
Log(LogDebug, "Checkable")
|
|
<< std::fixed << std::setprecision(6) << "Processing check result for checkable '" << GetName() << "' from "
|
|
<< Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", newCRTimestamp) << " (" << newCRTimestamp
|
|
<< "). Overriding since ours is from the future at "
|
|
<< Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", currentCRTimestamp) << " (" << currentCRTimestamp << ").";
|
|
} else {
|
|
/* Current timestamp is from the past, but the new timestamp is even more in the past. Skip it. */
|
|
if (newCRTimestamp < currentCRTimestamp) {
|
|
Log(LogDebug, "Checkable")
|
|
<< std::fixed << std::setprecision(6) << "Skipping check result for checkable '" << GetName() << "' from "
|
|
<< Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", newCRTimestamp) << " (" << newCRTimestamp
|
|
<< "). It is in the past compared to ours at "
|
|
<< Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", currentCRTimestamp) << " (" << currentCRTimestamp << ").";
|
|
return Result::NewerCheckResultPresent;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* The ExecuteCheck function already sets the old state, but we need to do it again
|
|
* in case this was a passive check result. */
|
|
SetLastStateRaw(old_state);
|
|
SetLastStateType(old_stateType);
|
|
SetLastReachable(reachable);
|
|
|
|
Host::Ptr host;
|
|
Service::Ptr service;
|
|
tie(host, service) = GetHostService(this);
|
|
|
|
CheckableType checkableType = CheckableHost;
|
|
if (service)
|
|
checkableType = CheckableService;
|
|
|
|
long attempt = 1;
|
|
|
|
std::set<Checkable::Ptr> children = GetChildren();
|
|
|
|
if (IsStateOK(cr->GetState())) {
|
|
SetStateType(StateTypeHard); // NOT-OK -> HARD OK
|
|
|
|
if (!IsStateOK(old_state))
|
|
recovery = true;
|
|
|
|
ResetNotificationNumbers();
|
|
SaveLastState(ServiceOK, cr->GetExecutionEnd());
|
|
} else {
|
|
/* OK -> NOT-OK change, first SOFT state. Reset attempt counter. */
|
|
if (IsStateOK(old_state)) {
|
|
SetStateType(StateTypeSoft);
|
|
attempt = 1;
|
|
}
|
|
|
|
/* SOFT state change, increase attempt counter. */
|
|
if (old_stateType == StateTypeSoft && !IsStateOK(old_state)) {
|
|
SetStateType(StateTypeSoft);
|
|
attempt = old_attempt + 1;
|
|
}
|
|
|
|
/* HARD state change (e.g. previously 2/3 and this next attempt). Reset attempt counter. */
|
|
if (attempt >= GetMaxCheckAttempts()) {
|
|
SetStateType(StateTypeHard);
|
|
attempt = 1;
|
|
}
|
|
|
|
if (!IsStateOK(cr->GetState())) {
|
|
SaveLastState(cr->GetState(), cr->GetExecutionEnd());
|
|
}
|
|
}
|
|
|
|
if (!reachable)
|
|
SetLastStateUnreachable(cr->GetExecutionEnd());
|
|
|
|
SetCheckAttempt(attempt);
|
|
|
|
ServiceState new_state = cr->GetState();
|
|
SetStateRaw(new_state);
|
|
|
|
bool stateChange;
|
|
|
|
/* Exception on state change calculation for hosts. */
|
|
if (checkableType == CheckableService)
|
|
stateChange = (old_state != new_state);
|
|
else
|
|
stateChange = (Host::CalculateState(old_state) != Host::CalculateState(new_state));
|
|
|
|
/* Store the current last state change for the next iteration. */
|
|
SetPreviousStateChange(GetLastStateChange());
|
|
|
|
if (stateChange) {
|
|
SetLastStateChange(cr->GetExecutionEnd());
|
|
|
|
/* remove acknowledgements */
|
|
if (GetAcknowledgement() == AcknowledgementNormal ||
|
|
(GetAcknowledgement() == AcknowledgementSticky && IsStateOK(new_state))) {
|
|
ClearAcknowledgement("");
|
|
}
|
|
}
|
|
|
|
bool remove_acknowledgement_comments = false;
|
|
|
|
if (GetAcknowledgement() == AcknowledgementNone)
|
|
remove_acknowledgement_comments = true;
|
|
|
|
bool hardChange = (GetStateType() == StateTypeHard && old_stateType == StateTypeSoft);
|
|
|
|
if (stateChange && old_stateType == StateTypeHard && GetStateType() == StateTypeHard)
|
|
hardChange = true;
|
|
|
|
bool is_volatile = GetVolatile();
|
|
|
|
if (hardChange || is_volatile) {
|
|
SetLastHardStateRaw(new_state);
|
|
SetLastHardStateChange(cr->GetExecutionEnd());
|
|
SetLastHardStatesRaw(GetLastHardStatesRaw() / 100u + new_state * 100u);
|
|
}
|
|
|
|
if (stateChange) {
|
|
SetLastSoftStatesRaw(GetLastSoftStatesRaw() / 100u + new_state * 100u);
|
|
}
|
|
|
|
cr->SetPreviousHardState(ServiceState(GetLastHardStatesRaw() % 100u));
|
|
|
|
if (!IsStateOK(new_state))
|
|
TriggerDowntimes(cr->GetExecutionEnd());
|
|
|
|
/* statistics for external tools */
|
|
Checkable::UpdateStatistics(cr, checkableType);
|
|
|
|
bool in_downtime = IsInDowntime();
|
|
|
|
bool send_notification = false;
|
|
bool suppress_notification = !notification_reachable || in_downtime || IsAcknowledged();
|
|
|
|
/* Send notifications whether when a hard state change occurred. */
|
|
if (hardChange && !(old_stateType == StateTypeSoft && IsStateOK(new_state)))
|
|
send_notification = true;
|
|
/* Or if the checkable is volatile and in a HARD state. */
|
|
else if (is_volatile && GetStateType() == StateTypeHard)
|
|
send_notification = true;
|
|
|
|
if (IsStateOK(old_state) && old_stateType == StateTypeSoft)
|
|
send_notification = false; /* Don't send notifications for SOFT-OK -> HARD-OK. */
|
|
|
|
if (is_volatile && IsStateOK(old_state) && IsStateOK(new_state))
|
|
send_notification = false; /* Don't send notifications for volatile OK -> OK changes. */
|
|
|
|
olock.Unlock();
|
|
|
|
if (remove_acknowledgement_comments)
|
|
RemoveAckComments(String(), cr->GetExecutionEnd());
|
|
|
|
Dictionary::Ptr vars_after = new Dictionary({
|
|
{ "state", new_state },
|
|
{ "state_type", GetStateType() },
|
|
{ "attempt", GetCheckAttempt() },
|
|
{ "reachable", reachable }
|
|
});
|
|
|
|
if (old_cr)
|
|
cr->SetVarsBefore(old_cr->GetVarsAfter());
|
|
|
|
cr->SetVarsAfter(vars_after);
|
|
|
|
olock.Lock();
|
|
|
|
if (service) {
|
|
SetLastCheckResult(cr);
|
|
} else {
|
|
bool wasProblem = GetProblem();
|
|
|
|
SetLastCheckResult(cr);
|
|
|
|
if (GetProblem() != wasProblem) {
|
|
auto services = host->GetServices();
|
|
olock.Unlock();
|
|
for (auto& service : services) {
|
|
Service::OnHostProblemChanged(service, cr, origin);
|
|
}
|
|
olock.Lock();
|
|
}
|
|
}
|
|
|
|
bool was_flapping = IsFlapping();
|
|
|
|
UpdateFlappingStatus(cr->GetState());
|
|
|
|
bool is_flapping = IsFlapping();
|
|
|
|
// Don't recompute the next check when the current check isn't generated by this endpoint. When the check is
|
|
// remotely generated we should've already received the "SetNextCheck" event before the "event::CheckResult"
|
|
// cluster event. Otherwise, the next check received before this check will be invalidated and cause the Checkable
|
|
// "next_check/next_update" in a HA setup to always be different from the other endpoint as the "m_SchedulingOffset"
|
|
// is randomly initialised on each node.
|
|
if (!origin) {
|
|
if (cr->GetActive()) {
|
|
UpdateNextCheck();
|
|
} else {
|
|
/* Reschedule the next check for external passive check results. The side effect of
|
|
* this is that for as long as we receive results for a service we
|
|
* won't execute any active checks. */
|
|
double offset;
|
|
double ttl = cr->GetTtl();
|
|
|
|
if (ttl > 0)
|
|
offset = ttl;
|
|
else
|
|
offset = GetCheckInterval();
|
|
|
|
SetNextCheck(Utility::GetTime() + offset);
|
|
}
|
|
}
|
|
|
|
olock.Unlock();
|
|
|
|
#ifdef I2_DEBUG /* I2_DEBUG */
|
|
Log(LogDebug, "Checkable")
|
|
<< "Flapping: Checkable " << GetName()
|
|
<< " was: " << was_flapping
|
|
<< " is: " << is_flapping
|
|
<< " threshold low: " << GetFlappingThresholdLow()
|
|
<< " threshold high: " << GetFlappingThresholdHigh()
|
|
<< "% current: " << GetFlappingCurrent() << "%.";
|
|
#endif /* I2_DEBUG */
|
|
|
|
if (recovery) {
|
|
for (auto& child : children) {
|
|
if (child->GetProblem() && child->GetEnableActiveChecks()) {
|
|
auto nextCheck (now + Utility::Random() % 60);
|
|
|
|
ObjectLock oLock (child);
|
|
|
|
if (nextCheck < child->GetNextCheck()) {
|
|
child->SetNextCheck(nextCheck);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (stateChange) {
|
|
/* reschedule direct parents */
|
|
for (const Checkable::Ptr& parent : GetParents()) {
|
|
if (parent.get() == this)
|
|
continue;
|
|
|
|
if (!parent->GetEnableActiveChecks())
|
|
continue;
|
|
|
|
if (parent->GetNextCheck() >= now + parent->GetRetryInterval()) {
|
|
ObjectLock olock(parent);
|
|
parent->SetNextCheck(now);
|
|
}
|
|
}
|
|
}
|
|
|
|
OnNewCheckResult(this, cr, origin);
|
|
|
|
/* signal status updates to for example db_ido */
|
|
OnStateChanged(this);
|
|
|
|
String old_state_str = (service ? Service::StateToString(old_state) : Host::StateToString(Host::CalculateState(old_state)));
|
|
String new_state_str = (service ? Service::StateToString(new_state) : Host::StateToString(Host::CalculateState(new_state)));
|
|
|
|
/* Whether a hard state change or a volatile state change except OK -> OK happened. */
|
|
if (hardChange || (is_volatile && !(IsStateOK(old_state) && IsStateOK(new_state)))) {
|
|
OnStateChange(this, cr, StateTypeHard, origin);
|
|
Log(LogNotice, "Checkable")
|
|
<< "State Change: Checkable '" << GetName() << "' hard state change from " << old_state_str << " to " << new_state_str << " detected." << (is_volatile ? " Checkable is volatile." : "");
|
|
}
|
|
/* Whether a state change happened or the state type is SOFT (must be logged too). */
|
|
else if (stateChange || GetStateType() == StateTypeSoft) {
|
|
OnStateChange(this, cr, StateTypeSoft, origin);
|
|
Log(LogNotice, "Checkable")
|
|
<< "State Change: Checkable '" << GetName() << "' soft state change from " << old_state_str << " to " << new_state_str << " detected.";
|
|
}
|
|
|
|
if (GetStateType() == StateTypeSoft || hardChange || recovery ||
|
|
(is_volatile && !(IsStateOK(old_state) && IsStateOK(new_state))))
|
|
ExecuteEventHandler();
|
|
|
|
int suppressed_types = 0;
|
|
|
|
/* Flapping start/end notifications */
|
|
if (!was_flapping && is_flapping) {
|
|
/* FlappingStart notifications happen on state changes, not in downtimes */
|
|
if (!IsPaused()) {
|
|
if (in_downtime) {
|
|
suppressed_types |= NotificationFlappingStart;
|
|
} else {
|
|
OnNotificationsRequested(this, NotificationFlappingStart, cr, "", "", nullptr);
|
|
}
|
|
}
|
|
|
|
Log(LogNotice, "Checkable")
|
|
<< "Flapping Start: Checkable '" << GetName() << "' started flapping (Current flapping value "
|
|
<< GetFlappingCurrent() << "% > high threshold " << GetFlappingThresholdHigh() << "%).";
|
|
|
|
NotifyFlapping(origin);
|
|
} else if (was_flapping && !is_flapping) {
|
|
/* FlappingEnd notifications are independent from state changes, must not happen in downtine */
|
|
if (!IsPaused()) {
|
|
if (in_downtime) {
|
|
suppressed_types |= NotificationFlappingEnd;
|
|
} else {
|
|
OnNotificationsRequested(this, NotificationFlappingEnd, cr, "", "", nullptr);
|
|
}
|
|
}
|
|
|
|
Log(LogNotice, "Checkable")
|
|
<< "Flapping Stop: Checkable '" << GetName() << "' stopped flapping (Current flapping value "
|
|
<< GetFlappingCurrent() << "% < low threshold " << GetFlappingThresholdLow() << "%).";
|
|
|
|
NotifyFlapping(origin);
|
|
}
|
|
|
|
if (send_notification && !is_flapping) {
|
|
if (!IsPaused()) {
|
|
/* If there are still some pending suppressed state notification, keep the suppression until these are
|
|
* handled by Checkable::FireSuppressedNotifications().
|
|
*/
|
|
bool pending = GetSuppressedNotifications() & (NotificationRecovery|NotificationProblem);
|
|
|
|
if (suppress_notification || pending) {
|
|
suppressed_types |= (recovery ? NotificationRecovery : NotificationProblem);
|
|
} else {
|
|
OnNotificationsRequested(this, recovery ? NotificationRecovery : NotificationProblem, cr, "", "", nullptr);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (suppressed_types) {
|
|
/* If some notifications were suppressed, but just because of e.g. a downtime,
|
|
* stash them into a notification types bitmask for maybe re-sending later.
|
|
*/
|
|
|
|
ObjectLock olock (this);
|
|
int suppressed_types_before (GetSuppressedNotifications());
|
|
int suppressed_types_after (suppressed_types_before | suppressed_types);
|
|
|
|
const int conflict = NotificationFlappingStart | NotificationFlappingEnd;
|
|
if ((suppressed_types_after & conflict) == conflict) {
|
|
/* Flapping start and end cancel out each other. */
|
|
suppressed_types_after &= ~conflict;
|
|
}
|
|
|
|
const int stateNotifications = NotificationRecovery | NotificationProblem;
|
|
if (!(suppressed_types_before & stateNotifications) && (suppressed_types & stateNotifications)) {
|
|
/* A state-related notification is suppressed for the first time, store the previous state. When
|
|
* notifications are no longer suppressed, this can be compared with the current state to determine
|
|
* if a notification must be sent. This is done differently compared to flapping notifications just above
|
|
* as for state notifications, problem and recovery don't always cancel each other. For example,
|
|
* WARNING -> OK -> CRITICAL generates both types once, but there should still be a notification.
|
|
*/
|
|
SetStateBeforeSuppression(old_stateType == StateTypeHard ? old_state : ServiceOK);
|
|
}
|
|
|
|
if (suppressed_types_after != suppressed_types_before) {
|
|
SetSuppressedNotifications(suppressed_types_after);
|
|
}
|
|
}
|
|
|
|
/* update reachability for child objects */
|
|
if ((stateChange || hardChange) && !children.empty())
|
|
OnReachabilityChanged(this, cr, children, origin);
|
|
|
|
return Result::Ok;
|
|
}
|
|
|
|
void Checkable::ExecuteRemoteCheck(const Dictionary::Ptr& resolvedMacros)
|
|
{
|
|
CONTEXT("Executing remote check for object '" << GetName() << "'");
|
|
|
|
double scheduled_start = GetNextCheck();
|
|
double before_check = Utility::GetTime();
|
|
|
|
CheckResult::Ptr cr = new CheckResult();
|
|
cr->SetScheduleStart(scheduled_start);
|
|
cr->SetExecutionStart(before_check);
|
|
|
|
GetCheckCommand()->Execute(this, cr, resolvedMacros, true);
|
|
}
|
|
|
|
void Checkable::ExecuteCheck()
|
|
{
|
|
CONTEXT("Executing check for object '" << GetName() << "'");
|
|
|
|
/* keep track of scheduling info in case the check type doesn't provide its own information */
|
|
double scheduled_start = GetNextCheck();
|
|
double before_check = Utility::GetTime();
|
|
|
|
SetLastCheckStarted(Utility::GetTime());
|
|
|
|
/* This calls SetNextCheck() which updates the CheckerComponent's idle/pending
|
|
* queues and ensures that checks are not fired multiple times. ProcessCheckResult()
|
|
* is called too late. See #6421.
|
|
*/
|
|
UpdateNextCheck();
|
|
|
|
bool reachable = IsReachable();
|
|
|
|
{
|
|
ObjectLock olock(this);
|
|
|
|
/* don't run another check if there is one pending */
|
|
if (m_CheckRunning)
|
|
return;
|
|
|
|
m_CheckRunning = true;
|
|
|
|
SetLastStateRaw(GetStateRaw());
|
|
SetLastStateType(GetLastStateType());
|
|
SetLastReachable(reachable);
|
|
}
|
|
|
|
CheckResult::Ptr cr = new CheckResult();
|
|
|
|
cr->SetScheduleStart(scheduled_start);
|
|
cr->SetExecutionStart(before_check);
|
|
|
|
Endpoint::Ptr endpoint = GetCommandEndpoint();
|
|
bool local = !endpoint || endpoint == Endpoint::GetLocalEndpoint();
|
|
|
|
if (local) {
|
|
GetCheckCommand()->Execute(this, cr, nullptr, false);
|
|
} else {
|
|
Dictionary::Ptr macros = new Dictionary();
|
|
GetCheckCommand()->Execute(this, cr, macros, false);
|
|
|
|
if (endpoint->GetConnected()) {
|
|
/* perform check on remote endpoint */
|
|
Dictionary::Ptr message = new Dictionary();
|
|
message->Set("jsonrpc", "2.0");
|
|
message->Set("method", "event::ExecuteCommand");
|
|
|
|
Host::Ptr host;
|
|
Service::Ptr service;
|
|
tie(host, service) = GetHostService(this);
|
|
|
|
Dictionary::Ptr params = new Dictionary();
|
|
message->Set("params", params);
|
|
params->Set("command_type", "check_command");
|
|
params->Set("command", GetCheckCommand()->GetName());
|
|
params->Set("host", host->GetName());
|
|
|
|
if (service)
|
|
params->Set("service", service->GetShortName());
|
|
|
|
/*
|
|
* If the host/service object specifies the 'check_timeout' attribute,
|
|
* forward this to the remote endpoint to limit the command execution time.
|
|
*/
|
|
if (!GetCheckTimeout().IsEmpty())
|
|
params->Set("check_timeout", GetCheckTimeout());
|
|
|
|
params->Set("macros", macros);
|
|
|
|
ApiListener::Ptr listener = ApiListener::GetInstance();
|
|
|
|
if (listener)
|
|
listener->SyncSendMessage(endpoint, message);
|
|
|
|
/* Re-schedule the check so we don't run it again until after we've received
|
|
* a check result from the remote instance. The check will be re-scheduled
|
|
* using the proper check interval once we've received a check result.
|
|
*/
|
|
SetNextCheck(Utility::GetTime() + GetCheckCommand()->GetTimeout() + 30);
|
|
|
|
/*
|
|
* Let the user know that there was a problem with the check if
|
|
* 1) The endpoint is not syncing (replay log, etc.)
|
|
* 2) Outside of the cold startup window (5min)
|
|
*/
|
|
} else if (!endpoint->GetSyncing() && Application::GetInstance()->GetStartTime() < Utility::GetTime() - 300) {
|
|
/* fail to perform check on unconnected endpoint */
|
|
cr->SetState(ServiceUnknown);
|
|
|
|
String output = "Remote Icinga instance '" + endpoint->GetName() + "' is not connected to ";
|
|
|
|
Endpoint::Ptr localEndpoint = Endpoint::GetLocalEndpoint();
|
|
|
|
if (localEndpoint)
|
|
output += "'" + localEndpoint->GetName() + "'";
|
|
else
|
|
output += "this instance";
|
|
|
|
cr->SetOutput(output);
|
|
|
|
ProcessCheckResult(cr);
|
|
}
|
|
|
|
{
|
|
ObjectLock olock(this);
|
|
m_CheckRunning = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
void Checkable::UpdateStatistics(const CheckResult::Ptr& cr, CheckableType type)
|
|
{
|
|
time_t ts = cr->GetScheduleEnd();
|
|
|
|
if (type == CheckableHost) {
|
|
if (cr->GetActive())
|
|
CIB::UpdateActiveHostChecksStatistics(ts, 1);
|
|
else
|
|
CIB::UpdatePassiveHostChecksStatistics(ts, 1);
|
|
} else if (type == CheckableService) {
|
|
if (cr->GetActive())
|
|
CIB::UpdateActiveServiceChecksStatistics(ts, 1);
|
|
else
|
|
CIB::UpdatePassiveServiceChecksStatistics(ts, 1);
|
|
} else {
|
|
Log(LogWarning, "Checkable", "Unknown checkable type for statistic update.");
|
|
}
|
|
}
|
|
|
|
void Checkable::IncreasePendingChecks()
|
|
{
|
|
std::unique_lock<std::mutex> lock(m_StatsMutex);
|
|
m_PendingChecks++;
|
|
}
|
|
|
|
void Checkable::DecreasePendingChecks()
|
|
{
|
|
std::unique_lock<std::mutex> lock(m_StatsMutex);
|
|
m_PendingChecks--;
|
|
m_PendingChecksCV.notify_one();
|
|
}
|
|
|
|
int Checkable::GetPendingChecks()
|
|
{
|
|
std::unique_lock<std::mutex> lock(m_StatsMutex);
|
|
return m_PendingChecks;
|
|
}
|
|
|
|
void Checkable::AquirePendingCheckSlot(int maxPendingChecks)
|
|
{
|
|
std::unique_lock<std::mutex> lock(m_StatsMutex);
|
|
while (m_PendingChecks >= maxPendingChecks)
|
|
m_PendingChecksCV.wait(lock);
|
|
|
|
m_PendingChecks++;
|
|
}
|