icinga2/lib/icinga/checkable-check.cpp
Alexander A. Klimov 6dffc57a37 Checkable#ProcessCheckResult(): only clean up ack comments older than check result
Normally if for some reason an ack comment still exists on a checkable not
acked anymore, still clean it up. But while replaying log config objects
incl. ack comments come before check results and acks. I.e. 1) ack comment,
2) DOWN check result and 3) ack. Not 1) DOWN check result, 2) ack and 3) ack
comment. So the checkable is temporarily not acked, but already has the ack
comment. In this case the DOWN check result which is older than the ack
comment shall not clean up the latter.
2023-07-04 10:56:30 +02:00

706 lines
22 KiB
C++

/* Icinga 2 | (c) 2012 Icinga GmbH | GPLv2+ */
#include "icinga/checkable.hpp"
#include "icinga/service.hpp"
#include "icinga/host.hpp"
#include "icinga/checkcommand.hpp"
#include "icinga/icingaapplication.hpp"
#include "icinga/cib.hpp"
#include "icinga/clusterevents.hpp"
#include "remote/messageorigin.hpp"
#include "remote/apilistener.hpp"
#include "base/objectlock.hpp"
#include "base/logger.hpp"
#include "base/convert.hpp"
#include "base/utility.hpp"
#include "base/context.hpp"
using namespace icinga;
boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, const MessageOrigin::Ptr&)> Checkable::OnNewCheckResult;
boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, StateType, const MessageOrigin::Ptr&)> Checkable::OnStateChange;
boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, std::set<Checkable::Ptr>, const MessageOrigin::Ptr&)> Checkable::OnReachabilityChanged;
boost::signals2::signal<void (const Checkable::Ptr&, NotificationType, const CheckResult::Ptr&, const String&, const String&, const MessageOrigin::Ptr&)> Checkable::OnNotificationsRequested;
boost::signals2::signal<void (const Checkable::Ptr&)> Checkable::OnNextCheckUpdated;
Atomic<uint_fast64_t> Checkable::CurrentConcurrentChecks (0);
std::mutex Checkable::m_StatsMutex;
int Checkable::m_PendingChecks = 0;
std::condition_variable Checkable::m_PendingChecksCV;
CheckCommand::Ptr Checkable::GetCheckCommand() const
{
return dynamic_pointer_cast<CheckCommand>(NavigateCheckCommandRaw());
}
TimePeriod::Ptr Checkable::GetCheckPeriod() const
{
return TimePeriod::GetByName(GetCheckPeriodRaw());
}
void Checkable::SetSchedulingOffset(long offset)
{
m_SchedulingOffset = offset;
}
long Checkable::GetSchedulingOffset()
{
return m_SchedulingOffset;
}
void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin)
{
double interval;
if (GetStateType() == StateTypeSoft && GetLastCheckResult() != nullptr)
interval = GetRetryInterval();
else
interval = GetCheckInterval();
double now = Utility::GetTime();
double adj = 0;
if (interval > 1)
adj = fmod(now * 100 + GetSchedulingOffset(), interval * 100) / 100.0;
if (adj != 0.0)
adj = std::min(0.5 + fmod(GetSchedulingOffset(), interval * 5) / 100.0, adj);
double nextCheck = now - adj + interval;
double lastCheck = GetLastCheck();
Log(LogDebug, "Checkable")
<< "Update checkable '" << GetName() << "' with check interval '" << GetCheckInterval()
<< "' from last check time at " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", (lastCheck < 0 ? 0 : lastCheck))
<< " (" << GetLastCheck() << ") to next check time at " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", nextCheck) << " (" << nextCheck << ").";
SetNextCheck(nextCheck, false, origin);
}
bool Checkable::HasBeenChecked() const
{
return GetLastCheckResult() != nullptr;
}
double Checkable::GetLastCheck() const
{
CheckResult::Ptr cr = GetLastCheckResult();
double schedule_end = -1;
if (cr)
schedule_end = cr->GetScheduleEnd();
return schedule_end;
}
void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrigin::Ptr& origin)
{
{
ObjectLock olock(this);
m_CheckRunning = false;
}
if (!cr)
return;
double now = Utility::GetTime();
if (cr->GetScheduleStart() == 0)
cr->SetScheduleStart(now);
if (cr->GetScheduleEnd() == 0)
cr->SetScheduleEnd(now);
if (cr->GetExecutionStart() == 0)
cr->SetExecutionStart(now);
if (cr->GetExecutionEnd() == 0)
cr->SetExecutionEnd(now);
if (!origin || origin->IsLocal())
cr->SetSchedulingSource(IcingaApplication::GetInstance()->GetNodeName());
Endpoint::Ptr command_endpoint = GetCommandEndpoint();
if (cr->GetCheckSource().IsEmpty()) {
if ((!origin || origin->IsLocal()))
cr->SetCheckSource(IcingaApplication::GetInstance()->GetNodeName());
/* override check source if command_endpoint was defined */
if (command_endpoint && !GetExtension("agent_check"))
cr->SetCheckSource(command_endpoint->GetName());
}
/* agent checks go through the api */
if (command_endpoint && GetExtension("agent_check")) {
ApiListener::Ptr listener = ApiListener::GetInstance();
if (listener) {
/* send message back to its origin */
Dictionary::Ptr message = ClusterEvents::MakeCheckResultMessage(this, cr);
listener->SyncSendMessage(command_endpoint, message);
}
return;
}
if (!IsActive())
return;
bool reachable = IsReachable();
bool notification_reachable = IsReachable(DependencyNotification);
ObjectLock olock(this);
CheckResult::Ptr old_cr = GetLastCheckResult();
ServiceState old_state = GetStateRaw();
StateType old_stateType = GetStateType();
long old_attempt = GetCheckAttempt();
bool recovery = false;
/* When we have an check result already (not after fresh start),
* prevent to accept old check results and allow overrides for
* CRs happened in the future.
*/
if (old_cr) {
double currentCRTimestamp = old_cr->GetExecutionStart();
double newCRTimestamp = cr->GetExecutionStart();
/* Our current timestamp may be from the future (wrong server time adjusted again). Allow overrides here. */
if (currentCRTimestamp > now) {
/* our current CR is from the future, let the new CR override it. */
Log(LogDebug, "Checkable")
<< std::fixed << std::setprecision(6) << "Processing check result for checkable '" << GetName() << "' from "
<< Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", newCRTimestamp) << " (" << newCRTimestamp
<< "). Overriding since ours is from the future at "
<< Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", currentCRTimestamp) << " (" << currentCRTimestamp << ").";
} else {
/* Current timestamp is from the past, but the new timestamp is even more in the past. Skip it. */
if (newCRTimestamp < currentCRTimestamp) {
Log(LogDebug, "Checkable")
<< std::fixed << std::setprecision(6) << "Skipping check result for checkable '" << GetName() << "' from "
<< Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", newCRTimestamp) << " (" << newCRTimestamp
<< "). It is in the past compared to ours at "
<< Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", currentCRTimestamp) << " (" << currentCRTimestamp << ").";
return;
}
}
}
/* The ExecuteCheck function already sets the old state, but we need to do it again
* in case this was a passive check result. */
SetLastStateRaw(old_state);
SetLastStateType(old_stateType);
SetLastReachable(reachable);
Host::Ptr host;
Service::Ptr service;
tie(host, service) = GetHostService(this);
CheckableType checkableType = CheckableHost;
if (service)
checkableType = CheckableService;
long attempt = 1;
std::set<Checkable::Ptr> children = GetChildren();
if (IsStateOK(cr->GetState())) {
SetStateType(StateTypeHard); // NOT-OK -> HARD OK
if (!IsStateOK(old_state))
recovery = true;
ResetNotificationNumbers();
SaveLastState(ServiceOK, cr->GetExecutionEnd());
} else {
/* OK -> NOT-OK change, first SOFT state. Reset attempt counter. */
if (IsStateOK(old_state)) {
SetStateType(StateTypeSoft);
attempt = 1;
}
/* SOFT state change, increase attempt counter. */
if (old_stateType == StateTypeSoft && !IsStateOK(old_state)) {
SetStateType(StateTypeSoft);
attempt = old_attempt + 1;
}
/* HARD state change (e.g. previously 2/3 and this next attempt). Reset attempt counter. */
if (attempt >= GetMaxCheckAttempts()) {
SetStateType(StateTypeHard);
attempt = 1;
}
if (!IsStateOK(cr->GetState())) {
SaveLastState(cr->GetState(), cr->GetExecutionEnd());
}
}
if (!reachable)
SetLastStateUnreachable(cr->GetExecutionEnd());
SetCheckAttempt(attempt);
ServiceState new_state = cr->GetState();
SetStateRaw(new_state);
bool stateChange;
/* Exception on state change calculation for hosts. */
if (checkableType == CheckableService)
stateChange = (old_state != new_state);
else
stateChange = (Host::CalculateState(old_state) != Host::CalculateState(new_state));
/* Store the current last state change for the next iteration. */
SetPreviousStateChange(GetLastStateChange());
if (stateChange) {
SetLastStateChange(cr->GetExecutionEnd());
/* remove acknowledgements */
if (GetAcknowledgement() == AcknowledgementNormal ||
(GetAcknowledgement() == AcknowledgementSticky && IsStateOK(new_state))) {
ClearAcknowledgement("");
}
}
bool remove_acknowledgement_comments = false;
if (GetAcknowledgement() == AcknowledgementNone)
remove_acknowledgement_comments = true;
bool hardChange = (GetStateType() == StateTypeHard && old_stateType == StateTypeSoft);
if (stateChange && old_stateType == StateTypeHard && GetStateType() == StateTypeHard)
hardChange = true;
bool is_volatile = GetVolatile();
if (hardChange || is_volatile) {
SetLastHardStateRaw(new_state);
SetLastHardStateChange(cr->GetExecutionEnd());
SetLastHardStatesRaw(GetLastHardStatesRaw() / 100u + new_state * 100u);
}
if (stateChange) {
SetLastSoftStatesRaw(GetLastSoftStatesRaw() / 100u + new_state * 100u);
}
cr->SetPreviousHardState(ServiceState(GetLastHardStatesRaw() % 100u));
if (!IsStateOK(new_state))
TriggerDowntimes(cr->GetExecutionEnd());
/* statistics for external tools */
Checkable::UpdateStatistics(cr, checkableType);
bool in_downtime = IsInDowntime();
bool send_notification = false;
bool suppress_notification = !notification_reachable || in_downtime || IsAcknowledged();
/* Send notifications whether when a hard state change occurred. */
if (hardChange && !(old_stateType == StateTypeSoft && IsStateOK(new_state)))
send_notification = true;
/* Or if the checkable is volatile and in a HARD state. */
else if (is_volatile && GetStateType() == StateTypeHard)
send_notification = true;
if (IsStateOK(old_state) && old_stateType == StateTypeSoft)
send_notification = false; /* Don't send notifications for SOFT-OK -> HARD-OK. */
if (is_volatile && IsStateOK(old_state) && IsStateOK(new_state))
send_notification = false; /* Don't send notifications for volatile OK -> OK changes. */
olock.Unlock();
if (remove_acknowledgement_comments)
RemoveAckComments(String(), cr->GetExecutionEnd());
Dictionary::Ptr vars_after = new Dictionary({
{ "state", new_state },
{ "state_type", GetStateType() },
{ "attempt", GetCheckAttempt() },
{ "reachable", reachable }
});
if (old_cr)
cr->SetVarsBefore(old_cr->GetVarsAfter());
cr->SetVarsAfter(vars_after);
olock.Lock();
if (service) {
SetLastCheckResult(cr);
} else {
bool wasProblem = GetProblem();
SetLastCheckResult(cr);
if (GetProblem() != wasProblem) {
auto services = host->GetServices();
olock.Unlock();
for (auto& service : services) {
Service::OnHostProblemChanged(service, cr, origin);
}
olock.Lock();
}
}
bool was_flapping = IsFlapping();
UpdateFlappingStatus(cr->GetState());
bool is_flapping = IsFlapping();
if (cr->GetActive()) {
UpdateNextCheck(origin);
} else {
/* Reschedule the next check for external passive check results. The side effect of
* this is that for as long as we receive results for a service we
* won't execute any active checks. */
double offset;
double ttl = cr->GetTtl();
if (ttl > 0)
offset = ttl;
else
offset = GetCheckInterval();
SetNextCheck(Utility::GetTime() + offset, false, origin);
}
olock.Unlock();
#ifdef I2_DEBUG /* I2_DEBUG */
Log(LogDebug, "Checkable")
<< "Flapping: Checkable " << GetName()
<< " was: " << was_flapping
<< " is: " << is_flapping
<< " threshold low: " << GetFlappingThresholdLow()
<< " threshold high: " << GetFlappingThresholdHigh()
<< "% current: " << GetFlappingCurrent() << "%.";
#endif /* I2_DEBUG */
if (recovery) {
for (auto& child : children) {
if (child->GetProblem() && child->GetEnableActiveChecks()) {
auto nextCheck (now + Utility::Random() % 60);
ObjectLock oLock (child);
if (nextCheck < child->GetNextCheck()) {
child->SetNextCheck(nextCheck);
}
}
}
}
if (stateChange) {
/* reschedule direct parents */
for (const Checkable::Ptr& parent : GetParents()) {
if (parent.get() == this)
continue;
if (!parent->GetEnableActiveChecks())
continue;
if (parent->GetNextCheck() >= now + parent->GetRetryInterval()) {
ObjectLock olock(parent);
parent->SetNextCheck(now);
}
}
}
OnNewCheckResult(this, cr, origin);
/* signal status updates to for example db_ido */
OnStateChanged(this);
String old_state_str = (service ? Service::StateToString(old_state) : Host::StateToString(Host::CalculateState(old_state)));
String new_state_str = (service ? Service::StateToString(new_state) : Host::StateToString(Host::CalculateState(new_state)));
/* Whether a hard state change or a volatile state change except OK -> OK happened. */
if (hardChange || (is_volatile && !(IsStateOK(old_state) && IsStateOK(new_state)))) {
OnStateChange(this, cr, StateTypeHard, origin);
Log(LogNotice, "Checkable")
<< "State Change: Checkable '" << GetName() << "' hard state change from " << old_state_str << " to " << new_state_str << " detected." << (is_volatile ? " Checkable is volatile." : "");
}
/* Whether a state change happened or the state type is SOFT (must be logged too). */
else if (stateChange || GetStateType() == StateTypeSoft) {
OnStateChange(this, cr, StateTypeSoft, origin);
Log(LogNotice, "Checkable")
<< "State Change: Checkable '" << GetName() << "' soft state change from " << old_state_str << " to " << new_state_str << " detected.";
}
if (GetStateType() == StateTypeSoft || hardChange || recovery ||
(is_volatile && !(IsStateOK(old_state) && IsStateOK(new_state))))
ExecuteEventHandler();
int suppressed_types = 0;
/* Flapping start/end notifications */
if (!was_flapping && is_flapping) {
/* FlappingStart notifications happen on state changes, not in downtimes */
if (!IsPaused()) {
if (in_downtime) {
suppressed_types |= NotificationFlappingStart;
} else {
OnNotificationsRequested(this, NotificationFlappingStart, cr, "", "", nullptr);
}
}
Log(LogNotice, "Checkable")
<< "Flapping Start: Checkable '" << GetName() << "' started flapping (Current flapping value "
<< GetFlappingCurrent() << "% > high threshold " << GetFlappingThresholdHigh() << "%).";
NotifyFlapping(origin);
} else if (was_flapping && !is_flapping) {
/* FlappingEnd notifications are independent from state changes, must not happen in downtine */
if (!IsPaused()) {
if (in_downtime) {
suppressed_types |= NotificationFlappingEnd;
} else {
OnNotificationsRequested(this, NotificationFlappingEnd, cr, "", "", nullptr);
}
}
Log(LogNotice, "Checkable")
<< "Flapping Stop: Checkable '" << GetName() << "' stopped flapping (Current flapping value "
<< GetFlappingCurrent() << "% < low threshold " << GetFlappingThresholdLow() << "%).";
NotifyFlapping(origin);
}
if (send_notification && !is_flapping) {
if (!IsPaused()) {
/* If there are still some pending suppressed state notification, keep the suppression until these are
* handled by Checkable::FireSuppressedNotifications().
*/
bool pending = GetSuppressedNotifications() & (NotificationRecovery|NotificationProblem);
if (suppress_notification || pending) {
suppressed_types |= (recovery ? NotificationRecovery : NotificationProblem);
} else {
OnNotificationsRequested(this, recovery ? NotificationRecovery : NotificationProblem, cr, "", "", nullptr);
}
}
}
if (suppressed_types) {
/* If some notifications were suppressed, but just because of e.g. a downtime,
* stash them into a notification types bitmask for maybe re-sending later.
*/
ObjectLock olock (this);
int suppressed_types_before (GetSuppressedNotifications());
int suppressed_types_after (suppressed_types_before | suppressed_types);
const int conflict = NotificationFlappingStart | NotificationFlappingEnd;
if ((suppressed_types_after & conflict) == conflict) {
/* Flapping start and end cancel out each other. */
suppressed_types_after &= ~conflict;
}
const int stateNotifications = NotificationRecovery | NotificationProblem;
if (!(suppressed_types_before & stateNotifications) && (suppressed_types & stateNotifications)) {
/* A state-related notification is suppressed for the first time, store the previous state. When
* notifications are no longer suppressed, this can be compared with the current state to determine
* if a notification must be sent. This is done differently compared to flapping notifications just above
* as for state notifications, problem and recovery don't always cancel each other. For example,
* WARNING -> OK -> CRITICAL generates both types once, but there should still be a notification.
*/
SetStateBeforeSuppression(old_stateType == StateTypeHard ? old_state : ServiceOK);
}
if (suppressed_types_after != suppressed_types_before) {
SetSuppressedNotifications(suppressed_types_after);
}
}
/* update reachability for child objects */
if ((stateChange || hardChange) && !children.empty())
OnReachabilityChanged(this, cr, children, origin);
}
void Checkable::ExecuteRemoteCheck(const Dictionary::Ptr& resolvedMacros)
{
CONTEXT("Executing remote check for object '" + GetName() + "'");
double scheduled_start = GetNextCheck();
double before_check = Utility::GetTime();
CheckResult::Ptr cr = new CheckResult();
cr->SetScheduleStart(scheduled_start);
cr->SetExecutionStart(before_check);
GetCheckCommand()->Execute(this, cr, resolvedMacros, true);
}
void Checkable::ExecuteCheck()
{
CONTEXT("Executing check for object '" + GetName() + "'");
/* keep track of scheduling info in case the check type doesn't provide its own information */
double scheduled_start = GetNextCheck();
double before_check = Utility::GetTime();
SetLastCheckStarted(Utility::GetTime());
/* This calls SetNextCheck() which updates the CheckerComponent's idle/pending
* queues and ensures that checks are not fired multiple times. ProcessCheckResult()
* is called too late. See #6421.
*/
UpdateNextCheck();
bool reachable = IsReachable();
{
ObjectLock olock(this);
/* don't run another check if there is one pending */
if (m_CheckRunning)
return;
m_CheckRunning = true;
SetLastStateRaw(GetStateRaw());
SetLastStateType(GetLastStateType());
SetLastReachable(reachable);
}
CheckResult::Ptr cr = new CheckResult();
cr->SetScheduleStart(scheduled_start);
cr->SetExecutionStart(before_check);
Endpoint::Ptr endpoint = GetCommandEndpoint();
bool local = !endpoint || endpoint == Endpoint::GetLocalEndpoint();
if (local) {
GetCheckCommand()->Execute(this, cr, nullptr, false);
} else {
Dictionary::Ptr macros = new Dictionary();
GetCheckCommand()->Execute(this, cr, macros, false);
if (endpoint->GetConnected()) {
/* perform check on remote endpoint */
Dictionary::Ptr message = new Dictionary();
message->Set("jsonrpc", "2.0");
message->Set("method", "event::ExecuteCommand");
Host::Ptr host;
Service::Ptr service;
tie(host, service) = GetHostService(this);
Dictionary::Ptr params = new Dictionary();
message->Set("params", params);
params->Set("command_type", "check_command");
params->Set("command", GetCheckCommand()->GetName());
params->Set("host", host->GetName());
if (service)
params->Set("service", service->GetShortName());
/*
* If the host/service object specifies the 'check_timeout' attribute,
* forward this to the remote endpoint to limit the command execution time.
*/
if (!GetCheckTimeout().IsEmpty())
params->Set("check_timeout", GetCheckTimeout());
params->Set("macros", macros);
ApiListener::Ptr listener = ApiListener::GetInstance();
if (listener)
listener->SyncSendMessage(endpoint, message);
/* Re-schedule the check so we don't run it again until after we've received
* a check result from the remote instance. The check will be re-scheduled
* using the proper check interval once we've received a check result.
*/
SetNextCheck(Utility::GetTime() + GetCheckCommand()->GetTimeout() + 30);
/*
* Let the user know that there was a problem with the check if
* 1) The endpoint is not syncing (replay log, etc.)
* 2) Outside of the cold startup window (5min)
*/
} else if (!endpoint->GetSyncing() && Application::GetInstance()->GetStartTime() < Utility::GetTime() - 300) {
/* fail to perform check on unconnected endpoint */
cr->SetState(ServiceUnknown);
String output = "Remote Icinga instance '" + endpoint->GetName() + "' is not connected to ";
Endpoint::Ptr localEndpoint = Endpoint::GetLocalEndpoint();
if (localEndpoint)
output += "'" + localEndpoint->GetName() + "'";
else
output += "this instance";
cr->SetOutput(output);
ProcessCheckResult(cr);
}
{
ObjectLock olock(this);
m_CheckRunning = false;
}
}
}
void Checkable::UpdateStatistics(const CheckResult::Ptr& cr, CheckableType type)
{
time_t ts = cr->GetScheduleEnd();
if (type == CheckableHost) {
if (cr->GetActive())
CIB::UpdateActiveHostChecksStatistics(ts, 1);
else
CIB::UpdatePassiveHostChecksStatistics(ts, 1);
} else if (type == CheckableService) {
if (cr->GetActive())
CIB::UpdateActiveServiceChecksStatistics(ts, 1);
else
CIB::UpdatePassiveServiceChecksStatistics(ts, 1);
} else {
Log(LogWarning, "Checkable", "Unknown checkable type for statistic update.");
}
}
void Checkable::IncreasePendingChecks()
{
std::unique_lock<std::mutex> lock(m_StatsMutex);
m_PendingChecks++;
}
void Checkable::DecreasePendingChecks()
{
std::unique_lock<std::mutex> lock(m_StatsMutex);
m_PendingChecks--;
m_PendingChecksCV.notify_one();
}
int Checkable::GetPendingChecks()
{
std::unique_lock<std::mutex> lock(m_StatsMutex);
return m_PendingChecks;
}
void Checkable::AquirePendingCheckSlot(int maxPendingChecks)
{
std::unique_lock<std::mutex> lock(m_StatsMutex);
while (m_PendingChecks >= maxPendingChecks)
m_PendingChecksCV.wait(lock);
m_PendingChecks++;
}