icinga2/lib/icinga/service-check.cpp

554 lines
15 KiB
C++
Raw Normal View History

/******************************************************************************
* Icinga 2 *
2013-09-25 07:43:57 +02:00
* Copyright (C) 2012-2013 Icinga Development Team (http://www.icinga.org/) *
* *
* This program is free software; you can redistribute it and/or *
* modify it under the terms of the GNU General Public License *
* as published by the Free Software Foundation; either version 2 *
* of the License, or (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the Free Software Foundation *
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. *
******************************************************************************/
2013-03-16 21:18:53 +01:00
#include "icinga/service.h"
#include "icinga/checkcommand.h"
2013-03-17 20:19:29 +01:00
#include "icinga/icingaapplication.h"
#include "icinga/cib.h"
2013-03-16 21:18:53 +01:00
#include "base/dynamictype.h"
#include "base/objectlock.h"
#include "base/logger_fwd.h"
#include "base/convert.h"
#include "base/utility.h"
#include "base/exception.h"
2013-03-16 21:18:53 +01:00
#include <boost/foreach.hpp>
2013-07-30 22:38:33 +02:00
#include <boost/algorithm/string/replace.hpp>
using namespace icinga;
boost::signals2::signal<void (const Service::Ptr&, const CheckResult::Ptr&, const String&)> Service::OnNewCheckResult;
boost::signals2::signal<void (const Service::Ptr&, const CheckResult::Ptr&, StateType, const String&)> Service::OnStateChange;
boost::signals2::signal<void (const Service::Ptr&, NotificationType, const CheckResult::Ptr&, const String&, const String&)> Service::OnNotificationsRequested;
2013-08-28 11:12:20 +02:00
boost::signals2::signal<void (const Service::Ptr&, double, const String&)> Service::OnNextCheckChanged;
boost::signals2::signal<void (const Service::Ptr&, bool, const String&)> Service::OnForceNextCheckChanged;
2013-08-29 13:09:26 +02:00
boost::signals2::signal<void (const Service::Ptr&, bool, const String&)> Service::OnForceNextNotificationChanged;
2013-08-28 11:12:20 +02:00
boost::signals2::signal<void (const Service::Ptr&, bool, const String&)> Service::OnEnableActiveChecksChanged;
boost::signals2::signal<void (const Service::Ptr&, bool, const String&)> Service::OnEnablePassiveChecksChanged;
boost::signals2::signal<void (const Service::Ptr&, bool, const String&)> Service::OnEnableNotificationsChanged;
boost::signals2::signal<void (const Service::Ptr&, bool, const String&)> Service::OnEnableFlappingChanged;
boost::signals2::signal<void (const Service::Ptr&, FlappingState)> Service::OnFlappingChanged;
CheckCommand::Ptr Service::GetCheckCommand(void) const
{
2013-10-26 09:41:45 +02:00
return CheckCommand::GetByName(GetCheckCommandRaw());
}
2013-03-13 16:04:53 +01:00
TimePeriod::Ptr Service::GetCheckPeriod(void) const
{
2013-10-26 09:41:45 +02:00
return TimePeriod::GetByName(GetCheckPeriodRaw());
2013-03-13 16:04:53 +01:00
}
double Service::GetCheckInterval(void) const
{
2013-10-26 09:41:45 +02:00
if (!GetOverrideCheckInterval().IsEmpty())
return GetOverrideCheckInterval();
else
2013-10-26 09:41:45 +02:00
return GetCheckIntervalRaw();
}
void Service::SetCheckInterval(double interval)
{
2013-10-26 09:41:45 +02:00
SetOverrideCheckInterval(interval);
}
double Service::GetRetryInterval(void) const
{
2013-10-26 09:41:45 +02:00
if (!GetOverrideRetryInterval().IsEmpty())
return GetOverrideRetryInterval();
else
2013-10-26 09:41:45 +02:00
return GetRetryIntervalRaw();
}
void Service::SetRetryInterval(double interval)
{
2013-10-26 09:41:45 +02:00
SetOverrideRetryInterval(interval);
}
void Service::SetSchedulingOffset(long offset)
{
2013-02-26 10:13:54 +01:00
m_SchedulingOffset = offset;
}
long Service::GetSchedulingOffset(void)
{
2013-02-26 10:13:54 +01:00
return m_SchedulingOffset;
}
2013-08-28 11:12:20 +02:00
void Service::SetNextCheck(double nextCheck, const String& authority)
{
2013-10-26 09:41:45 +02:00
SetNextCheckRaw(nextCheck);
2013-11-10 16:53:57 +01:00
OnNextCheckChanged(GetSelf(), nextCheck, authority);
}
double Service::GetNextCheck(void)
{
2013-10-26 09:41:45 +02:00
return GetNextCheckRaw();
}
void Service::UpdateNextCheck(void)
{
2013-03-02 09:07:47 +01:00
ObjectLock olock(this);
double interval;
if (GetStateType() == StateTypeSoft && GetLastCheckResult() != NULL)
interval = GetRetryInterval();
else
interval = GetCheckInterval();
double now = Utility::GetTime();
double adj = 0;
if (interval > 1)
adj = fmod(now * 100 + GetSchedulingOffset(), interval * 100) / 100.0;
SetNextCheck(now - adj + interval);
}
bool Service::HasBeenChecked(void) const
{
return GetLastCheckResult() != NULL;
}
double Service::GetLastCheck(void) const
{
CheckResult::Ptr cr = GetLastCheckResult();
double schedule_end = -1;
2013-10-26 09:41:45 +02:00
if (cr)
schedule_end = cr->GetScheduleEnd();
return schedule_end;
}
bool Service::GetEnableActiveChecks(void) const
{
2013-10-26 09:41:45 +02:00
if (!GetOverrideEnableActiveChecks().IsEmpty())
return GetOverrideEnableActiveChecks();
else
2013-10-26 09:41:45 +02:00
return GetEnableActiveChecksRaw();
}
2013-08-28 11:12:20 +02:00
void Service::SetEnableActiveChecks(bool enabled, const String& authority)
{
2013-10-26 09:41:45 +02:00
SetOverrideEnableActiveChecks(enabled);
2013-08-28 11:12:20 +02:00
2013-11-10 16:53:57 +01:00
OnEnableActiveChecksChanged(GetSelf(), enabled, authority);
}
bool Service::GetEnablePassiveChecks(void) const
{
2013-10-26 09:41:45 +02:00
if (!GetOverrideEnablePassiveChecks().IsEmpty())
return GetOverrideEnablePassiveChecks();
else
2013-10-26 09:41:45 +02:00
return GetEnablePassiveChecksRaw();
}
2013-08-28 11:12:20 +02:00
void Service::SetEnablePassiveChecks(bool enabled, const String& authority)
{
2013-10-26 09:41:45 +02:00
SetOverrideEnablePassiveChecks(enabled);
2013-08-28 11:12:20 +02:00
2013-11-10 16:53:57 +01:00
OnEnablePassiveChecksChanged(GetSelf(), enabled, authority);
}
bool Service::GetForceNextCheck(void) const
{
2013-10-26 09:41:45 +02:00
return GetForceNextCheckRaw();
}
2013-08-28 11:12:20 +02:00
void Service::SetForceNextCheck(bool forced, const String& authority)
{
2013-10-26 09:41:45 +02:00
SetForceNextCheckRaw(forced);
2013-08-28 11:12:20 +02:00
2013-11-10 16:53:57 +01:00
OnForceNextCheckChanged(GetSelf(), forced, authority);
}
void Service::ProcessCheckResult(const CheckResult::Ptr& cr, const String& authority)
{
2013-03-19 16:20:13 +01:00
double now = Utility::GetTime();
if (cr->GetScheduleStart() == 0)
cr->SetScheduleStart(now);
2013-03-19 16:20:13 +01:00
if (cr->GetScheduleEnd() == 0)
cr->SetScheduleEnd(now);
2013-03-19 16:20:13 +01:00
if (cr->GetExecutionStart() == 0)
cr->SetExecutionStart(now);
2013-03-19 16:20:13 +01:00
if (cr->GetExecutionEnd() == 0)
cr->SetExecutionEnd(now);
2013-03-19 16:20:13 +01:00
String check_source = cr->GetCheckSource();
if (check_source.IsEmpty())
cr->SetCheckSource(authority);
2013-03-06 11:03:50 +01:00
bool reachable = IsReachable();
2013-03-19 13:04:30 +01:00
Host::Ptr host = GetHost();
bool host_reachable = true;
if (host)
host_reachable = host->IsReachable();
ASSERT(!OwnsLock());
2013-03-02 09:07:47 +01:00
ObjectLock olock(this);
CheckResult::Ptr old_cr = GetLastCheckResult();
ServiceState old_state = GetState();
2013-03-07 12:04:20 +01:00
StateType old_stateType = GetStateType();
2013-10-26 09:41:45 +02:00
long old_attempt = GetCheckAttempt();
2013-02-24 01:10:34 +01:00
bool recovery;
if (old_cr && cr->GetExecutionStart() < old_cr->GetExecutionStart())
return;
2013-03-25 18:36:15 +01:00
/* The ExecuteCheck function already sets the old state, but we need to do it again
* in case this was a passive check result. */
2013-03-07 12:04:20 +01:00
SetLastState(old_state);
SetLastStateType(old_stateType);
2013-03-19 13:04:30 +01:00
SetLastReachable(reachable);
2013-03-07 12:04:20 +01:00
2013-03-19 13:04:30 +01:00
long attempt;
if (cr->GetState() == StateOK) {
if (old_state == StateOK && old_stateType == StateTypeSoft)
SetStateType(StateTypeHard); // SOFT OK -> HARD OK
attempt = 1;
2013-02-24 01:10:34 +01:00
recovery = true;
2013-07-18 17:04:09 +02:00
ResetNotificationNumbers();
SetLastStateOK(Utility::GetTime());
} else {
2013-03-19 13:04:30 +01:00
if (old_attempt >= GetMaxCheckAttempts()) {
SetStateType(StateTypeHard);
attempt = 1;
} else if (GetStateType() == StateTypeSoft || GetState() == StateOK) {
SetStateType(StateTypeSoft);
2013-03-19 13:04:30 +01:00
attempt = old_attempt + 1;
} else {
attempt = old_attempt;
}
2013-02-24 01:10:34 +01:00
recovery = false;
switch (cr->GetState()) {
case StateWarning:
SetLastStateWarning(Utility::GetTime());
break;
case StateCritical:
SetLastStateCritical(Utility::GetTime());
break;
case StateUnknown:
SetLastStateUnknown(Utility::GetTime());
break;
}
}
2013-07-18 18:28:23 +02:00
if (!reachable)
SetLastStateUnreachable(Utility::GetTime());
2013-10-26 09:41:45 +02:00
SetCheckAttempt(attempt);
SetState(cr->GetState());
bool call_eventhandler = false;
2013-06-21 10:20:29 +02:00
bool stateChange = (old_state != GetState());
if (stateChange) {
SetLastStateChange(now);
/* remove acknowledgements */
if (GetAcknowledgement() == AcknowledgementNormal ||
(GetAcknowledgement() == AcknowledgementSticky && GetStateType() == StateTypeHard && GetState() == StateOK)) {
ClearAcknowledgement();
}
/* reschedule service dependencies */
2013-03-02 09:07:47 +01:00
BOOST_FOREACH(const Service::Ptr& parent, GetParentServices()) {
2013-03-04 15:52:42 +01:00
ObjectLock olock(parent);
parent->SetNextCheck(Utility::GetTime());
}
/* reschedule host dependencies */
2013-03-02 09:07:47 +01:00
BOOST_FOREACH(const Host::Ptr& parent, GetParentHosts()) {
Service::Ptr service = parent->GetCheckService();
2013-03-04 15:52:42 +01:00
if (service && service->GetName() != GetName()) {
ObjectLock olock(service);
service->SetNextCheck(Utility::GetTime());
}
}
call_eventhandler = true;
}
bool remove_acknowledgement_comments = false;
if (GetAcknowledgement() == AcknowledgementNone)
remove_acknowledgement_comments = true;
bool hardChange = (GetStateType() == StateTypeHard && old_stateType == StateTypeSoft);
2013-03-21 11:37:34 +01:00
if (old_state != GetState() && old_stateType == StateTypeHard && GetStateType() == StateTypeHard)
hardChange = true;
2013-10-26 09:41:45 +02:00
if (GetVolatile())
hardChange = true;
if (hardChange) {
SetLastHardState(GetState());
2013-03-02 09:07:47 +01:00
SetLastHardStateChange(now);
}
2013-03-02 09:07:47 +01:00
if (GetState() != StateOK)
TriggerDowntimes();
2013-03-02 09:07:47 +01:00
Service::UpdateStatistics(cr);
2013-03-18 12:55:41 +01:00
bool in_downtime = IsInDowntime();
bool send_notification = hardChange && reachable && !in_downtime && !IsAcknowledged();
if (old_state == StateOK && old_stateType == StateTypeSoft)
send_notification = false; /* Don't send notifications for SOFT-OK -> HARD-OK. */
2013-10-26 09:41:45 +02:00
bool send_downtime_notification = (GetLastInDowntime() != in_downtime);
SetLastInDowntime(in_downtime);
2013-03-06 11:03:50 +01:00
2013-03-02 09:07:47 +01:00
olock.Unlock();
if (remove_acknowledgement_comments)
RemoveCommentsByType(CommentAcknowledgement);
Dictionary::Ptr vars_after = make_shared<Dictionary>();
2013-03-19 13:04:30 +01:00
vars_after->Set("state", GetState());
vars_after->Set("state_type", GetStateType());
2013-10-26 09:41:45 +02:00
vars_after->Set("attempt", GetCheckAttempt());
2013-03-19 13:04:30 +01:00
vars_after->Set("reachable", reachable);
vars_after->Set("host_reachable", host_reachable);
if (old_cr)
cr->SetVarsBefore(old_cr->GetVarsAfter());
2013-03-19 13:04:30 +01:00
cr->SetVarsAfter(vars_after);
2013-03-19 13:04:30 +01:00
olock.Lock();
SetLastCheckResult(cr);
2013-06-21 10:20:29 +02:00
bool was_flapping, is_flapping;
was_flapping = IsFlapping();
if (GetStateType() == StateTypeHard)
UpdateFlappingStatus(stateChange);
2013-06-21 10:20:29 +02:00
is_flapping = IsFlapping();
olock.Unlock();
// Log(LogDebug, "icinga", "Flapping: Service " + GetName() +
// " was: " + Convert::ToString(was_flapping) +
// " is: " + Convert::ToString(is_flapping) +
// " threshold: " + Convert::ToString(GetFlappingThreshold()) +
// "% current: " + Convert::ToString(GetFlappingCurrent()) + "%.");
2013-11-10 16:53:57 +01:00
OnNewCheckResult(GetSelf(), cr, authority);
OnStateChanged(GetSelf());
2013-03-02 09:07:47 +01:00
2013-11-10 16:53:57 +01:00
if (hardChange)
OnStateChange(GetSelf(), cr, StateTypeHard, authority);
else if (stateChange)
OnStateChange(GetSelf(), cr, StateTypeSoft, authority);
2013-09-25 18:01:08 +02:00
if (call_eventhandler)
ExecuteEventHandler();
2013-03-18 12:55:41 +01:00
if (send_downtime_notification)
OnNotificationsRequested(GetSelf(), in_downtime ? NotificationDowntimeStart : NotificationDowntimeEnd, cr, "", "");
2013-03-18 12:55:41 +01:00
if (!was_flapping && is_flapping) {
OnNotificationsRequested(GetSelf(), NotificationFlappingStart, cr, "", "");
Log(LogDebug, "icinga", "Flapping: Service " + GetName() + " started flapping (" + Convert::ToString(GetFlappingThreshold()) + "% < " + Convert::ToString(GetFlappingCurrent()) + "%).");
OnFlappingChanged(GetSelf(), FlappingStarted);
} else if (was_flapping && !is_flapping) {
OnNotificationsRequested(GetSelf(), NotificationFlappingEnd, cr, "", "");
Log(LogDebug, "icinga", "Flapping: Service " + GetName() + " stopped flapping (" + Convert::ToString(GetFlappingThreshold()) + "% >= " + Convert::ToString(GetFlappingCurrent()) + "%).");
OnFlappingChanged(GetSelf(), FlappingStopped);
} else if (send_notification)
OnNotificationsRequested(GetSelf(), recovery ? NotificationRecovery : NotificationProblem, cr, "", "");
}
ServiceState Service::StateFromString(const String& state)
{
2013-02-24 01:10:34 +01:00
if (state == "OK")
return StateOK;
2013-02-24 01:10:34 +01:00
else if (state == "WARNING")
return StateWarning;
2013-02-24 01:10:34 +01:00
else if (state == "CRITICAL")
return StateCritical;
else
return StateUnknown;
}
String Service::StateToString(ServiceState state)
{
switch (state) {
case StateOK:
2013-02-24 01:10:34 +01:00
return "OK";
case StateWarning:
2013-02-24 01:10:34 +01:00
return "WARNING";
case StateCritical:
2013-02-24 01:10:34 +01:00
return "CRITICAL";
case StateUnknown:
default:
2013-02-24 01:10:34 +01:00
return "UNKNOWN";
}
}
2013-03-07 12:04:20 +01:00
StateType Service::StateTypeFromString(const String& type)
{
2013-02-24 01:10:34 +01:00
if (type == "SOFT")
return StateTypeSoft;
else
return StateTypeHard;
}
2013-03-07 12:04:20 +01:00
String Service::StateTypeToString(StateType type)
{
if (type == StateTypeSoft)
2013-02-24 01:10:34 +01:00
return "SOFT";
else
2013-02-24 01:10:34 +01:00
return "HARD";
}
2013-03-25 18:36:15 +01:00
void Service::ExecuteCheck(void)
{
ASSERT(!OwnsLock());
2013-02-24 01:10:34 +01:00
2013-03-19 14:13:58 +01:00
bool reachable = IsReachable();
2013-03-06 11:03:50 +01:00
{
ObjectLock olock(this);
/* don't run another check if there is one pending */
2013-03-25 18:36:15 +01:00
if (m_CheckRunning)
2013-03-06 11:03:50 +01:00
return;
m_CheckRunning = true;
SetLastState(GetState());
SetLastStateType(GetLastStateType());
2013-03-19 14:13:58 +01:00
SetLastReachable(reachable);
}
/* keep track of scheduling info in case the check type doesn't provide its own information */
2013-11-15 14:26:38 +01:00
double scheduled_start = GetNextCheck();
double before_check = Utility::GetTime();
2013-02-24 01:10:34 +01:00
2013-03-04 15:52:42 +01:00
Service::Ptr self = GetSelf();
CheckResult::Ptr result;
try {
CheckCommand::Ptr command = GetCheckCommand();
if (!command) {
Log(LogDebug, "icinga", "No check_command found for service '" + GetName() + "'. Skipping execution.");
return;
}
result = command->Execute(GetSelf());
2013-03-16 21:18:53 +01:00
} catch (const std::exception& ex) {
std::ostringstream msgbuf;
msgbuf << "Exception occured during check for service '"
<< GetName() << "': " << DiagnosticInformation(ex);
String message = msgbuf.str();
2013-03-16 21:18:53 +01:00
Log(LogWarning, "icinga", message);
result = make_shared<CheckResult>();
result->SetState(StateUnknown);
result->SetOutput(message);
}
double after_check = Utility::GetTime();
2013-03-25 18:36:15 +01:00
if (result) {
2013-11-15 14:26:38 +01:00
if (result->GetScheduleStart() == 0)
result->SetScheduleStart(scheduled_start);
2013-11-15 14:26:38 +01:00
if (result->GetScheduleEnd() == 0)
result->SetScheduleEnd(after_check);
2013-02-24 01:10:34 +01:00
2013-11-15 14:26:38 +01:00
if (result->GetExecutionStart() == 0)
result->SetExecutionStart(before_check);
2013-11-15 14:26:38 +01:00
if (result->GetExecutionEnd() == 0)
result->SetExecutionEnd(after_check);
}
2013-03-02 09:07:47 +01:00
if (result)
ProcessCheckResult(result);
2013-03-07 12:04:20 +01:00
/* figure out when the next check is for this service; the call to
* ProcessCheckResult() should've already done this but lets do it again
* just in case there was no check result. */
UpdateNextCheck();
2013-02-24 01:10:34 +01:00
{
ObjectLock olock(this);
2013-03-06 11:03:50 +01:00
m_CheckRunning = false;
2013-02-24 01:10:34 +01:00
}
}
void Service::UpdateStatistics(const CheckResult::Ptr& cr)
{
time_t ts = cr->GetScheduleEnd();
if (ts == 0)
ts = static_cast<time_t>(Utility::GetTime());
if (cr->GetActive())
CIB::UpdateActiveChecksStatistics(ts, 1);
else
CIB::UpdatePassiveChecksStatistics(ts, 1);
}
2013-02-24 01:10:34 +01:00
double Service::CalculateExecutionTime(const CheckResult::Ptr& cr)
2013-02-24 01:10:34 +01:00
{
if (!cr)
return 0;
2013-02-24 01:10:34 +01:00
return cr->GetExecutionEnd() - cr->GetExecutionStart();
2013-02-24 01:10:34 +01:00
}
double Service::CalculateLatency(const CheckResult::Ptr& cr)
2013-02-24 01:10:34 +01:00
{
if (!cr)
return 0;
2013-02-24 01:10:34 +01:00
double latency = (cr->GetScheduleEnd() - cr->GetScheduleStart()) - CalculateExecutionTime(cr);
if (latency < 0)
latency = 0;
return latency;
2013-02-24 01:10:34 +01:00
}