/****************************************************************************** * Icinga 2 * * Copyright (C) 2012 Icinga Development Team (http://www.icinga.org/) * * * * This program is free software; you can redistribute it and/or * * modify it under the terms of the GNU General Public License * * as published by the Free Software Foundation; either version 2 * * of the License, or (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the Free Software Foundation * * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. * ******************************************************************************/ #include "icinga/service.h" #include "icinga/checkcommand.h" #include "icinga/icingaapplication.h" #include "icinga/checkresultmessage.h" #include "icinga/flappingmessage.h" #include "icinga/cib.h" #include "remoting/endpointmanager.h" #include "base/dynamictype.h" #include "base/objectlock.h" #include "base/logger_fwd.h" #include "base/convert.h" #include #include #include using namespace icinga; const int Service::DefaultMaxCheckAttempts = 3; const double Service::DefaultCheckInterval = 5 * 60; const double Service::CheckIntervalDivisor = 5.0; boost::signals2::signal Service::OnCheckerChanged; boost::signals2::signal Service::OnNextCheckChanged; boost::signals2::signal Service::OnFlappingChanged; CheckCommand::Ptr Service::GetCheckCommand(void) const { return CheckCommand::GetByName(m_CheckCommand); } long Service::GetMaxCheckAttempts(void) const { if (m_MaxCheckAttempts.IsEmpty()) return DefaultMaxCheckAttempts; return m_MaxCheckAttempts; } TimePeriod::Ptr Service::GetCheckPeriod(void) const { return TimePeriod::GetByName(m_CheckPeriod); } double Service::GetCheckInterval(void) const { if (m_CheckInterval.IsEmpty()) return DefaultCheckInterval; return m_CheckInterval; } double Service::GetRetryInterval(void) const { if (m_RetryInterval.IsEmpty()) return GetCheckInterval() / CheckIntervalDivisor; return m_RetryInterval; } Array::Ptr Service::GetCheckers(void) const { return m_Checkers; } void Service::SetSchedulingOffset(long offset) { m_SchedulingOffset = offset; } long Service::GetSchedulingOffset(void) { return m_SchedulingOffset; } void Service::SetNextCheck(double nextCheck) { m_NextCheck = nextCheck; Touch("next_check"); } double Service::GetNextCheck(void) { return m_NextCheck; } void Service::UpdateNextCheck(void) { ObjectLock olock(this); double interval; if (GetStateType() == StateTypeSoft) interval = GetRetryInterval(); else interval = GetCheckInterval(); double now = Utility::GetTime(); double adj = 0; if (interval > 1) adj = fmod(now * 100 + GetSchedulingOffset(), interval * 100) / 100.0; SetNextCheck(now - adj + interval); } void Service::SetCurrentChecker(const String& checker) { m_CurrentChecker = checker; Touch("current_checker"); } String Service::GetCurrentChecker(void) const { return m_CurrentChecker; } void Service::SetCurrentCheckAttempt(long attempt) { m_CheckAttempt = attempt; Touch("check_attempt"); } long Service::GetCurrentCheckAttempt(void) const { if (m_CheckAttempt.IsEmpty()) return 1; return m_CheckAttempt; } void Service::SetState(ServiceState state) { m_State = static_cast(state); Touch("state"); } ServiceState Service::GetState(void) const { if (m_State.IsEmpty()) return StateUnknown; int ivalue = static_cast(m_State); return static_cast(ivalue); } void Service::SetLastState(ServiceState state) { m_LastState = static_cast(state); Touch("last_state"); } ServiceState Service::GetLastState(void) const { if (m_LastState.IsEmpty()) return StateUnknown; int ivalue = static_cast(m_LastState); return static_cast(ivalue); } void Service::SetStateType(StateType type) { m_StateType = static_cast(type); Touch("state_type"); } StateType Service::GetStateType(void) const { if (m_StateType.IsEmpty()) return StateTypeSoft; int ivalue = static_cast(m_StateType); return static_cast(ivalue); } void Service::SetLastStateType(StateType type) { m_LastStateType = static_cast(type); Touch("last_state_type"); } StateType Service::GetLastStateType(void) const { if (m_LastStateType.IsEmpty()) return StateTypeSoft; int ivalue = static_cast(m_LastStateType); return static_cast(ivalue); } void Service::SetLastReachable(bool reachable) { m_LastReachable = reachable; Touch("last_reachable"); } bool Service::GetLastReachable(void) const { if (m_LastReachable.IsEmpty()) return true; return m_LastReachable; } void Service::SetLastCheckResult(const Dictionary::Ptr& result) { m_LastResult = result; Touch("last_result"); } Dictionary::Ptr Service::GetLastCheckResult(void) const { return m_LastResult; } void Service::SetLastStateChange(double ts) { m_LastStateChange = ts; Touch("last_state_change"); } double Service::GetLastStateChange(void) const { if (m_LastStateChange.IsEmpty()) return IcingaApplication::GetInstance()->GetStartTime(); return m_LastStateChange; } void Service::SetLastHardStateChange(double ts) { m_LastHardStateChange = ts; Touch("last_hard_state_change"); } double Service::GetLastHardStateChange(void) const { if (m_LastHardStateChange.IsEmpty()) return IcingaApplication::GetInstance()->GetStartTime(); return m_LastHardStateChange; } bool Service::GetEnableActiveChecks(void) const { if (m_EnableActiveChecks.IsEmpty()) return true; else return m_EnableActiveChecks; } void Service::SetEnableActiveChecks(bool enabled) { m_EnableActiveChecks = enabled ? 1 : 0; Touch("enable_active_checks"); } bool Service::GetEnablePassiveChecks(void) const { if (m_EnablePassiveChecks.IsEmpty()) return true; else return m_EnablePassiveChecks; } void Service::SetEnablePassiveChecks(bool enabled) { m_EnablePassiveChecks = enabled ? 1 : 0; Touch("enable_passive_checks"); } bool Service::GetForceNextCheck(void) const { if (m_ForceNextCheck.IsEmpty()) return false; return static_cast(m_ForceNextCheck); } void Service::SetForceNextCheck(bool forced) { m_ForceNextCheck = forced ? 1 : 0; Touch("force_next_check"); } void Service::ProcessCheckResult(const Dictionary::Ptr& cr) { double now = Utility::GetTime(); if (!cr->Contains("schedule_start")) cr->Set("schedule_start", now); if (!cr->Contains("schedule_end")) cr->Set("schedule_end", now); if (!cr->Contains("execution_start")) cr->Set("execution_start", now); if (!cr->Contains("execution_end")) cr->Set("execution_end", now); bool reachable = IsReachable(); Host::Ptr host = GetHost(); bool host_reachable = true; if (host) host_reachable = host->IsReachable(); ASSERT(!OwnsLock()); ObjectLock olock(this); Dictionary::Ptr old_cr = GetLastCheckResult(); ServiceState old_state = GetState(); StateType old_stateType = GetStateType(); long old_attempt = GetCurrentCheckAttempt(); bool recovery; /* The ExecuteCheck function already sets the old state, but we need to do it again * in case this was a passive check result. */ SetLastState(old_state); SetLastStateType(old_stateType); SetLastReachable(reachable); long attempt; if (cr->Get("state") == StateOK) { if (old_state == StateOK && old_stateType == StateTypeSoft) SetStateType(StateTypeHard); // SOFT OK -> HARD OK attempt = 1; recovery = true; } else { if (old_attempt >= GetMaxCheckAttempts()) { SetStateType(StateTypeHard); attempt = 1; } else if (GetStateType() == StateTypeSoft || GetState() == StateOK) { SetStateType(StateTypeSoft); attempt = old_attempt + 1; } else { attempt = old_attempt; } recovery = false; } SetCurrentCheckAttempt(attempt); int state = cr->Get("state"); SetState(static_cast(state)); bool call_eventhandler = false; bool stateChange = (old_state != GetState()); if (stateChange) { SetLastStateChange(now); /* remove acknowledgements */ if (GetAcknowledgement() == AcknowledgementNormal || (GetAcknowledgement() == AcknowledgementSticky && GetStateType() == StateTypeHard && GetState() == StateOK)) { SetAcknowledgement(AcknowledgementNone); SetAcknowledgementExpiry(0); } /* reschedule service dependencies */ BOOST_FOREACH(const Service::Ptr& parent, GetParentServices()) { ObjectLock olock(parent); parent->SetNextCheck(Utility::GetTime()); } /* reschedule host dependencies */ BOOST_FOREACH(const Host::Ptr& parent, GetParentHosts()) { Service::Ptr service = parent->GetHostCheckService(); if (service && service->GetName() != GetName()) { ObjectLock olock(service); service->SetNextCheck(Utility::GetTime()); } } call_eventhandler = true; } bool remove_acknowledgement_comments = false; if (GetAcknowledgement() == AcknowledgementNone) remove_acknowledgement_comments = true; bool hardChange = (GetStateType() == StateTypeHard && old_stateType == StateTypeSoft); if (old_state != GetState() && old_stateType == StateTypeHard && GetStateType() == StateTypeHard) hardChange = true; if (IsVolatile()) hardChange = true; if (hardChange) SetLastHardStateChange(now); if (GetState() != StateOK) TriggerDowntimes(); Service::UpdateStatistics(cr); bool in_downtime = IsInDowntime(); bool send_notification = hardChange && reachable && !in_downtime && !IsAcknowledged(); if (old_state == StateOK && old_stateType == StateTypeSoft) send_notification = false; /* Don't send notifications for SOFT-OK -> HARD-OK. */ bool send_downtime_notification = m_LastInDowntime != in_downtime; m_LastInDowntime = in_downtime; Touch("last_in_downtime"); olock.Unlock(); if (remove_acknowledgement_comments) RemoveCommentsByType(CommentAcknowledgement); Dictionary::Ptr vars_after = boost::make_shared(); vars_after->Set("state", GetState()); vars_after->Set("state_type", GetStateType()); vars_after->Set("attempt", GetCurrentCheckAttempt()); vars_after->Set("reachable", reachable); vars_after->Set("host_reachable", host_reachable); if (old_cr) cr->Set("vars_before", old_cr->Get("vars_after")); cr->Set("vars_after", vars_after); cr->Seal(); olock.Lock(); SetLastCheckResult(cr); bool was_flapping, is_flapping; was_flapping = IsFlapping(); UpdateFlappingStatus(stateChange); is_flapping = IsFlapping(); olock.Unlock(); Log(LogDebug, "icinga", "Flapping: Service " + GetName() + " was: " + Convert::ToString(was_flapping) + " is: " + Convert::ToString(is_flapping) + " threshold: " + Convert::ToString(GetFlappingThreshold()) + "% current: " + Convert::ToString(GetFlappingCurrent()) + "%."); /* Flush the object so other instances see the service's * new state when they receive the CheckResult message */ Flush(); RequestMessage rm; rm.SetMethod("checker::CheckResult"); /* TODO: add _old_ state to message */ CheckResultMessage params; params.SetService(GetName()); params.SetCheckResult(cr); rm.SetParams(params); EndpointManager::GetInstance()->SendMulticastMessage(rm); if (call_eventhandler) ExecuteEventHandler(); if (send_downtime_notification) RequestNotifications(in_downtime ? NotificationDowntimeStart : NotificationDowntimeEnd, cr); if (!was_flapping && is_flapping) { RequestNotifications(NotificationFlappingStart, cr); RequestMessage rm; rm.SetMethod("icinga::Flapping"); FlappingMessage params; params.SetService(GetName()); params.SetState(FlappingStarted); rm.SetParams(params); EndpointManager::GetInstance()->SendMulticastMessage(rm); Log(LogDebug, "icinga", "Flapping: Service " + GetName() + " started flapping (" + Convert::ToString(GetFlappingThreshold()) + "% < " + Convert::ToString(GetFlappingCurrent()) + "%)."); } else if (was_flapping && !is_flapping) { RequestNotifications(NotificationFlappingEnd, cr); RequestMessage rm; rm.SetMethod("icinga::Flapping"); FlappingMessage params; params.SetService(GetName()); params.SetState(FlappingStopped); rm.SetParams(params); EndpointManager::GetInstance()->SendMulticastMessage(rm); Log(LogDebug, "icinga", "Flapping: Service " + GetName() + " stopped flapping (" + Convert::ToString(GetFlappingThreshold()) + "% >= " + Convert::ToString(GetFlappingCurrent()) + "%)."); } else if (send_notification) RequestNotifications(recovery ? NotificationRecovery : NotificationProblem, cr); } ServiceState Service::StateFromString(const String& state) { if (state == "OK") return StateOK; else if (state == "WARNING") return StateWarning; else if (state == "CRITICAL") return StateCritical; else if (state == "UNCHECKABLE") return StateUncheckable; else return StateUnknown; } String Service::StateToString(ServiceState state) { switch (state) { case StateOK: return "OK"; case StateWarning: return "WARNING"; case StateCritical: return "CRITICAL"; case StateUncheckable: return "UNCHECKABLE"; case StateUnknown: default: return "UNKNOWN"; } } StateType Service::StateTypeFromString(const String& type) { if (type == "SOFT") return StateTypeSoft; else return StateTypeHard; } String Service::StateTypeToString(StateType type) { if (type == StateTypeSoft) return "SOFT"; else return "HARD"; } bool Service::IsAllowedChecker(const String& checker) const { Array::Ptr checkers = GetCheckers(); if (!checkers) return true; ObjectLock olock(checkers); BOOST_FOREACH(const Value& pattern, checkers) { if (Utility::Match(pattern, checker)) return true; } return false; } void Service::ExecuteCheck(void) { ASSERT(!OwnsLock()); bool reachable = IsReachable(); { ObjectLock olock(this); /* don't run another check if there is one pending */ if (m_CheckRunning) return; m_CheckRunning = true; SetLastState(GetState()); SetLastStateType(GetLastStateType()); SetLastReachable(reachable); } /* keep track of scheduling info in case the check type doesn't provide its own information */ Dictionary::Ptr checkInfo = boost::make_shared(); checkInfo->Set("schedule_start", GetNextCheck()); checkInfo->Set("execution_start", Utility::GetTime()); Service::Ptr self = GetSelf(); Dictionary::Ptr result; try { CheckCommand::Ptr command = GetCheckCommand(); if (!command) return; result = command->Execute(GetSelf()); } catch (const std::exception& ex) { std::ostringstream msgbuf; msgbuf << "Exception occured during check for service '" << GetName() << "': " << boost::diagnostic_information(ex); String message = msgbuf.str(); Log(LogWarning, "icinga", message); result = boost::make_shared(); result->Set("state", StateUnknown); result->Set("output", message); } checkInfo->Set("execution_end", Utility::GetTime()); checkInfo->Set("schedule_end", Utility::GetTime()); checkInfo->Seal(); if (result) { if (!result->Contains("schedule_start")) result->Set("schedule_start", checkInfo->Get("schedule_start")); if (!result->Contains("schedule_end")) result->Set("schedule_end", checkInfo->Get("schedule_end")); if (!result->Contains("execution_start")) result->Set("execution_start", checkInfo->Get("execution_start")); if (!result->Contains("execution_end")) result->Set("execution_end", checkInfo->Get("execution_end")); if (!result->Contains("macros")) result->Set("macros", checkInfo->Get("macros")); if (!result->Contains("active")) result->Set("active", 1); if (!result->Contains("current_checker")) result->Set("current_checker", EndpointManager::GetInstance()->GetIdentity()); } if (result) ProcessCheckResult(result); /* figure out when the next check is for this service; the call to * ProcessCheckResult() should've already done this but lets do it again * just in case there was no check result. */ UpdateNextCheck(); { ObjectLock olock(this); m_CheckRunning = false; } } void Service::UpdateStatistics(const Dictionary::Ptr& cr) { time_t ts; Value schedule_end = cr->Get("schedule_end"); if (!schedule_end.IsEmpty()) ts = static_cast(schedule_end); else ts = static_cast(Utility::GetTime()); Value active = cr->Get("active"); if (active.IsEmpty() || static_cast(active)) CIB::UpdateActiveChecksStatistics(ts, 1); else CIB::UpdatePassiveChecksStatistics(ts, 1); } double Service::CalculateExecutionTime(const Dictionary::Ptr& cr) { double execution_start = 0, execution_end = 0; if (cr) { if (!cr->Contains("execution_start") || !cr->Contains("execution_end")) return 0; execution_start = cr->Get("execution_start"); execution_end = cr->Get("execution_end"); } return (execution_end - execution_start); } double Service::CalculateLatency(const Dictionary::Ptr& cr) { double schedule_start = 0, schedule_end = 0; if (cr) { if (!cr->Contains("schedule_start") || !cr->Contains("schedule_end")) return 0; schedule_start = cr->Get("schedule_start"); schedule_end = cr->Get("schedule_end"); } return (schedule_end - schedule_start) - CalculateExecutionTime(cr); }