2019-02-25 14:48:22 +01:00
/* Icinga 2 | (c) 2012 Icinga GmbH | GPLv2+ */
2013-02-09 18:39:43 +01:00
2014-05-25 16:23:35 +02:00
# include "icinga/checkable.hpp"
# include "icinga/service.hpp"
# include "icinga/host.hpp"
# include "icinga/checkcommand.hpp"
# include "icinga/icingaapplication.hpp"
# include "icinga/cib.hpp"
2015-10-19 17:31:18 +02:00
# include "icinga/clusterevents.hpp"
2014-05-25 16:23:35 +02:00
# include "remote/messageorigin.hpp"
2014-11-13 11:23:57 +01:00
# include "remote/apilistener.hpp"
2014-05-25 16:23:35 +02:00
# include "base/objectlock.hpp"
2014-10-19 14:21:12 +02:00
# include "base/logger.hpp"
2014-05-25 16:23:35 +02:00
# include "base/convert.hpp"
# include "base/utility.hpp"
# include "base/context.hpp"
2013-02-09 18:39:43 +01:00
using namespace icinga ;
2015-08-04 14:47:44 +02:00
boost : : signals2 : : signal < void ( const Checkable : : Ptr & , const CheckResult : : Ptr & , const MessageOrigin : : Ptr & ) > Checkable : : OnNewCheckResult ;
boost : : signals2 : : signal < void ( const Checkable : : Ptr & , const CheckResult : : Ptr & , StateType , const MessageOrigin : : Ptr & ) > Checkable : : OnStateChange ;
boost : : signals2 : : signal < void ( const Checkable : : Ptr & , const CheckResult : : Ptr & , std : : set < Checkable : : Ptr > , const MessageOrigin : : Ptr & ) > Checkable : : OnReachabilityChanged ;
2016-06-07 12:44:12 +02:00
boost : : signals2 : : signal < void ( const Checkable : : Ptr & , NotificationType , const CheckResult : : Ptr & , const String & , const String & , const MessageOrigin : : Ptr & ) > Checkable : : OnNotificationsRequested ;
2016-01-22 18:42:15 +01:00
boost : : signals2 : : signal < void ( const Checkable : : Ptr & ) > Checkable : : OnNextCheckUpdated ;
2014-04-03 15:36:13 +02:00
2019-08-14 17:43:06 +02:00
Atomic < uint_fast64_t > Checkable : : CurrentConcurrentChecks ( 0 ) ;
2021-02-02 10:16:04 +01:00
std : : mutex Checkable : : m_StatsMutex ;
2016-05-12 13:46:22 +02:00
int Checkable : : m_PendingChecks = 0 ;
2021-02-02 10:16:04 +01:00
std : : condition_variable Checkable : : m_PendingChecksCV ;
2016-05-12 13:46:22 +02:00
2018-01-04 04:25:35 +01:00
CheckCommand : : Ptr Checkable : : GetCheckCommand ( ) const
2013-02-09 18:39:43 +01:00
{
2015-09-22 09:42:30 +02:00
return dynamic_pointer_cast < CheckCommand > ( NavigateCheckCommandRaw ( ) ) ;
2013-02-09 18:39:43 +01:00
}
2018-01-04 04:25:35 +01:00
TimePeriod : : Ptr Checkable : : GetCheckPeriod ( ) const
2013-03-13 16:04:53 +01:00
{
2015-08-04 14:47:44 +02:00
return TimePeriod : : GetByName ( GetCheckPeriodRaw ( ) ) ;
2013-02-09 18:39:43 +01:00
}
2014-04-03 15:36:13 +02:00
void Checkable : : SetSchedulingOffset ( long offset )
2013-02-09 18:39:43 +01:00
{
2013-02-26 10:13:54 +01:00
m_SchedulingOffset = offset ;
2013-02-09 18:39:43 +01:00
}
2018-01-04 04:25:35 +01:00
long Checkable : : GetSchedulingOffset ( )
2013-02-09 18:39:43 +01:00
{
2013-02-26 10:13:54 +01:00
return m_SchedulingOffset ;
2013-02-09 18:39:43 +01:00
}
2016-03-15 13:02:38 +01:00
void Checkable : : UpdateNextCheck ( const MessageOrigin : : Ptr & origin )
2013-02-09 18:39:43 +01:00
{
double interval ;
2017-12-14 15:37:20 +01:00
if ( GetStateType ( ) = = StateTypeSoft & & GetLastCheckResult ( ) ! = nullptr )
2013-02-09 18:39:43 +01:00
interval = GetRetryInterval ( ) ;
else
interval = GetCheckInterval ( ) ;
double now = Utility : : GetTime ( ) ;
double adj = 0 ;
if ( interval > 1 )
2013-03-20 15:25:53 +01:00
adj = fmod ( now * 100 + GetSchedulingOffset ( ) , interval * 100 ) / 100.0 ;
2013-02-09 18:39:43 +01:00
2019-01-09 11:27:33 +01:00
if ( adj ! = 0.0 )
adj = std : : min ( 0.5 + fmod ( GetSchedulingOffset ( ) , interval * 5 ) / 100.0 , adj ) ;
2016-05-24 11:05:29 +02:00
2018-07-02 16:17:33 +02:00
double nextCheck = now - adj + interval ;
2019-01-16 17:27:38 +01:00
double lastCheck = GetLastCheck ( ) ;
2018-07-02 16:17:33 +02:00
Log ( LogDebug , " Checkable " )
< < " Update checkable ' " < < GetName ( ) < < " ' with check interval ' " < < GetCheckInterval ( )
2019-01-16 17:27:38 +01:00
< < " ' from last check time at " < < Utility : : FormatDateTime ( " %Y-%m-%d %H:%M:%S %z " , ( lastCheck < 0 ? 0 : lastCheck ) )
2019-05-22 14:13:14 +02:00
< < " ( " < < GetLastCheck ( ) < < " ) to next check time at " < < Utility : : FormatDateTime ( " %Y-%m-%d %H:%M:%S %z " , nextCheck ) < < " ( " < < nextCheck < < " ). " ;
2018-07-02 16:17:33 +02:00
SetNextCheck ( nextCheck , false , origin ) ;
2013-02-09 18:39:43 +01:00
}
2018-01-04 04:25:35 +01:00
bool Checkable : : HasBeenChecked ( ) const
2013-07-05 09:35:49 +02:00
{
2017-12-14 15:37:20 +01:00
return GetLastCheckResult ( ) ! = nullptr ;
2013-07-05 09:35:49 +02:00
}
2018-01-04 04:25:35 +01:00
double Checkable : : GetLastCheck ( ) const
2013-07-05 09:35:49 +02:00
{
2013-11-09 14:22:38 +01:00
CheckResult : : Ptr cr = GetLastCheckResult ( ) ;
2013-07-05 09:35:49 +02:00
double schedule_end = - 1 ;
2013-10-26 09:41:45 +02:00
if ( cr )
2013-11-09 14:22:38 +01:00
schedule_end = cr - > GetScheduleEnd ( ) ;
2013-07-05 09:35:49 +02:00
return schedule_end ;
}
2022-04-26 13:33:59 +02:00
Checkable : : ProcessingResult Checkable : : ProcessCheckResult ( const CheckResult : : Ptr & cr , const MessageOrigin : : Ptr & origin )
2013-02-09 18:39:43 +01:00
{
2022-04-26 13:33:59 +02:00
using Result = Checkable : : ProcessingResult ;
2014-03-12 10:05:36 +01:00
{
ObjectLock olock ( this ) ;
m_CheckRunning = false ;
}
2018-05-29 13:51:34 +02:00
if ( ! cr )
2022-04-26 13:33:59 +02:00
return Result : : NoCheckResult ;
2016-08-08 14:17:44 +02:00
2013-03-19 16:20:13 +01:00
double now = Utility : : GetTime ( ) ;
2013-11-09 14:22:38 +01:00
if ( cr - > GetScheduleStart ( ) = = 0 )
cr - > SetScheduleStart ( now ) ;
2013-03-19 16:20:13 +01:00
2013-11-09 14:22:38 +01:00
if ( cr - > GetScheduleEnd ( ) = = 0 )
cr - > SetScheduleEnd ( now ) ;
2013-03-19 16:20:13 +01:00
2013-11-09 14:22:38 +01:00
if ( cr - > GetExecutionStart ( ) = = 0 )
cr - > SetExecutionStart ( now ) ;
2013-03-19 16:20:13 +01:00
2013-11-09 14:22:38 +01:00
if ( cr - > GetExecutionEnd ( ) = = 0 )
cr - > SetExecutionEnd ( now ) ;
2013-03-19 16:20:13 +01:00
2021-07-20 11:10:26 +02:00
if ( ! origin | | origin - > IsLocal ( ) )
cr - > SetSchedulingSource ( IcingaApplication : : GetInstance ( ) - > GetNodeName ( ) ) ;
2014-11-13 11:23:57 +01:00
Endpoint : : Ptr command_endpoint = GetCommandEndpoint ( ) ;
2021-01-25 16:05:03 +01:00
if ( cr - > GetCheckSource ( ) . IsEmpty ( ) ) {
if ( ( ! origin | | origin - > IsLocal ( ) ) )
cr - > SetCheckSource ( IcingaApplication : : GetInstance ( ) - > GetNodeName ( ) ) ;
/* override check source if command_endpoint was defined */
if ( command_endpoint & & ! GetExtension ( " agent_check " ) )
cr - > SetCheckSource ( command_endpoint - > GetName ( ) ) ;
}
2015-09-05 15:18:10 +02:00
/* agent checks go through the api */
2015-01-18 22:15:35 +01:00
if ( command_endpoint & & GetExtension ( " agent_check " ) ) {
2014-11-13 11:23:57 +01:00
ApiListener : : Ptr listener = ApiListener : : GetInstance ( ) ;
if ( listener ) {
2015-01-18 22:15:35 +01:00
/* send message back to its origin */
2015-10-19 17:31:18 +02:00
Dictionary : : Ptr message = ClusterEvents : : MakeCheckResultMessage ( this , cr ) ;
2014-11-13 11:23:57 +01:00
listener - > SyncSendMessage ( command_endpoint , message ) ;
}
2022-04-26 13:33:59 +02:00
return Result : : Ok ;
2015-01-18 22:15:35 +01:00
2014-11-13 11:23:57 +01:00
}
2018-05-29 13:51:34 +02:00
if ( ! IsActive ( ) )
2022-04-26 13:33:59 +02:00
return Result : : CheckableInactive ;
2018-05-29 13:51:34 +02:00
2013-03-06 11:03:50 +01:00
bool reachable = IsReachable ( ) ;
2014-02-27 11:05:55 +01:00
bool notification_reachable = IsReachable ( DependencyNotification ) ;
2013-03-06 11:03:50 +01:00
2013-03-02 09:07:47 +01:00
ObjectLock olock ( this ) ;
2013-11-09 14:22:38 +01:00
CheckResult : : Ptr old_cr = GetLastCheckResult ( ) ;
2014-04-03 15:36:13 +02:00
ServiceState old_state = GetStateRaw ( ) ;
2013-03-07 12:04:20 +01:00
StateType old_stateType = GetStateType ( ) ;
2013-10-26 09:41:45 +02:00
long old_attempt = GetCheckAttempt ( ) ;
2014-07-22 14:13:21 +02:00
bool recovery = false ;
2013-02-09 18:39:43 +01:00
2019-02-08 13:32:13 +01:00
/* When we have an check result already (not after fresh start),
* prevent to accept old check results and allow overrides for
* CRs happened in the future .
*/
if ( old_cr ) {
double currentCRTimestamp = old_cr - > GetExecutionStart ( ) ;
double newCRTimestamp = cr - > GetExecutionStart ( ) ;
/* Our current timestamp may be from the future (wrong server time adjusted again). Allow overrides here. */
if ( currentCRTimestamp > now ) {
/* our current CR is from the future, let the new CR override it. */
Log ( LogDebug , " Checkable " )
< < std : : fixed < < std : : setprecision ( 6 ) < < " Processing check result for checkable ' " < < GetName ( ) < < " ' from "
< < Utility : : FormatDateTime ( " %Y-%m-%d %H:%M:%S %z " , newCRTimestamp ) < < " ( " < < newCRTimestamp
< < " ). Overriding since ours is from the future at "
< < Utility : : FormatDateTime ( " %Y-%m-%d %H:%M:%S %z " , currentCRTimestamp ) < < " ( " < < currentCRTimestamp < < " ). " ;
} else {
/* Current timestamp is from the past, but the new timestamp is even more in the past. Skip it. */
if ( newCRTimestamp < currentCRTimestamp ) {
Log ( LogDebug , " Checkable " )
< < std : : fixed < < std : : setprecision ( 6 ) < < " Skipping check result for checkable ' " < < GetName ( ) < < " ' from "
< < Utility : : FormatDateTime ( " %Y-%m-%d %H:%M:%S %z " , newCRTimestamp ) < < " ( " < < newCRTimestamp
< < " ). It is in the past compared to ours at "
< < Utility : : FormatDateTime ( " %Y-%m-%d %H:%M:%S %z " , currentCRTimestamp ) < < " ( " < < currentCRTimestamp < < " ). " ;
2022-04-26 13:33:59 +02:00
return Result : : NewerCheckResultPresent ;
2019-02-08 13:32:13 +01:00
}
}
}
2013-09-12 17:39:29 +02:00
2013-03-25 18:36:15 +01:00
/* The ExecuteCheck function already sets the old state, but we need to do it again
2013-03-08 14:43:48 +01:00
* in case this was a passive check result . */
2014-04-03 15:36:13 +02:00
SetLastStateRaw ( old_state ) ;
2013-03-07 12:04:20 +01:00
SetLastStateType ( old_stateType ) ;
2013-03-19 13:04:30 +01:00
SetLastReachable ( reachable ) ;
2013-03-07 12:04:20 +01:00
2016-03-10 14:32:57 +01:00
Host : : Ptr host ;
Service : : Ptr service ;
tie ( host , service ) = GetHostService ( this ) ;
CheckableType checkableType = CheckableHost ;
if ( service )
checkableType = CheckableService ;
2014-03-10 08:56:31 +01:00
long attempt = 1 ;
2013-02-09 18:39:43 +01:00
2014-12-12 15:53:10 +01:00
std : : set < Checkable : : Ptr > children = GetChildren ( ) ;
2016-08-08 11:02:08 +02:00
if ( IsStateOK ( cr - > GetState ( ) ) ) {
2016-05-31 17:03:49 +02:00
SetStateType ( StateTypeHard ) ; // NOT-OK -> HARD OK
2013-02-09 18:39:43 +01:00
2016-08-24 11:13:19 +02:00
if ( ! IsStateOK ( old_state ) )
2016-05-31 17:03:49 +02:00
recovery = true ;
2014-07-22 14:13:21 +02:00
2013-07-18 17:04:09 +02:00
ResetNotificationNumbers ( ) ;
2020-07-09 10:44:38 +02:00
SaveLastState ( ServiceOK , cr - > GetExecutionEnd ( ) ) ;
2013-02-09 18:39:43 +01:00
} else {
2016-09-27 11:29:25 +02:00
/* OK -> NOT-OK change, first SOFT state. Reset attempt counter. */
if ( IsStateOK ( old_state ) ) {
2013-02-09 18:39:43 +01:00
SetStateType ( StateTypeSoft ) ;
2016-09-27 11:29:25 +02:00
attempt = 1 ;
}
/* SOFT state change, increase attempt counter. */
if ( old_stateType = = StateTypeSoft & & ! IsStateOK ( old_state ) ) {
2015-02-20 21:30:14 +01:00
SetStateType ( StateTypeSoft ) ;
2016-09-27 11:29:25 +02:00
attempt = old_attempt + 1 ;
}
/* HARD state change (e.g. previously 2/3 and this next attempt). Reset attempt counter. */
if ( attempt > = GetMaxCheckAttempts ( ) ) {
SetStateType ( StateTypeHard ) ;
attempt = 1 ;
2013-02-09 18:39:43 +01:00
}
2013-02-24 01:10:34 +01:00
2016-03-15 09:46:20 +01:00
if ( ! IsStateOK ( cr - > GetState ( ) ) ) {
2020-07-09 10:44:38 +02:00
SaveLastState ( cr - > GetState ( ) , cr - > GetExecutionEnd ( ) ) ;
2013-11-09 14:22:38 +01:00
}
2013-02-09 18:39:43 +01:00
}
2013-07-18 18:28:23 +02:00
if ( ! reachable )
2020-07-09 10:44:38 +02:00
SetLastStateUnreachable ( cr - > GetExecutionEnd ( ) ) ;
2013-07-18 18:28:23 +02:00
2013-10-26 09:41:45 +02:00
SetCheckAttempt ( attempt ) ;
2013-02-09 18:39:43 +01:00
2014-04-03 15:36:13 +02:00
ServiceState new_state = cr - > GetState ( ) ;
2020-03-04 10:55:07 +01:00
SetStateRaw ( new_state ) ;
2013-02-09 18:39:43 +01:00
2016-03-10 14:32:57 +01:00
bool stateChange ;
/* Exception on state change calculation for hosts. */
if ( checkableType = = CheckableService )
stateChange = ( old_state ! = new_state ) ;
else
stateChange = ( Host : : CalculateState ( old_state ) ! = Host : : CalculateState ( new_state ) ) ;
2019-03-27 11:43:14 +01:00
/* Store the current last state change for the next iteration. */
SetPreviousStateChange ( GetLastStateChange ( ) ) ;
2013-06-21 10:20:29 +02:00
if ( stateChange ) {
2020-07-09 10:44:38 +02:00
SetLastStateChange ( cr - > GetExecutionEnd ( ) ) ;
2013-02-09 18:39:43 +01:00
/* remove acknowledgements */
if ( GetAcknowledgement ( ) = = AcknowledgementNormal | |
2017-12-19 15:50:05 +01:00
( GetAcknowledgement ( ) = = AcknowledgementSticky & & IsStateOK ( new_state ) ) ) {
2019-11-28 17:46:12 +01:00
ClearAcknowledgement ( " " ) ;
2013-02-09 18:39:43 +01:00
}
}
2013-06-19 10:57:07 +02:00
bool remove_acknowledgement_comments = false ;
if ( GetAcknowledgement ( ) = = AcknowledgementNone )
remove_acknowledgement_comments = true ;
2013-03-20 15:25:53 +01:00
bool hardChange = ( GetStateType ( ) = = StateTypeHard & & old_stateType = = StateTypeSoft ) ;
2014-04-03 15:36:13 +02:00
if ( stateChange & & old_stateType = = StateTypeHard & & GetStateType ( ) = = StateTypeHard )
2013-03-21 11:37:34 +01:00
hardChange = true ;
2015-01-08 16:18:11 +01:00
bool is_volatile = GetVolatile ( ) ;
2013-06-13 12:24:20 +02:00
2015-01-08 16:18:11 +01:00
if ( hardChange | | is_volatile ) {
2014-04-03 15:36:13 +02:00
SetLastHardStateRaw ( new_state ) ;
2020-07-09 10:44:38 +02:00
SetLastHardStateChange ( cr - > GetExecutionEnd ( ) ) ;
2019-10-23 15:22:54 +02:00
SetLastHardStatesRaw ( GetLastHardStatesRaw ( ) / 100u + new_state * 100u ) ;
2013-07-05 09:35:49 +02:00
}
2013-03-02 09:07:47 +01:00
2019-11-04 11:02:07 +01:00
if ( stateChange ) {
SetLastSoftStatesRaw ( GetLastSoftStatesRaw ( ) / 100u + new_state * 100u ) ;
}
2022-01-10 19:18:11 +01:00
cr - > SetPreviousHardState ( ServiceState ( GetLastHardStatesRaw ( ) % 100u ) ) ;
2016-03-15 09:46:20 +01:00
if ( ! IsStateOK ( new_state ) )
2021-12-08 11:49:42 +01:00
TriggerDowntimes ( cr - > GetExecutionEnd ( ) ) ;
2013-02-09 18:39:43 +01:00
2014-05-26 20:56:34 +02:00
/* statistics for external tools */
2016-03-10 14:32:57 +01:00
Checkable : : UpdateStatistics ( cr , checkableType ) ;
2013-02-09 18:39:43 +01:00
2013-03-18 12:55:41 +01:00
bool in_downtime = IsInDowntime ( ) ;
2016-03-11 13:19:03 +01:00
bool send_notification = false ;
2019-07-02 11:23:16 +02:00
bool suppress_notification = ! notification_reachable | | in_downtime | | IsAcknowledged ( ) ;
2016-03-11 13:19:03 +01:00
2019-07-02 11:23:16 +02:00
/* Send notifications whether when a hard state change occurred. */
if ( hardChange & & ! ( old_stateType = = StateTypeSoft & & IsStateOK ( new_state ) ) )
send_notification = true ;
/* Or if the checkable is volatile and in a HARD state. */
else if ( is_volatile & & GetStateType ( ) = = StateTypeHard )
send_notification = true ;
2013-03-18 12:55:41 +01:00
2016-03-15 09:46:20 +01:00
if ( IsStateOK ( old_state ) & & old_stateType = = StateTypeSoft )
2013-03-20 15:25:53 +01:00
send_notification = false ; /* Don't send notifications for SOFT-OK -> HARD-OK. */
2016-03-15 09:46:20 +01:00
if ( is_volatile & & IsStateOK ( old_state ) & & IsStateOK ( new_state ) )
2015-01-08 16:18:11 +01:00
send_notification = false ; /* Don't send notifications for volatile OK -> OK changes. */
2013-03-02 09:07:47 +01:00
olock . Unlock ( ) ;
2013-02-09 18:39:43 +01:00
2013-06-19 10:57:07 +02:00
if ( remove_acknowledgement_comments )
RemoveCommentsByType ( CommentAcknowledgement ) ;
2018-01-11 11:17:38 +01:00
Dictionary : : Ptr vars_after = new Dictionary ( {
{ " state " , new_state } ,
{ " state_type " , GetStateType ( ) } ,
{ " attempt " , GetCheckAttempt ( ) } ,
{ " reachable " , reachable }
} ) ;
2013-03-19 13:04:30 +01:00
if ( old_cr )
2013-11-09 14:22:38 +01:00
cr - > SetVarsBefore ( old_cr - > GetVarsAfter ( ) ) ;
2013-03-19 13:04:30 +01:00
2013-11-09 14:22:38 +01:00
cr - > SetVarsAfter ( vars_after ) ;
2013-03-19 13:04:30 +01:00
2013-03-07 15:00:26 +01:00
olock . Lock ( ) ;
2020-03-04 10:55:07 +01:00
if ( service ) {
SetLastCheckResult ( cr ) ;
} else {
bool wasProblem = GetProblem ( ) ;
SetLastCheckResult ( cr ) ;
if ( GetProblem ( ) ! = wasProblem ) {
2020-08-11 15:24:54 +02:00
auto services = host - > GetServices ( ) ;
olock . Unlock ( ) ;
for ( auto & service : services ) {
2020-03-04 10:55:07 +01:00
Service : : OnHostProblemChanged ( service , cr , origin ) ;
}
2020-08-11 15:24:54 +02:00
olock . Lock ( ) ;
2020-03-04 10:55:07 +01:00
}
}
2013-06-21 10:20:29 +02:00
2017-10-19 17:32:52 +02:00
bool was_flapping = IsFlapping ( ) ;
2013-06-21 10:20:29 +02:00
2020-11-11 17:43:30 +01:00
UpdateFlappingStatus ( cr - > GetState ( ) ) ;
2016-03-10 14:32:57 +01:00
2017-10-19 17:32:52 +02:00
bool is_flapping = IsFlapping ( ) ;
2013-06-21 10:20:29 +02:00
2016-03-15 13:02:38 +01:00
if ( cr - > GetActive ( ) ) {
2016-05-24 11:05:29 +02:00
UpdateNextCheck ( origin ) ;
2016-03-15 13:02:38 +01:00
} else {
2018-01-11 17:10:46 +01:00
/* Reschedule the next check for external passive check results. The side effect of
* this is that for as long as we receive results for a service we
2016-03-15 13:02:38 +01:00
* won ' t execute any active checks . */
2018-01-11 17:10:46 +01:00
double offset ;
double ttl = cr - > GetTtl ( ) ;
if ( ttl > 0 )
offset = ttl ;
else
offset = GetCheckInterval ( ) ;
SetNextCheck ( Utility : : GetTime ( ) + offset , false , origin ) ;
2016-03-15 13:02:38 +01:00
}
2013-03-07 15:00:26 +01:00
olock . Unlock ( ) ;
2017-11-08 12:12:27 +01:00
# ifdef I2_DEBUG /* I2_DEBUG */
Log ( LogDebug , " Checkable " )
2017-12-19 15:50:05 +01:00
< < " Flapping: Checkable " < < GetName ( )
< < " was: " < < was_flapping
< < " is: " < < is_flapping
< < " threshold low: " < < GetFlappingThresholdLow ( )
< < " threshold high: " < < GetFlappingThresholdHigh ( )
< < " % current: " < < GetFlappingCurrent ( ) < < " %. " ;
2017-11-08 12:12:27 +01:00
# endif /* I2_DEBUG */
2013-07-01 17:25:30 +02:00
2022-02-17 16:13:25 +01:00
if ( recovery ) {
for ( auto & child : children ) {
if ( child - > GetProblem ( ) & & child - > GetEnableActiveChecks ( ) ) {
auto nextCheck ( now + Utility : : Random ( ) % 60 ) ;
ObjectLock oLock ( child ) ;
if ( nextCheck < child - > GetNextCheck ( ) ) {
child - > SetNextCheck ( nextCheck ) ;
}
}
}
}
if ( stateChange ) {
/* reschedule direct parents */
for ( const Checkable : : Ptr & parent : GetParents ( ) ) {
if ( parent . get ( ) = = this )
continue ;
if ( ! parent - > GetEnableActiveChecks ( ) )
continue ;
if ( parent - > GetNextCheck ( ) > = now + parent - > GetRetryInterval ( ) ) {
ObjectLock olock ( parent ) ;
parent - > SetNextCheck ( now ) ;
}
}
}
2014-11-08 21:17:16 +01:00
OnNewCheckResult ( this , cr , origin ) ;
2014-03-09 18:06:24 +01:00
/* signal status updates to for example db_ido */
2014-11-08 21:17:16 +01:00
OnStateChanged ( this ) ;
2013-03-02 09:07:47 +01:00
2014-05-25 12:45:29 +02:00
String old_state_str = ( service ? Service : : StateToString ( old_state ) : Host : : StateToString ( Host : : CalculateState ( old_state ) ) ) ;
String new_state_str = ( service ? Service : : StateToString ( new_state ) : Host : : StateToString ( Host : : CalculateState ( new_state ) ) ) ;
2016-05-21 13:41:43 +02:00
/* Whether a hard state change or a volatile state change except OK -> OK happened. */
2016-05-21 14:16:47 +02:00
if ( hardChange | | ( is_volatile & & ! ( IsStateOK ( old_state ) & & IsStateOK ( new_state ) ) ) ) {
2014-11-08 21:17:16 +01:00
OnStateChange ( this , cr , StateTypeHard , origin ) ;
2014-10-19 17:52:17 +02:00
Log ( LogNotice , " Checkable " )
2017-12-19 15:50:05 +01:00
< < " State Change: Checkable ' " < < GetName ( ) < < " ' hard state change from " < < old_state_str < < " to " < < new_state_str < < " detected. " < < ( is_volatile ? " Checkable is volatile. " : " " ) ;
2016-06-13 10:09:18 +02:00
}
/* Whether a state change happened or the state type is SOFT (must be logged too). */
else if ( stateChange | | GetStateType ( ) = = StateTypeSoft ) {
2014-11-08 21:17:16 +01:00
OnStateChange ( this , cr , StateTypeSoft , origin ) ;
2014-10-19 17:52:17 +02:00
Log ( LogNotice , " Checkable " )
2017-12-19 15:50:05 +01:00
< < " State Change: Checkable ' " < < GetName ( ) < < " ' soft state change from " < < old_state_str < < " to " < < new_state_str < < " detected. " ;
2014-05-22 23:47:03 +02:00
}
2013-09-25 18:01:08 +02:00
2016-05-21 13:41:43 +02:00
if ( GetStateType ( ) = = StateTypeSoft | | hardChange | | recovery | |
2017-12-19 15:50:05 +01:00
( is_volatile & & ! ( IsStateOK ( old_state ) & & IsStateOK ( new_state ) ) ) )
2013-06-13 11:33:00 +02:00
ExecuteEventHandler ( ) ;
2019-07-02 11:23:16 +02:00
int suppressed_types = 0 ;
2016-06-13 10:12:38 +02:00
/* Flapping start/end notifications */
2019-07-02 11:23:16 +02:00
if ( ! was_flapping & & is_flapping ) {
2016-11-10 14:02:02 +01:00
/* FlappingStart notifications happen on state changes, not in downtimes */
2019-07-02 11:23:16 +02:00
if ( ! IsPaused ( ) ) {
if ( in_downtime ) {
suppressed_types | = NotificationFlappingStart ;
} else {
OnNotificationsRequested ( this , NotificationFlappingStart , cr , " " , " " , nullptr ) ;
}
}
2016-06-13 10:12:38 +02:00
Log ( LogNotice , " Checkable " )
2017-12-19 15:50:05 +01:00
< < " Flapping Start: Checkable ' " < < GetName ( ) < < " ' started flapping (Current flapping value "
< < GetFlappingCurrent ( ) < < " % > high threshold " < < GetFlappingThresholdHigh ( ) < < " %). " ;
2016-06-13 10:12:38 +02:00
NotifyFlapping ( origin ) ;
2019-07-02 11:23:16 +02:00
} else if ( was_flapping & & ! is_flapping ) {
2016-11-10 14:02:02 +01:00
/* FlappingEnd notifications are independent from state changes, must not happen in downtine */
2019-07-02 11:23:16 +02:00
if ( ! IsPaused ( ) ) {
if ( in_downtime ) {
suppressed_types | = NotificationFlappingEnd ;
} else {
OnNotificationsRequested ( this , NotificationFlappingEnd , cr , " " , " " , nullptr ) ;
}
}
2016-06-13 10:12:38 +02:00
Log ( LogNotice , " Checkable " )
2017-12-19 15:50:05 +01:00
< < " Flapping Stop: Checkable ' " < < GetName ( ) < < " ' stopped flapping (Current flapping value "
< < GetFlappingCurrent ( ) < < " % < low threshold " < < GetFlappingThresholdLow ( ) < < " %). " ;
2016-06-13 10:12:38 +02:00
NotifyFlapping ( origin ) ;
}
2016-08-24 11:13:19 +02:00
if ( send_notification & & ! is_flapping ) {
2019-07-02 11:23:16 +02:00
if ( ! IsPaused ( ) ) {
2022-01-28 15:15:38 +01:00
/* If there are still some pending suppressed state notification, keep the suppression until these are
* handled by Checkable : : FireSuppressedNotifications ( ) .
*/
bool pending = GetSuppressedNotifications ( ) & ( NotificationRecovery | NotificationProblem ) ;
if ( suppress_notification | | pending ) {
2019-07-02 11:23:16 +02:00
suppressed_types | = ( recovery ? NotificationRecovery : NotificationProblem ) ;
} else {
OnNotificationsRequested ( this , recovery ? NotificationRecovery : NotificationProblem , cr , " " , " " , nullptr ) ;
}
}
}
if ( suppressed_types ) {
/* If some notifications were suppressed, but just because of e.g. a downtime,
* stash them into a notification types bitmask for maybe re - sending later .
*/
ObjectLock olock ( this ) ;
int suppressed_types_before ( GetSuppressedNotifications ( ) ) ;
int suppressed_types_after ( suppressed_types_before | suppressed_types ) ;
2022-01-28 15:15:38 +01:00
const int conflict = NotificationFlappingStart | NotificationFlappingEnd ;
if ( ( suppressed_types_after & conflict ) = = conflict ) {
/* Flapping start and end cancel out each other. */
suppressed_types_after & = ~ conflict ;
}
2019-07-02 11:23:16 +02:00
2022-01-28 15:15:38 +01:00
const int stateNotifications = NotificationRecovery | NotificationProblem ;
if ( ! ( suppressed_types_before & stateNotifications ) & & ( suppressed_types & stateNotifications ) ) {
/* A state-related notification is suppressed for the first time, store the previous state. When
* notifications are no longer suppressed , this can be compared with the current state to determine
* if a notification must be sent . This is done differently compared to flapping notifications just above
* as for state notifications , problem and recovery don ' t always cancel each other . For example ,
* WARNING - > OK - > CRITICAL generates both types once , but there should still be a notification .
*/
SetStateBeforeSuppression ( old_stateType = = StateTypeHard ? old_state : ServiceOK ) ;
2019-07-02 11:23:16 +02:00
}
if ( suppressed_types_after ! = suppressed_types_before ) {
SetSuppressedNotifications ( suppressed_types_after ) ;
}
2016-02-22 19:43:44 +01:00
}
2022-01-25 11:38:05 +01:00
/* update reachability for child objects */
2022-02-03 11:12:53 +01:00
if ( ( stateChange | | hardChange ) & & ! children . empty ( ) )
2022-01-25 11:38:05 +01:00
OnReachabilityChanged ( this , cr , children , origin ) ;
2022-04-26 13:33:59 +02:00
return Result : : Ok ;
2013-02-09 18:39:43 +01:00
}
2015-01-18 22:15:35 +01:00
void Checkable : : ExecuteRemoteCheck ( const Dictionary : : Ptr & resolvedMacros )
{
2022-11-24 12:40:36 +01:00
CONTEXT ( " Executing remote check for object ' " < < GetName ( ) < < " ' " ) ;
2015-01-18 22:15:35 +01:00
double scheduled_start = GetNextCheck ( ) ;
double before_check = Utility : : GetTime ( ) ;
CheckResult : : Ptr cr = new CheckResult ( ) ;
cr - > SetScheduleStart ( scheduled_start ) ;
cr - > SetExecutionStart ( before_check ) ;
GetCheckCommand ( ) - > Execute ( this , cr , resolvedMacros , true ) ;
}
2018-01-04 04:25:35 +01:00
void Checkable : : ExecuteCheck ( )
2013-02-09 18:39:43 +01:00
{
2022-11-24 12:40:36 +01:00
CONTEXT ( " Executing check for object ' " < < GetName ( ) < < " ' " ) ;
2013-11-19 07:49:41 +01:00
2016-05-10 11:12:37 +02:00
/* keep track of scheduling info in case the check type doesn't provide its own information */
double scheduled_start = GetNextCheck ( ) ;
double before_check = Utility : : GetTime ( ) ;
2020-03-05 15:42:07 +01:00
SetLastCheckStarted ( Utility : : GetTime ( ) ) ;
2018-07-02 16:17:53 +02:00
/* This calls SetNextCheck() which updates the CheckerComponent's idle/pending
* queues and ensures that checks are not fired multiple times . ProcessCheckResult ( )
* is called too late . See # 6421.
*/
UpdateNextCheck ( ) ;
2013-03-19 14:13:58 +01:00
bool reachable = IsReachable ( ) ;
2013-03-06 11:03:50 +01:00
{
ObjectLock olock ( this ) ;
/* don't run another check if there is one pending */
2013-03-25 18:36:15 +01:00
if ( m_CheckRunning )
2013-03-06 11:03:50 +01:00
return ;
m_CheckRunning = true ;
2013-03-08 14:43:48 +01:00
2014-04-03 15:36:13 +02:00
SetLastStateRaw ( GetStateRaw ( ) ) ;
2013-03-08 14:43:48 +01:00
SetLastStateType ( GetLastStateType ( ) ) ;
2013-03-19 14:13:58 +01:00
SetLastReachable ( reachable ) ;
2013-02-09 18:39:43 +01:00
}
2015-01-18 22:15:35 +01:00
CheckResult : : Ptr cr = new CheckResult ( ) ;
2013-03-25 18:36:15 +01:00
2015-01-18 22:15:35 +01:00
cr - > SetScheduleStart ( scheduled_start ) ;
cr - > SetExecutionStart ( before_check ) ;
2013-02-09 18:39:43 +01:00
2014-11-13 11:23:57 +01:00
Endpoint : : Ptr endpoint = GetCommandEndpoint ( ) ;
2015-01-18 22:15:35 +01:00
bool local = ! endpoint | | endpoint = = Endpoint : : GetLocalEndpoint ( ) ;
2014-11-13 11:23:57 +01:00
2015-01-18 22:15:35 +01:00
if ( local ) {
2017-12-14 15:37:20 +01:00
GetCheckCommand ( ) - > Execute ( this , cr , nullptr , false ) ;
2015-01-18 22:15:35 +01:00
} else {
Dictionary : : Ptr macros = new Dictionary ( ) ;
GetCheckCommand ( ) - > Execute ( this , cr , macros , false ) ;
2014-11-13 11:23:57 +01:00
2015-10-22 10:52:38 +02:00
if ( endpoint - > GetConnected ( ) ) {
2015-01-18 22:15:35 +01:00
/* perform check on remote endpoint */
2014-11-13 11:23:57 +01:00
Dictionary : : Ptr message = new Dictionary ( ) ;
message - > Set ( " jsonrpc " , " 2.0 " ) ;
message - > Set ( " method " , " event::ExecuteCommand " ) ;
Host : : Ptr host ;
Service : : Ptr service ;
tie ( host , service ) = GetHostService ( this ) ;
Dictionary : : Ptr params = new Dictionary ( ) ;
message - > Set ( " params " , params ) ;
params - > Set ( " command_type " , " check_command " ) ;
params - > Set ( " command " , GetCheckCommand ( ) - > GetName ( ) ) ;
params - > Set ( " host " , host - > GetName ( ) ) ;
if ( service )
params - > Set ( " service " , service - > GetShortName ( ) ) ;
2020-02-27 11:46:52 +01:00
/*
* If the host / service object specifies the ' check_timeout ' attribute ,
* forward this to the remote endpoint to limit the command execution time .
*/
if ( ! GetCheckTimeout ( ) . IsEmpty ( ) )
params - > Set ( " check_timeout " , GetCheckTimeout ( ) ) ;
2014-11-13 11:23:57 +01:00
params - > Set ( " macros " , macros ) ;
ApiListener : : Ptr listener = ApiListener : : GetInstance ( ) ;
if ( listener )
listener - > SyncSendMessage ( endpoint , message ) ;
2015-01-18 22:15:35 +01:00
2016-01-21 10:32:38 +01:00
/* Re-schedule the check so we don't run it again until after we've received
2017-12-19 15:50:05 +01:00
* a check result from the remote instance . The check will be re - scheduled
* using the proper check interval once we ' ve received a check result .
*/
2016-01-21 10:32:38 +01:00
SetNextCheck ( Utility : : GetTime ( ) + GetCheckCommand ( ) - > GetTimeout ( ) + 30 ) ;
2020-02-11 12:49:40 +01:00
/*
* Let the user know that there was a problem with the check if
* 1 ) The endpoint is not syncing ( replay log , etc . )
* 2 ) Outside of the cold startup window ( 5 min )
*/
2016-10-24 08:38:58 +02:00
} else if ( ! endpoint - > GetSyncing ( ) & & Application : : GetInstance ( ) - > GetStartTime ( ) < Utility : : GetTime ( ) - 300 ) {
2015-01-18 22:15:35 +01:00
/* fail to perform check on unconnected endpoint */
cr - > SetState ( ServiceUnknown ) ;
2016-04-19 09:35:48 +02:00
String output = " Remote Icinga instance ' " + endpoint - > GetName ( ) + " ' is not connected to " ;
Endpoint : : Ptr localEndpoint = Endpoint : : GetLocalEndpoint ( ) ;
if ( localEndpoint )
output + = " ' " + localEndpoint - > GetName ( ) + " ' " ;
else
output + = " this instance " ;
cr - > SetOutput ( output ) ;
2015-01-18 22:15:35 +01:00
ProcessCheckResult ( cr ) ;
2014-11-13 11:23:57 +01:00
}
{
ObjectLock olock ( this ) ;
m_CheckRunning = false ;
}
}
2013-02-09 18:39:43 +01:00
}
2013-02-11 23:37:39 +01:00
2014-05-26 20:56:34 +02:00
void Checkable : : UpdateStatistics ( const CheckResult : : Ptr & cr , CheckableType type )
2013-02-11 23:37:39 +01:00
{
2013-11-09 14:22:38 +01:00
time_t ts = cr - > GetScheduleEnd ( ) ;
2014-05-26 20:56:34 +02:00
if ( type = = CheckableHost ) {
if ( cr - > GetActive ( ) )
CIB : : UpdateActiveHostChecksStatistics ( ts , 1 ) ;
else
CIB : : UpdatePassiveHostChecksStatistics ( ts , 1 ) ;
} else if ( type = = CheckableService ) {
if ( cr - > GetActive ( ) )
CIB : : UpdateActiveServiceChecksStatistics ( ts , 1 ) ;
else
CIB : : UpdatePassiveServiceChecksStatistics ( ts , 1 ) ;
} else {
2014-05-28 13:45:45 +02:00
Log ( LogWarning , " Checkable " , " Unknown checkable type for statistic update. " ) ;
2014-05-26 20:56:34 +02:00
}
2013-02-11 23:37:39 +01:00
}
2016-05-12 13:46:22 +02:00
2018-01-04 04:25:35 +01:00
void Checkable : : IncreasePendingChecks ( )
2016-05-12 13:46:22 +02:00
{
2021-02-02 10:16:04 +01:00
std : : unique_lock < std : : mutex > lock ( m_StatsMutex ) ;
2016-05-12 13:46:22 +02:00
m_PendingChecks + + ;
}
2018-01-04 04:25:35 +01:00
void Checkable : : DecreasePendingChecks ( )
2016-05-12 13:46:22 +02:00
{
2021-02-02 10:16:04 +01:00
std : : unique_lock < std : : mutex > lock ( m_StatsMutex ) ;
2016-05-12 13:46:22 +02:00
m_PendingChecks - - ;
2018-01-16 10:40:08 +01:00
m_PendingChecksCV . notify_one ( ) ;
2016-05-12 13:46:22 +02:00
}
2018-01-04 04:25:35 +01:00
int Checkable : : GetPendingChecks ( )
2016-05-12 13:46:22 +02:00
{
2021-02-02 10:16:04 +01:00
std : : unique_lock < std : : mutex > lock ( m_StatsMutex ) ;
2016-05-12 13:46:22 +02:00
return m_PendingChecks ;
}
2018-01-16 10:40:08 +01:00
void Checkable : : AquirePendingCheckSlot ( int maxPendingChecks )
{
2021-02-02 10:16:04 +01:00
std : : unique_lock < std : : mutex > lock ( m_StatsMutex ) ;
2018-01-16 10:40:08 +01:00
while ( m_PendingChecks > = maxPendingChecks )
m_PendingChecksCV . wait ( lock ) ;
m_PendingChecks + + ;
}