pandorafms/pandora_server/util/pandora_watchdog.sh

125 lines
4.1 KiB
Bash
Raw Normal View History

#!/bin/bash
2023-07-03 17:20:25 +02:00
# Copyright (c) 2005-2023 Pandora FMS
# Author: Sancho Lerena <slerena@artica.es> 2009
# Licence: GPL2
#
# daemon_watchdog
#
# Generic watchdog to detect if a daemon is running. If cannot restart, execute
# a custom-user defined command to notify daemon is down and continues in
# standby (without notifying / checking) until daemon is alive again.
# Default configuration is for Pandora FMS Server daemon
# =====================================================================
# Configuration begins here. Please use "" if data contain blank spaces
export DAEMON_WATCHDOG=pandora_watchdog.sh
# DAEMON_WATCHDOG: Name of this script. Used to check if its running already
export DAEMON_CHECK="/usr/bin/pandora_server /etc/pandora/pandora_server.conf"
# DAEMON_CHECK: Daemon monitored, please use full path and parameters like
# are shown doing a ps aux of ps -Alf
export DAEMON_RESTART="/etc/init.d/pandora_server restart"
# DAEMON_RESTART: Command to try to restart the daemon
export DAEMON_DEADWAIT=90
# DAEMON_DEADWAIT: Time this script checks after detect that
# daemon is down before to consider is really down.
export DAEMON_ALERT="/usr/bin/pandora_alert"
# DAEMON_ALERT: Command/Script executed if after detecting daemon is down,
# and waiting DAEMON_DEADWAIT, and daemon continues down.
export DAEMON_LOOP=7
# DAEMON_LOOP: Interval within daemon_wathdog checks if daemon is alive.
# DO NOT use values under 3-5 seconds or could be CPU consuming.
# NEVER NEVER NEVER use 0 value or gets 100% CPU!.
# Configuration stop here
# =====================================================================
# Check if another instance of this script
RUNNING_CHECK=`ps aux | grep "$DAEMON_WATCHDOG" | grep -v grep |wc -l`
if [ $RUNNING_CHECK -gt 2 ]
then
echo "Aborting, seems that there are more '$DAEMON_WATCHDOG' running in this system"
logger $DAEMON_WATCHDOG aborted execution because another watchdog seems to be running
exit -1
fi
# This value always must be 0 at start. Do not alter
export DAEMON_STANDBY=0
# This function replace pidof, not working in the same way in different linux distros
function pidof_daemon () (
# This sets COLUMNS to XXX chars, because if command is run
# in a "strech" term, ps aux don't report more than COLUMNS
# characters and this will not work.
COLUMNS=300
DAEMON_PID=`ps aux | grep "$DAEMON_CHECK" | grep -v grep | tail -1 | awk '{ print $2 }'`
echo $DAEMON_PID
)
# Main script
if [ ! -f `echo $DAEMON_CHECK | awk '{ print $1 }'` ]
then
echo "Daemon you want to check is not present in the system. Aborting watchdog"
exit
fi
while [ 1 ]
do
DAEMON_PID=`pidof_daemon`
if [ -z "$DAEMON_PID" ]
then
echo "Checkpoint #1 $DAEMON_PID "
if [ $DAEMON_STANDBY == 0 ]
then
# Daemon down, first detection
# Restart it !
logger $DAEMON_WATCHDOG restarting $DAEMON_CHECK
$DAEMON_RESTART 2> /dev/null > /dev/null
# Just WAIT another DAEMON_DEADWAIT before consider it DEAD
echo "Going to DAEMON_DEADEWAIT"
sleep $DAEMON_DEADWAIT
DAEMON_PID=`pidof_daemon`
if [ -z "$DAEMON_PID" ]
then
# Is dead and can't be restarted properly. Execute alert
echo "I cannot startup again the process"
logger $DAEMON_WATCHDOG $DAEMON_CHECK is dead, alerting !
$DAEMON_ALERT 2> /dev/null > /dev/null
# Watchdog process puts in STANDBY mode until process get alive again
logger $DAEMON_WATCHDOG "Entering in Stabdby mode"
DAEMON_STANDBY=1
fi
fi
else
echo "Checkpoint #1B $DAEMON_PID "
DAEMON_STANDBY=0
fi
sleep $DAEMON_LOOP
done