#!/bin/bash # Copyright (c) 2005-2009 Artica ST # Author: Sancho Lerena 2009 # Licence: GPL2 # # daemon_watchdog # # Generic watchdog to detect if a daemon is running. If cannot restart, execute # a custom-user defined command to notify daemon is down and continues in # standby (without notifying / checking) until daemon is alive again. # Default configuration is for Pandora FMS Server daemon # ===================================================================== # Configuration begins here. Please use "" if data contain blank spaces export DAEMON_WATCHDOG=pandora_watchdog.sh # DAEMON_WATCHDOG: Name of this script. Used to check if its running already export DAEMON_CHECK="/usr/bin/pandora_server /etc/pandora/pandora_server.conf" # DAEMON_CHECK: Daemon monitored, please use full path and parameters like # are shown doing a ps aux of ps -Alf export DAEMON_RESTART="/etc/init.d/pandora_server restart" # DAEMON_RESTART: Command to try to restart the daemon export DAEMON_DEADWAIT=90 # DAEMON_DEADWAIT: Time this script checks after detect that # daemon is down before to consider is really down. export DAEMON_ALERT="/usr/bin/pandora_alert" # DAEMON_ALERT: Command/Script executed if after detecting daemon is down, # and waiting DAEMON_DEADWAIT, and daemon continues down. export DAEMON_LOOP=7 # DAEMON_LOOP: Interval within daemon_wathdog checks if daemon is alive. # DO NOT use values under 3-5 seconds or could be CPU consuming. # NEVER NEVER NEVER use 0 value or gets 100% CPU!. # Configuration stop here # ===================================================================== # Check if another instance of this script RUNNING_CHECK=`ps aux | grep "$DAEMON_WATCHDOG" | grep -v grep |wc -l` if [ $RUNNING_CHECK -gt 2 ] then echo "Aborting, seems that there are more '$DAEMON_WATCHDOG' running in this system" logger $DAEMON_WATCHDOG aborted execution because another watchdog seems to be running exit -1 fi # This value always must be 0 at start. Do not alter export DAEMON_STANDBY=0 # This function replace pidof, not working in the same way in different linux distros function pidof_daemon () ( # This sets COLUMNS to XXX chars, because if command is run # in a "strech" term, ps aux don't report more than COLUMNS # characters and this will not work. COLUMNS=300 DAEMON_PID=`ps aux | grep "$DAEMON_CHECK" | grep -v grep | tail -1 | awk '{ print $2 }'` echo $DAEMON_PID ) # Main script if [ ! -f `echo $DAEMON_CHECK | awk '{ print $1 }'` ] then echo "Daemon you want to check is not present in the system. Aborting watchdog" exit fi while [ 1 ] do DAEMON_PID=`pidof_daemon` if [ -z "$DAEMON_PID" ] then echo "Checkpoint #1 $DAEMON_PID " if [ $DAEMON_STANDBY == 0 ] then # Daemon down, first detection # Restart it ! logger $DAEMON_WATCHDOG restarting $DAEMON_CHECK $DAEMON_RESTART 2> /dev/null > /dev/null # Just WAIT another DAEMON_DEADWAIT before consider it DEAD echo "Going to DAEMON_DEADEWAIT" sleep $DAEMON_DEADWAIT DAEMON_PID=`pidof_daemon` if [ -z "$DAEMON_PID" ] then # Is dead and can't be restarted properly. Execute alert echo "I cannot startup again the process" logger $DAEMON_WATCHDOG $DAEMON_CHECK is dead, alerting ! $DAEMON_ALERT 2> /dev/null > /dev/null # Watchdog process puts in STANDBY mode until process get alive again logger $DAEMON_WATCHDOG "Entering in Stabdby mode" DAEMON_STANDBY=1 fi fi else echo "Checkpoint #1B $DAEMON_PID " DAEMON_STANDBY=0 fi sleep $DAEMON_LOOP done