Check and restart server threads if necessary.

Ref pandora_enterprise#2593.
This commit is contained in:
Ramon Novoa 2018-08-10 11:34:56 +02:00
parent 506beefca4
commit 9a9956296f
3 changed files with 177 additions and 115 deletions

View File

@ -101,8 +101,21 @@ sub pandora_startup () {
# Generate the encryption key after reading the passphrase.
$Config{"encryption_key"} = enterprise_hook('pandora_get_encryption_key', [\%Config, $Config{"encryption_passphrase"}]);
# Update the agent cache.
threads->create('enterprise_hook', ('update_agent_cache', [\%Config]))->detach() if ($Config{'node_metaconsole'} == 1);
# Kill any running server threads.
stop_server_threads();
# Start the task execution thread.
start_server_thread(\&pandora_server_tasks, [\%Config]);
# Start the policy queue thread.
start_server_thread(\&pandora_process_policy_queue, [\%Config]) if ($Config{'__enterprise_enabled'} == 1 && $Config{'policy_manager'} == 1);
# Start the event replication thread. Do not start with start_server_thread, this thread may exit on its own.
threads->create(\&pandora_process_event_replication, [\%Config]) if($Config{'__enterprise_enabled'} == 1 && $Config{'event_replication'} == 1);
# Update the agent cache. Do not start with start_server_thread, this thread updates the agent cache and exits.
threads->create(\&enterprise_hook, ['update_agent_cache', [\%Config]])->detach() if ($Config{'node_metaconsole'} == 1);
pandora_audit (\%Config, $Config{'rb_product_name'} . ' Server Daemon starting', 'SYSTEM', 'System', $DBH);
# Load servers
@ -143,9 +156,11 @@ sub pandora_restart (;$) {
my $sleep_time = @_ > 0 ? $_[0] : $Config{'restart_delay'};
# Stop the servers
foreach my $server (@Servers) {
$server->stop ();
}
eval {
foreach my $server (@Servers) {
$server->stop ();
}
};
# Remove the servers
while (pop (@Servers)) {};
@ -274,111 +289,110 @@ sub pandora_server_tasks ($) {
# Get the console DB connection
my $dbh = db_connect ($pa_config->{'dbengine'}, $pa_config->{'dbname'}, $pa_config->{'dbhost'}, $pa_config->{'dbport'},
$pa_config->{'dbuser'}, $pa_config->{'dbpass'});
my $counter = 0;
while ($RUN == 1) {
eval{
if (pandora_is_master($pa_config) == 1) {
while ($THRRUN == 1) {
if (pandora_is_master($pa_config) == 1) {
# TASKS EXECUTED EVERY 5 SECONDS (Low latency tasks)
# --------------------------------------------------
if (($counter % 5) == 0) {
# TASKS EXECUTED EVERY 5 SECONDS (Low latency tasks)
# --------------------------------------------------
if (($counter % 5) == 0) {
# Update forced alerts
pandora_exec_forced_alerts ($pa_config, $dbh);
}
# TASKS EXECUTED EVERY 30 SECONDS (Mid latency tasks)
# ---------------------------------------------------
if (($counter % 30) == 0) {
# Update module status and fired alert counts
my @agents = get_db_rows ($dbh, 'SELECT id_agente, nombre, update_module_count, update_alert_count, update_secondary_groups FROM tagente WHERE disabled = 0 AND (update_module_count=1 OR update_alert_count=1 OR update_secondary_groups=1)');
foreach my $agent (@agents) {
logger ($pa_config, "Updating module status and fired alert counts for agent " . $agent->{'nombre'}, 10);
if ($agent->{'update_module_count'} == 1) {
pandora_update_agent_module_count ($pa_config, $dbh, $agent->{'id_agente'});
}
if ($agent->{'update_alert_count'} == 1) {
pandora_update_agent_alert_count ($pa_config, $dbh, $agent->{'id_agente'});
}
if ($agent->{'update_secondary_groups'} == 1) {
pandora_update_secondary_groups_cache ($pa_config, $dbh, $agent->{'id_agente'});
}
}
# Keepalive module control.(very DB intensive, not run frecuently
pandora_module_keep_alive_nd ($pa_config, $dbh);
# Set the status of unknown modules
pandora_module_unknown ($pa_config, $dbh);
# Check if an autodisabled agent needs to be autodisable
pandora_disable_autodisable_agents ($pa_config, $dbh);
}
# TASKS EXECUTED EVERY 60 SECONDS (High latency tasks)
# ----------------------------------------------------
if (($counter % 60) == 0) {
# Downtimes are executed only 30 x Server Threshold secs
pandora_planned_downtime ($pa_config, $dbh);
# Realtime stats (Only master server!) - ( VERY HEAVY !)
# Realtimestats == 1, generated by WEB Console, not by server!
if (defined($pa_config->{"realtimestats"}) && $pa_config->{"realtimestats"} == 0){
# Check if I need to refresh stats
my $last_execution_stats = get_db_value ($dbh, "SELECT MAX(utimestamp) FROM tgroup_stat");
if (!defined($last_execution_stats) || $last_execution_stats < (time() - $pa_config->{"stats_interval"})){
pandora_group_statistics ($pa_config, $dbh);
pandora_server_statistics ($pa_config, $dbh);
}
}
# Event auto-expiry
my $expiry_time = $pa_config->{"event_expiry_time"};
my $expiry_window = $pa_config->{"event_expiry_window"};
if ($expiry_time > 0 && $expiry_window > 0 && $expiry_window > $expiry_time) {
my $time_ref = time ();
my $expiry_limit = $time_ref - $expiry_time;
my $expiry_window = $time_ref - $expiry_window;
db_do ($dbh, 'UPDATE tevento SET estado=1, ack_utimestamp=? WHERE estado=0 AND utimestamp < ? AND utimestamp > ?', $time_ref, $expiry_limit, $expiry_window);
}
}
# Update forced alerts
pandora_exec_forced_alerts ($pa_config, $dbh);
}
# COMMON TASKS (master and non-master)
# ---------------------------------------------------------------
# Rotate Log File
# TASKS EXECUTED EVERY 30 SECONDS (Mid latency tasks)
# ---------------------------------------------------
if (($counter % 30) == 0) {
pandora_rotate_logfile($pa_config);
# Set event storm protection
pandora_set_event_storm_protection (pandora_get_tconfig_token ($dbh, 'event_storm_protection', 0));
}
# Pandora self monitoring
if (defined($pa_config->{"self_monitoring"})
&& $pa_config->{"self_monitoring"} == 1
&& !is_metaconsole($pa_config)
&& $counter % $pa_config->{'self_monitoring_interval'} == 0) {
pandora_self_monitoring ($pa_config, $dbh);
}
# Avoid counter overflow
if ($counter > 10000){
$counter = 0;
# Update module status and fired alert counts
my @agents = get_db_rows ($dbh, 'SELECT id_agente, nombre, update_module_count, update_alert_count, update_secondary_groups FROM tagente WHERE disabled = 0 AND (update_module_count=1 OR update_alert_count=1 OR update_secondary_groups=1)');
foreach my $agent (@agents) {
logger ($pa_config, "Updating module status and fired alert counts for agent " . $agent->{'nombre'}, 10);
if ($agent->{'update_module_count'} == 1) {
pandora_update_agent_module_count ($pa_config, $dbh, $agent->{'id_agente'});
}
if ($agent->{'update_alert_count'} == 1) {
pandora_update_agent_alert_count ($pa_config, $dbh, $agent->{'id_agente'});
}
if ($agent->{'update_secondary_groups'} == 1) {
pandora_update_secondary_groups_cache ($pa_config, $dbh, $agent->{'id_agente'});
}
}
# Keepalive module control.(very DB intensive, not run frecuently
pandora_module_keep_alive_nd ($pa_config, $dbh);
# Set the status of unknown modules
pandora_module_unknown ($pa_config, $dbh);
# Check if an autodisabled agent needs to be autodisable
pandora_disable_autodisable_agents ($pa_config, $dbh);
}
else {
$counter++;
# TASKS EXECUTED EVERY 60 SECONDS (High latency tasks)
# ----------------------------------------------------
if (($counter % 60) == 0) {
# Downtimes are executed only 30 x Server Threshold secs
pandora_planned_downtime ($pa_config, $dbh);
# Realtime stats (Only master server!) - ( VERY HEAVY !)
# Realtimestats == 1, generated by WEB Console, not by server!
if (defined($pa_config->{"realtimestats"}) && $pa_config->{"realtimestats"} == 0){
# Check if I need to refresh stats
my $last_execution_stats = get_db_value ($dbh, "SELECT MAX(utimestamp) FROM tgroup_stat");
if (!defined($last_execution_stats) || $last_execution_stats < (time() - $pa_config->{"stats_interval"})){
pandora_group_statistics ($pa_config, $dbh);
pandora_server_statistics ($pa_config, $dbh);
}
}
# Event auto-expiry
my $expiry_time = $pa_config->{"event_expiry_time"};
my $expiry_window = $pa_config->{"event_expiry_window"};
if ($expiry_time > 0 && $expiry_window > 0 && $expiry_window > $expiry_time) {
my $time_ref = time ();
my $expiry_limit = $time_ref - $expiry_time;
my $expiry_window = $time_ref - $expiry_window;
db_do ($dbh, 'UPDATE tevento SET estado=1, ack_utimestamp=? WHERE estado=0 AND utimestamp < ? AND utimestamp > ?', $time_ref, $expiry_limit, $expiry_window);
}
}
};
}
# COMMON TASKS (master and non-master)
# ---------------------------------------------------------------
# Rotate Log File
if (($counter % 30) == 0) {
pandora_rotate_logfile($pa_config);
# Set event storm protection
pandora_set_event_storm_protection (pandora_get_tconfig_token ($dbh, 'event_storm_protection', 0));
}
# Pandora self monitoring
if (defined($pa_config->{"self_monitoring"})
&& $pa_config->{"self_monitoring"} == 1
&& !is_metaconsole($pa_config)
&& $counter % $pa_config->{'self_monitoring_interval'} == 0) {
pandora_self_monitoring ($pa_config, $dbh);
}
# Avoid counter overflow
if ($counter > 10000){
$counter = 0;
}
else {
$counter++;
}
sleep (1);
}
db_disconnect($dbh);
}
################################################################################
@ -525,21 +539,13 @@ sub main() {
# Load enterprise module
if (enterprise_load (\%Config) == 0) {
$Config{'__enterprise_enabled'} = 0;
print_message (\%Config, " [*] Pandora FMS Enterprise module not available.", 1);
logger (\%Config, " [*] Pandora FMS Enterprise module not available.", 1);
} else {
$Config{'__enterprise_enabled'} = 1;
print_message (\%Config, " [*] " . pandora_get_initial_product_name() . " Enterprise module loaded.", 1);
logger (\%Config, " [*] " . pandora_get_initial_product_name() . " Enterprise module loaded.", 1);
if($Config{'policy_manager'} == 1) {
# Start thread to patrol policy queue
threads->create('pandora_process_policy_queue', (\%Config))->detach();
}
if($Config{'event_replication'} == 1) {
# Start thread to process event replication
threads->create('pandora_process_event_replication', (\%Config))->detach();
}
}
# Save the start time for warmup intervals.
@ -559,9 +565,6 @@ sub main() {
pandora_event (\%Config, "Warmup mode for events started.", 0, 0, 0, 0, 0, 'system', 0, $DBH);
}
# Start thread to execute server tasks on the master server
threads->create('pandora_server_tasks', (\%Config))->detach();
# Generate 'going up' events
foreach my $server (@Servers) {
$server->upEvent ();
@ -612,6 +615,9 @@ sub main() {
$server->update();
}
# Make sure all server threads are running.
die("Server thread crashed.") unless (check_server_threads() == 1);
db_do ($DBH,
"UPDATE tserver SET status = 0
WHERE UNIX_TIMESTAMP(now())-UNIX_TIMESTAMP(keepalive) > 2*server_keepalive"

View File

@ -4557,7 +4557,7 @@ sub pandora_process_event_replication ($) {
logger($pa_config, "Starting replication events process.", 1);
while(1) {
while($THRRUN == 1) {
# If we are not the master server sleep and check again.
if (pandora_is_master($pa_config) == 0) {
@ -4569,6 +4569,8 @@ sub pandora_process_event_replication ($) {
sleep ($replication_interval);
enterprise_hook('pandora_replicate_copy_events',[$pa_config, $dbh, $dbh_metaconsole, $metaconsole_server_id, $replication_mode]);
}
db_disconnect($dbh);
}
##########################################################################
@ -4588,7 +4590,7 @@ sub pandora_process_policy_queue ($) {
logger($pa_config, "Starting policy queue patrol process.", 1);
while(1) {
while($THRRUN == 1) {
# If we are not the master server sleep and check again.
if (pandora_is_master($pa_config) == 0) {
@ -4615,7 +4617,9 @@ sub pandora_process_policy_queue ($) {
}
enterprise_hook('pandora_finish_queue_operation', [$dbh, $operation->{'id'}]);
}
}
db_disconnect($dbh);
}
##########################################################################

View File

@ -77,6 +77,7 @@ our @EXPORT = qw(
MODULE_WARNING
MODULE_UNKNOWN
MODULE_NOTINIT
$THRRUN
api_call_url
cron_get_closest_in_range
cron_next_execution
@ -115,6 +116,9 @@ our @EXPORT = qw(
valid_regex
set_file_permissions
uri_encode
check_server_threads
start_server_thread
stop_server_threads
);
# ID of the different servers
@ -307,6 +311,12 @@ while (my ($ent, $chr) = each(%ENT2CHR)) {
$CHR2ENT{$chr} = "&" . $ent . ";";
}
# Threads started by the Pandora FMS Server.
my @ServerThreads;
# Keep threads running.
our $THRRUN :shared = 1;
###############################################################################
# Sets user:group owner for the given file
###############################################################################
@ -1740,6 +1750,48 @@ sub api_call_url {
return undef;
}
################################################################################
# Start a server thread and keep track of it.
################################################################################
sub start_server_thread {
my ($fn, $args) = @_;
# Signal the threads to run.
$THRRUN = 1;
my $thr = threads->create($fn, @{$args});
push(@ServerThreads, $thr);
}
################################################################################
# Check the status of server threads. Returns 1 if all all running, 0 otherwise.
################################################################################
sub check_server_threads {
my ($fn, $args) = @_;
foreach my $thr (@ServerThreads) {
return 0 unless $thr->is_running();
}
return 1;
}
################################################################################
# Stop all server threads.
################################################################################
sub stop_server_threads {
my ($fn, $args) = @_;
# Signal the threads to exits.
$THRRUN = 0;
foreach my $thr (@ServerThreads) {
$thr->detach();
}
@ServerThreads = ();
}
# End of function declaration
# End of defined Code