Skip to content
This repository has been archived by the owner on Jan 30, 2024. It is now read-only.

Commit

Permalink
bug fixes for configuration reloads, fix a crash if the configuration…
Browse files Browse the repository at this point in the history
… file disappeared
  • Loading branch information
shthead committed May 13, 2016
1 parent 35eea7f commit 75e428d
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 30 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
v0.4.3 - 13/05/16
------------------------------
BUG FIXES
* Changes to interval, rise/fall values and logcheck values were not
being reloaded correctly.
* Certain changes that resulted in a configuration error that was detected
when reloading the config resulted in no further changes being applied.
* If the configuration file disappeared during a certain time (eg. it was
removed) it would cause the script to crash.

v0.4.2 - 13/05/16
------------------------------
BUG FIXES
Expand Down
97 changes: 67 additions & 30 deletions healthcheck.pl
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@

########## Begin Script ##########

my $script_version = '0.4.2';
my $script_version = '0.4.3';

# Get this scripts name
my $name = basename($0);
Expand All @@ -66,7 +66,7 @@
select STDOUT; $| = 1;

# Various variables that are used
my ($cfg,$logger,$service_state,$service_metric,$service_nexthop,@service_ips,$statusfile,$pidfile,$pid);
my ($cfg,$logger,$service_state,$service_metric,$service_nexthop,@service_ips,$statusfile,$pidfile,$pid,$cmd,$logcheck);

# Initialise the script with some basic sanity checks
init();
Expand Down Expand Up @@ -286,12 +286,27 @@ sub run_announce {
# Set the rise/fall values initially
my $service_rise = 0;
my $service_fall = 0;

# Get the configured values for rise/fall
my $service_rise_req = get_value('rise');
my $service_fall_req = get_value('fall');

# Get the log path
my $log_path = get_value('logfile');

# Get the debug option
my $service_debug = get_value('debug');

# Get the check interval
my $interval = get_value('interval');

# Get the check command. This will be prepended with the "timeout" utility for executing the check.
my $command = get_value('command');
my $timeout = get_value('timeout');
$cmd = "timeout $timeout $command";

# Should we log the check output
$logcheck = get_value('logcheck');

# Last result variable
my $last_result = undef;
Expand All @@ -301,10 +316,11 @@ sub run_announce {

# Get the current hash of the config file to check for changes
my $config_md5 = file_md5_hex($config);
my $new_config_md5 = $config_md5;

# Config is valid
my $config_valid = 'valid';

# Start loop
while (1) {

Expand All @@ -313,30 +329,60 @@ sub run_announce {

$logger->debug("$check: Check start");

# Check the hash of the config file. If it has changed since the last run, re-read the config to make sure that we use the correct values.
my $new_config_md5 = file_md5_hex($config);
# Check the hash of the config file only if it exists still.

if (-f $config) { $new_config_md5 = file_md5_hex($config); }
else { $logger->info("$check: The configuration file has disappeared?"); }

# If the config has has changed since last run, reload the config, validated it and use the new values.
if ($new_config_md5 ne $config_md5) {
# File has changed, validate config
$logger->debug("$check: Configuration file has changed since last check, validating config");
# File has changed, re-read and validate config
$logger->debug("$check: Configuration file has changed since last check, reloading and validating config");
$cfg->ReadConfig;
$config_valid = validate_config($check);
if ($config_valid ne 'valid') {
$logger->error("$check: Configuration file is not valid. Not reloading any changes. Error: $config_valid");
} else {
# Re-read config
$cfg->ReadConfig;

# Check the log path is still the same
if (get_value('logfile') ne $log_path || get_value('debug') ne $service_debug) {
# Restart logger due to config change
$log_path = get_value('logfile');
$service_debug = get_value('debug');
$logger = start_log();
}

# Check that the list of IP's, metric and next hop address is still the same. If there is changes and the service state is currently up (routes are announced) the routes need to be withdrawn and then announced.

# Check if the interval for checks has been changed
if (get_value('interval') ne $interval) {
$logger->info("$check: Check interval has been changed from $interval to ".get_value('interval'));
$interval = get_value('interval');
}

# Check if the option to log check output has been changed
if (get_value('logcheck') ne $logcheck) {
$logger->info("$check: Check logging has changed from $logcheck to ".get_value('logcheck'));
$logcheck = get_value('logcheck');
}

# Check if the values for rise/fall have changed
if (get_value('rise') ne $service_rise_req) {
$logger->info("$check: Check rise value has changed from $service_rise_req to ".get_value('rise'));
$service_rise_req = get_value('rise');
}
if (get_value('fall') ne $service_fall_req) {
$logger->info("$check: Check fall value has changed from $service_fall_req to ".get_value('fall'));
$service_fall_req = get_value('fall');
}

# Check if the timeout or check command has been changed.
if (get_value('timeout') ne $timeout || get_value('command') ne $command) {
if (get_value('timeout') ne $timeout) { $logger->info("$check: Check timeout has been changed from $timeout to ".get_value('timeout')); $timeout = get_value('timeout'); }
if (get_value('command') ne $command) { $logger->info("$check: Check command has been changed from $command to ".get_value('command')); $command = get_value('command'); }
$cmd = "timeout $timeout $command";
}

# Define the variable $changes - this will be set if there are changes that require all routes to be withdrawn and announced again.
my $changes;

# Check if the metric has changed
if ($service_metric ne get_value('metric')) {
$logger->info("$check: Metric for routes has changed from $service_metric to ".get_value('metric'));
Expand Down Expand Up @@ -469,7 +515,7 @@ sub run_announce {
if ($service_state eq 'down') {
$service_rise++;
# Check if the value of $service_rise is high enough to mark the service as up
if ($service_rise >= get_value('rise')) {
if ($service_rise >= $service_rise_req) {
$logger->info("$check: Last check succeeded. Service has met the number of success checks required, marking as up and announcing IP's");
# Service should be marked as up.
$service_state = 'up';
Expand All @@ -480,10 +526,10 @@ sub run_announce {
# Set the process status
proc_status('UP');
} else {
my $service_rise_left = get_value('rise') - $service_rise;
my $service_rise_left = $service_rise_req - $service_rise;
$logger->info("$check: Last check succeeded. Service needs $service_rise_left checks to succeed before it is active");
# Set the process status
proc_status("DOWN | RISING $service_rise/".get_value('rise'));
proc_status("DOWN | RISING $service_rise/$service_rise_req");
}
}

Expand All @@ -493,7 +539,7 @@ sub run_announce {
if ($service_state eq 'up') {
$service_fall++;
# Check if the value of $service_fall is high enough to mark the service as down
if ($service_fall >= get_value('fall')) {
if ($service_fall >= $service_fall_req) {
$logger->info("$check: Last check failed. Service has met the number of failure checks required, marking service as down and withdrawing IP's");
# Service should be marked as down
$service_state = 'down';
Expand All @@ -504,19 +550,16 @@ sub run_announce {
# Set the process status
proc_status('DOWN');
} else {
my $service_fall_left = get_value('fall') - $service_fall;
my $service_fall_left = $service_fall_req - $service_fall;
$logger->info("$check: Last check failed. Service needs $service_fall_left checks to fail before it is down");
# Set the process status
proc_status("UP | FALLING $service_fall/".get_value('fall'));
proc_status("UP | FALLING $service_fall/$service_fall_req");
}
}

}
}

# Get the check interval
my $interval = get_value('interval');

# Check how long this check took, sleep for the appropriate amount of time to start next check
my $end = time();
my $runtime = $end - $start;
Expand Down Expand Up @@ -560,7 +603,7 @@ sub validate_config {
elsif (! looks_like_number(get_value('timeout',$section))) { $errors .= " - Timeout specified is not a number\n"; }

# Ensure that the check timeout is less than the check interval
if (get_value('timeout',$section) >= get_value('interval',$section)) { $errors .= " - The timeout specified is larger than the check interval\n"; }
if (get_value('timeout',$section) > get_value('interval',$section)) { $errors .= " - The timeout specified is larger than the check interval\n"; }

# Ensure that there is a rise value set and it is valid
if (! get_value('rise',$section)) { $errors .= " - No rise value specified\n"; }
Expand Down Expand Up @@ -765,18 +808,12 @@ sub withdraw_ips {

# Sub to execute check command
sub run_check {
my $cmd = get_value('command');
my $timeout = get_value('timeout');

# Remove quotes from start/end of the command if it is quoted
if ($cmd =~ /^"/) {
$cmd =~ s/^"//;
$cmd =~ s/"$//;
}

# Prepend the timeout to the check command
$cmd = "timeout $timeout $cmd";

$logger->debug("$check: Attempting to fork and run check command [$cmd]");

# opening a pipe creates a forked process
Expand All @@ -790,7 +827,7 @@ sub run_check {
close($pipe);
my $exit_code = $?;
$logger->debug("$check: Executed check command [$cmd]. Return code [$exit_code]");
if (get_value('logcheck') eq 'yes') {
if ($logcheck eq 'yes') {
$logger->debug("$check: Output from [$cmd]: @result");
}
return ($exit_code);
Expand Down

0 comments on commit 75e428d

Please sign in to comment.