Skip to content

Commit

Permalink
Merge pull request #195 from philippthun/invoke-restart-drain-for-fai…
Browse files Browse the repository at this point in the history
…led-healthcheck

Invoke restart drain for failed healthcheck
  • Loading branch information
sethboyles authored May 13, 2021
2 parents 1659236 + dd4128b commit 7aae29e
Show file tree
Hide file tree
Showing 7 changed files with 64 additions and 31 deletions.
12 changes: 6 additions & 6 deletions jobs/cloud_controller_ng/monit
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@ check process cloud_controller_ng
with pidfile /var/vcap/sys/run/bpm/cloud_controller_ng/cloud_controller_ng.pid
start program "/var/vcap/jobs/bpm/bin/bpm start cloud_controller_ng"
stop program "/var/vcap/jobs/bpm/bin/bpm stop cloud_controller_ng"
depends on ccng_monit_http_healthcheck
group vcap
if totalmem > <%= p("cc.thresholds.api.alert_if_above_mb") %> Mb for 3 cycles then alert
if totalmem > <%= p("cc.thresholds.api.restart_if_consistently_above_mb") %> Mb for <%= p("cc.thresholds.api.restart_if_consistently_above_mb_cycles") %> cycles then exec "/var/vcap/jobs/cloud_controller_ng/bin/restart_drain"
if totalmem > <%= p("cc.thresholds.api.restart_if_above_mb") %> Mb for 3 cycles then exec "/var/vcap/jobs/cloud_controller_ng/bin/restart_drain"


check process ccng_monit_http_healthcheck
with pidfile /var/vcap/sys/run/bpm/cloud_controller_ng/ccng_monit_http_healthcheck.pid
start program "/var/vcap/jobs/bpm/bin/bpm start cloud_controller_ng -p ccng_monit_http_healthcheck"
stop program "/var/vcap/jobs/bpm/bin/bpm stop cloud_controller_ng -p ccng_monit_http_healthcheck"
group vcap
with pidfile /var/vcap/sys/run/bpm/cloud_controller_ng/ccng_monit_http_healthcheck.pid
start program "/var/vcap/jobs/bpm/bin/bpm start cloud_controller_ng -p ccng_monit_http_healthcheck"
stop program "/var/vcap/jobs/bpm/bin/bpm stop cloud_controller_ng -p ccng_monit_http_healthcheck"
if 1 restart within 2 cycles then exec "/var/vcap/jobs/cloud_controller_ng/bin/restart_drain"
depends on nginx_cc
group vcap

<% (1..(p("cc.jobs.local.number_of_workers"))).each do |index| %>
check process cloud_controller_worker_local_<%= index %>
Expand Down
8 changes: 8 additions & 0 deletions jobs/cloud_controller_ng/spec
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,14 @@ properties:
default: 6
description: "Maximum health check timeout (in seconds). Health checks will be retried until this time limit is reached. This should be less than or equal to your route_registrar.routes.api.health_check.timeout"

cc.ccng_monit_http_healthcheck_retries:
default: 5
description: "Number of retries performed by the ccng_monit_http_healthcheck process"

cc.ccng_monit_http_healthcheck_timeout_per_retry:
default: 2
description: "Timeout (in seconds) for each HTTP request sent by the ccng_monit_http_healthcheck process"

cc.jobs.global.timeout_in_seconds:
description: "The longest any job can take before it is cancelled unless overridden per job"
default: 14400 # 4 hours
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,28 +30,26 @@ PORT=<%= p("cc.public_tls.port") %>
PROTOCOL="https"
URL="https://${HOST}:${PORT}/healthz"

source /var/vcap/packages/capi_utils/monit_utils.sh

echo $(date --rfc-3339=ns) 'Waiting for Cloud Controller to initially become healthy at'

wait_for_server_to_become_healthy "${URL}" "<%= p("cc.api_post_start_healthcheck_timeout_in_seconds") %>"

echo $(date --rfc-3339=ns) 'Initial check passed, will now restart CC over on repeated failures'
echo $(date --rfc-3339=ns) 'Will restart CC over on repeated failures'

trap log_failure EXIT

# if we fail to curl it 5 times in a row across 50 seconds, die so monit will restart us
set -e
# If we fail to curl the healthz endpoint 5 times (can be changed with cc.ccng_monit_http_healthcheck_retries) with
# a delay of 10 seconds between each retry, exit in order to trigger a restart of cloud controller through monit.
# Each curl has an individual timout of 2 seconds (can be changed with cc.ccng_monit_http_healthcheck_timeout_per_retry).
while true; do
if ! curl \
set +e
curl \
-sS \
--max-time <%= p('cc.api_health_check_timeout_per_retry') %> \
--retry 5 \
--max-time <%= p('cc.ccng_monit_http_healthcheck_timeout_per_retry') %> \
--retry <%= p('cc.ccng_monit_http_healthcheck_retries') %> \
--retry-delay 10 \
-A "ccng_monit_http_healthcheck" \
-k \
"${URL}" > /dev/null ; then
status=$?
"${URL}" > /dev/null
status=$?
set -e
if [[ $status > 0 ]] ; then
echo $(date --rfc-3339=ns) "ccng_monit_http_healthcheck failed to curl <${URL}>: exit code $status"
exit $status
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,3 @@ if [[ $status > 0 ]] ; then
echo $(date --rfc-3339=ns) "Failed to hit ${URL}"
exit $status
fi

14 changes: 7 additions & 7 deletions jobs/cloud_controller_ng/templates/restart_drain.sh.erb
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ PIDFILE="/var/vcap/sys/run/cloud_controller_ng/restart_drain.pid"
[[ -s "$PIDFILE" ]] && exit

function on_exit {
# Enable monitoring of nginx_cc. This also enables monitoring for
# cloud_controller_ng and ccng_monit_http_healthcheck.
/var/vcap/bosh/bin/monit monitor nginx_cc
# Re-enable monitoring of ccng_monit_http_healthcheck. This also enables
# monitoring of nginx_cc and cloud_controller_ng.
/var/vcap/bosh/bin/monit monitor ccng_monit_http_healthcheck
rm -f $PIDFILE
}

Expand All @@ -19,9 +19,9 @@ echo "$BASHPID" > "$PIDFILE"
LOGFILE="/var/vcap/sys/log/cloud_controller_ng/drain/restart_drain.log"
echo "$(date) - pid: $BASHPID - Monit triggered shutdown drain" >> "$LOGFILE"

# The health check fails as soon as nginx stops accepting new requests. It must
# be unmonitored to not interfere with the graceful shutdown. This also
# unmonitors cloud_controller_ng and nginx_cc.
/var/vcap/bosh/bin/monit unmonitor ccng_monit_http_healthcheck
# Unmonitor cloud_controller_ng. This also unmonitors nginx_cc and
# ccng_monit_http_healthcheck. Monit should not interfere with the graceful
# shutdown.
/var/vcap/bosh/bin/monit unmonitor cloud_controller_ng

/var/vcap/jobs/cloud_controller_ng/bin/shutdown_drain 1>&2
3 changes: 0 additions & 3 deletions jobs/cloud_controller_ng/templates/shutdown_drain.rb.erb
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,5 @@ $LOAD_PATH.unshift('/var/vcap/packages/cloud_controller_ng/cloud_controller_ng/l
require 'cloud_controller/drain'

@drain = VCAP::CloudController::Drain.new('/var/vcap/sys/log/cloud_controller_ng')
@drain.log_invocation(ARGV)
@drain.shutdown_nginx('/var/vcap/sys/run/bpm/cloud_controller_ng/nginx.pid', <%= p("cc.nginx_drain_timeout") %>)
@drain.shutdown_cc('/var/vcap/sys/run/bpm/cloud_controller_ng/cloud_controller_ng.pid')

puts 0 # tell bosh the drain script succeeded
31 changes: 31 additions & 0 deletions spec/cloud_controller_ng/healthcheck_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# frozen_string_literal: true

require 'rspec'
require 'bosh/template/test'

module Bosh::Template::Test
describe 'health check template rendering' do
let(:release_path) { File.join(File.dirname(__FILE__), '../..') }
let(:release) { ReleaseDir.new(release_path) }
let(:job) { release.job('cloud_controller_ng') }

describe 'bin/ccng_monit_http_healthcheck' do
let(:template) { job.template('bin/ccng_monit_http_healthcheck') }

it 'renders the default value' do
rendered_file = template.render(consumes: {})
expect(rendered_file).to include('--max-time 2')
expect(rendered_file).to include('--retry 5')
end

context 'when custom values are provided' do
it 'renders the provided values' do
rendered_file = template.render({ 'cc' => { 'ccng_monit_http_healthcheck_timeout_per_retry' => 30,
'ccng_monit_http_healthcheck_retries' => 2 } }, consumes: {})
expect(rendered_file).to include('--max-time 30')
expect(rendered_file).to include('--retry 2')
end
end
end
end
end

0 comments on commit 7aae29e

Please sign in to comment.