From ec2fcfa550518c43ed3042b33023da34af96fb86 Mon Sep 17 00:00:00 2001 From: Philipp Thun Date: Fri, 30 Apr 2021 14:33:25 +0200 Subject: [PATCH 1/4] Call restart_drain when ccng_monit_http_healthcheck fails The current dependency chain in monit is: nginx_cc -> cloud_controller_ng -> ccng_monit_http_healthcheck The health check is defined as base process so that a failure triggers a restart of the dependent processes. This is now changed to the logical dependency chain, i.e. the order in which the processes should be started: ccng_monit_http_healthcheck -> nginx_cc -> cloud_controller_ng This removes the implicit restart trigger for the other processes in case the health check fails; but we can add a directive to explicitly invoke the 'restart_drain' script instead. --- jobs/cloud_controller_ng/monit | 12 ++++++------ .../templates/restart_drain.sh.erb | 14 +++++++------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/jobs/cloud_controller_ng/monit b/jobs/cloud_controller_ng/monit index 2e0add1c84..1d1b6f9c06 100644 --- a/jobs/cloud_controller_ng/monit +++ b/jobs/cloud_controller_ng/monit @@ -2,18 +2,18 @@ check process cloud_controller_ng with pidfile /var/vcap/sys/run/bpm/cloud_controller_ng/cloud_controller_ng.pid start program "/var/vcap/jobs/bpm/bin/bpm start cloud_controller_ng" stop program "/var/vcap/jobs/bpm/bin/bpm stop cloud_controller_ng" - depends on ccng_monit_http_healthcheck group vcap if totalmem > <%= p("cc.thresholds.api.alert_if_above_mb") %> Mb for 3 cycles then alert if totalmem > <%= p("cc.thresholds.api.restart_if_consistently_above_mb") %> Mb for <%= p("cc.thresholds.api.restart_if_consistently_above_mb_cycles") %> cycles then exec "/var/vcap/jobs/cloud_controller_ng/bin/restart_drain" if totalmem > <%= p("cc.thresholds.api.restart_if_above_mb") %> Mb for 3 cycles then exec "/var/vcap/jobs/cloud_controller_ng/bin/restart_drain" - check process ccng_monit_http_healthcheck - with pidfile /var/vcap/sys/run/bpm/cloud_controller_ng/ccng_monit_http_healthcheck.pid - start program "/var/vcap/jobs/bpm/bin/bpm start cloud_controller_ng -p ccng_monit_http_healthcheck" - stop program "/var/vcap/jobs/bpm/bin/bpm stop cloud_controller_ng -p ccng_monit_http_healthcheck" - group vcap + with pidfile /var/vcap/sys/run/bpm/cloud_controller_ng/ccng_monit_http_healthcheck.pid + start program "/var/vcap/jobs/bpm/bin/bpm start cloud_controller_ng -p ccng_monit_http_healthcheck" + stop program "/var/vcap/jobs/bpm/bin/bpm stop cloud_controller_ng -p ccng_monit_http_healthcheck" + if 1 restart within 2 cycles then exec "/var/vcap/jobs/cloud_controller_ng/bin/restart_drain" + depends on nginx_cc + group vcap <% (1..(p("cc.jobs.local.number_of_workers"))).each do |index| %> check process cloud_controller_worker_local_<%= index %> diff --git a/jobs/cloud_controller_ng/templates/restart_drain.sh.erb b/jobs/cloud_controller_ng/templates/restart_drain.sh.erb index 263b2c37c7..b037de50b6 100644 --- a/jobs/cloud_controller_ng/templates/restart_drain.sh.erb +++ b/jobs/cloud_controller_ng/templates/restart_drain.sh.erb @@ -7,9 +7,9 @@ PIDFILE="/var/vcap/sys/run/cloud_controller_ng/restart_drain.pid" [[ -s "$PIDFILE" ]] && exit function on_exit { - # Enable monitoring of nginx_cc. This also enables monitoring for - # cloud_controller_ng and ccng_monit_http_healthcheck. - /var/vcap/bosh/bin/monit monitor nginx_cc + # Re-enable monitoring of ccng_monit_http_healthcheck. This also enables + # monitoring of nginx_cc and cloud_controller_ng. + /var/vcap/bosh/bin/monit monitor ccng_monit_http_healthcheck rm -f $PIDFILE } @@ -19,9 +19,9 @@ echo "$BASHPID" > "$PIDFILE" LOGFILE="/var/vcap/sys/log/cloud_controller_ng/drain/restart_drain.log" echo "$(date) - pid: $BASHPID - Monit triggered shutdown drain" >> "$LOGFILE" -# The health check fails as soon as nginx stops accepting new requests. It must -# be unmonitored to not interfere with the graceful shutdown. This also -# unmonitors cloud_controller_ng and nginx_cc. -/var/vcap/bosh/bin/monit unmonitor ccng_monit_http_healthcheck +# Unmonitor cloud_controller_ng. This also unmonitors nginx_cc and +# ccng_monit_http_healthcheck. Monit should not interfere with the graceful +# shutdown. +/var/vcap/bosh/bin/monit unmonitor cloud_controller_ng /var/vcap/jobs/cloud_controller_ng/bin/shutdown_drain 1>&2 From f22e7e092eda7172ec7cd8afff2c29b68750525a Mon Sep 17 00:00:00 2001 From: Philipp Thun Date: Fri, 30 Apr 2021 14:37:49 +0200 Subject: [PATCH 2/4] Remove obsolete waiting for cc, fix exit code handling - The wait_for_server_to_become_healthy is not needed anymore as the health check is started after cc and nginx. - The printed status code was always 0 - it was the exit code of the 'if' statement, not the 'curl' command. Co-authored-by: Andy Paine --- .../ccng_monit_http_healthcheck.sh.erb | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/jobs/cloud_controller_ng/templates/ccng_monit_http_healthcheck.sh.erb b/jobs/cloud_controller_ng/templates/ccng_monit_http_healthcheck.sh.erb index 8b37340693..0ea53fc608 100644 --- a/jobs/cloud_controller_ng/templates/ccng_monit_http_healthcheck.sh.erb +++ b/jobs/cloud_controller_ng/templates/ccng_monit_http_healthcheck.sh.erb @@ -30,28 +30,24 @@ PORT=<%= p("cc.public_tls.port") %> PROTOCOL="https" URL="https://${HOST}:${PORT}/healthz" -source /var/vcap/packages/capi_utils/monit_utils.sh - -echo $(date --rfc-3339=ns) 'Waiting for Cloud Controller to initially become healthy at' - -wait_for_server_to_become_healthy "${URL}" "<%= p("cc.api_post_start_healthcheck_timeout_in_seconds") %>" - -echo $(date --rfc-3339=ns) 'Initial check passed, will now restart CC over on repeated failures' +echo $(date --rfc-3339=ns) 'Will restart CC over on repeated failures' trap log_failure EXIT # if we fail to curl it 5 times in a row across 50 seconds, die so monit will restart us -set -e while true; do - if ! curl \ + set +e + curl \ -sS \ --max-time <%= p('cc.api_health_check_timeout_per_retry') %> \ --retry 5 \ --retry-delay 10 \ -A "ccng_monit_http_healthcheck" \ -k \ - "${URL}" > /dev/null ; then - status=$? + "${URL}" > /dev/null + status=$? + set -e + if [[ $status > 0 ]] ; then echo $(date --rfc-3339=ns) "ccng_monit_http_healthcheck failed to curl <${URL}>: exit code $status" exit $status fi From 2bb10a560d50c88e4490ceabe1c6f6e27b309122 Mon Sep 17 00:00:00 2001 From: Philipp Thun Date: Fri, 30 Apr 2021 16:26:54 +0200 Subject: [PATCH 3/4] Add (optional) properties to configure the ccng_monit_http_healthcheck Instead of re-using config options from the route registration health check, add dedicated options to configure timeout and retries for the ccng_monit_http_healthcheck process. There are situations when cloud controller is busy with processing multiple long-running requests; although it makes sense to temporarily deregister its route to prevent additional load on the instance, the (internal) health check should not restart the instance at the same time, but only when it stays busy for too long. --- jobs/cloud_controller_ng/spec | 8 +++++ .../ccng_monit_http_healthcheck.sh.erb | 8 +++-- spec/cloud_controller_ng/healthcheck_spec.rb | 31 +++++++++++++++++++ 3 files changed, 44 insertions(+), 3 deletions(-) create mode 100644 spec/cloud_controller_ng/healthcheck_spec.rb diff --git a/jobs/cloud_controller_ng/spec b/jobs/cloud_controller_ng/spec index bccdf03aaa..ac27e452ca 100644 --- a/jobs/cloud_controller_ng/spec +++ b/jobs/cloud_controller_ng/spec @@ -355,6 +355,14 @@ properties: default: 6 description: "Maximum health check timeout (in seconds). Health checks will be retried until this time limit is reached. This should be less than or equal to your route_registrar.routes.api.health_check.timeout" + cc.ccng_monit_http_healthcheck_retries: + default: 5 + description: "Number of retries performed by the ccng_monit_http_healthcheck process" + + cc.ccng_monit_http_healthcheck_timeout_per_retry: + default: 2 + description: "Timeout (in seconds) for each HTTP request sent by the ccng_monit_http_healthcheck process" + cc.jobs.global.timeout_in_seconds: description: "The longest any job can take before it is cancelled unless overridden per job" default: 14400 # 4 hours diff --git a/jobs/cloud_controller_ng/templates/ccng_monit_http_healthcheck.sh.erb b/jobs/cloud_controller_ng/templates/ccng_monit_http_healthcheck.sh.erb index 0ea53fc608..a44a5d52ad 100644 --- a/jobs/cloud_controller_ng/templates/ccng_monit_http_healthcheck.sh.erb +++ b/jobs/cloud_controller_ng/templates/ccng_monit_http_healthcheck.sh.erb @@ -34,13 +34,15 @@ echo $(date --rfc-3339=ns) 'Will restart CC over on repeated failures' trap log_failure EXIT -# if we fail to curl it 5 times in a row across 50 seconds, die so monit will restart us +# If we fail to curl the healthz endpoint 5 times (can be changed with cc.ccng_monit_http_healthcheck_retries) with +# a delay of 10 seconds between each retry, exit in order to trigger a restart of cloud controller through monit. +# Each curl has an individual timout of 2 seconds (can be changed with cc.ccng_monit_http_healthcheck_timeout_per_retry). while true; do set +e curl \ -sS \ - --max-time <%= p('cc.api_health_check_timeout_per_retry') %> \ - --retry 5 \ + --max-time <%= p('cc.ccng_monit_http_healthcheck_timeout_per_retry') %> \ + --retry <%= p('cc.ccng_monit_http_healthcheck_retries') %> \ --retry-delay 10 \ -A "ccng_monit_http_healthcheck" \ -k \ diff --git a/spec/cloud_controller_ng/healthcheck_spec.rb b/spec/cloud_controller_ng/healthcheck_spec.rb new file mode 100644 index 0000000000..a98f6a7ebf --- /dev/null +++ b/spec/cloud_controller_ng/healthcheck_spec.rb @@ -0,0 +1,31 @@ +# frozen_string_literal: true + +require 'rspec' +require 'bosh/template/test' + +module Bosh::Template::Test + describe 'health check template rendering' do + let(:release_path) { File.join(File.dirname(__FILE__), '../..') } + let(:release) { ReleaseDir.new(release_path) } + let(:job) { release.job('cloud_controller_ng') } + + describe 'bin/ccng_monit_http_healthcheck' do + let(:template) { job.template('bin/ccng_monit_http_healthcheck') } + + it 'renders the default value' do + rendered_file = template.render(consumes: {}) + expect(rendered_file).to include('--max-time 2') + expect(rendered_file).to include('--retry 5') + end + + context 'when custom values are provided' do + it 'renders the provided values' do + rendered_file = template.render({ 'cc' => { 'ccng_monit_http_healthcheck_timeout_per_retry' => 30, + 'ccng_monit_http_healthcheck_retries' => 2 } }, consumes: {}) + expect(rendered_file).to include('--max-time 30') + expect(rendered_file).to include('--retry 2') + end + end + end + end +end From dd4128b6653b8223519a3c9458f74ca9de57e60b Mon Sep 17 00:00:00 2001 From: Philipp Thun Date: Mon, 3 May 2021 15:00:47 +0200 Subject: [PATCH 4/4] Improve the drain scripts - bin/shutdown_drain does not need to echo '0'; this is already done by bin/drain (https://github.com/cloudfoundry/capi-release/blob/2b52d846f82c7dbe893ffc21f879c15a3fa1da36/jobs/cloud_controller_ng/templates/drain.sh.erb#L9). - As the bin/shutdown_drain script is never invoked with arguments, logging the invocation is superfluous. --- .../templates/cloud_controller_api_health_check.erb | 1 - jobs/cloud_controller_ng/templates/shutdown_drain.rb.erb | 3 --- 2 files changed, 4 deletions(-) diff --git a/jobs/cloud_controller_ng/templates/cloud_controller_api_health_check.erb b/jobs/cloud_controller_ng/templates/cloud_controller_api_health_check.erb index 88c0e78d5c..6a5c716dd8 100644 --- a/jobs/cloud_controller_ng/templates/cloud_controller_api_health_check.erb +++ b/jobs/cloud_controller_ng/templates/cloud_controller_api_health_check.erb @@ -18,4 +18,3 @@ if [[ $status > 0 ]] ; then echo $(date --rfc-3339=ns) "Failed to hit ${URL}" exit $status fi - diff --git a/jobs/cloud_controller_ng/templates/shutdown_drain.rb.erb b/jobs/cloud_controller_ng/templates/shutdown_drain.rb.erb index f3766b32dc..72e9278335 100644 --- a/jobs/cloud_controller_ng/templates/shutdown_drain.rb.erb +++ b/jobs/cloud_controller_ng/templates/shutdown_drain.rb.erb @@ -6,8 +6,5 @@ $LOAD_PATH.unshift('/var/vcap/packages/cloud_controller_ng/cloud_controller_ng/l require 'cloud_controller/drain' @drain = VCAP::CloudController::Drain.new('/var/vcap/sys/log/cloud_controller_ng') -@drain.log_invocation(ARGV) @drain.shutdown_nginx('/var/vcap/sys/run/bpm/cloud_controller_ng/nginx.pid', <%= p("cc.nginx_drain_timeout") %>) @drain.shutdown_cc('/var/vcap/sys/run/bpm/cloud_controller_ng/cloud_controller_ng.pid') - -puts 0 # tell bosh the drain script succeeded