Merge pull request #195 from philippthun/invoke-restart-drain-for-fai…

…led-healthcheck Invoke restart drain for failed healthcheck
cloudfoundry · May 13, 2021 · 7aae29e · 7aae29e
2 parents 1659236 + dd4128b
commit 7aae29e
Show file tree

Hide file tree

Showing 7 changed files with 64 additions and 31 deletions.
diff --git a/jobs/cloud_controller_ng/monit b/jobs/cloud_controller_ng/monit
@@ -2,18 +2,18 @@ check process cloud_controller_ng
   with pidfile /var/vcap/sys/run/bpm/cloud_controller_ng/cloud_controller_ng.pid
   start program "/var/vcap/jobs/bpm/bin/bpm start cloud_controller_ng"
   stop program "/var/vcap/jobs/bpm/bin/bpm stop cloud_controller_ng"
-  depends on ccng_monit_http_healthcheck
   group vcap
   if totalmem > <%= p("cc.thresholds.api.alert_if_above_mb") %> Mb for 3 cycles then alert
   if totalmem > <%= p("cc.thresholds.api.restart_if_consistently_above_mb") %> Mb for <%= p("cc.thresholds.api.restart_if_consistently_above_mb_cycles") %> cycles then exec "/var/vcap/jobs/cloud_controller_ng/bin/restart_drain"
   if totalmem > <%= p("cc.thresholds.api.restart_if_above_mb") %> Mb for 3 cycles then exec "/var/vcap/jobs/cloud_controller_ng/bin/restart_drain"
 
-
 check process ccng_monit_http_healthcheck
-    with pidfile /var/vcap/sys/run/bpm/cloud_controller_ng/ccng_monit_http_healthcheck.pid
-    start program "/var/vcap/jobs/bpm/bin/bpm start cloud_controller_ng -p ccng_monit_http_healthcheck"
-    stop program "/var/vcap/jobs/bpm/bin/bpm stop cloud_controller_ng -p ccng_monit_http_healthcheck"
-    group vcap
+  with pidfile /var/vcap/sys/run/bpm/cloud_controller_ng/ccng_monit_http_healthcheck.pid
+  start program "/var/vcap/jobs/bpm/bin/bpm start cloud_controller_ng -p ccng_monit_http_healthcheck"
+  stop program "/var/vcap/jobs/bpm/bin/bpm stop cloud_controller_ng -p ccng_monit_http_healthcheck"
+  if 1 restart within 2 cycles then exec "/var/vcap/jobs/cloud_controller_ng/bin/restart_drain"
+  depends on nginx_cc
+  group vcap
 
 <% (1..(p("cc.jobs.local.number_of_workers"))).each do |index| %>
 check process cloud_controller_worker_local_<%= index %>

diff --git a/jobs/cloud_controller_ng/spec b/jobs/cloud_controller_ng/spec
@@ -355,6 +355,14 @@ properties:
     default: 6
     description: "Maximum health check timeout (in seconds). Health checks will be retried until this time limit is reached. This should be less than or equal to your route_registrar.routes.api.health_check.timeout"
 
+  cc.ccng_monit_http_healthcheck_retries:
+    default: 5
+    description: "Number of retries performed by the ccng_monit_http_healthcheck process"
+
+  cc.ccng_monit_http_healthcheck_timeout_per_retry:
+    default: 2
+    description: "Timeout (in seconds) for each HTTP request sent by the ccng_monit_http_healthcheck process"
+
   cc.jobs.global.timeout_in_seconds:
     description: "The longest any job can take before it is cancelled unless overridden per job"
     default: 14400 # 4 hours

diff --git a/jobs/cloud_controller_ng/templates/ccng_monit_http_healthcheck.sh.erb b/jobs/cloud_controller_ng/templates/ccng_monit_http_healthcheck.sh.erb
@@ -30,28 +30,26 @@ PORT=<%= p("cc.public_tls.port") %>
 PROTOCOL="https"
 URL="https://${HOST}:${PORT}/healthz"
 
-source /var/vcap/packages/capi_utils/monit_utils.sh
-
-echo $(date --rfc-3339=ns) 'Waiting for Cloud Controller to initially become healthy at'
-
-wait_for_server_to_become_healthy "${URL}" "<%= p("cc.api_post_start_healthcheck_timeout_in_seconds") %>"
-
-echo $(date --rfc-3339=ns) 'Initial check passed, will now restart CC over on repeated failures'
+echo $(date --rfc-3339=ns) 'Will restart CC over on repeated failures'
 
 trap log_failure EXIT
 
-# if we fail to curl it 5 times in a row across 50 seconds, die so monit will restart us
-set -e
+# If we fail to curl the healthz endpoint 5 times (can be changed with cc.ccng_monit_http_healthcheck_retries) with
+# a delay of 10 seconds between each retry, exit in order to trigger a restart of cloud controller through monit.
+# Each curl has an individual timout of 2 seconds (can be changed with cc.ccng_monit_http_healthcheck_timeout_per_retry).
 while true; do
-  if ! curl \
+  set +e
+  curl \
     -sS \
-    --max-time <%= p('cc.api_health_check_timeout_per_retry') %> \
-    --retry 5 \
+    --max-time <%= p('cc.ccng_monit_http_healthcheck_timeout_per_retry') %> \
+    --retry <%= p('cc.ccng_monit_http_healthcheck_retries') %> \
     --retry-delay 10 \
     -A "ccng_monit_http_healthcheck" \
     -k \
-    "${URL}" > /dev/null ; then
-    status=$?
+    "${URL}" > /dev/null
+  status=$?
+  set -e
+  if [[ $status > 0 ]] ; then
     echo $(date --rfc-3339=ns) "ccng_monit_http_healthcheck failed to curl <${URL}>: exit code $status"
     exit $status
   fi

diff --git a/jobs/cloud_controller_ng/templates/cloud_controller_api_health_check.erb b/jobs/cloud_controller_ng/templates/cloud_controller_api_health_check.erb
@@ -18,4 +18,3 @@ if [[ $status > 0 ]] ; then
   echo $(date --rfc-3339=ns) "Failed to hit ${URL}"
   exit $status
 fi
-
diff --git a/jobs/cloud_controller_ng/templates/restart_drain.sh.erb b/jobs/cloud_controller_ng/templates/restart_drain.sh.erb
@@ -7,9 +7,9 @@ PIDFILE="/var/vcap/sys/run/cloud_controller_ng/restart_drain.pid"
 [[ -s "$PIDFILE" ]] && exit
 
 function on_exit {
-    # Enable monitoring of nginx_cc. This also enables monitoring for
-    # cloud_controller_ng and ccng_monit_http_healthcheck.
-    /var/vcap/bosh/bin/monit monitor nginx_cc
+    # Re-enable monitoring of ccng_monit_http_healthcheck. This also enables
+    # monitoring of nginx_cc and cloud_controller_ng.
+    /var/vcap/bosh/bin/monit monitor ccng_monit_http_healthcheck
     rm -f $PIDFILE
 }
 
@@ -19,9 +19,9 @@ echo "$BASHPID" > "$PIDFILE"
 LOGFILE="/var/vcap/sys/log/cloud_controller_ng/drain/restart_drain.log"
 echo "$(date) - pid: $BASHPID - Monit triggered shutdown drain" >> "$LOGFILE"
 
-# The health check fails as soon as nginx stops accepting new requests. It must
-# be unmonitored to not interfere with the graceful shutdown. This also
-# unmonitors cloud_controller_ng and nginx_cc.
-/var/vcap/bosh/bin/monit unmonitor ccng_monit_http_healthcheck
+# Unmonitor cloud_controller_ng. This also unmonitors nginx_cc and
+# ccng_monit_http_healthcheck. Monit should not interfere with the graceful
+# shutdown.
+/var/vcap/bosh/bin/monit unmonitor cloud_controller_ng
 
 /var/vcap/jobs/cloud_controller_ng/bin/shutdown_drain 1>&2
diff --git a/jobs/cloud_controller_ng/templates/shutdown_drain.rb.erb b/jobs/cloud_controller_ng/templates/shutdown_drain.rb.erb
@@ -6,8 +6,5 @@ $LOAD_PATH.unshift('/var/vcap/packages/cloud_controller_ng/cloud_controller_ng/l
 require 'cloud_controller/drain'
 
 @drain = VCAP::CloudController::Drain.new('/var/vcap/sys/log/cloud_controller_ng')
-@drain.log_invocation(ARGV)
 @drain.shutdown_nginx('/var/vcap/sys/run/bpm/cloud_controller_ng/nginx.pid', <%= p("cc.nginx_drain_timeout") %>)
 @drain.shutdown_cc('/var/vcap/sys/run/bpm/cloud_controller_ng/cloud_controller_ng.pid')
-
-puts 0 # tell bosh the drain script succeeded
diff --git a/spec/cloud_controller_ng/healthcheck_spec.rb b/spec/cloud_controller_ng/healthcheck_spec.rb
@@ -0,0 +1,31 @@
+# frozen_string_literal: true
+
+require 'rspec'
+require 'bosh/template/test'
+
+module Bosh::Template::Test
+  describe 'health check template rendering' do
+    let(:release_path) { File.join(File.dirname(__FILE__), '../..') }
+    let(:release) { ReleaseDir.new(release_path) }
+    let(:job) { release.job('cloud_controller_ng') }
+
+    describe 'bin/ccng_monit_http_healthcheck' do
+      let(:template) { job.template('bin/ccng_monit_http_healthcheck') }
+
+      it 'renders the default value' do
+        rendered_file = template.render(consumes: {})
+        expect(rendered_file).to include('--max-time 2')
+        expect(rendered_file).to include('--retry 5')
+      end
+
+      context 'when custom values are provided' do
+        it 'renders the provided values' do
+          rendered_file = template.render({ 'cc' => { 'ccng_monit_http_healthcheck_timeout_per_retry' => 30,
+                                                      'ccng_monit_http_healthcheck_retries' => 2 } }, consumes: {})
+          expect(rendered_file).to include('--max-time 30')
+          expect(rendered_file).to include('--retry 2')
+        end
+      end
+    end
+  end
+end
Original file line number	Diff line number	Diff line change
Expand Up		@@ -18,4 +18,3 @@ if [[ $status > 0 ]] ; then
		echo $(date --rfc-3339=ns) "Failed to hit ${URL}"
		exit $status
		fi