From ec2fcfa550518c43ed3042b33023da34af96fb86 Mon Sep 17 00:00:00 2001
From: Philipp Thun <philipp.thun@sap.com>
Date: Fri, 30 Apr 2021 14:33:25 +0200
Subject: [PATCH 1/4] Call restart_drain when ccng_monit_http_healthcheck fails

The current dependency chain in monit is:

  nginx_cc -> cloud_controller_ng -> ccng_monit_http_healthcheck

The health check is defined as base process so that a failure triggers a restart of the dependent processes.

This is now changed to the logical dependency chain, i.e. the order in which the processes should be started:

  ccng_monit_http_healthcheck -> nginx_cc -> cloud_controller_ng

This removes the implicit restart trigger for the other processes in case the health check fails; but we can add a directive to explicitly invoke the 'restart_drain' script instead.
---
 jobs/cloud_controller_ng/monit                     | 12 ++++++------
 .../templates/restart_drain.sh.erb                 | 14 +++++++-------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/jobs/cloud_controller_ng/monit b/jobs/cloud_controller_ng/monit
index 2e0add1c84..1d1b6f9c06 100644
--- a/jobs/cloud_controller_ng/monit
+++ b/jobs/cloud_controller_ng/monit
@@ -2,18 +2,18 @@ check process cloud_controller_ng
   with pidfile /var/vcap/sys/run/bpm/cloud_controller_ng/cloud_controller_ng.pid
   start program "/var/vcap/jobs/bpm/bin/bpm start cloud_controller_ng"
   stop program "/var/vcap/jobs/bpm/bin/bpm stop cloud_controller_ng"
-  depends on ccng_monit_http_healthcheck
   group vcap
   if totalmem > <%= p("cc.thresholds.api.alert_if_above_mb") %> Mb for 3 cycles then alert
   if totalmem > <%= p("cc.thresholds.api.restart_if_consistently_above_mb") %> Mb for <%= p("cc.thresholds.api.restart_if_consistently_above_mb_cycles") %> cycles then exec "/var/vcap/jobs/cloud_controller_ng/bin/restart_drain"
   if totalmem > <%= p("cc.thresholds.api.restart_if_above_mb") %> Mb for 3 cycles then exec "/var/vcap/jobs/cloud_controller_ng/bin/restart_drain"
 
-
 check process ccng_monit_http_healthcheck
-    with pidfile /var/vcap/sys/run/bpm/cloud_controller_ng/ccng_monit_http_healthcheck.pid
-    start program "/var/vcap/jobs/bpm/bin/bpm start cloud_controller_ng -p ccng_monit_http_healthcheck"
-    stop program "/var/vcap/jobs/bpm/bin/bpm stop cloud_controller_ng -p ccng_monit_http_healthcheck"
-    group vcap
+  with pidfile /var/vcap/sys/run/bpm/cloud_controller_ng/ccng_monit_http_healthcheck.pid
+  start program "/var/vcap/jobs/bpm/bin/bpm start cloud_controller_ng -p ccng_monit_http_healthcheck"
+  stop program "/var/vcap/jobs/bpm/bin/bpm stop cloud_controller_ng -p ccng_monit_http_healthcheck"
+  if 1 restart within 2 cycles then exec "/var/vcap/jobs/cloud_controller_ng/bin/restart_drain"
+  depends on nginx_cc
+  group vcap
 
 <% (1..(p("cc.jobs.local.number_of_workers"))).each do |index| %>
 check process cloud_controller_worker_local_<%= index %>
diff --git a/jobs/cloud_controller_ng/templates/restart_drain.sh.erb b/jobs/cloud_controller_ng/templates/restart_drain.sh.erb
index 263b2c37c7..b037de50b6 100644
--- a/jobs/cloud_controller_ng/templates/restart_drain.sh.erb
+++ b/jobs/cloud_controller_ng/templates/restart_drain.sh.erb
@@ -7,9 +7,9 @@ PIDFILE="/var/vcap/sys/run/cloud_controller_ng/restart_drain.pid"
 [[ -s "$PIDFILE" ]] && exit
 
 function on_exit {
-    # Enable monitoring of nginx_cc. This also enables monitoring for
-    # cloud_controller_ng and ccng_monit_http_healthcheck.
-    /var/vcap/bosh/bin/monit monitor nginx_cc
+    # Re-enable monitoring of ccng_monit_http_healthcheck. This also enables
+    # monitoring of nginx_cc and cloud_controller_ng.
+    /var/vcap/bosh/bin/monit monitor ccng_monit_http_healthcheck
     rm -f $PIDFILE
 }
 
@@ -19,9 +19,9 @@ echo "$BASHPID" > "$PIDFILE"
 LOGFILE="/var/vcap/sys/log/cloud_controller_ng/drain/restart_drain.log"
 echo "$(date) - pid: $BASHPID - Monit triggered shutdown drain" >> "$LOGFILE"
 
-# The health check fails as soon as nginx stops accepting new requests. It must
-# be unmonitored to not interfere with the graceful shutdown. This also
-# unmonitors cloud_controller_ng and nginx_cc.
-/var/vcap/bosh/bin/monit unmonitor ccng_monit_http_healthcheck
+# Unmonitor cloud_controller_ng. This also unmonitors nginx_cc and
+# ccng_monit_http_healthcheck. Monit should not interfere with the graceful
+# shutdown.
+/var/vcap/bosh/bin/monit unmonitor cloud_controller_ng
 
 /var/vcap/jobs/cloud_controller_ng/bin/shutdown_drain 1>&2

From f22e7e092eda7172ec7cd8afff2c29b68750525a Mon Sep 17 00:00:00 2001
From: Philipp Thun <philipp.thun@sap.com>
Date: Fri, 30 Apr 2021 14:37:49 +0200
Subject: [PATCH 2/4] Remove obsolete waiting for cc, fix exit code handling

- The wait_for_server_to_become_healthy is not needed anymore as the health check is started after cc and nginx.

- The printed status code was always 0 - it was the exit code of the 'if' statement, not the 'curl' command.

Co-authored-by: Andy Paine <andy.paine@engineerbetter.com>
---
 .../ccng_monit_http_healthcheck.sh.erb         | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/jobs/cloud_controller_ng/templates/ccng_monit_http_healthcheck.sh.erb b/jobs/cloud_controller_ng/templates/ccng_monit_http_healthcheck.sh.erb
index 8b37340693..0ea53fc608 100644
--- a/jobs/cloud_controller_ng/templates/ccng_monit_http_healthcheck.sh.erb
+++ b/jobs/cloud_controller_ng/templates/ccng_monit_http_healthcheck.sh.erb
@@ -30,28 +30,24 @@ PORT=<%= p("cc.public_tls.port") %>
 PROTOCOL="https"
 URL="https://${HOST}:${PORT}/healthz"
 
-source /var/vcap/packages/capi_utils/monit_utils.sh
-
-echo $(date --rfc-3339=ns) 'Waiting for Cloud Controller to initially become healthy at'
-
-wait_for_server_to_become_healthy "${URL}" "<%= p("cc.api_post_start_healthcheck_timeout_in_seconds") %>"
-
-echo $(date --rfc-3339=ns) 'Initial check passed, will now restart CC over on repeated failures'
+echo $(date --rfc-3339=ns) 'Will restart CC over on repeated failures'
 
 trap log_failure EXIT
 
 # if we fail to curl it 5 times in a row across 50 seconds, die so monit will restart us
-set -e
 while true; do
-  if ! curl \
+  set +e
+  curl \
     -sS \
     --max-time <%= p('cc.api_health_check_timeout_per_retry') %> \
     --retry 5 \
     --retry-delay 10 \
     -A "ccng_monit_http_healthcheck" \
     -k \
-    "${URL}" > /dev/null ; then
-    status=$?
+    "${URL}" > /dev/null
+  status=$?
+  set -e
+  if [[ $status > 0 ]] ; then
     echo $(date --rfc-3339=ns) "ccng_monit_http_healthcheck failed to curl <${URL}>: exit code $status"
     exit $status
   fi

From 2bb10a560d50c88e4490ceabe1c6f6e27b309122 Mon Sep 17 00:00:00 2001
From: Philipp Thun <philipp.thun@sap.com>
Date: Fri, 30 Apr 2021 16:26:54 +0200
Subject: [PATCH 3/4] Add (optional) properties to configure the
 ccng_monit_http_healthcheck

Instead of re-using config options from the route registration health check, add dedicated options to configure timeout and retries for the ccng_monit_http_healthcheck process.

There are situations when cloud controller is busy with processing multiple long-running requests; although it makes sense to temporarily deregister its route to prevent additional load on the instance, the (internal) health check should not restart the instance at the same time, but only when it stays busy for too long.
---
 jobs/cloud_controller_ng/spec                 |  8 +++++
 .../ccng_monit_http_healthcheck.sh.erb        |  8 +++--
 spec/cloud_controller_ng/healthcheck_spec.rb  | 31 +++++++++++++++++++
 3 files changed, 44 insertions(+), 3 deletions(-)
 create mode 100644 spec/cloud_controller_ng/healthcheck_spec.rb

diff --git a/jobs/cloud_controller_ng/spec b/jobs/cloud_controller_ng/spec
index bccdf03aaa..ac27e452ca 100644
--- a/jobs/cloud_controller_ng/spec
+++ b/jobs/cloud_controller_ng/spec
@@ -355,6 +355,14 @@ properties:
     default: 6
     description: "Maximum health check timeout (in seconds). Health checks will be retried until this time limit is reached. This should be less than or equal to your route_registrar.routes.api.health_check.timeout"
 
+  cc.ccng_monit_http_healthcheck_retries:
+    default: 5
+    description: "Number of retries performed by the ccng_monit_http_healthcheck process"
+
+  cc.ccng_monit_http_healthcheck_timeout_per_retry:
+    default: 2
+    description: "Timeout (in seconds) for each HTTP request sent by the ccng_monit_http_healthcheck process"
+
   cc.jobs.global.timeout_in_seconds:
     description: "The longest any job can take before it is cancelled unless overridden per job"
     default: 14400 # 4 hours
diff --git a/jobs/cloud_controller_ng/templates/ccng_monit_http_healthcheck.sh.erb b/jobs/cloud_controller_ng/templates/ccng_monit_http_healthcheck.sh.erb
index 0ea53fc608..a44a5d52ad 100644
--- a/jobs/cloud_controller_ng/templates/ccng_monit_http_healthcheck.sh.erb
+++ b/jobs/cloud_controller_ng/templates/ccng_monit_http_healthcheck.sh.erb
@@ -34,13 +34,15 @@ echo $(date --rfc-3339=ns) 'Will restart CC over on repeated failures'
 
 trap log_failure EXIT
 
-# if we fail to curl it 5 times in a row across 50 seconds, die so monit will restart us
+# If we fail to curl the healthz endpoint 5 times (can be changed with cc.ccng_monit_http_healthcheck_retries) with
+# a delay of 10 seconds between each retry, exit in order to trigger a restart of cloud controller through monit.
+# Each curl has an individual timout of 2 seconds (can be changed with cc.ccng_monit_http_healthcheck_timeout_per_retry).
 while true; do
   set +e
   curl \
     -sS \
-    --max-time <%= p('cc.api_health_check_timeout_per_retry') %> \
-    --retry 5 \
+    --max-time <%= p('cc.ccng_monit_http_healthcheck_timeout_per_retry') %> \
+    --retry <%= p('cc.ccng_monit_http_healthcheck_retries') %> \
     --retry-delay 10 \
     -A "ccng_monit_http_healthcheck" \
     -k \
diff --git a/spec/cloud_controller_ng/healthcheck_spec.rb b/spec/cloud_controller_ng/healthcheck_spec.rb
new file mode 100644
index 0000000000..a98f6a7ebf
--- /dev/null
+++ b/spec/cloud_controller_ng/healthcheck_spec.rb
@@ -0,0 +1,31 @@
+# frozen_string_literal: true
+
+require 'rspec'
+require 'bosh/template/test'
+
+module Bosh::Template::Test
+  describe 'health check template rendering' do
+    let(:release_path) { File.join(File.dirname(__FILE__), '../..') }
+    let(:release) { ReleaseDir.new(release_path) }
+    let(:job) { release.job('cloud_controller_ng') }
+
+    describe 'bin/ccng_monit_http_healthcheck' do
+      let(:template) { job.template('bin/ccng_monit_http_healthcheck') }
+
+      it 'renders the default value' do
+        rendered_file = template.render(consumes: {})
+        expect(rendered_file).to include('--max-time 2')
+        expect(rendered_file).to include('--retry 5')
+      end
+
+      context 'when custom values are provided' do
+        it 'renders the provided values' do
+          rendered_file = template.render({ 'cc' => { 'ccng_monit_http_healthcheck_timeout_per_retry' => 30,
+                                                      'ccng_monit_http_healthcheck_retries' => 2 } }, consumes: {})
+          expect(rendered_file).to include('--max-time 30')
+          expect(rendered_file).to include('--retry 2')
+        end
+      end
+    end
+  end
+end

From dd4128b6653b8223519a3c9458f74ca9de57e60b Mon Sep 17 00:00:00 2001
From: Philipp Thun <philipp.thun@sap.com>
Date: Mon, 3 May 2021 15:00:47 +0200
Subject: [PATCH 4/4] Improve the drain scripts

- bin/shutdown_drain does not need to echo '0'; this is already done by bin/drain (https://github.com/cloudfoundry/capi-release/blob/2b52d846f82c7dbe893ffc21f879c15a3fa1da36/jobs/cloud_controller_ng/templates/drain.sh.erb#L9).

- As the bin/shutdown_drain script is never invoked with arguments, logging the invocation is superfluous.
---
 .../templates/cloud_controller_api_health_check.erb            | 1 -
 jobs/cloud_controller_ng/templates/shutdown_drain.rb.erb       | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/jobs/cloud_controller_ng/templates/cloud_controller_api_health_check.erb b/jobs/cloud_controller_ng/templates/cloud_controller_api_health_check.erb
index 88c0e78d5c..6a5c716dd8 100644
--- a/jobs/cloud_controller_ng/templates/cloud_controller_api_health_check.erb
+++ b/jobs/cloud_controller_ng/templates/cloud_controller_api_health_check.erb
@@ -18,4 +18,3 @@ if [[ $status > 0 ]] ; then
   echo $(date --rfc-3339=ns) "Failed to hit ${URL}"
   exit $status
 fi
-
diff --git a/jobs/cloud_controller_ng/templates/shutdown_drain.rb.erb b/jobs/cloud_controller_ng/templates/shutdown_drain.rb.erb
index f3766b32dc..72e9278335 100644
--- a/jobs/cloud_controller_ng/templates/shutdown_drain.rb.erb
+++ b/jobs/cloud_controller_ng/templates/shutdown_drain.rb.erb
@@ -6,8 +6,5 @@ $LOAD_PATH.unshift('/var/vcap/packages/cloud_controller_ng/cloud_controller_ng/l
 require 'cloud_controller/drain'
 
 @drain = VCAP::CloudController::Drain.new('/var/vcap/sys/log/cloud_controller_ng')
-@drain.log_invocation(ARGV)
 @drain.shutdown_nginx('/var/vcap/sys/run/bpm/cloud_controller_ng/nginx.pid', <%= p("cc.nginx_drain_timeout") %>)
 @drain.shutdown_cc('/var/vcap/sys/run/bpm/cloud_controller_ng/cloud_controller_ng.pid')
-
-puts 0 # tell bosh the drain script succeeded