From bb4cdf81b80475b960750f77c78c6c9110a336a0 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Tue, 7 May 2019 22:19:56 -0700 Subject: [PATCH] roachprod: remove monitor netcat command `roachprod monitor` assumes `nc` will exit as soon as Cockroach server exits. This actually is not the case in later versions of netcat (tested on Ubuntu 18.04+). This PR changes to a polling approach calling `kill -0` once per second to monitor the Cockroach server's liveness. This should give us better portability and we verified the overhead is low (~0.65ms of a CPU core's time per `kill` invocation). Tested by running `roachprod monitor` locally, gradually killing the nodes, and observing the output: ``` 3: 28342 1: 28176 2: 28257 3: kill exited nonzero 3: dead 2: kill exited nonzero 2: dead 1: kill exited nonzero 1: dead ``` Fixes #37370. Release note: None --- pkg/cmd/roachprod/install/cluster_synced.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/cmd/roachprod/install/cluster_synced.go b/pkg/cmd/roachprod/install/cluster_synced.go index a0c930dac064..7a4f38e31580 100644 --- a/pkg/cmd/roachprod/install/cluster_synced.go +++ b/pkg/cmd/roachprod/install/cluster_synced.go @@ -368,8 +368,10 @@ while :; do exit 0 {{- end}} if [ -n "${lastpid}" ]; then - nc localhost {{.Port}} >/dev/null 2>&1 - echo nc exited + while kill -0 "${lastpid}"; do + sleep 1 + done + echo "kill exited nonzero" else sleep 1 fi