Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

roachprod: show process exit status with monitor #66414

Merged
merged 2 commits into from
Jun 14, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 29 additions & 32 deletions pkg/cmd/roachprod/install/cluster_synced.go
Original file line number Diff line number Diff line change
Expand Up @@ -343,53 +343,60 @@ func (c *SyncedCluster) Monitor(ignoreEmptyNodes bool, oneShot bool) chan NodeMo
return
}

// On each monitored node, we loop looking for a cockroach process. In
// order to avoid polling with lsof, if we find a live process we use nc
// (netcat) to connect to the rpc port which will block until the server
// either decides to kill the connection or the process is killed.
// In one-shot we don't use nc and return after the first assessment
// of the process' health.
// On each monitored node, we loop looking for a cockroach process.
data := struct {
OneShot bool
IgnoreEmpty bool
Store string
Port int
Local bool
}{
OneShot: oneShot,
IgnoreEmpty: ignoreEmptyNodes,
Store: Cockroach{}.NodeDir(c, nodes[i], 1 /* storeIndex */),
Port: Cockroach{}.NodePort(c, nodes[i]),
Local: c.IsLocal(),
}

snippet := `
lastpid=0
{{ if .IgnoreEmpty}}
{{ if .IgnoreEmpty }}
if [ ! -f "{{.Store}}/CURRENT" ]; then
echo "skipped"
exit 0
fi
{{- end}}

lastpid=""
while :; do
{{ if .Local }}
pid=$(lsof -i :{{.Port}} -sTCP:LISTEN | awk '!/COMMAND/ {print $2}')
pid=${pid:-0} # default to 0
status=unknown
{{- else }}
pid=$(systemctl show cockroach --property MainPID --value)
status=$(systemctl show cockroach --property ExecMainStatus --value)
{{- end }}

if [ "${pid}" != "${lastpid}" ]; then
if [ -n "${lastpid}" -a -z "${pid}" ]; then
echo dead
# Output a dead event on every PID change, except if initial PID is live.
if [[ ! ("${lastpid}" == "" && "${pid}" != 0) ]]; then
echo "dead (exit status ${status})"
fi
if [ "${pid}" != 0 ]; then
echo "${pid}"
fi
lastpid=${pid}
if [ -n "${pid}" ]; then
echo ${pid}
fi
fi
{{if .OneShot }}

{{ if .OneShot }}
exit 0
{{- end}}
if [ -n "${lastpid}" ]; then
while kill -0 "${lastpid}"; do
{{- end }}

sleep 1
if [ "${pid}" != 0 ]; then
while kill -0 "${pid}"; do
sleep 1
done
echo "kill exited nonzero"
else
sleep 1
fi
done
`
Expand All @@ -401,22 +408,12 @@ done
return
}

// Request a PTY so that the script will receive will receive a SIGPIPE
// when the session is closed.
// Request a PTY so that the script will receive a SIGPIPE when the
// session is closed.
if err := sess.RequestPty(); err != nil {
ch <- NodeMonitorInfo{Index: nodes[i], Err: err}
return
}
// Give the session a valid stdin pipe so that nc won't exit immediately.
// When nc does exit, we write to stdout, which has a side effect of
// checking whether the stdout pipe has broken. This allows us to detect
// when the roachprod process is killed.
inPipe, err := sess.StdinPipe()
if err != nil {
ch <- NodeMonitorInfo{Index: nodes[i], Err: err}
return
}
defer inPipe.Close()

var readerWg sync.WaitGroup
readerWg.Add(1)
Expand Down