Skip to content

Commit

Permalink
cmd/roachprod: automatically configure core dumps
Browse files Browse the repository at this point in the history
Automatically configure roachprod machines to generate core dumps, and
specify `GOTRACEBACK=crash` when running cockroach and other binaries so
that the Go runtime generates a core dump when panicing.

Fixes cockroachdb#34680

Release note: None
  • Loading branch information
petermattis committed Mar 1, 2019
1 parent ce3d213 commit 4102943
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pkg/cmd/roachprod/install/cluster_synced.go
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ func (c *SyncedCluster) Run(stdout, stderr io.Writer, nodes []int, title, cmd st
//
// That command should return immediately. And a "roachprod status" should
// reveal that the sleep command is running on the cluster.
nodeCmd := fmt.Sprintf(`export ROACHPROD=%d%s && bash -c %s`,
nodeCmd := fmt.Sprintf(`export ROACHPROD=%d%s GOTRACEBACK=crash && bash -c %s`,
nodes[i], c.Tag, ssh.Escape1(expandedCmd))
if c.IsLocal() {
nodeCmd = fmt.Sprintf("cd ${HOME}/local/%d ; %s", nodes[i], nodeCmd)
Expand Down
3 changes: 2 additions & 1 deletion pkg/cmd/roachprod/install/cockroach.go
Original file line number Diff line number Diff line change
Expand Up @@ -380,14 +380,15 @@ tar cvf certs.tar certs
// NB: this is awkward as when the process fails, the test runner will show an
// unhelpful empty error (since everything has been redirected away). This is
// unfortunately equally awkward to address.
cmd := "mkdir -p " + logDir + "; "
cmd := "ulimit -c unlimited; mkdir -p " + logDir + "; "
// TODO(peter): The ps and lslocks stuff is intended to debug why killing
// of a cockroach process sometimes doesn't release file locks immediately.
cmd += `echo ">>> roachprod start: $(date)" >> ` + logDir + "/roachprod.log; " +
`ps axeww -o pid -o command >> ` + logDir + "/roachprod.log; " +
`[ -x /usr/bin/lslocks ] && /usr/bin/lslocks >> ` + logDir + "/roachprod.log; "
cmd += keyCmd +
fmt.Sprintf(" export ROACHPROD=%d%s && ", nodes[i], c.Tag) +
"GOTRACEBACK=crash " +
"COCKROACH_SKIP_ENABLING_DIAGNOSTIC_REPORTING=1 " +
c.Env + " " + binary + " start " + strings.Join(args, " ") +
" >> " + logDir + "/cockroach.stdout 2>> " + logDir + "/cockroach.stderr" +
Expand Down
19 changes: 19 additions & 0 deletions pkg/cmd/roachprod/vm/aws/support.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,25 @@ sudo chronyc -a waitsync 30 0.01 | sudo tee -a /root/chrony.log
# root and non-root users. Load generators running a lot of concurrent
# workers bump into this often.
sudo sh -c 'echo "root - nofile 65536\n* - nofile 65536" > /etc/security/limits.d/10-roachprod-nofiles.conf'
# Enable core dumps
cat <<EOF > /etc/security/limits.d/core_unlimited.conf
* soft core unlimited
* hard core unlimited
root soft core unlimited
root hard core unlimited
EOF
mkdir -p /tmp/cores
chmod a+w /tmp/cores
CORE_PATTERN="/tmp/cores/core.%e.%p.%h.%t"
echo "$CORE_PATTERN" > /proc/sys/kernel/core_pattern
sed -i'~' 's/enabled=1/enabled=0/' /etc/default/apport
sed -i'~' '/.*kernel\\.core_pattern.*/c\\' /etc/sysctl.conf
echo "kernel.core_pattern=$CORE_PATTERN" >> /etc/sysctl.conf
sysctl --system # reload sysctl settings
sudo touch /mnt/data1/.roachprod-initialized
`

Expand Down
20 changes: 19 additions & 1 deletion pkg/cmd/roachprod/vm/gce/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ sudo service sshguard stop
# root and non-root users. Load generators running a lot of concurrent
# workers bump into this often.
sudo sh -c 'echo "root - nofile 65536\n* - nofile 65536" > /etc/security/limits.d/10-roachprod-nofiles.conf'
sudo touch /mnt/data1/.roachprod-initialized
# Send TCP keepalives every minute since GCE will terminate idle connections
# after 10m. Note that keepalives still need to be requested by the application
Expand All @@ -69,7 +68,26 @@ net.ipv4.tcp_keepalive_time=60
net.ipv4.tcp_keepalive_intvl=60
net.ipv4.tcp_keepalive_probes=5
EOF
# Enable core dumps
cat <<EOF > /etc/security/limits.d/core_unlimited.conf
* soft core unlimited
* hard core unlimited
root soft core unlimited
root hard core unlimited
EOF
mkdir -p /tmp/cores
chmod a+w /tmp/cores
CORE_PATTERN="/tmp/cores/core.%e.%p.%h.%t"
echo "$CORE_PATTERN" > /proc/sys/kernel/core_pattern
sed -i'~' 's/enabled=1/enabled=0/' /etc/default/apport
sed -i'~' '/.*kernel\\.core_pattern.*/c\\' /etc/sysctl.conf
echo "kernel.core_pattern=$CORE_PATTERN" >> /etc/sysctl.conf
sysctl --system # reload sysctl settings
sudo touch /mnt/data1/.roachprod-initialized
`

// writeStartupScript writes the startup script to a temp file.
Expand Down

0 comments on commit 4102943

Please sign in to comment.