-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #15717 from knz/jepsen-refactor
build: refactor the Jepsen test scripts
- Loading branch information
Showing
7 changed files
with
227 additions
and
84 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
# Source this file from one of the other jepsen scripts | ||
|
||
PS4="+($(basename $0)) " | ||
|
||
LOG_DIR="${COCKROACH_PATH}/artifacts" | ||
mkdir -p "${LOG_DIR}" | ||
cd "${LOG_DIR}" | ||
|
||
KEY_NAME="${KEY_NAME-google_compute_engine}" | ||
|
||
SSH_OPTIONS=(-o "ServerAliveInterval=60" -o "StrictHostKeyChecking no" -i "$HOME/.ssh/${KEY_NAME}") | ||
|
||
# Ensure that the terraform config is cancelled if one of the run scripts fails | ||
# or the entire thing is interrupted externally. | ||
function destroy { | ||
set +e | ||
progress Destroying cluster... | ||
terraform destroy --var=key_name="${KEY_NAME}" --force || true | ||
|
||
if test -n "${currentTestName:-}"; then | ||
tc Failed "$currentTestName" | ||
tc Finished "$currentTestName" | ||
fi | ||
exit 1 | ||
} | ||
trap destroy ERR SIGHUP SIGINT SIGTERM | ||
|
||
function tc { | ||
printf "##%s[test%s name='Jepsen%s']\\n" teamcity "$1" "$2" | ||
case $1 in | ||
Started) currentTestName=$2 ;; | ||
Finished) currentTestName= ;; | ||
esac | ||
} | ||
|
||
function progress { | ||
printf "##%s[progressMessage '%s']\\n" teamcity "$*" | ||
} | ||
|
||
nemeses=( | ||
# big-skews disabled since they assume an eth0 interface. | ||
#"--nemesis big-skews" | ||
"--nemesis majority-ring" | ||
"--nemesis start-stop-2" | ||
"--nemesis start-kill-2" | ||
#"--nemesis majority-ring --nemesis2 big-skews" | ||
#"--nemesis big-skews --nemesis2 start-kill-2" | ||
"--nemesis majority-ring --nemesis2 start-kill-2" | ||
"--nemesis parts --nemesis2 start-kill-2" | ||
) | ||
|
||
tests=( | ||
"bank" | ||
"comments" | ||
"register" | ||
"monotonic" | ||
"sets" | ||
"sequential" | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
#!/usr/bin/env bash | ||
set -euxo pipefail | ||
COCKROACH_PATH="${GOPATH}/src/github.com/cockroachdb/cockroach" | ||
source "${COCKROACH_PATH}/build/jepsen-common.sh" | ||
|
||
progress Destroying cluster... | ||
terraform destroy --var=key_name="${KEY_NAME}" --force || true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#!/usr/bin/env bash | ||
set -euxo pipefail | ||
COCKROACH_PATH="${GOPATH}/src/github.com/cockroachdb/cockroach" | ||
source "${COCKROACH_PATH}/build/jepsen-common.sh" | ||
|
||
tc Started SetupCluster | ||
|
||
progress Preparing files | ||
# Copy the terraform config locally. We keep it in artifacts as well | ||
# as the terraform state file so that if when troubleshooting a | ||
# failing test we can reuse exactly the same settings. | ||
cp -a "${COCKROACH_PATH}/cloud/gce/jepsen"/* . | ||
|
||
progress Generating controller SSH keys | ||
rm -f controller.id_rsa controller.id_rsa.pub | ||
ssh-keygen -f controller.id_rsa -N '' | ||
|
||
progress Spinning up the cluster | ||
# A failure here is caught by the trap handler. | ||
terraform apply --var=key_name="${KEY_NAME}" | ||
|
||
tc Finished SetupCluster |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
#!/usr/bin/env bash | ||
set -euxo pipefail | ||
COCKROACH_PATH="${GOPATH}/src/github.com/cockroachdb/cockroach" | ||
source "${COCKROACH_PATH}/build/jepsen-common.sh" | ||
|
||
testName=${1:?test label not specified} | ||
test=${2:?Jepsen test name not specified} | ||
nemesis=${3:?Jepsen nemesis flag(s) not specified} | ||
|
||
tc Started "$testName" | ||
|
||
# The test's log file will go to a sub-dir named after the test. Make it. | ||
artifacts_dir=$(echo "$testName"|tr / _) | ||
mkdir -p "${artifacts_dir}" | ||
|
||
# What is the controller again? | ||
controller=$(terraform output controller-ip) | ||
|
||
# Prepare the command to run the test. | ||
testcmd="cd jepsen/cockroachdb && set -eo pipefail && \ | ||
stdbuf -oL -eL \ | ||
~/lein run test \ | ||
--tarball file:///home/ubuntu/cockroach.tgz \ | ||
--username ubuntu \ | ||
--ssh-private-key ~/.ssh/id_rsa \ | ||
--nodes-file ~/nodes \ | ||
--os ubuntu \ | ||
--time-limit 180 \ | ||
--test-count 1 \ | ||
--test ${test} ${nemesis} \ | ||
2>&1 | stdbuf -oL tee invoke.log" | ||
|
||
exitcode=0 | ||
|
||
# Although we run tests of 3 minutes each, we use a timeout | ||
# much larger than that; this is because Jepsen for some tests | ||
# (e.g. register) runs a potentially long analysis after the test | ||
# itself has completed, before determining whether the test has | ||
# succeeded or note. | ||
if timeout 15m ssh "${SSH_OPTIONS[@]}" "ubuntu@${controller}" "${testcmd}" \ | ||
| (set +x; i=1; IFS=' | ||
'; | ||
# The following loop displays a TC message every 10 seconds | ||
# with an excerpt from the jepsen log. | ||
prevsecs=0 | ||
while true; do | ||
# Fail if no jepsen logging message within 30 seconds. | ||
read -t 30 x | ||
status=$? | ||
if [ $status -gt 128 ]; then | ||
progress "Jepsen test was silent for too long, aborting" | ||
# timeout: kill ssh to abort the test. | ||
killall ssh | ||
exit $status | ||
elif [ $status != 0 ]; then | ||
break | ||
fi | ||
secs=$(date +%s); | ||
if [ $secs -gt $(($prevsecs+10)) ]; then | ||
prevsecs=$secs | ||
echo "... $x ..." | ||
progress "Test still running, $i log lines" | ||
fi | ||
i=$(($i+1)) | ||
done; exit 0); then | ||
|
||
# Test passed. grab just the results file. | ||
progress "Test passed. Grabbing minimal logs..." | ||
scp "${SSH_OPTIONS[@]}" -C -r \ | ||
"ubuntu@${controller}:jepsen/cockroachdb/store/latest/{test.fressian,results.edn,latency-quantiles.png,latency-raw.png,rate.png}" \ | ||
"${artifacts_dir}" | ||
|
||
else | ||
progress "Test failed: exit code $?. Grabbing artifacts from controller..." | ||
exitcode=1 | ||
|
||
# Show the last few lines from the Jepsen run into the build log. | ||
ssh "${SSH_OPTIONS[@]}" "ubuntu@${controller}" "tail -n 100 jepsen/cockroachdb/invoke.log" >&2 || echo "Failed to extract the last lines from invoke.log." >&2 | ||
|
||
progress Creating archive from controller output | ||
# Now grab all the artifacts. | ||
# -h causes tar to follow symlinks; needed by the `latest` symlink. | ||
ssh "${SSH_OPTIONS[@]}" "ubuntu@${controller}" "tar -chj --ignore-failed-read -f- jepsen/cockroachdb/store/latest jepsen/cockroachdb/invoke.log" >"${artifacts_dir}"/failure-logs.tbz || echo "Failed to copy the files." >&2 | ||
|
||
progress Resetting latest run for next test | ||
# Reset the link for the next test run. | ||
ssh "${SSH_OPTIONS[@]}" "ubuntu@${controller}" "rm -f jepsen/cockroachdb/store/latest" || echo "Failed to remove the latest alias." >&2 | ||
|
||
tc Failed "$testName" | ||
fi | ||
|
||
tc Finished "$testName" | ||
|
||
# For debugging | ||
echo "##teamcity[publishArtifacts '${LOG_DIR}']" | ||
|
||
exit $exitcode |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
#!/usr/bin/env bash | ||
set -euxo pipefail | ||
COCKROACH_PATH="${GOPATH}/src/github.com/cockroachdb/cockroach" | ||
source "${COCKROACH_PATH}/build/jepsen-common.sh" | ||
|
||
tc SuiteStarted '' | ||
|
||
exitcode=0 | ||
for test in "${tests[@]}"; do | ||
# Capitalize the test name. | ||
caps=$(echo "${test:0:1}"|tr a-z A-Z)${test:1} | ||
|
||
tc SuiteStarted "$caps" | ||
|
||
for nemesis in "${nemeses[@]}"; do | ||
# Produce a test name. | ||
|
||
# Reduce "--nemesis X --nemesis2 Y" to "X+Y" | ||
nemname=${nemesis// /} | ||
nemname=${nemname#--nemesis} | ||
nemname=${nemname//--nemesis2/+} | ||
nemname=${nemname//--nemesis/+} | ||
|
||
# Generate a proper test name. | ||
testname=$caps/$nemname | ||
|
||
if ! $BASH "${COCKROACH_PATH}/build/teamcity-jepsen-run-one.sh" "$testname" "$test" "$nemesis"; then | ||
exitcode=1 | ||
fi | ||
done | ||
|
||
tc SuiteFinished "$caps" | ||
done | ||
|
||
tc SuiteFinished '' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,90 +1,13 @@ | ||
#!/usr/bin/env bash | ||
set -euxo pipefail | ||
COCKROACH_PATH="${GOPATH}/src/github.com/cockroachdb/cockroach" | ||
source "${COCKROACH_PATH}/build/jepsen-common.sh" | ||
|
||
# This script provisions a Jepsen controller and 5 nodes, and runs tests | ||
# against them. | ||
|
||
COCKROACH_PATH="${GOPATH}/src/github.com/cockroachdb/cockroach" | ||
KEY_NAME="${KEY_NAME-google_compute_engine}" | ||
LOG_DIR="${COCKROACH_PATH}/artifacts" | ||
mkdir -p "${LOG_DIR}" | ||
|
||
cd "${COCKROACH_PATH}/cloud/gce/jepsen" | ||
|
||
# Generate ssh keys for the controller to talk to the workers. | ||
rm -f controller.id_rsa controller.id_rsa.pub | ||
ssh-keygen -f controller.id_rsa -N '' | ||
|
||
function destroy { | ||
set +e | ||
echo "Tearing down cluster..." | ||
terraform destroy --var=key_name="${KEY_NAME}" --force | ||
} | ||
trap destroy EXIT | ||
|
||
# Spin up the cluster. | ||
terraform apply --var=key_name="${KEY_NAME}" | ||
|
||
controller="$(terraform output controller-ip)" | ||
|
||
nemeses=( | ||
# big-skews disabled since they assume an eth0 interface. | ||
#"--nemesis big-skews" | ||
"--nemesis majority-ring" | ||
"--nemesis start-stop-2" | ||
"--nemesis start-kill-2" | ||
#"--nemesis majority-ring --nemesis2 big-skews" | ||
#"--nemesis big-skews --nemesis2 start-kill-2" | ||
"--nemesis majority-ring --nemesis2 start-kill-2" | ||
"--nemesis parts --nemesis2 start-kill-2" | ||
) | ||
|
||
tests=( | ||
"bank" | ||
"comments" | ||
"register" | ||
"monotonic" | ||
"sets" | ||
"sequential" | ||
) | ||
|
||
testcmd_base="cd jepsen/cockroachdb && ~/lein run test --tarball file:///home/ubuntu/cockroach.tgz --username ubuntu --ssh-private-key ~/.ssh/id_rsa --nodes-file ~/nodes --time-limit 180 --test-count 1 --os ubuntu" | ||
|
||
# Don't quit after just one test. | ||
# Can't have -x on when echoing the teamcity status lines or else we'll | ||
# get duplicates. | ||
set +ex | ||
for test in "${tests[@]}"; do | ||
for nemesis in "${nemeses[@]}"; do | ||
# We pipe stdout to /dev/null because it's already recorded by Jepsen | ||
# and placed in the artifacts for us. | ||
testcmd="${testcmd_base} --test ${test} ${nemesis} > /dev/null" | ||
echo "##teamcity[testStarted name='${test} ${nemesis}']" | ||
echo "Testing with args --test ${test} ${nemesis}" | ||
|
||
# Remove spaces from test name to get the artifacts subdirectory | ||
testname=$(echo "${test}${nemesis}" | sed 's/ //g') | ||
artifacts_dir="${LOG_DIR}/${testname}" | ||
mkdir -p "${artifacts_dir}" | ||
|
||
# Run each test over an ssh connection. | ||
# If this begins to time out frequently, let's do this via nohup and poll. | ||
# | ||
# shellcheck disable=SC2029 | ||
if ssh -o "ServerAliveInterval=60" -o "StrictHostKeyChecking no" -i "$HOME/.ssh/${KEY_NAME}" "ubuntu@${controller}" "${testcmd}"; then | ||
# Test passed. grab just the results file. | ||
echo "Test passed. Grabbing minimal logs..." | ||
scp -o "StrictHostKeyChecking no" -ri "$HOME/.ssh/${KEY_NAME}" "ubuntu@${controller}:jepsen/cockroachdb/store/latest/{test.fressian,results.edn,latency-quantiles.png,latency-raw.png,rate.png}" "${artifacts_dir}" | ||
else | ||
# Test failed: grab everything. | ||
echo "Test failed. Grabbing all logs..." | ||
archive_path="jepsen/cockroachdb/store/failure-logs.tgz" | ||
# -h causes tar to follow symlinks; needed by the `latest` symlink. | ||
ssh -o "StrictHostKeyChecking no" -i "$HOME/.ssh/${KEY_NAME}" "ubuntu@${controller}" "tar -chzf ${archive_path} jepsen/cockroachdb/store/latest" | ||
scp -o "StrictHostKeyChecking no" -ri "$HOME/.ssh/${KEY_NAME}" "ubuntu@${controller}:${archive_path}" "${artifacts_dir}" | ||
echo "##teamcity[testFailed name='${test} ${nemesis}']" | ||
fi | ||
echo "##teamcity[testFinished name='${test} ${nemesis}']" | ||
echo "##teamcity[publishArtifacts '${LOG_DIR}']" | ||
done | ||
done | ||
$BASH "${COCKROACH_PATH}/build/teamcity-jepsen-prepare.sh" | ||
$BASH "${COCKROACH_PATH}/build/teamcity-jepsen-run.sh" | ||
$BASH "${COCKROACH_PATH}/build/teamcity-jepsen-cleanup.sh" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters