Skip to content

Commit

Permalink
Merge pull request #15717 from knz/jepsen-refactor
Browse files Browse the repository at this point in the history
build: refactor the Jepsen test scripts
  • Loading branch information
knz authored May 12, 2017
2 parents 546199d + 4664404 commit 089a8b9
Show file tree
Hide file tree
Showing 7 changed files with 227 additions and 84 deletions.
59 changes: 59 additions & 0 deletions build/jepsen-common.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Source this file from one of the other jepsen scripts

PS4="+($(basename $0)) "

LOG_DIR="${COCKROACH_PATH}/artifacts"
mkdir -p "${LOG_DIR}"
cd "${LOG_DIR}"

KEY_NAME="${KEY_NAME-google_compute_engine}"

SSH_OPTIONS=(-o "ServerAliveInterval=60" -o "StrictHostKeyChecking no" -i "$HOME/.ssh/${KEY_NAME}")

# Ensure that the terraform config is cancelled if one of the run scripts fails
# or the entire thing is interrupted externally.
function destroy {
set +e
progress Destroying cluster...
terraform destroy --var=key_name="${KEY_NAME}" --force || true

if test -n "${currentTestName:-}"; then
tc Failed "$currentTestName"
tc Finished "$currentTestName"
fi
exit 1
}
trap destroy ERR SIGHUP SIGINT SIGTERM

function tc {
printf "##%s[test%s name='Jepsen%s']\\n" teamcity "$1" "$2"
case $1 in
Started) currentTestName=$2 ;;
Finished) currentTestName= ;;
esac
}

function progress {
printf "##%s[progressMessage '%s']\\n" teamcity "$*"
}

nemeses=(
# big-skews disabled since they assume an eth0 interface.
#"--nemesis big-skews"
"--nemesis majority-ring"
"--nemesis start-stop-2"
"--nemesis start-kill-2"
#"--nemesis majority-ring --nemesis2 big-skews"
#"--nemesis big-skews --nemesis2 start-kill-2"
"--nemesis majority-ring --nemesis2 start-kill-2"
"--nemesis parts --nemesis2 start-kill-2"
)

tests=(
"bank"
"comments"
"register"
"monotonic"
"sets"
"sequential"
)
7 changes: 7 additions & 0 deletions build/teamcity-jepsen-cleanup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env bash
set -euxo pipefail
COCKROACH_PATH="${GOPATH}/src/github.com/cockroachdb/cockroach"
source "${COCKROACH_PATH}/build/jepsen-common.sh"

progress Destroying cluster...
terraform destroy --var=key_name="${KEY_NAME}" --force || true
22 changes: 22 additions & 0 deletions build/teamcity-jepsen-prepare.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env bash
set -euxo pipefail
COCKROACH_PATH="${GOPATH}/src/github.com/cockroachdb/cockroach"
source "${COCKROACH_PATH}/build/jepsen-common.sh"

tc Started SetupCluster

progress Preparing files
# Copy the terraform config locally. We keep it in artifacts as well
# as the terraform state file so that if when troubleshooting a
# failing test we can reuse exactly the same settings.
cp -a "${COCKROACH_PATH}/cloud/gce/jepsen"/* .

progress Generating controller SSH keys
rm -f controller.id_rsa controller.id_rsa.pub
ssh-keygen -f controller.id_rsa -N ''

progress Spinning up the cluster
# A failure here is caught by the trap handler.
terraform apply --var=key_name="${KEY_NAME}"

tc Finished SetupCluster
97 changes: 97 additions & 0 deletions build/teamcity-jepsen-run-one.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env bash
set -euxo pipefail
COCKROACH_PATH="${GOPATH}/src/github.com/cockroachdb/cockroach"
source "${COCKROACH_PATH}/build/jepsen-common.sh"

testName=${1:?test label not specified}
test=${2:?Jepsen test name not specified}
nemesis=${3:?Jepsen nemesis flag(s) not specified}

tc Started "$testName"

# The test's log file will go to a sub-dir named after the test. Make it.
artifacts_dir=$(echo "$testName"|tr / _)
mkdir -p "${artifacts_dir}"

# What is the controller again?
controller=$(terraform output controller-ip)

# Prepare the command to run the test.
testcmd="cd jepsen/cockroachdb && set -eo pipefail && \
stdbuf -oL -eL \
~/lein run test \
--tarball file:///home/ubuntu/cockroach.tgz \
--username ubuntu \
--ssh-private-key ~/.ssh/id_rsa \
--nodes-file ~/nodes \
--os ubuntu \
--time-limit 180 \
--test-count 1 \
--test ${test} ${nemesis} \
2>&1 | stdbuf -oL tee invoke.log"

exitcode=0

# Although we run tests of 3 minutes each, we use a timeout
# much larger than that; this is because Jepsen for some tests
# (e.g. register) runs a potentially long analysis after the test
# itself has completed, before determining whether the test has
# succeeded or note.
if timeout 15m ssh "${SSH_OPTIONS[@]}" "ubuntu@${controller}" "${testcmd}" \
| (set +x; i=1; IFS='
';
# The following loop displays a TC message every 10 seconds
# with an excerpt from the jepsen log.
prevsecs=0
while true; do
# Fail if no jepsen logging message within 30 seconds.
read -t 30 x
status=$?
if [ $status -gt 128 ]; then
progress "Jepsen test was silent for too long, aborting"
# timeout: kill ssh to abort the test.
killall ssh
exit $status
elif [ $status != 0 ]; then
break
fi
secs=$(date +%s);
if [ $secs -gt $(($prevsecs+10)) ]; then
prevsecs=$secs
echo "... $x ..."
progress "Test still running, $i log lines"
fi
i=$(($i+1))
done; exit 0); then

# Test passed. grab just the results file.
progress "Test passed. Grabbing minimal logs..."
scp "${SSH_OPTIONS[@]}" -C -r \
"ubuntu@${controller}:jepsen/cockroachdb/store/latest/{test.fressian,results.edn,latency-quantiles.png,latency-raw.png,rate.png}" \
"${artifacts_dir}"

else
progress "Test failed: exit code $?. Grabbing artifacts from controller..."
exitcode=1

# Show the last few lines from the Jepsen run into the build log.
ssh "${SSH_OPTIONS[@]}" "ubuntu@${controller}" "tail -n 100 jepsen/cockroachdb/invoke.log" >&2 || echo "Failed to extract the last lines from invoke.log." >&2

progress Creating archive from controller output
# Now grab all the artifacts.
# -h causes tar to follow symlinks; needed by the `latest` symlink.
ssh "${SSH_OPTIONS[@]}" "ubuntu@${controller}" "tar -chj --ignore-failed-read -f- jepsen/cockroachdb/store/latest jepsen/cockroachdb/invoke.log" >"${artifacts_dir}"/failure-logs.tbz || echo "Failed to copy the files." >&2

progress Resetting latest run for next test
# Reset the link for the next test run.
ssh "${SSH_OPTIONS[@]}" "ubuntu@${controller}" "rm -f jepsen/cockroachdb/store/latest" || echo "Failed to remove the latest alias." >&2

tc Failed "$testName"
fi

tc Finished "$testName"

# For debugging
echo "##teamcity[publishArtifacts '${LOG_DIR}']"

exit $exitcode
35 changes: 35 additions & 0 deletions build/teamcity-jepsen-run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env bash
set -euxo pipefail
COCKROACH_PATH="${GOPATH}/src/github.com/cockroachdb/cockroach"
source "${COCKROACH_PATH}/build/jepsen-common.sh"

tc SuiteStarted ''

exitcode=0
for test in "${tests[@]}"; do
# Capitalize the test name.
caps=$(echo "${test:0:1}"|tr a-z A-Z)${test:1}

tc SuiteStarted "$caps"

for nemesis in "${nemeses[@]}"; do
# Produce a test name.

# Reduce "--nemesis X --nemesis2 Y" to "X+Y"
nemname=${nemesis// /}
nemname=${nemname#--nemesis}
nemname=${nemname//--nemesis2/+}
nemname=${nemname//--nemesis/+}

# Generate a proper test name.
testname=$caps/$nemname

if ! $BASH "${COCKROACH_PATH}/build/teamcity-jepsen-run-one.sh" "$testname" "$test" "$nemesis"; then
exitcode=1
fi
done

tc SuiteFinished "$caps"
done

tc SuiteFinished ''
87 changes: 5 additions & 82 deletions build/teamcity-jepsen.sh
Original file line number Diff line number Diff line change
@@ -1,90 +1,13 @@
#!/usr/bin/env bash
set -euxo pipefail
COCKROACH_PATH="${GOPATH}/src/github.com/cockroachdb/cockroach"
source "${COCKROACH_PATH}/build/jepsen-common.sh"

# This script provisions a Jepsen controller and 5 nodes, and runs tests
# against them.

COCKROACH_PATH="${GOPATH}/src/github.com/cockroachdb/cockroach"
KEY_NAME="${KEY_NAME-google_compute_engine}"
LOG_DIR="${COCKROACH_PATH}/artifacts"
mkdir -p "${LOG_DIR}"

cd "${COCKROACH_PATH}/cloud/gce/jepsen"

# Generate ssh keys for the controller to talk to the workers.
rm -f controller.id_rsa controller.id_rsa.pub
ssh-keygen -f controller.id_rsa -N ''

function destroy {
set +e
echo "Tearing down cluster..."
terraform destroy --var=key_name="${KEY_NAME}" --force
}
trap destroy EXIT

# Spin up the cluster.
terraform apply --var=key_name="${KEY_NAME}"

controller="$(terraform output controller-ip)"

nemeses=(
# big-skews disabled since they assume an eth0 interface.
#"--nemesis big-skews"
"--nemesis majority-ring"
"--nemesis start-stop-2"
"--nemesis start-kill-2"
#"--nemesis majority-ring --nemesis2 big-skews"
#"--nemesis big-skews --nemesis2 start-kill-2"
"--nemesis majority-ring --nemesis2 start-kill-2"
"--nemesis parts --nemesis2 start-kill-2"
)

tests=(
"bank"
"comments"
"register"
"monotonic"
"sets"
"sequential"
)

testcmd_base="cd jepsen/cockroachdb && ~/lein run test --tarball file:///home/ubuntu/cockroach.tgz --username ubuntu --ssh-private-key ~/.ssh/id_rsa --nodes-file ~/nodes --time-limit 180 --test-count 1 --os ubuntu"

# Don't quit after just one test.
# Can't have -x on when echoing the teamcity status lines or else we'll
# get duplicates.
set +ex
for test in "${tests[@]}"; do
for nemesis in "${nemeses[@]}"; do
# We pipe stdout to /dev/null because it's already recorded by Jepsen
# and placed in the artifacts for us.
testcmd="${testcmd_base} --test ${test} ${nemesis} > /dev/null"
echo "##teamcity[testStarted name='${test} ${nemesis}']"
echo "Testing with args --test ${test} ${nemesis}"

# Remove spaces from test name to get the artifacts subdirectory
testname=$(echo "${test}${nemesis}" | sed 's/ //g')
artifacts_dir="${LOG_DIR}/${testname}"
mkdir -p "${artifacts_dir}"

# Run each test over an ssh connection.
# If this begins to time out frequently, let's do this via nohup and poll.
#
# shellcheck disable=SC2029
if ssh -o "ServerAliveInterval=60" -o "StrictHostKeyChecking no" -i "$HOME/.ssh/${KEY_NAME}" "ubuntu@${controller}" "${testcmd}"; then
# Test passed. grab just the results file.
echo "Test passed. Grabbing minimal logs..."
scp -o "StrictHostKeyChecking no" -ri "$HOME/.ssh/${KEY_NAME}" "ubuntu@${controller}:jepsen/cockroachdb/store/latest/{test.fressian,results.edn,latency-quantiles.png,latency-raw.png,rate.png}" "${artifacts_dir}"
else
# Test failed: grab everything.
echo "Test failed. Grabbing all logs..."
archive_path="jepsen/cockroachdb/store/failure-logs.tgz"
# -h causes tar to follow symlinks; needed by the `latest` symlink.
ssh -o "StrictHostKeyChecking no" -i "$HOME/.ssh/${KEY_NAME}" "ubuntu@${controller}" "tar -chzf ${archive_path} jepsen/cockroachdb/store/latest"
scp -o "StrictHostKeyChecking no" -ri "$HOME/.ssh/${KEY_NAME}" "ubuntu@${controller}:${archive_path}" "${artifacts_dir}"
echo "##teamcity[testFailed name='${test} ${nemesis}']"
fi
echo "##teamcity[testFinished name='${test} ${nemesis}']"
echo "##teamcity[publishArtifacts '${LOG_DIR}']"
done
done
$BASH "${COCKROACH_PATH}/build/teamcity-jepsen-prepare.sh"
$BASH "${COCKROACH_PATH}/build/teamcity-jepsen-run.sh"
$BASH "${COCKROACH_PATH}/build/teamcity-jepsen-cleanup.sh"
4 changes: 2 additions & 2 deletions cloud/gce/jepsen/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ FILE
"cat /home/ubuntu/nodes | xargs -n1 ssh-keyscan -t rsa >> ~/.ssh/known_hosts",
"curl https://raw.githubusercontent.com/technomancy/leiningen/stable/bin/lein > /home/ubuntu/lein",
"chmod +x /home/ubuntu/lein",
"cd /home/ubuntu && git clone https://github.com/jepsen-io/jepsen",
"cd /home/ubuntu && git clone https://github.com/cockroachdb/jepsen && cd jepsen && git checkout tc-nightly",
]
}
}
Expand Down Expand Up @@ -162,7 +162,7 @@ resource "null_resource" "cockroach-runner" {
"sudo cp ~/.ssh/authorized_keys2 /root/.ssh/authorized_keys2",
# Download latest cockroach binary, zip so that Jepsen understands it
"mkdir -p /tmp/cockroach",
"curl http://s3.amazonaws.com/cockroach/cockroach/cockroach.$(curl http://s3.amazonaws.com/cockroach/cockroach/cockroach.LATEST) -o /tmp/cockroach/cockroach",
"curl -L https://edge-binaries.cockroachdb.com/cockroach/cockroach.linux-gnu-amd64.LATEST -o /tmp/cockroach/cockroach",
"chmod +x /tmp/cockroach/cockroach",
"tar -C /tmp -czf /home/ubuntu/cockroach.tgz cockroach",
]
Expand Down

0 comments on commit 089a8b9

Please sign in to comment.