Skip to content

Commit

Permalink
Refactor automation framework to accept arbitary scripts after cluste…
Browse files Browse the repository at this point in the history
…r launch
  • Loading branch information
danpaul000 committed Mar 17, 2020
1 parent 7319833 commit 6541003
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 102 deletions.
62 changes: 61 additions & 1 deletion system-test/automation_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ function analyze_packet_loss {

function wait_for_bootstrap_validator_stake_drop {
max_stake="$1"
if [[ $max_stake -eq 100 ]]; then
return
fi

source "${REPO_ROOT}"/net/common.sh
loadConfigFile

Expand All @@ -88,7 +92,63 @@ function get_slot {
function get_bootstrap_validator_ip_address {
source "${REPO_ROOT}"/net/common.sh
loadConfigFile
echo ${validatorIpList[0]}
echo "${validatorIpList[0]}"
}

function collect_performance_statistics {
execution_step "Collect performance statistics about run"
declare q_mean_tps='
SELECT ROUND(MEAN("median_sum")) as "mean_tps" FROM (
SELECT MEDIAN(sum_count) AS "median_sum" FROM (
SELECT SUM("count") AS "sum_count"
FROM "'$TESTNET_TAG'"."autogen"."bank-process_transactions"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s AND count > 0
GROUP BY time(1s), host_id)
GROUP BY time(1s)
)'

declare q_max_tps='
SELECT MAX("median_sum") as "max_tps" FROM (
SELECT MEDIAN(sum_count) AS "median_sum" FROM (
SELECT SUM("count") AS "sum_count"
FROM "'$TESTNET_TAG'"."autogen"."bank-process_transactions"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s AND count > 0
GROUP BY time(1s), host_id)
GROUP BY time(1s)
)'

declare q_mean_confirmation='
SELECT round(mean("duration_ms")) as "mean_confirmation_ms"
FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'

declare q_max_confirmation='
SELECT round(max("duration_ms")) as "max_confirmation_ms"
FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'

declare q_99th_confirmation='
SELECT round(percentile("duration_ms", 99)) as "99th_percentile_confirmation_ms"
FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'

declare q_max_tower_distance_observed='
SELECT MAX("tower_distance") as "max_tower_distance" FROM (
SELECT last("slot") - last("root") as "tower_distance"
FROM "'$TESTNET_TAG'"."autogen"."tower-observed"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s
GROUP BY time(1s), host_id)'

declare q_last_tower_distance_observed='
SELECT MEAN("tower_distance") as "last_tower_distance" FROM (
SELECT last("slot") - last("root") as "tower_distance"
FROM "'$TESTNET_TAG'"."autogen"."tower-observed"
GROUP BY host_id)'

curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \
--data-urlencode "db=${TESTNET_TAG}" \
--data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation;$q_max_tower_distance_observed;$q_last_tower_distance_observed" |
python "${REPO_ROOT}"/system-test/testnet-automation-json-parser.py >>"$RESULT_FILE"
}

function upload_results_to_slack() {
Expand Down
5 changes: 0 additions & 5 deletions system-test/stake-operations-testcases/offline_stake_colo.yml

This file was deleted.

18 changes: 18 additions & 0 deletions system-test/stake-operations-testcases/offline_stake_gce.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
steps:
- command: "system-test/testnet-automation.sh"
label: "Running Offline Stake Operations Tests"
env:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "gce"
TESTNET_TAG: "gce-perf-cpu-only"
ENABLE_GPU: "false"
TEST_DURATION_SECONDS: 30
NUMBER_OF_VALIDATOR_NODES: 1
VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16"
NUMBER_OF_CLIENT_NODES: 0
ADDITIONAL_FLAGS: ""
BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD: 100
TEST_TYPE: "script"
CUSTOM_SCRIPT: "system-test/stake-operations-testcases/stake_test_automation.sh"
agents:
- "queue=gce-deploy"
11 changes: 3 additions & 8 deletions system-test/stake-operations-testcases/stake_test_automation.sh
Original file line number Diff line number Diff line change
@@ -1,17 +1,12 @@
#!/usr/bin/env bash

set -e
set -x
set -ex

# shellcheck disable=SC1090
# shellcheck disable=SC1091
source "$(dirname "$0")"/../automation_utils.sh

curl -sSf https://raw.githubusercontent.com/solana-labs/solana/v1.0.5/install/solana-install-init.sh | sh -s - 1.0.5

# Create a single node cluster on colo, then call offline_stake_operations.sh against that cluster
"${REPO_ROOT}"/net/colo.sh delete --reclaim-preemptible-reservations
"${REPO_ROOT}"/net/colo.sh create -n 1 -c 0 -p stake-ops-testnet --dedicated
"${REPO_ROOT}"/net/net.sh start -t edge
# Runs offline stake operations tests against a running cluster launched from the automation framework

bootstrapper_ip_address="$(get_bootstrap_validator_ip_address)"
entrypoint=http://"${bootstrapper_ip_address}":8899
Expand Down
146 changes: 58 additions & 88 deletions system-test/testnet-automation.sh
Original file line number Diff line number Diff line change
Expand Up @@ -147,30 +147,36 @@ function launch_testnet() {
SLOT_COUNT_START_SECONDS=$SECONDS
execution_step "Marking beginning of slot rate test - Slot: $START_SLOT, Seconds: $SLOT_COUNT_START_SECONDS"

if [[ -n $TEST_DURATION_SECONDS ]]; then
execution_step "Wait ${TEST_DURATION_SECONDS} seconds to complete test"
sleep "$TEST_DURATION_SECONDS"
elif [[ "$APPLY_PARTITIONS" = "true" ]]; then
STATS_START_SECONDS=$SECONDS
execution_step "Wait $PARTITION_INACTIVE_DURATION before beginning to apply partitions"
sleep "$PARTITION_INACTIVE_DURATION"
for (( i=1; i<=PARTITION_ITERATION_COUNT; i++ )); do
execution_step "Partition Iteration $i of $PARTITION_ITERATION_COUNT"
execution_step "Applying netem config $NETEM_CONFIG_FILE for $PARTITION_ACTIVE_DURATION seconds"
"${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE"
sleep "$PARTITION_ACTIVE_DURATION"

execution_step "Resolving partitions for $PARTITION_INACTIVE_DURATION seconds"
"${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" --netem-cmd cleanup
case $TEST_TYPE in
fixed_duration)
execution_step "Wait ${TEST_DURATION_SECONDS} seconds to complete test"
sleep "$TEST_DURATION_SECONDS"
;;
partition)
STATS_START_SECONDS=$SECONDS
execution_step "Wait $PARTITION_INACTIVE_DURATION before beginning to apply partitions"
sleep "$PARTITION_INACTIVE_DURATION"
done
STATS_FINISH_SECONDS=$SECONDS
TEST_DURATION_SECONDS=$((STATS_FINISH_SECONDS - STATS_START_SECONDS))
else
# We should never get here
echo Test duration and partition config not defined
exit 1
fi
for (( i=1; i<=PARTITION_ITERATION_COUNT; i++ )); do
execution_step "Partition Iteration $i of $PARTITION_ITERATION_COUNT"
execution_step "Applying netem config $NETEM_CONFIG_FILE for $PARTITION_ACTIVE_DURATION seconds"
"${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE"
sleep "$PARTITION_ACTIVE_DURATION"

execution_step "Resolving partitions for $PARTITION_INACTIVE_DURATION seconds"
"${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" --netem-cmd cleanup
sleep "$PARTITION_INACTIVE_DURATION"
done
STATS_FINISH_SECONDS=$SECONDS
TEST_DURATION_SECONDS=$((STATS_FINISH_SECONDS - STATS_START_SECONDS))
;;
script)
execution_step "Running custom script: ${REPO_ROOT}/${CUSTOM_SCRIPT}"
"$REPO_ROOT"/"$CUSTOM_SCRIPT"
;;
*)
echo "Error: Unsupported test type: $TEST_TYPE"
;;
esac

END_SLOT=$(get_slot)
SLOT_COUNT_END_SECONDS=$SECONDS
Expand All @@ -179,63 +185,10 @@ function launch_testnet() {
SLOTS_PER_SECOND="$(bc <<< "scale=3; ($END_SLOT - $START_SLOT)/($SLOT_COUNT_END_SECONDS - $SLOT_COUNT_START_SECONDS)")"
execution_step "Average slot rate: $SLOTS_PER_SECOND slots/second over $((SLOT_COUNT_END_SECONDS - SLOT_COUNT_START_SECONDS)) seconds"

execution_step "Collect statistics about run"
declare q_mean_tps='
SELECT ROUND(MEAN("median_sum")) as "mean_tps" FROM (
SELECT MEDIAN(sum_count) AS "median_sum" FROM (
SELECT SUM("count") AS "sum_count"
FROM "'$TESTNET_TAG'"."autogen"."bank-process_transactions"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s AND count > 0
GROUP BY time(1s), host_id)
GROUP BY time(1s)
)'

declare q_max_tps='
SELECT MAX("median_sum") as "max_tps" FROM (
SELECT MEDIAN(sum_count) AS "median_sum" FROM (
SELECT SUM("count") AS "sum_count"
FROM "'$TESTNET_TAG'"."autogen"."bank-process_transactions"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s AND count > 0
GROUP BY time(1s), host_id)
GROUP BY time(1s)
)'

declare q_mean_confirmation='
SELECT round(mean("duration_ms")) as "mean_confirmation_ms"
FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'

declare q_max_confirmation='
SELECT round(max("duration_ms")) as "max_confirmation_ms"
FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'

declare q_99th_confirmation='
SELECT round(percentile("duration_ms", 99)) as "99th_percentile_confirmation_ms"
FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'

declare q_max_tower_distance_observed='
SELECT MAX("tower_distance") as "max_tower_distance" FROM (
SELECT last("slot") - last("root") as "tower_distance"
FROM "'$TESTNET_TAG'"."autogen"."tower-observed"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s
GROUP BY time(1s), host_id)'

declare q_last_tower_distance_observed='
SELECT MEAN("tower_distance") as "last_tower_distance" FROM (
SELECT last("slot") - last("root") as "tower_distance"
FROM "'$TESTNET_TAG'"."autogen"."tower-observed"
GROUP BY host_id)'

curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \
--data-urlencode "db=${TESTNET_TAG}" \
--data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation;$q_max_tower_distance_observed;$q_last_tower_distance_observed" |
python "${REPO_ROOT}"/system-test/testnet-automation-json-parser.py >>"$RESULT_FILE"
[[ -n $SKIP_PERF_RESULTS ]] || collect_performance_statistics

echo "slots_per_second: $SLOTS_PER_SECOND" >>"$RESULT_FILE"

execution_step "Writing test results to ${RESULT_FILE}"
RESULT_DETAILS=$(<"$RESULT_FILE")
upload-ci-artifact "$RESULT_FILE"
}
Expand Down Expand Up @@ -292,18 +245,35 @@ if [[ "$USE_PUBLIC_IP_ADDRESSES" = "true" ]]; then
maybePublicIpAddresses="-P"
fi

: "${CLIENT_DELAY_START:=0}"

if [[ -z $APPLY_PARTITIONS ]]; then
APPLY_PARTITIONS=false
fi
if [[ "$APPLY_PARTITIONS" = "true" ]]; then
if [[ -n $TEST_DURATION_SECONDS ]]; then
echo Cannot accept TEST_DURATION_SECONDS and a parition looping config
exit 1
execution_step "Checking for required parameters"
testTypeRequiredParameters=
case $TEST_TYPE in
fixed_duration)
testTypeRequiredParameters+=TEST_DURATION_SECONDS
;;
partition)
testTypeRequiredParameters+=NETEM_CONFIG_FILE
testTypeRequiredParameters+=PARTITION_ACTIVE_DURATION
testTypeRequiredParameters+=PARTITION_INACTIVE_DURATION
testTypeRequiredParameters+=PARTITION_ITERATION_COUNT
;;
script)
testTypeRequiredParameters+=CUSTOM_SCRIPT
;;
*)
echo "Error: Unsupported test type: $TEST_TYPE"
;;
esac

missingParameters=
for i in "${testTypeRequiredParameters[@]}"; do
if [[ -z ${!i} ]]; then
missingParameters+="${i} "
fi
elif [[ -z $TEST_DURATION_SECONDS ]]; then
echo TEST_DURATION_SECONDS not defined
done

if [[ -n $missingParameters ]]; then
echo "Error: For test type $TEST_TYPE, the following required parameters are missing: ${missingParameters[*]}"
exit 1
fi

Expand Down

0 comments on commit 6541003

Please sign in to comment.