diff --git a/system-test/automation_utils.sh b/system-test/automation_utils.sh index 213c09887f856a..581d242631ab40 100755 --- a/system-test/automation_utils.sh +++ b/system-test/automation_utils.sh @@ -62,6 +62,10 @@ function analyze_packet_loss { function wait_for_bootstrap_validator_stake_drop { max_stake="$1" + if [[ $max_stake -eq 100 ]]; then + return + fi + source "${REPO_ROOT}"/net/common.sh loadConfigFile @@ -88,7 +92,63 @@ function get_slot { function get_bootstrap_validator_ip_address { source "${REPO_ROOT}"/net/common.sh loadConfigFile - echo ${validatorIpList[0]} + echo "${validatorIpList[0]}" +} + +function collect_performance_statistics { + execution_step "Collect performance statistics about run" + declare q_mean_tps=' + SELECT ROUND(MEAN("median_sum")) as "mean_tps" FROM ( + SELECT MEDIAN(sum_count) AS "median_sum" FROM ( + SELECT SUM("count") AS "sum_count" + FROM "'$TESTNET_TAG'"."autogen"."bank-process_transactions" + WHERE time > now() - '"$TEST_DURATION_SECONDS"'s AND count > 0 + GROUP BY time(1s), host_id) + GROUP BY time(1s) + )' + + declare q_max_tps=' + SELECT MAX("median_sum") as "max_tps" FROM ( + SELECT MEDIAN(sum_count) AS "median_sum" FROM ( + SELECT SUM("count") AS "sum_count" + FROM "'$TESTNET_TAG'"."autogen"."bank-process_transactions" + WHERE time > now() - '"$TEST_DURATION_SECONDS"'s AND count > 0 + GROUP BY time(1s), host_id) + GROUP BY time(1s) + )' + + declare q_mean_confirmation=' + SELECT round(mean("duration_ms")) as "mean_confirmation_ms" + FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation" + WHERE time > now() - '"$TEST_DURATION_SECONDS"'s' + + declare q_max_confirmation=' + SELECT round(max("duration_ms")) as "max_confirmation_ms" + FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation" + WHERE time > now() - '"$TEST_DURATION_SECONDS"'s' + + declare q_99th_confirmation=' + SELECT round(percentile("duration_ms", 99)) as "99th_percentile_confirmation_ms" + FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation" + WHERE time > now() - '"$TEST_DURATION_SECONDS"'s' + + declare q_max_tower_distance_observed=' + SELECT MAX("tower_distance") as "max_tower_distance" FROM ( + SELECT last("slot") - last("root") as "tower_distance" + FROM "'$TESTNET_TAG'"."autogen"."tower-observed" + WHERE time > now() - '"$TEST_DURATION_SECONDS"'s + GROUP BY time(1s), host_id)' + + declare q_last_tower_distance_observed=' + SELECT MEAN("tower_distance") as "last_tower_distance" FROM ( + SELECT last("slot") - last("root") as "tower_distance" + FROM "'$TESTNET_TAG'"."autogen"."tower-observed" + GROUP BY host_id)' + + curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \ + --data-urlencode "db=${TESTNET_TAG}" \ + --data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation;$q_max_tower_distance_observed;$q_last_tower_distance_observed" | + python "${REPO_ROOT}"/system-test/testnet-automation-json-parser.py >>"$RESULT_FILE" } function upload_results_to_slack() { diff --git a/system-test/stake-operations-testcases/offline_stake_colo.yml b/system-test/stake-operations-testcases/offline_stake_colo.yml deleted file mode 100755 index e0a5a3ee9140ed..00000000000000 --- a/system-test/stake-operations-testcases/offline_stake_colo.yml +++ /dev/null @@ -1,5 +0,0 @@ -steps: - - command: "system-test/stake-operations-testcases/stake_test_automation.sh" - label: "Running Offline Stake Operations Tests" - agents: - - "queue=colo-deploy" diff --git a/system-test/stake-operations-testcases/offline_stake_gce.yml b/system-test/stake-operations-testcases/offline_stake_gce.yml new file mode 100755 index 00000000000000..3b38ddb085e9d7 --- /dev/null +++ b/system-test/stake-operations-testcases/offline_stake_gce.yml @@ -0,0 +1,18 @@ +steps: + - command: "system-test/testnet-automation.sh" + label: "Running Offline Stake Operations Tests" + env: + UPLOAD_RESULTS_TO_SLACK: "true" + CLOUD_PROVIDER: "gce" + TESTNET_TAG: "gce-perf-cpu-only" + ENABLE_GPU: "false" + TEST_DURATION_SECONDS: 30 + NUMBER_OF_VALIDATOR_NODES: 1 + VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16" + NUMBER_OF_CLIENT_NODES: 0 + ADDITIONAL_FLAGS: "" + BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD: 100 + TEST_TYPE: "script" + CUSTOM_SCRIPT: "system-test/stake-operations-testcases/stake_test_automation.sh" + agents: + - "queue=gce-deploy" diff --git a/system-test/stake-operations-testcases/stake_test_automation.sh b/system-test/stake-operations-testcases/stake_test_automation.sh index ba00d813688de4..1f21376b6d9ae9 100755 --- a/system-test/stake-operations-testcases/stake_test_automation.sh +++ b/system-test/stake-operations-testcases/stake_test_automation.sh @@ -1,17 +1,12 @@ #!/usr/bin/env bash -set -e -set -x +set -ex +# shellcheck disable=SC1090 # shellcheck disable=SC1091 source "$(dirname "$0")"/../automation_utils.sh -curl -sSf https://raw.githubusercontent.com/solana-labs/solana/v1.0.5/install/solana-install-init.sh | sh -s - 1.0.5 - -# Create a single node cluster on colo, then call offline_stake_operations.sh against that cluster -"${REPO_ROOT}"/net/colo.sh delete --reclaim-preemptible-reservations -"${REPO_ROOT}"/net/colo.sh create -n 1 -c 0 -p stake-ops-testnet --dedicated -"${REPO_ROOT}"/net/net.sh start -t edge +# Runs offline stake operations tests against a running cluster launched from the automation framework bootstrapper_ip_address="$(get_bootstrap_validator_ip_address)" entrypoint=http://"${bootstrapper_ip_address}":8899 diff --git a/system-test/testnet-automation.sh b/system-test/testnet-automation.sh index abd1b31bab6a14..3c69b4a762590f 100755 --- a/system-test/testnet-automation.sh +++ b/system-test/testnet-automation.sh @@ -147,30 +147,35 @@ function launch_testnet() { SLOT_COUNT_START_SECONDS=$SECONDS execution_step "Marking beginning of slot rate test - Slot: $START_SLOT, Seconds: $SLOT_COUNT_START_SECONDS" - if [[ -n $TEST_DURATION_SECONDS ]]; then - execution_step "Wait ${TEST_DURATION_SECONDS} seconds to complete test" - sleep "$TEST_DURATION_SECONDS" - elif [[ "$APPLY_PARTITIONS" = "true" ]]; then - STATS_START_SECONDS=$SECONDS - execution_step "Wait $PARTITION_INACTIVE_DURATION before beginning to apply partitions" - sleep "$PARTITION_INACTIVE_DURATION" - for (( i=1; i<=PARTITION_ITERATION_COUNT; i++ )); do - execution_step "Partition Iteration $i of $PARTITION_ITERATION_COUNT" - execution_step "Applying netem config $NETEM_CONFIG_FILE for $PARTITION_ACTIVE_DURATION seconds" - "${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" - sleep "$PARTITION_ACTIVE_DURATION" - - execution_step "Resolving partitions for $PARTITION_INACTIVE_DURATION seconds" - "${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" --netem-cmd cleanup + case $TEST_TYPE in + fixed_duration) + execution_step "Wait ${TEST_DURATION_SECONDS} seconds to complete test" + sleep "$TEST_DURATION_SECONDS" + ;; + partition) + STATS_START_SECONDS=$SECONDS + execution_step "Wait $PARTITION_INACTIVE_DURATION before beginning to apply partitions" sleep "$PARTITION_INACTIVE_DURATION" - done - STATS_FINISH_SECONDS=$SECONDS - TEST_DURATION_SECONDS=$((STATS_FINISH_SECONDS - STATS_START_SECONDS)) - else - # We should never get here - echo Test duration and partition config not defined - exit 1 - fi + for (( i=1; i<=PARTITION_ITERATION_COUNT; i++ )); do + execution_step "Partition Iteration $i of $PARTITION_ITERATION_COUNT" + execution_step "Applying netem config $NETEM_CONFIG_FILE for $PARTITION_ACTIVE_DURATION seconds" + "${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" + sleep "$PARTITION_ACTIVE_DURATION" + + execution_step "Resolving partitions for $PARTITION_INACTIVE_DURATION seconds" + "${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" --netem-cmd cleanup + sleep "$PARTITION_INACTIVE_DURATION" + done + STATS_FINISH_SECONDS=$SECONDS + TEST_DURATION_SECONDS=$((STATS_FINISH_SECONDS - STATS_START_SECONDS)) + ;; + script) + "${REPO_ROOT}"/$CUSTOM_SCRIPT + ;; + *) + echo "Error: Unsupported test type: $TEST_TYPE" + ;; + esac END_SLOT=$(get_slot) SLOT_COUNT_END_SECONDS=$SECONDS @@ -179,63 +184,10 @@ function launch_testnet() { SLOTS_PER_SECOND="$(bc <<< "scale=3; ($END_SLOT - $START_SLOT)/($SLOT_COUNT_END_SECONDS - $SLOT_COUNT_START_SECONDS)")" execution_step "Average slot rate: $SLOTS_PER_SECOND slots/second over $((SLOT_COUNT_END_SECONDS - SLOT_COUNT_START_SECONDS)) seconds" - execution_step "Collect statistics about run" - declare q_mean_tps=' - SELECT ROUND(MEAN("median_sum")) as "mean_tps" FROM ( - SELECT MEDIAN(sum_count) AS "median_sum" FROM ( - SELECT SUM("count") AS "sum_count" - FROM "'$TESTNET_TAG'"."autogen"."bank-process_transactions" - WHERE time > now() - '"$TEST_DURATION_SECONDS"'s AND count > 0 - GROUP BY time(1s), host_id) - GROUP BY time(1s) - )' - - declare q_max_tps=' - SELECT MAX("median_sum") as "max_tps" FROM ( - SELECT MEDIAN(sum_count) AS "median_sum" FROM ( - SELECT SUM("count") AS "sum_count" - FROM "'$TESTNET_TAG'"."autogen"."bank-process_transactions" - WHERE time > now() - '"$TEST_DURATION_SECONDS"'s AND count > 0 - GROUP BY time(1s), host_id) - GROUP BY time(1s) - )' - - declare q_mean_confirmation=' - SELECT round(mean("duration_ms")) as "mean_confirmation_ms" - FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation" - WHERE time > now() - '"$TEST_DURATION_SECONDS"'s' - - declare q_max_confirmation=' - SELECT round(max("duration_ms")) as "max_confirmation_ms" - FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation" - WHERE time > now() - '"$TEST_DURATION_SECONDS"'s' - - declare q_99th_confirmation=' - SELECT round(percentile("duration_ms", 99)) as "99th_percentile_confirmation_ms" - FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation" - WHERE time > now() - '"$TEST_DURATION_SECONDS"'s' - - declare q_max_tower_distance_observed=' - SELECT MAX("tower_distance") as "max_tower_distance" FROM ( - SELECT last("slot") - last("root") as "tower_distance" - FROM "'$TESTNET_TAG'"."autogen"."tower-observed" - WHERE time > now() - '"$TEST_DURATION_SECONDS"'s - GROUP BY time(1s), host_id)' - - declare q_last_tower_distance_observed=' - SELECT MEAN("tower_distance") as "last_tower_distance" FROM ( - SELECT last("slot") - last("root") as "tower_distance" - FROM "'$TESTNET_TAG'"."autogen"."tower-observed" - GROUP BY host_id)' - - curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \ - --data-urlencode "db=${TESTNET_TAG}" \ - --data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation;$q_max_tower_distance_observed;$q_last_tower_distance_observed" | - python "${REPO_ROOT}"/system-test/testnet-automation-json-parser.py >>"$RESULT_FILE" + [[ -n $SKIP_PERF_RESULTS ]] || collect_performance_statistics echo "slots_per_second: $SLOTS_PER_SECOND" >>"$RESULT_FILE" - execution_step "Writing test results to ${RESULT_FILE}" RESULT_DETAILS=$(<"$RESULT_FILE") upload-ci-artifact "$RESULT_FILE" } @@ -292,18 +244,35 @@ if [[ "$USE_PUBLIC_IP_ADDRESSES" = "true" ]]; then maybePublicIpAddresses="-P" fi -: "${CLIENT_DELAY_START:=0}" - -if [[ -z $APPLY_PARTITIONS ]]; then - APPLY_PARTITIONS=false -fi -if [[ "$APPLY_PARTITIONS" = "true" ]]; then - if [[ -n $TEST_DURATION_SECONDS ]]; then - echo Cannot accept TEST_DURATION_SECONDS and a parition looping config - exit 1 +execution_step "Checking for required parameters" +testTypeRequiredParameters= +case $TEST_TYPE in + fixed_duration) + testTypeRequiredParameters+=TEST_DURATION_SECONDS + ;; + partition) + testTypeRequiredParameters+=NETEM_CONFIG_FILE + testTypeRequiredParameters+=PARTITION_ACTIVE_DURATION + testTypeRequiredParameters+=PARTITION_INACTIVE_DURATION + testTypeRequiredParameters+=PARTITION_ITERATION_COUNT + ;; + script) + testTypeRequiredParameters+=CUSTOM_SCRIPT + ;; + *) + echo "Error: Unsupported test type: $TEST_TYPE" + ;; +esac + +missingParameters= +for i in "${testTypeRequiredParameters[@]}"; do + if [[ -z ${!i} ]]; then + missingParameters+="${i}" fi -elif [[ -z $TEST_DURATION_SECONDS ]]; then - echo TEST_DURATION_SECONDS not defined +done + +if [[ -n $missingParameters ]]; then + echo "Error: For test type $TEST_TYPE, the following required parameters are missing: ${missingParameters[@]}" exit 1 fi