Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automated test framework can run scripts on launched clusters. Add offline stake operations test case and script. #8510

Merged
merged 10 commits into from
Mar 18, 2020
68 changes: 67 additions & 1 deletion system-test/automation_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ function analyze_packet_loss {

function wait_for_bootstrap_validator_stake_drop {
max_stake="$1"
if [[ $max_stake -eq 100 ]]; then
danpaul000 marked this conversation as resolved.
Show resolved Hide resolved
return
fi

source "${REPO_ROOT}"/net/common.sh
loadConfigFile

Expand All @@ -85,6 +89,68 @@ function get_slot {
ssh "${sshOptions[@]}" "${validatorIpList[0]}" '$HOME/.cargo/bin/solana slot'
}

function get_bootstrap_validator_ip_address {
source "${REPO_ROOT}"/net/common.sh
loadConfigFile
echo "${validatorIpList[0]}"
}

function collect_performance_statistics {
execution_step "Collect performance statistics about run"
declare q_mean_tps='
SELECT ROUND(MEAN("median_sum")) as "mean_tps" FROM (
SELECT MEDIAN(sum_count) AS "median_sum" FROM (
SELECT SUM("count") AS "sum_count"
FROM "'$TESTNET_TAG'"."autogen"."bank-process_transactions"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s AND count > 0
GROUP BY time(1s), host_id)
GROUP BY time(1s)
)'

declare q_max_tps='
SELECT MAX("median_sum") as "max_tps" FROM (
SELECT MEDIAN(sum_count) AS "median_sum" FROM (
SELECT SUM("count") AS "sum_count"
FROM "'$TESTNET_TAG'"."autogen"."bank-process_transactions"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s AND count > 0
GROUP BY time(1s), host_id)
GROUP BY time(1s)
)'

declare q_mean_confirmation='
SELECT round(mean("duration_ms")) as "mean_confirmation_ms"
FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'

declare q_max_confirmation='
SELECT round(max("duration_ms")) as "max_confirmation_ms"
FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'

declare q_99th_confirmation='
SELECT round(percentile("duration_ms", 99)) as "99th_percentile_confirmation_ms"
FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'

declare q_max_tower_distance_observed='
SELECT MAX("tower_distance") as "max_tower_distance" FROM (
SELECT last("slot") - last("root") as "tower_distance"
FROM "'$TESTNET_TAG'"."autogen"."tower-observed"
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s
GROUP BY time(1s), host_id)'

declare q_last_tower_distance_observed='
SELECT MEAN("tower_distance") as "last_tower_distance" FROM (
SELECT last("slot") - last("root") as "tower_distance"
FROM "'$TESTNET_TAG'"."autogen"."tower-observed"
GROUP BY host_id)'

curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \
--data-urlencode "db=${TESTNET_TAG}" \
--data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation;$q_max_tower_distance_observed;$q_last_tower_distance_observed" |
python "${REPO_ROOT}"/system-test/testnet-automation-json-parser.py >>"$RESULT_FILE"
}

function upload_results_to_slack() {
echo --- Uploading results to Slack Performance Results App

Expand Down Expand Up @@ -166,7 +232,7 @@ function upload_results_to_slack() {
{
"type": "divider"
},
{
{
"type": "section",
"text": {
"type": "mrkdwn",
Expand Down
1 change: 1 addition & 0 deletions system-test/deprecated-testcases/colo-cpu-only-perf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ steps:
NUMBER_OF_CLIENT_NODES: 2
CLIENT_OPTIONS: "bench-tps=2=--tx_count 20000 --thread-batch-sleep-ms 250"
ADDITIONAL_FLAGS: ""
TEST_TYPE: "fixed_duration"
agents:
- "queue=colo-deploy"
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ steps:
NUMBER_OF_CLIENT_NODES: 2
CLIENT_OPTIONS: "bench-tps=2=--tx_count 30000 --thread-batch-sleep-ms 250"
ADDITIONAL_FLAGS: ""
TEST_TYPE: "fixed_duration"
agents:
- "queue=colo-deploy"
1 change: 1 addition & 0 deletions system-test/deprecated-testcases/colo-gpu-perf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ steps:
NUMBER_OF_CLIENT_NODES: 2
CLIENT_OPTIONS: "bench-tps=2=--tx_count 20000 --thread-batch-sleep-ms 250"
ADDITIONAL_FLAGS: ""
TEST_TYPE: "fixed_duration"
agents:
- "queue=colo-deploy"
1 change: 1 addition & 0 deletions system-test/deprecated-testcases/gce-gpu-perf-100-node.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@ steps:
ALLOW_BOOT_FAILURES: "true"
USE_PUBLIC_IP_ADDRESSES: "true"
ADDITIONAL_FLAGS: "--dedicated"
TEST_TYPE: "fixed_duration"
agents:
- "queue=gce-deploy"
1 change: 1 addition & 0 deletions system-test/partition-testcases/colo-3-partition.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@ steps:
PARTITION_ACTIVE_DURATION: 30
PARTITION_INACTIVE_DURATION: 30
PARTITION_ITERATION_COUNT: 5
TEST_TYPE: "partition"
agents:
- "queue=colo-deploy"
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@ steps:
PARTITION_ACTIVE_DURATION: 60
PARTITION_INACTIVE_DURATION: 60
PARTITION_ITERATION_COUNT: 10
TEST_TYPE: "partition"
agents:
- "queue=colo-deploy"
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@ steps:
PARTITION_ACTIVE_DURATION: 60
PARTITION_INACTIVE_DURATION: 300
PARTITION_ITERATION_COUNT: 1
TEST_TYPE: "partition"
agents:
- "queue=colo-deploy"
1 change: 1 addition & 0 deletions system-test/partition-testcases/gce-5-node-3-partition.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,6 @@ steps:
PARTITION_ACTIVE_DURATION: 30
PARTITION_INACTIVE_DURATION: 30
PARTITION_ITERATION_COUNT: 5
TEST_TYPE: "partition"
agents:
- "queue=testnet-deploy"
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,6 @@ steps:
PARTITION_ACTIVE_DURATION: 60
PARTITION_INACTIVE_DURATION: 300
PARTITION_ITERATION_COUNT: 5
TEST_TYPE: "partition"
agents:
- "queue=gce-deploy"
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,6 @@ steps:
PARTITION_ACTIVE_DURATION: 60
PARTITION_INACTIVE_DURATION: 300
PARTITION_ITERATION_COUNT: 1
TEST_TYPE: "partition"
agents:
- "queue=gce-deploy"
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@ steps:
TESTNET_ZONES: "us-west-1a,us-west-1c,us-east-1a,eu-west-1a"
USE_PUBLIC_IP_ADDRESSES: "true"
ADDITIONAL_FLAGS: ""
TEST_TYPE: "fixed_duration"
agents:
- "queue=aws-deploy"
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@ steps:
TESTNET_ZONES: "us-west-1a,us-west-1c,us-east-1a,eu-west-1a"
USE_PUBLIC_IP_ADDRESSES: "true"
ADDITIONAL_FLAGS: ""
TEST_TYPE: "fixed_duration"
agents:
- "queue=aws-deploy"
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ steps:
TESTNET_ZONES: "westus"
USE_PUBLIC_IP_ADDRESSES: "true"
ADDITIONAL_FLAGS: ""
TEST_TYPE: "fixed_duration"
agents:
- "queue=azure-deploy"
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ steps:
NUMBER_OF_CLIENT_NODES: 1
CLIENT_OPTIONS: "bench-tps=1=--tx_count 40000 --thread-batch-sleep-ms 250"
ADDITIONAL_FLAGS: ""
TEST_TYPE: "fixed_duration"
agents:
- "queue=colo-deploy"
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ steps:
NUMBER_OF_CLIENT_NODES: 1
CLIENT_OPTIONS: "bench-tps=1=--tx_count 40000 --thread-batch-sleep-ms 250"
ADDITIONAL_FLAGS: ""
TEST_TYPE: "fixed_duration"
agents:
- "queue=colo-deploy"
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ steps:
NUMBER_OF_CLIENT_NODES: 1
CLIENT_OPTIONS: "bench-tps=1=--tx_count 60000 --thread-batch-sleep-ms 250"
ADDITIONAL_FLAGS: ""
TEST_TYPE: "fixed_duration"
agents:
- "queue=colo-deploy"
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ steps:
TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a"
USE_PUBLIC_IP_ADDRESSES: "true"
ADDITIONAL_FLAGS: "--dedicated"
TEST_TYPE: "fixed_duration"
agents:
- "queue=gce-deploy"
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ steps:
TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a"
USE_PUBLIC_IP_ADDRESSES: "true"
ADDITIONAL_FLAGS: "--dedicated"
TEST_TYPE: "fixed_duration"
agents:
- "queue=gce-deploy"
1 change: 1 addition & 0 deletions system-test/performance-testcases/gce-gpu-perf-10-node.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ steps:
TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a"
USE_PUBLIC_IP_ADDRESSES: "true"
ADDITIONAL_FLAGS: "--dedicated"
TEST_TYPE: "fixed_duration"
agents:
- "queue=gce-deploy"
1 change: 1 addition & 0 deletions system-test/performance-testcases/gce-gpu-perf-25-node.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ steps:
TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a"
USE_PUBLIC_IP_ADDRESSES: "true"
ADDITIONAL_FLAGS: "--dedicated"
TEST_TYPE: "fixed_duration"
agents:
- "queue=gce-deploy"
1 change: 1 addition & 0 deletions system-test/performance-testcases/gce-gpu-perf-5-node.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ steps:
TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a"
USE_PUBLIC_IP_ADDRESSES: "true"
ADDITIONAL_FLAGS: "--dedicated"
TEST_TYPE: "fixed_duration"
agents:
- "queue=gce-deploy"
1 change: 1 addition & 0 deletions system-test/performance-testcases/gce-gpu-perf-50-node.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@ steps:
ALLOW_BOOT_FAILURES: "true"
USE_PUBLIC_IP_ADDRESSES: "true"
ADDITIONAL_FLAGS: "--dedicated"
TEST_TYPE: "fixed_duration"
agents:
- "queue=gce-deploy"
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ steps:
CLOUD_PROVIDER: "colo"
TESTNET_TAG: "colo-perf-cpu-only"
ENABLE_GPU: "false"
TEST_DURATION_SECONDS: 30
TEST_DURATION_SECONDS: 60
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are the config changes besides the TEST_TYPE additions intentional or debug artifacts?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Debug artifacts mostly. I just threw the "sanity-testcases" dir in there to run short tests that exercise the functionality of the testing framework, rather than the limits of the cluster. So I can point a buildkite job on a PR against this file rather than an expensive/long nightly-style testcase.

NUMBER_OF_VALIDATOR_NODES: 1
NUMBER_OF_CLIENT_NODES: 1
CLIENT_OPTIONS: "bench-tps=1=--tx_count 40000 --thread-batch-sleep-ms 250"
ADDITIONAL_FLAGS: ""
BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD: 100
BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD: 99
TEST_TYPE: "fixed_duration"
agents:
- "queue=colo-deploy"
5 changes: 3 additions & 2 deletions system-test/sanity-testcases/colo-partition-sanity-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ steps:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "colo"
TESTNET_TAG: "colo-perf-cpu-only"
NUMBER_OF_VALIDATOR_NODES: 4
NUMBER_OF_VALIDATOR_NODES: 3
ENABLE_GPU: "false"
NUMBER_OF_CLIENT_NODES: 1
CLIENT_OPTIONS: "bench-tps=1=--tx_count 15000 --thread-batch-sleep-ms 250"
Expand All @@ -15,6 +15,7 @@ steps:
PARTITION_ACTIVE_DURATION: 30
PARTITION_INACTIVE_DURATION: 30
PARTITION_ITERATION_COUNT: 2
BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD: 100
BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD: 66
TEST_TYPE: "partition"
agents:
- "queue=colo-deploy"
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ steps:
NUMBER_OF_CLIENT_NODES: 1
CLIENT_OPTIONS: "bench-tps=1=--tx_count 30000 --thread-batch-sleep-ms 250"
ADDITIONAL_FLAGS: ""
TEST_TYPE: "fixed_duration"
agents:
- "queue=colo-deploy"
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ steps:
NUMBER_OF_CLIENT_NODES: 1
CLIENT_OPTIONS: "bench-tps=1=--tx_count 30000 --thread-batch-sleep-ms 250"
ADDITIONAL_FLAGS: ""
TEST_TYPE: "fixed_duration"
agents:
- "queue=colo-deploy"
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ steps:
TESTNET_ZONES: "us-west1-a"
USE_PUBLIC_IP_ADDRESSES: "false"
ADDITIONAL_FLAGS: "--dedicated"
TEST_TYPE: "fixed_duration"
agents:
- "queue=gce-deploy"
1 change: 1 addition & 0 deletions system-test/stability-testcases/gce-stability-5-node.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@ steps:
TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a"
USE_PUBLIC_IP_ADDRESSES: "true"
ADDITIONAL_FLAGS: "--dedicated"
TEST_TYPE: "fixed_duration"
agents:
- "queue=stability-deploy"
17 changes: 17 additions & 0 deletions system-test/stake-operations-testcases/offline_stake_colo.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
steps:
- command: "system-test/testnet-automation.sh"
label: "Running Offline Stake Operations Tests on Colo"
env:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "colo"
ENABLE_GPU: "false"
TEST_DURATION_SECONDS: 30
NUMBER_OF_VALIDATOR_NODES: 1
NUMBER_OF_CLIENT_NODES: 0
ADDITIONAL_FLAGS: ""
BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD: 100
SKIP_PERF_RESULTS: "true"
TEST_TYPE: "script"
CUSTOM_SCRIPT: "system-test/stake-operations-testcases/stake_test_automation.sh"
agents:
- "queue=colo-deploy"
18 changes: 18 additions & 0 deletions system-test/stake-operations-testcases/offline_stake_gce.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
steps:
- command: "system-test/testnet-automation.sh"
label: "Running Offline Stake Operations Tests on GCE"
env:
UPLOAD_RESULTS_TO_SLACK: "true"
CLOUD_PROVIDER: "gce"
ENABLE_GPU: "false"
TEST_DURATION_SECONDS: 30
NUMBER_OF_VALIDATOR_NODES: 1
VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16"
NUMBER_OF_CLIENT_NODES: 0
ADDITIONAL_FLAGS: "--dedicated"
BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD: 100
SKIP_PERF_RESULTS: "true"
TEST_TYPE: "script"
CUSTOM_SCRIPT: "system-test/stake-operations-testcases/stake_test_automation.sh"
agents:
- "queue=gce-deploy"
Loading