From 6541003cb2a82c4a9ef23b4fdf21f3878bfc1085 Mon Sep 17 00:00:00 2001
From: Dan Albert <dan@solana.com>
Date: Mon, 16 Mar 2020 16:21:54 -0700
Subject: [PATCH] Refactor automation framework to accept arbitary scripts
 after cluster launch

---
 system-test/automation_utils.sh               |  62 +++++++-
 .../offline_stake_colo.yml                    |   5 -
 .../offline_stake_gce.yml                     |  18 +++
 .../stake_test_automation.sh                  |  11 +-
 system-test/testnet-automation.sh             | 146 +++++++-----------
 5 files changed, 140 insertions(+), 102 deletions(-)
 delete mode 100755 system-test/stake-operations-testcases/offline_stake_colo.yml
 create mode 100755 system-test/stake-operations-testcases/offline_stake_gce.yml

diff --git a/system-test/automation_utils.sh b/system-test/automation_utils.sh
index 213c09887f856a..581d242631ab40 100755
--- a/system-test/automation_utils.sh
+++ b/system-test/automation_utils.sh
@@ -62,6 +62,10 @@ function analyze_packet_loss {
 
 function wait_for_bootstrap_validator_stake_drop {
   max_stake="$1"
+  if [[ $max_stake -eq 100 ]]; then
+    return
+  fi
+
   source "${REPO_ROOT}"/net/common.sh
   loadConfigFile
 
@@ -88,7 +92,63 @@ function get_slot {
 function get_bootstrap_validator_ip_address {
   source "${REPO_ROOT}"/net/common.sh
   loadConfigFile
-  echo ${validatorIpList[0]}
+  echo "${validatorIpList[0]}"
+}
+
+function collect_performance_statistics {
+  execution_step "Collect performance statistics about run"
+  declare q_mean_tps='
+    SELECT ROUND(MEAN("median_sum")) as "mean_tps" FROM (
+      SELECT MEDIAN(sum_count) AS "median_sum" FROM (
+        SELECT SUM("count") AS "sum_count"
+          FROM "'$TESTNET_TAG'"."autogen"."bank-process_transactions"
+          WHERE time > now() - '"$TEST_DURATION_SECONDS"'s AND count > 0
+          GROUP BY time(1s), host_id)
+      GROUP BY time(1s)
+    )'
+
+  declare q_max_tps='
+    SELECT MAX("median_sum") as "max_tps" FROM (
+      SELECT MEDIAN(sum_count) AS "median_sum" FROM (
+        SELECT SUM("count") AS "sum_count"
+          FROM "'$TESTNET_TAG'"."autogen"."bank-process_transactions"
+          WHERE time > now() - '"$TEST_DURATION_SECONDS"'s AND count > 0
+          GROUP BY time(1s), host_id)
+      GROUP BY time(1s)
+    )'
+
+  declare q_mean_confirmation='
+    SELECT round(mean("duration_ms")) as "mean_confirmation_ms"
+      FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
+      WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'
+
+  declare q_max_confirmation='
+    SELECT round(max("duration_ms")) as "max_confirmation_ms"
+      FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
+      WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'
+
+  declare q_99th_confirmation='
+    SELECT round(percentile("duration_ms", 99)) as "99th_percentile_confirmation_ms"
+      FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
+      WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'
+
+  declare q_max_tower_distance_observed='
+    SELECT MAX("tower_distance") as "max_tower_distance" FROM (
+      SELECT last("slot") - last("root") as "tower_distance"
+        FROM "'$TESTNET_TAG'"."autogen"."tower-observed"
+        WHERE time > now() - '"$TEST_DURATION_SECONDS"'s
+        GROUP BY time(1s), host_id)'
+
+  declare q_last_tower_distance_observed='
+      SELECT MEAN("tower_distance") as "last_tower_distance" FROM (
+            SELECT last("slot") - last("root") as "tower_distance"
+              FROM "'$TESTNET_TAG'"."autogen"."tower-observed"
+              GROUP BY host_id)'
+
+  curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \
+    --data-urlencode "db=${TESTNET_TAG}" \
+    --data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation;$q_max_tower_distance_observed;$q_last_tower_distance_observed" |
+    python "${REPO_ROOT}"/system-test/testnet-automation-json-parser.py >>"$RESULT_FILE"
 }
 
 function upload_results_to_slack() {
diff --git a/system-test/stake-operations-testcases/offline_stake_colo.yml b/system-test/stake-operations-testcases/offline_stake_colo.yml
deleted file mode 100755
index e0a5a3ee9140ed..00000000000000
--- a/system-test/stake-operations-testcases/offline_stake_colo.yml
+++ /dev/null
@@ -1,5 +0,0 @@
-steps:
-  - command: "system-test/stake-operations-testcases/stake_test_automation.sh"
-    label: "Running Offline Stake Operations Tests"
-    agents:
-      - "queue=colo-deploy"
diff --git a/system-test/stake-operations-testcases/offline_stake_gce.yml b/system-test/stake-operations-testcases/offline_stake_gce.yml
new file mode 100755
index 00000000000000..3b38ddb085e9d7
--- /dev/null
+++ b/system-test/stake-operations-testcases/offline_stake_gce.yml
@@ -0,0 +1,18 @@
+steps:
+  - command: "system-test/testnet-automation.sh"
+    label: "Running Offline Stake Operations Tests"
+    env:
+      UPLOAD_RESULTS_TO_SLACK: "true"
+      CLOUD_PROVIDER: "gce"
+      TESTNET_TAG: "gce-perf-cpu-only"
+      ENABLE_GPU: "false"
+      TEST_DURATION_SECONDS: 30
+      NUMBER_OF_VALIDATOR_NODES: 1
+      VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16"
+      NUMBER_OF_CLIENT_NODES: 0
+      ADDITIONAL_FLAGS: ""
+      BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD: 100
+      TEST_TYPE: "script"
+      CUSTOM_SCRIPT: "system-test/stake-operations-testcases/stake_test_automation.sh"
+    agents:
+      - "queue=gce-deploy"
diff --git a/system-test/stake-operations-testcases/stake_test_automation.sh b/system-test/stake-operations-testcases/stake_test_automation.sh
index ba00d813688de4..1f21376b6d9ae9 100755
--- a/system-test/stake-operations-testcases/stake_test_automation.sh
+++ b/system-test/stake-operations-testcases/stake_test_automation.sh
@@ -1,17 +1,12 @@
 #!/usr/bin/env bash
 
-set -e
-set -x
+set -ex
 
+# shellcheck disable=SC1090
 # shellcheck disable=SC1091
 source "$(dirname "$0")"/../automation_utils.sh
 
-curl -sSf https://raw.githubusercontent.com/solana-labs/solana/v1.0.5/install/solana-install-init.sh | sh -s - 1.0.5
-
-# Create a single node cluster on colo, then call offline_stake_operations.sh against that cluster
-"${REPO_ROOT}"/net/colo.sh delete --reclaim-preemptible-reservations
-"${REPO_ROOT}"/net/colo.sh create -n 1 -c 0 -p stake-ops-testnet --dedicated
-"${REPO_ROOT}"/net/net.sh start -t edge
+# Runs offline stake operations tests against a running cluster launched from the automation framework
 
 bootstrapper_ip_address="$(get_bootstrap_validator_ip_address)"
 entrypoint=http://"${bootstrapper_ip_address}":8899
diff --git a/system-test/testnet-automation.sh b/system-test/testnet-automation.sh
index abd1b31bab6a14..78aca2a5a3645a 100755
--- a/system-test/testnet-automation.sh
+++ b/system-test/testnet-automation.sh
@@ -147,30 +147,36 @@ function launch_testnet() {
   SLOT_COUNT_START_SECONDS=$SECONDS
   execution_step "Marking beginning of slot rate test - Slot: $START_SLOT, Seconds: $SLOT_COUNT_START_SECONDS"
 
-  if [[ -n $TEST_DURATION_SECONDS ]]; then
-    execution_step "Wait ${TEST_DURATION_SECONDS} seconds to complete test"
-    sleep "$TEST_DURATION_SECONDS"
-  elif [[ "$APPLY_PARTITIONS" = "true" ]]; then
-    STATS_START_SECONDS=$SECONDS
-    execution_step "Wait $PARTITION_INACTIVE_DURATION before beginning to apply partitions"
-    sleep "$PARTITION_INACTIVE_DURATION"
-    for (( i=1; i<=PARTITION_ITERATION_COUNT; i++ )); do
-      execution_step "Partition Iteration $i of $PARTITION_ITERATION_COUNT"
-      execution_step "Applying netem config $NETEM_CONFIG_FILE for $PARTITION_ACTIVE_DURATION seconds"
-      "${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE"
-      sleep "$PARTITION_ACTIVE_DURATION"
-
-      execution_step "Resolving partitions for $PARTITION_INACTIVE_DURATION seconds"
-      "${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" --netem-cmd cleanup
+  case $TEST_TYPE in
+    fixed_duration)
+      execution_step "Wait ${TEST_DURATION_SECONDS} seconds to complete test"
+      sleep "$TEST_DURATION_SECONDS"
+      ;;
+    partition)
+      STATS_START_SECONDS=$SECONDS
+      execution_step "Wait $PARTITION_INACTIVE_DURATION before beginning to apply partitions"
       sleep "$PARTITION_INACTIVE_DURATION"
-    done
-    STATS_FINISH_SECONDS=$SECONDS
-    TEST_DURATION_SECONDS=$((STATS_FINISH_SECONDS - STATS_START_SECONDS))
-  else
-    # We should never get here
-    echo Test duration and partition config not defined
-    exit 1
-  fi
+      for (( i=1; i<=PARTITION_ITERATION_COUNT; i++ )); do
+        execution_step "Partition Iteration $i of $PARTITION_ITERATION_COUNT"
+        execution_step "Applying netem config $NETEM_CONFIG_FILE for $PARTITION_ACTIVE_DURATION seconds"
+        "${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE"
+        sleep "$PARTITION_ACTIVE_DURATION"
+
+        execution_step "Resolving partitions for $PARTITION_INACTIVE_DURATION seconds"
+        "${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" --netem-cmd cleanup
+        sleep "$PARTITION_INACTIVE_DURATION"
+      done
+      STATS_FINISH_SECONDS=$SECONDS
+      TEST_DURATION_SECONDS=$((STATS_FINISH_SECONDS - STATS_START_SECONDS))
+      ;;
+    script)
+      execution_step "Running custom script: ${REPO_ROOT}/${CUSTOM_SCRIPT}"
+      "$REPO_ROOT"/"$CUSTOM_SCRIPT"
+      ;;
+    *)
+      echo "Error: Unsupported test type: $TEST_TYPE"
+      ;;
+  esac
 
   END_SLOT=$(get_slot)
   SLOT_COUNT_END_SECONDS=$SECONDS
@@ -179,63 +185,10 @@ function launch_testnet() {
   SLOTS_PER_SECOND="$(bc <<< "scale=3; ($END_SLOT - $START_SLOT)/($SLOT_COUNT_END_SECONDS - $SLOT_COUNT_START_SECONDS)")"
   execution_step "Average slot rate: $SLOTS_PER_SECOND slots/second over $((SLOT_COUNT_END_SECONDS - SLOT_COUNT_START_SECONDS)) seconds"
 
-  execution_step "Collect statistics about run"
-  declare q_mean_tps='
-    SELECT ROUND(MEAN("median_sum")) as "mean_tps" FROM (
-      SELECT MEDIAN(sum_count) AS "median_sum" FROM (
-        SELECT SUM("count") AS "sum_count"
-          FROM "'$TESTNET_TAG'"."autogen"."bank-process_transactions"
-          WHERE time > now() - '"$TEST_DURATION_SECONDS"'s AND count > 0
-          GROUP BY time(1s), host_id)
-      GROUP BY time(1s)
-    )'
-
-  declare q_max_tps='
-    SELECT MAX("median_sum") as "max_tps" FROM (
-      SELECT MEDIAN(sum_count) AS "median_sum" FROM (
-        SELECT SUM("count") AS "sum_count"
-          FROM "'$TESTNET_TAG'"."autogen"."bank-process_transactions"
-          WHERE time > now() - '"$TEST_DURATION_SECONDS"'s AND count > 0
-          GROUP BY time(1s), host_id)
-      GROUP BY time(1s)
-    )'
-
-  declare q_mean_confirmation='
-    SELECT round(mean("duration_ms")) as "mean_confirmation_ms"
-      FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
-      WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'
-
-  declare q_max_confirmation='
-    SELECT round(max("duration_ms")) as "max_confirmation_ms"
-      FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
-      WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'
-
-  declare q_99th_confirmation='
-    SELECT round(percentile("duration_ms", 99)) as "99th_percentile_confirmation_ms"
-      FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
-      WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'
-
-  declare q_max_tower_distance_observed='
-    SELECT MAX("tower_distance") as "max_tower_distance" FROM (
-      SELECT last("slot") - last("root") as "tower_distance"
-        FROM "'$TESTNET_TAG'"."autogen"."tower-observed"
-        WHERE time > now() - '"$TEST_DURATION_SECONDS"'s
-        GROUP BY time(1s), host_id)'
-
-  declare q_last_tower_distance_observed='
-      SELECT MEAN("tower_distance") as "last_tower_distance" FROM (
-            SELECT last("slot") - last("root") as "tower_distance"
-              FROM "'$TESTNET_TAG'"."autogen"."tower-observed"
-              GROUP BY host_id)'
-
-  curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \
-    --data-urlencode "db=${TESTNET_TAG}" \
-    --data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation;$q_max_tower_distance_observed;$q_last_tower_distance_observed" |
-    python "${REPO_ROOT}"/system-test/testnet-automation-json-parser.py >>"$RESULT_FILE"
+  [[ -n $SKIP_PERF_RESULTS ]] || collect_performance_statistics
 
   echo "slots_per_second: $SLOTS_PER_SECOND" >>"$RESULT_FILE"
 
-  execution_step "Writing test results to ${RESULT_FILE}"
   RESULT_DETAILS=$(<"$RESULT_FILE")
   upload-ci-artifact "$RESULT_FILE"
 }
@@ -292,18 +245,35 @@ if [[ "$USE_PUBLIC_IP_ADDRESSES" = "true" ]]; then
   maybePublicIpAddresses="-P"
 fi
 
-: "${CLIENT_DELAY_START:=0}"
-
-if [[ -z $APPLY_PARTITIONS ]]; then
-  APPLY_PARTITIONS=false
-fi
-if [[ "$APPLY_PARTITIONS" = "true" ]]; then
-  if [[ -n $TEST_DURATION_SECONDS ]]; then
-    echo Cannot accept TEST_DURATION_SECONDS and a parition looping config
-    exit 1
+execution_step "Checking for required parameters"
+testTypeRequiredParameters=
+case $TEST_TYPE in
+  fixed_duration)
+    testTypeRequiredParameters+=TEST_DURATION_SECONDS
+    ;;
+  partition)
+    testTypeRequiredParameters+=NETEM_CONFIG_FILE
+    testTypeRequiredParameters+=PARTITION_ACTIVE_DURATION
+    testTypeRequiredParameters+=PARTITION_INACTIVE_DURATION
+    testTypeRequiredParameters+=PARTITION_ITERATION_COUNT
+    ;;
+  script)
+    testTypeRequiredParameters+=CUSTOM_SCRIPT
+    ;;
+  *)
+    echo "Error: Unsupported test type: $TEST_TYPE"
+    ;;
+esac
+
+missingParameters=
+for i in "${testTypeRequiredParameters[@]}"; do
+  if [[ -z ${!i} ]]; then
+    missingParameters+="${i} "
   fi
-elif [[ -z $TEST_DURATION_SECONDS ]]; then
-  echo TEST_DURATION_SECONDS not defined
+done
+
+if [[ -n $missingParameters ]]; then
+  echo "Error: For test type $TEST_TYPE, the following required parameters are missing: ${missingParameters[*]}"
   exit 1
 fi