From 350ff35835ec32923fa85f9dba74154bf2194f23 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Thu, 10 Sep 2020 20:40:33 +0900 Subject: [PATCH 01/57] Test against live clusters --- ci/buildkite-pipeline.sh | 30 +++++--- ci/live-cluster-sanity.sh | 142 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+), 10 deletions(-) create mode 100755 ci/live-cluster-sanity.sh diff --git a/ci/buildkite-pipeline.sh b/ci/buildkite-pipeline.sh index c92426f28af5ce..a0be208ef58889 100755 --- a/ci/buildkite-pipeline.sh +++ b/ci/buildkite-pipeline.sh @@ -125,8 +125,9 @@ wait_step() { } all_test_steps() { - command_step checks ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_nightly_docker_image ci/test-checks.sh" 20 - wait_step + #command_step checks ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_nightly_docker_image ci/test-checks.sh" 20 + #wait_step + true # Coverage... if affects \ @@ -137,16 +138,18 @@ all_test_steps() { ^ci/test-coverage.sh \ ^scripts/coverage.sh \ ; then - command_step coverage ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_nightly_docker_image ci/test-coverage.sh" 30 - wait_step + #command_step coverage ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_nightly_docker_image ci/test-coverage.sh" 30 + #wait_step + true else annotate --style info --context test-coverage \ "Coverage skipped as no .rs files were modified" fi # Full test suite - command_step stable ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_stable_docker_image ci/test-stable.sh" 60 - wait_step + # command_step stable ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_stable_docker_image ci/test-stable.sh" 60 + #wait_step + true # Perf test suite if affects \ @@ -169,6 +172,12 @@ all_test_steps() { artifact_paths: "log-*.txt" agents: - "queue=cuda" + - command: "ci/live-cluster-sanity.sh" + name: "live-cluster-sanity" + timeout_in_minutes: 40 + artifact_paths: "log-*.txt" + agents: + - "queue=gce-deploy" EOF else annotate --style info \ @@ -220,13 +229,14 @@ EOF } pull_or_push_steps() { - command_step sanity "ci/test-sanity.sh" 5 - wait_step + #command_step sanity "ci/test-sanity.sh" 5 + #wait_step # Check for any .sh file changes if affects .sh$; then - command_step shellcheck "ci/shellcheck.sh" 5 - wait_step + #command_step shellcheck "ci/shellcheck.sh" 5 + #wait_step + true fi # Run the full test suite by default, skipping only if modifications are local diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh new file mode 100755 index 00000000000000..68589ce4c047ac --- /dev/null +++ b/ci/live-cluster-sanity.sh @@ -0,0 +1,142 @@ +#!/usr/bin/env bash +set -e +cd "$(dirname "$0")/.." + +source ci/_ +source ci/rust-version.sh stable +source ci/upload-ci-artifact.sh + +escaped_branch=$(echo "$BUILDKITE_BRANCH" | tr -c "[:alnum:]" - | sed -r "s#(^-*|-*head-*|-*$)##g") +instance_prefix="testnet-live-sanity-$escaped_branch" +# ensure to delete leftover cluster +./net/gce.sh delete -p "$instance_prefix" || true +# only bootstrap, no normal validator +./net/gce.sh create -p "$instance_prefix" -n 0 +instance_ip=$(./net/gce.sh info | grep bootstrap-validator | awk '{print $3}') + +on_trap() { + if [[ -z $instance_deleted ]]; then + ( + set +e + upload-ci-artifact cluster-sanity/testnet-validator.log + upload-ci-artifact cluster-sanity/mainnet-beta-validator.log + _ ./net/gce.sh delete -p "$instance_prefix" + ) + fi +} +trap on_trap INT TERM EXIT + +_ cargo +"$rust_stable" build --bins --release +_ ./net/scp.sh ./target/release/solana-validator "$instance_ip":. +echo 500000 | ./net/ssh.sh "$instance_ip" sudo tee /proc/sys/vm/max_map_count > /dev/null + +test_with_live_cluster() { + cluster_label="$1" + shift + + echo "--- Starting validator $cluster_label" + + rm -rf cluster-sanity + mkdir cluster-sanity + ./net/ssh.sh "$instance_ip" rm -rf cluster-sanity + ./net/ssh.sh "$instance_ip" mkdir cluster-sanity + + validator_log="cluster-sanity/$cluster_label-validator.log" + (./net/ssh.sh "$instance_ip" -Llocalhost:18899:localhost:18899 ./solana-validator \ + --no-untrusted-rpc \ + --ledger cluster-sanity/ledger \ + --log - \ + --init-complete-file cluster-sanity/init-completed \ + --enable-rpc-exit \ + --private-rpc \ + --rpc-port 18899 \ + --rpc-bind-address localhost \ + --snapshot-interval-slots 0 \ + "$@" ) &> "$validator_log" & + ssh_pid=$! + tail -F "$validator_log" > cluster-sanity/log-tail 2> /dev/null & + tail_pid=$! + sleep 3 + + attempts=100 + while ! ./net/ssh.sh "$instance_ip" test -f cluster-sanity/init-completed &> /dev/null ; do + attempts=$((attempts - 1)) + if [[ (($attempts == 0)) || ! -d "/proc/$ssh_pid" ]]; then + set +e + kill $ssh_pid $tail_pid + wait $ssh_pid $tail_pid + echo "Error: validator failed to boot" + exit 1 + fi + + sleep 3 + echo "##### validator is starting... (until timeout: $attempts) #####" + if find cluster-sanity/log-tail -not -empty | grep ^ > /dev/null; then + echo "##### new log:" + timeout 1 cat cluster-sanity/log-tail | tail -n 3 || true + truncate --size 0 cluster-sanity/log-tail + echo + fi + done + + snapshot_slot=$(./net/ssh.sh "$instance_ip" ls -t cluster-sanity/ledger/snapshot* | + head -n 1 | + grep -o 'snapshot-[0-9]*-' | + grep -o '[0-9]*' + ) + + echo "--- Monitoring validator $cluster_label" + + attempts=100 + current_root=$snapshot_slot + goal_root=$((snapshot_slot + 100)) + while [[ $current_root -le $goal_root ]]; do + attempts=$((attempts - 1)) + if [[ (($attempts == 0)) || ! -d "/proc/$ssh_pid" ]]; then + set +e + kill $ssh_pid $tail_pid + wait $ssh_pid $tail_pid + echo "Error: validator failed to boot" + exit 1 + fi + + sleep 3 + current_root=$(./target/release/solana --url http://localhost:18899 slot --commitment root) + echo "##### validator is running ($current_root/$goal_root)... (until timeout: $attempts) #####" + if find cluster-sanity/log-tail -not -empty | grep ^ > /dev/null; then + echo "##### new log:" + timeout 1 cat cluster-sanity/log-tail | tail -n 3 || true + truncate --size 0 cluster-sanity/log-tail + echo + fi + done + + _ curl \ + -X POST \ + -H 'Content-Type: application/json' \ + -d '{"jsonrpc":"2.0","id":1, "method":"validatorExit"}' \ + http://localhost:18899 + sleep 10 + + (sleep 3 && kill "$tail_pid") & + kill_pid=$! + wait "$ssh_pid" "$tail_pid" "$kill_pid" + + upload-ci-artifact "$validator_log" +} + +test_with_live_cluster "mainnet-beta" \ + --trusted-validator 7Np41oeYqPefeNQEHSv1UDhYrehxin3NStELsSKCT4K2 \ + --trusted-validator GdnSyH3YtwcxFvQrVVJMm1JhTS4QVX7MFsX56uJLUfiZ \ + --trusted-validator DE1bawNcRJB9rVm3buyMVfr8mBEoyyu73NBovf2oXJsJ \ + --entrypoint mainnet-beta.solana.com:8001 \ + --expected-genesis-hash 5eykt4UsFv8P8NJdTREpY1vzqKqZKvdpKuc147dw2N9d \ + --expected-shred-version 64864 + +test_with_live_cluster "testnet" \ + --trusted-validator 5D1fNXzvv5NjV1ysLjirC4WY92RNsVH18vjmcszZd8on \ + --entrypoint 35.203.170.30:8001 \ + --expected-genesis-hash 4uhcVJyU9pJkvQyS88uRDiswHXSCkY3zQawwpjk2NsNY \ + --expected-shred-version 1579 \ + +./net/gce.sh delete -p "$instance_prefix" && instance_deleted=yes From 250029f52a3be010b2e2ef7bff953cfb0e9ab326 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 18 Sep 2020 21:45:20 +0900 Subject: [PATCH 02/57] Fix typo to trigger CI --- runtime/src/bank.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/runtime/src/bank.rs b/runtime/src/bank.rs index 8cb3149ec63ad0..bd7797a950fef8 100644 --- a/runtime/src/bank.rs +++ b/runtime/src/bank.rs @@ -3898,8 +3898,8 @@ impl Bank { let cycle_params = self.determine_collection_cycle_params(epoch); let (_, _, in_multi_epoch_cycle, _, _, partition_count) = cycle_params; - // use common codepath for both very likely and very unlikely for the sake of minimized - // risk of any miscalculation instead of negligibly faster computation per slot for the + // use common code-path for both very-likely and very-unlikely for the sake of minimized + // risk of any mis-calculation instead of negligible faster computation per slot for the // likely case. let mut start_partition_index = Self::partition_index_from_slot_index(start_slot_index, cycle_params); @@ -3911,7 +3911,7 @@ impl Bank { let in_middle_of_cycle = start_partition_index > 0; if in_multi_epoch_cycle && is_special_new_epoch && in_middle_of_cycle { // Adjust slot indexes so that the final partition ranges are continuous! - // This is need because the caller gives us off-by-one indexes when + // This is needed because the caller gives us off-by-one indexes when // an epoch boundary is crossed. // Usually there is no need for this adjustment because cycles are aligned // with epochs. But for multi-epoch cycles, adjust the indexes if it From 7e3d007204b1b0372dfad586d6337dd3149bec58 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 18 Sep 2020 21:57:19 +0900 Subject: [PATCH 03/57] Enable shellcheck --- ci/buildkite-pipeline.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/buildkite-pipeline.sh b/ci/buildkite-pipeline.sh index a0be208ef58889..c9e3c97530442f 100755 --- a/ci/buildkite-pipeline.sh +++ b/ci/buildkite-pipeline.sh @@ -234,8 +234,8 @@ pull_or_push_steps() { # Check for any .sh file changes if affects .sh$; then - #command_step shellcheck "ci/shellcheck.sh" 5 - #wait_step + command_step shellcheck "ci/shellcheck.sh" 5 + wait_step true fi From a147f0a763d3974360ce70d615febff92f0eac02 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 18 Sep 2020 21:59:54 +0900 Subject: [PATCH 04/57] clean up --- ci/buildkite-pipeline.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/buildkite-pipeline.sh b/ci/buildkite-pipeline.sh index c9e3c97530442f..ad13876a17e67b 100755 --- a/ci/buildkite-pipeline.sh +++ b/ci/buildkite-pipeline.sh @@ -236,7 +236,6 @@ pull_or_push_steps() { if affects .sh$; then command_step shellcheck "ci/shellcheck.sh" 5 wait_step - true fi # Run the full test suite by default, skipping only if modifications are local From 3e53824df6cf226a10eb02846e17b77e7f555115 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 18 Sep 2020 22:14:16 +0900 Subject: [PATCH 05/57] Sync cluster info with docs --- ci/live-cluster-sanity.sh | 16 +++++++++++----- docs/src/clusters.md | 37 +++++++++++++++++++------------------ 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index 68589ce4c047ac..4b2bb2d0baa930 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -125,18 +125,24 @@ test_with_live_cluster() { upload-ci-artifact "$validator_log" } +# UPDATE docs/src/clusters.md TOO!! test_with_live_cluster "mainnet-beta" \ + --entrypoint mainnet-beta.solana.com:8001 \ --trusted-validator 7Np41oeYqPefeNQEHSv1UDhYrehxin3NStELsSKCT4K2 \ --trusted-validator GdnSyH3YtwcxFvQrVVJMm1JhTS4QVX7MFsX56uJLUfiZ \ --trusted-validator DE1bawNcRJB9rVm3buyMVfr8mBEoyyu73NBovf2oXJsJ \ - --entrypoint mainnet-beta.solana.com:8001 \ + --trusted-validator CakcnaRDHka2gXyfbEd2d3xsvkJkqsLw2akB3zsN1D2S \ --expected-genesis-hash 5eykt4UsFv8P8NJdTREpY1vzqKqZKvdpKuc147dw2N9d \ - --expected-shred-version 64864 + --expected-shred-version 64864 \ + # for your pain-less copy-paste +# UPDATE docs/src/clusters.md TOO!! test_with_live_cluster "testnet" \ + --entrypoint entrypoint.testnet.solana.com:8001 \ --trusted-validator 5D1fNXzvv5NjV1ysLjirC4WY92RNsVH18vjmcszZd8on \ - --entrypoint 35.203.170.30:8001 \ - --expected-genesis-hash 4uhcVJyU9pJkvQyS88uRDiswHXSCkY3zQawwpjk2NsNY \ - --expected-shred-version 1579 \ + --trusted-validator ta1Uvfb7W5BRPrdGnhP9RmeCGKzBySGM1hTE4rBRy6T \ + --trusted-validator Ft5fbkqNa76vnsjYNwjDZUXoTWpP7VYm3mtsaQckQADN \ + --trusted-validator 9QxCLckBiJc783jnMvXZubK4wH86Eqqvashtrwvcsgkv \ + # for your pain-less copy-paste ./net/gce.sh delete -p "$instance_prefix" && instance_deleted=yes diff --git a/docs/src/clusters.md b/docs/src/clusters.md index 09e5f935ed89b3..eb6e28ea718a07 100644 --- a/docs/src/clusters.md +++ b/docs/src/clusters.md @@ -42,15 +42,13 @@ solana config set --url https://api.devnet.solana.com ```bash $ solana-validator \ - --identity validator-keypair.json \ - --vote-account vote-account-keypair.json \ --trusted-validator dv1LfzJvDF7S1fBKpFgKoKXK5yoSosmkAdfbxBo1GqJ \ + --identity ~/validator-keypair.json \ + --vote-account ~/vote-account-keypair.json \ --no-untrusted-rpc \ --ledger ledger \ --rpc-port 8899 \ --dynamic-port-range 8000-8010 \ - --entrypoint entrypoint.devnet.solana.com:8001 \ - --expected-genesis-hash EtWTRABZaYq6iMfeYKouRu166VU2xqa1wcaWoxPkrZBG \ --wal-recovery-mode skip_any_corrupted_record \ --limit-ledger-size ``` @@ -87,22 +85,23 @@ solana config set --url https://api.testnet.solana.com ##### Example `solana-validator` command-line +[comment]: <> (UPDATE ci/live-cluster-sanity.sh TOO!) ```bash $ solana-validator \ - --identity validator-keypair.json \ - --vote-account vote-account-keypair.json \ + --entrypoint entrypoint.testnet.solana.com:8001 \ + --entrypoint entrypoint2.testnet.solana.com:8001 \ + --entrypoint entrypoint3.testnet.solana.com:8001 \ --trusted-validator 5D1fNXzvv5NjV1ysLjirC4WY92RNsVH18vjmcszZd8on \ --trusted-validator 7XSY3MrYnK8vq693Rju17bbPkCN3Z7KvvfvJx4kdrsSY \ --trusted-validator Ft5fbkqNa76vnsjYNwjDZUXoTWpP7VYm3mtsaQckQADN \ --trusted-validator 9QxCLckBiJc783jnMvXZubK4wH86Eqqvashtrwvcsgkv \ + --expected-genesis-hash 4uhcVJyU9pJkvQyS88uRDiswHXSCkY3zQawwpjk2NsNY \ + --identity ~/validator-keypair.json \ + --vote-account ~/vote-account-keypair.json \ --no-untrusted-rpc \ --ledger ledger \ --rpc-port 8899 \ --dynamic-port-range 8000-8010 \ - --entrypoint entrypoint.testnet.solana.com:8001 \ - --entrypoint entrypoint2.testnet.solana.com:8001 \ - --entrypoint entrypoint3.testnet.solana.com:8001 \ - --expected-genesis-hash 4uhcVJyU9pJkvQyS88uRDiswHXSCkY3zQawwpjk2NsNY \ --wal-recovery-mode skip_any_corrupted_record \ --limit-ledger-size ``` @@ -142,25 +141,27 @@ solana config set --url https://api.mainnet-beta.solana.com ##### Example `solana-validator` command-line +[comment]: <> (UPDATE ci/live-cluster-sanity.sh TOO!) ```bash $ solana-validator \ - --identity ~/validator-keypair.json \ - --vote-account ~/vote-account-keypair.json \ + --entrypoint entrypoint.mainnet-beta.solana.com:8001 \ + --entrypoint entrypoint2.mainnet-beta.solana.com:8001 \ + --entrypoint entrypoint3.mainnet-beta.solana.com:8001 \ + --entrypoint entrypoint4.mainnet-beta.solana.com:8001 \ + --entrypoint entrypoint5.mainnet-beta.solana.com:8001 \ --trusted-validator 7Np41oeYqPefeNQEHSv1UDhYrehxin3NStELsSKCT4K2 \ --trusted-validator GdnSyH3YtwcxFvQrVVJMm1JhTS4QVX7MFsX56uJLUfiZ \ --trusted-validator DE1bawNcRJB9rVm3buyMVfr8mBEoyyu73NBovf2oXJsJ \ --trusted-validator CakcnaRDHka2gXyfbEd2d3xsvkJkqsLw2akB3zsN1D2S \ + --expected-genesis-hash 5eykt4UsFv8P8NJdTREpY1vzqKqZKvdpKuc147dw2N9d \ + --expected-shred-version 64864 \ + --identity ~/validator-keypair.json \ + --vote-account ~/vote-account-keypair.json \ --no-untrusted-rpc \ --ledger ledger \ --rpc-port 8899 \ --private-rpc \ --dynamic-port-range 8000-8010 \ - --entrypoint entrypoint.mainnet-beta.solana.com:8001 \ - --entrypoint entrypoint2.mainnet-beta.solana.com:8001 \ - --entrypoint entrypoint3.mainnet-beta.solana.com:8001 \ - --entrypoint entrypoint4.mainnet-beta.solana.com:8001 \ - --entrypoint entrypoint5.mainnet-beta.solana.com:8001 \ - --expected-genesis-hash 5eykt4UsFv8P8NJdTREpY1vzqKqZKvdpKuc147dw2N9d \ --wal-recovery-mode skip_any_corrupted_record \ --limit-ledger-size ``` From b5358773ba986bca4029dbfa761e3d104ddbef60 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 18 Sep 2020 22:18:12 +0900 Subject: [PATCH 06/57] Bad copy pasta... --- ci/live-cluster-sanity.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index 4b2bb2d0baa930..8c4cc799203178 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -143,6 +143,7 @@ test_with_live_cluster "testnet" \ --trusted-validator ta1Uvfb7W5BRPrdGnhP9RmeCGKzBySGM1hTE4rBRy6T \ --trusted-validator Ft5fbkqNa76vnsjYNwjDZUXoTWpP7VYm3mtsaQckQADN \ --trusted-validator 9QxCLckBiJc783jnMvXZubK4wH86Eqqvashtrwvcsgkv \ + --expected-genesis-hash 4uhcVJyU9pJkvQyS88uRDiswHXSCkY3zQawwpjk2NsNY \ # for your pain-less copy-paste ./net/gce.sh delete -p "$instance_prefix" && instance_deleted=yes From 4d1463ae365776f6a74c469ba1c613c889929d78 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 18 Sep 2020 22:34:38 +0900 Subject: [PATCH 07/57] Just use artifact_paths --- ci/buildkite-pipeline.sh | 4 ++-- ci/live-cluster-sanity.sh | 7 +------ 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/ci/buildkite-pipeline.sh b/ci/buildkite-pipeline.sh index ad13876a17e67b..daee2a31458949 100755 --- a/ci/buildkite-pipeline.sh +++ b/ci/buildkite-pipeline.sh @@ -174,8 +174,8 @@ all_test_steps() { - "queue=cuda" - command: "ci/live-cluster-sanity.sh" name: "live-cluster-sanity" - timeout_in_minutes: 40 - artifact_paths: "log-*.txt" + timeout_in_minutes: 20 + artifact_paths: "*-validator.log" agents: - "queue=gce-deploy" EOF diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index 8c4cc799203178..177ced845fced1 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -4,7 +4,6 @@ cd "$(dirname "$0")/.." source ci/_ source ci/rust-version.sh stable -source ci/upload-ci-artifact.sh escaped_branch=$(echo "$BUILDKITE_BRANCH" | tr -c "[:alnum:]" - | sed -r "s#(^-*|-*head-*|-*$)##g") instance_prefix="testnet-live-sanity-$escaped_branch" @@ -18,8 +17,6 @@ on_trap() { if [[ -z $instance_deleted ]]; then ( set +e - upload-ci-artifact cluster-sanity/testnet-validator.log - upload-ci-artifact cluster-sanity/mainnet-beta-validator.log _ ./net/gce.sh delete -p "$instance_prefix" ) fi @@ -41,7 +38,7 @@ test_with_live_cluster() { ./net/ssh.sh "$instance_ip" rm -rf cluster-sanity ./net/ssh.sh "$instance_ip" mkdir cluster-sanity - validator_log="cluster-sanity/$cluster_label-validator.log" + validator_log="$cluster_label-validator.log" (./net/ssh.sh "$instance_ip" -Llocalhost:18899:localhost:18899 ./solana-validator \ --no-untrusted-rpc \ --ledger cluster-sanity/ledger \ @@ -121,8 +118,6 @@ test_with_live_cluster() { (sleep 3 && kill "$tail_pid") & kill_pid=$! wait "$ssh_pid" "$tail_pid" "$kill_pid" - - upload-ci-artifact "$validator_log" } # UPDATE docs/src/clusters.md TOO!! From f8706be4117fe6ea9392fc78a7d09d8eb3785260 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 18 Sep 2020 22:59:55 +0900 Subject: [PATCH 08/57] Minor polishments --- ci/live-cluster-sanity.sh | 10 +++++----- docs/src/clusters.md | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index 177ced845fced1..de88dea5b63bf3 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -24,7 +24,7 @@ on_trap() { trap on_trap INT TERM EXIT _ cargo +"$rust_stable" build --bins --release -_ ./net/scp.sh ./target/release/solana-validator "$instance_ip":. +_ ./net/scp.sh ./target/release/solana-validator "$instance_ip:." echo 500000 | ./net/ssh.sh "$instance_ip" sudo tee /proc/sys/vm/max_map_count > /dev/null test_with_live_cluster() { @@ -39,7 +39,7 @@ test_with_live_cluster() { ./net/ssh.sh "$instance_ip" mkdir cluster-sanity validator_log="$cluster_label-validator.log" - (./net/ssh.sh "$instance_ip" -Llocalhost:18899:localhost:18899 ./solana-validator \ + ./net/ssh.sh "$instance_ip" -Llocalhost:18899:localhost:18899 ./solana-validator \ --no-untrusted-rpc \ --ledger cluster-sanity/ledger \ --log - \ @@ -49,7 +49,7 @@ test_with_live_cluster() { --rpc-port 18899 \ --rpc-bind-address localhost \ --snapshot-interval-slots 0 \ - "$@" ) &> "$validator_log" & + "$@" &> "$validator_log" & ssh_pid=$! tail -F "$validator_log" > cluster-sanity/log-tail 2> /dev/null & tail_pid=$! @@ -70,7 +70,7 @@ test_with_live_cluster() { echo "##### validator is starting... (until timeout: $attempts) #####" if find cluster-sanity/log-tail -not -empty | grep ^ > /dev/null; then echo "##### new log:" - timeout 1 cat cluster-sanity/log-tail | tail -n 3 || true + timeout 1 cat cluster-sanity/log-tail | tail -n 3 | cut -c 1-200 || true truncate --size 0 cluster-sanity/log-tail echo fi @@ -102,7 +102,7 @@ test_with_live_cluster() { echo "##### validator is running ($current_root/$goal_root)... (until timeout: $attempts) #####" if find cluster-sanity/log-tail -not -empty | grep ^ > /dev/null; then echo "##### new log:" - timeout 1 cat cluster-sanity/log-tail | tail -n 3 || true + timeout 1 cat cluster-sanity/log-tail | tail -n 3 | cut -c 1-200 || true truncate --size 0 cluster-sanity/log-tail echo fi diff --git a/docs/src/clusters.md b/docs/src/clusters.md index eb6e28ea718a07..b6cfe59f420e95 100644 --- a/docs/src/clusters.md +++ b/docs/src/clusters.md @@ -85,7 +85,7 @@ solana config set --url https://api.testnet.solana.com ##### Example `solana-validator` command-line -[comment]: <> (UPDATE ci/live-cluster-sanity.sh TOO!) +[comment]: <> (UPDATE ci/live-cluster-sanity.sh TOO!!) ```bash $ solana-validator \ --entrypoint entrypoint.testnet.solana.com:8001 \ @@ -141,7 +141,7 @@ solana config set --url https://api.mainnet-beta.solana.com ##### Example `solana-validator` command-line -[comment]: <> (UPDATE ci/live-cluster-sanity.sh TOO!) +[comment]: <> (UPDATE ci/live-cluster-sanity.sh TOO!!) ```bash $ solana-validator \ --entrypoint entrypoint.mainnet-beta.solana.com:8001 \ From 87e19edb778e4d84f80927c643723d7ff1b71252 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 18 Sep 2020 23:11:36 +0900 Subject: [PATCH 09/57] Remove needless sleep and display longer line more --- ci/live-cluster-sanity.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index de88dea5b63bf3..ebfd8cf6a0a3e4 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -53,7 +53,6 @@ test_with_live_cluster() { ssh_pid=$! tail -F "$validator_log" > cluster-sanity/log-tail 2> /dev/null & tail_pid=$! - sleep 3 attempts=100 while ! ./net/ssh.sh "$instance_ip" test -f cluster-sanity/init-completed &> /dev/null ; do @@ -70,7 +69,7 @@ test_with_live_cluster() { echo "##### validator is starting... (until timeout: $attempts) #####" if find cluster-sanity/log-tail -not -empty | grep ^ > /dev/null; then echo "##### new log:" - timeout 1 cat cluster-sanity/log-tail | tail -n 3 | cut -c 1-200 || true + timeout 1 cat cluster-sanity/log-tail | tail -n 3 | cut -c 1-300 || true truncate --size 0 cluster-sanity/log-tail echo fi @@ -102,7 +101,7 @@ test_with_live_cluster() { echo "##### validator is running ($current_root/$goal_root)... (until timeout: $attempts) #####" if find cluster-sanity/log-tail -not -empty | grep ^ > /dev/null; then echo "##### new log:" - timeout 1 cat cluster-sanity/log-tail | tail -n 3 | cut -c 1-200 || true + timeout 1 cat cluster-sanity/log-tail | tail -n 3 | cut -c 1-300 || true truncate --size 0 cluster-sanity/log-tail echo fi @@ -113,7 +112,6 @@ test_with_live_cluster() { -H 'Content-Type: application/json' \ -d '{"jsonrpc":"2.0","id":1, "method":"validatorExit"}' \ http://localhost:18899 - sleep 10 (sleep 3 && kill "$tail_pid") & kill_pid=$! From 9e4c48fe2c0df6be21d46d44e25585c7f5735b69 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 18 Sep 2020 23:19:30 +0900 Subject: [PATCH 10/57] Extract some --- ci/live-cluster-sanity.sh | 44 +++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index ebfd8cf6a0a3e4..56e62697d8e262 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -27,6 +27,24 @@ _ cargo +"$rust_stable" build --bins --release _ ./net/scp.sh ./target/release/solana-validator "$instance_ip:." echo 500000 | ./net/ssh.sh "$instance_ip" sudo tee /proc/sys/vm/max_map_count > /dev/null +on_error() { + status=$1 + set +e + kill $ssh_pid $tail_pid + wait $ssh_pid $tail_pid + echo "Error: validator failed to $status" + exit 1 +} + +show_log() { + if find cluster-sanity/log-tail -not -empty | grep ^ > /dev/null; then + echo "##### new log:" + timeout 1 cat cluster-sanity/log-tail | tail -n 3 | cut -c 1-300 || true + truncate --size 0 cluster-sanity/log-tail + echo + fi +} + test_with_live_cluster() { cluster_label="$1" shift @@ -58,21 +76,12 @@ test_with_live_cluster() { while ! ./net/ssh.sh "$instance_ip" test -f cluster-sanity/init-completed &> /dev/null ; do attempts=$((attempts - 1)) if [[ (($attempts == 0)) || ! -d "/proc/$ssh_pid" ]]; then - set +e - kill $ssh_pid $tail_pid - wait $ssh_pid $tail_pid - echo "Error: validator failed to boot" - exit 1 + on_error "start" fi sleep 3 echo "##### validator is starting... (until timeout: $attempts) #####" - if find cluster-sanity/log-tail -not -empty | grep ^ > /dev/null; then - echo "##### new log:" - timeout 1 cat cluster-sanity/log-tail | tail -n 3 | cut -c 1-300 || true - truncate --size 0 cluster-sanity/log-tail - echo - fi + show_log done snapshot_slot=$(./net/ssh.sh "$instance_ip" ls -t cluster-sanity/ledger/snapshot* | @@ -89,22 +98,13 @@ test_with_live_cluster() { while [[ $current_root -le $goal_root ]]; do attempts=$((attempts - 1)) if [[ (($attempts == 0)) || ! -d "/proc/$ssh_pid" ]]; then - set +e - kill $ssh_pid $tail_pid - wait $ssh_pid $tail_pid - echo "Error: validator failed to boot" - exit 1 + on_error "root new slots" fi sleep 3 current_root=$(./target/release/solana --url http://localhost:18899 slot --commitment root) echo "##### validator is running ($current_root/$goal_root)... (until timeout: $attempts) #####" - if find cluster-sanity/log-tail -not -empty | grep ^ > /dev/null; then - echo "##### new log:" - timeout 1 cat cluster-sanity/log-tail | tail -n 3 | cut -c 1-300 || true - truncate --size 0 cluster-sanity/log-tail - echo - fi + show_log done _ curl \ From 2f141087902cd66522ea9759ae59d03a384f0348 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 18 Sep 2020 23:23:27 +0900 Subject: [PATCH 11/57] Reorder a bit --- ci/live-cluster-sanity.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index 56e62697d8e262..dd2584c208bf31 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -84,17 +84,17 @@ test_with_live_cluster() { show_log done + echo "--- Monitoring validator $cluster_label" + snapshot_slot=$(./net/ssh.sh "$instance_ip" ls -t cluster-sanity/ledger/snapshot* | head -n 1 | grep -o 'snapshot-[0-9]*-' | grep -o '[0-9]*' ) - - echo "--- Monitoring validator $cluster_label" - - attempts=100 current_root=$snapshot_slot goal_root=$((snapshot_slot + 100)) + + attempts=100 while [[ $current_root -le $goal_root ]]; do attempts=$((attempts - 1)) if [[ (($attempts == 0)) || ! -d "/proc/$ssh_pid" ]]; then From 553b17eb4ab23c22ecb922b6aa5f540085d3a6ac Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 18 Sep 2020 23:24:42 +0900 Subject: [PATCH 12/57] Fix shellcheck --- ci/live-cluster-sanity.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index dd2584c208bf31..65a15e04ff5519 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -30,8 +30,8 @@ echo 500000 | ./net/ssh.sh "$instance_ip" sudo tee /proc/sys/vm/max_map_count > on_error() { status=$1 set +e - kill $ssh_pid $tail_pid - wait $ssh_pid $tail_pid + kill "$ssh_pid" "$tail_pid" + wait "$ssh_pid" "$tail_pid" echo "Error: validator failed to $status" exit 1 } From 764ccc4823723357f2762b562ba64db387e4d508 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Thu, 24 Sep 2020 15:57:10 +0900 Subject: [PATCH 13/57] Use more compatible commenting hack? --- docs/src/clusters.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/src/clusters.md b/docs/src/clusters.md index b6cfe59f420e95..23d805c57dbf9d 100644 --- a/docs/src/clusters.md +++ b/docs/src/clusters.md @@ -85,7 +85,8 @@ solana config set --url https://api.testnet.solana.com ##### Example `solana-validator` command-line -[comment]: <> (UPDATE ci/live-cluster-sanity.sh TOO!!) +[comment]: # (UPDATE ci/live-cluster-sanity.sh TOO!!) + ```bash $ solana-validator \ --entrypoint entrypoint.testnet.solana.com:8001 \ @@ -141,7 +142,8 @@ solana config set --url https://api.mainnet-beta.solana.com ##### Example `solana-validator` command-line -[comment]: <> (UPDATE ci/live-cluster-sanity.sh TOO!!) +[comment]: # (UPDATE ci/live-cluster-sanity.sh TOO!!) + ```bash $ solana-validator \ --entrypoint entrypoint.mainnet-beta.solana.com:8001 \ From 5fc203e3d185e4275c2808152a451a1b75d51526 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Thu, 24 Sep 2020 16:49:29 +0900 Subject: [PATCH 14/57] Minor review comments --- ci/live-cluster-sanity.sh | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index 65a15e04ff5519..30786e0aae14c0 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -5,8 +5,13 @@ cd "$(dirname "$0")/.." source ci/_ source ci/rust-version.sh stable -escaped_branch=$(echo "$BUILDKITE_BRANCH" | tr -c "[:alnum:]" - | sed -r "s#(^-*|-*head-*|-*$)##g") -instance_prefix="testnet-live-sanity-$escaped_branch" +if [[ -n $CI ]]; then + escaped_branch=$(echo "$BUILDKITE_BRANCH" | tr -c "[:alnum:]" - | sed -r "s#(^-*|-*head-*|-*$)##g") + instance_prefix="testnet-live-sanity-$escaped_branch" +else + instance_prefix="testnet-live-sanity-$(whoami)" --self-destruct-hours 1 +fi + # ensure to delete leftover cluster ./net/gce.sh delete -p "$instance_prefix" || true # only bootstrap, no normal validator @@ -14,12 +19,8 @@ instance_prefix="testnet-live-sanity-$escaped_branch" instance_ip=$(./net/gce.sh info | grep bootstrap-validator | awk '{print $3}') on_trap() { - if [[ -z $instance_deleted ]]; then - ( - set +e - _ ./net/gce.sh delete -p "$instance_prefix" - ) - fi + set +e + _ ./net/gce.sh delete -p "$instance_prefix" } trap on_trap INT TERM EXIT @@ -115,7 +116,7 @@ test_with_live_cluster() { (sleep 3 && kill "$tail_pid") & kill_pid=$! - wait "$ssh_pid" "$tail_pid" "$kill_pid" + timeout 30 wait "$ssh_pid" "$tail_pid" "$kill_pid" } # UPDATE docs/src/clusters.md TOO!! @@ -138,5 +139,3 @@ test_with_live_cluster "testnet" \ --trusted-validator 9QxCLckBiJc783jnMvXZubK4wH86Eqqvashtrwvcsgkv \ --expected-genesis-hash 4uhcVJyU9pJkvQyS88uRDiswHXSCkY3zQawwpjk2NsNY \ # for your pain-less copy-paste - -./net/gce.sh delete -p "$instance_prefix" && instance_deleted=yes From 69dd2b21d2311cc217eba2024b47b6f6f53aca50 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Thu, 24 Sep 2020 16:51:20 +0900 Subject: [PATCH 15/57] Wrong place... --- ci/live-cluster-sanity.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index 30786e0aae14c0..dc31e0336ea925 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -9,13 +9,13 @@ if [[ -n $CI ]]; then escaped_branch=$(echo "$BUILDKITE_BRANCH" | tr -c "[:alnum:]" - | sed -r "s#(^-*|-*head-*|-*$)##g") instance_prefix="testnet-live-sanity-$escaped_branch" else - instance_prefix="testnet-live-sanity-$(whoami)" --self-destruct-hours 1 + instance_prefix="testnet-live-sanity-$(whoami)" fi # ensure to delete leftover cluster ./net/gce.sh delete -p "$instance_prefix" || true # only bootstrap, no normal validator -./net/gce.sh create -p "$instance_prefix" -n 0 +./net/gce.sh create -p "$instance_prefix" -n 0 --self-destruct-hours 1 instance_ip=$(./net/gce.sh info | grep bootstrap-validator | awk '{print $3}') on_trap() { From 570887e44ead6acb227e830d1992d82aaeaa9e23 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Thu, 24 Sep 2020 17:17:14 +0900 Subject: [PATCH 16/57] Extract into new remote shell --- ci/live-cluster-sanity.sh | 99 +++----------------------------- ci/remote-live-cluster-sanity.sh | 90 +++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 92 deletions(-) create mode 100755 ci/remote-live-cluster-sanity.sh diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index dc31e0336ea925..8877ae62109d59 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -13,9 +13,9 @@ else fi # ensure to delete leftover cluster -./net/gce.sh delete -p "$instance_prefix" || true +_ ./net/gce.sh delete -p "$instance_prefix" || true # only bootstrap, no normal validator -./net/gce.sh create -p "$instance_prefix" -n 0 --self-destruct-hours 1 +_ ./net/gce.sh create -p "$instance_prefix" -n 0 --self-destruct-hours 1 instance_ip=$(./net/gce.sh info | grep bootstrap-validator | awk '{print $3}') on_trap() { @@ -25,98 +25,13 @@ on_trap() { trap on_trap INT TERM EXIT _ cargo +"$rust_stable" build --bins --release -_ ./net/scp.sh ./target/release/solana-validator "$instance_ip:." -echo 500000 | ./net/ssh.sh "$instance_ip" sudo tee /proc/sys/vm/max_map_count > /dev/null - -on_error() { - status=$1 - set +e - kill "$ssh_pid" "$tail_pid" - wait "$ssh_pid" "$tail_pid" - echo "Error: validator failed to $status" - exit 1 -} - -show_log() { - if find cluster-sanity/log-tail -not -empty | grep ^ > /dev/null; then - echo "##### new log:" - timeout 1 cat cluster-sanity/log-tail | tail -n 3 | cut -c 1-300 || true - truncate --size 0 cluster-sanity/log-tail - echo - fi -} +_ ./net/scp.sh \ + ./ci/remote-live-cluster-sanity.sh \ + ./target/release/{solana,solana-validator,solana-sys-tuner} \ + "$instance_ip:." test_with_live_cluster() { - cluster_label="$1" - shift - - echo "--- Starting validator $cluster_label" - - rm -rf cluster-sanity - mkdir cluster-sanity - ./net/ssh.sh "$instance_ip" rm -rf cluster-sanity - ./net/ssh.sh "$instance_ip" mkdir cluster-sanity - - validator_log="$cluster_label-validator.log" - ./net/ssh.sh "$instance_ip" -Llocalhost:18899:localhost:18899 ./solana-validator \ - --no-untrusted-rpc \ - --ledger cluster-sanity/ledger \ - --log - \ - --init-complete-file cluster-sanity/init-completed \ - --enable-rpc-exit \ - --private-rpc \ - --rpc-port 18899 \ - --rpc-bind-address localhost \ - --snapshot-interval-slots 0 \ - "$@" &> "$validator_log" & - ssh_pid=$! - tail -F "$validator_log" > cluster-sanity/log-tail 2> /dev/null & - tail_pid=$! - - attempts=100 - while ! ./net/ssh.sh "$instance_ip" test -f cluster-sanity/init-completed &> /dev/null ; do - attempts=$((attempts - 1)) - if [[ (($attempts == 0)) || ! -d "/proc/$ssh_pid" ]]; then - on_error "start" - fi - - sleep 3 - echo "##### validator is starting... (until timeout: $attempts) #####" - show_log - done - - echo "--- Monitoring validator $cluster_label" - - snapshot_slot=$(./net/ssh.sh "$instance_ip" ls -t cluster-sanity/ledger/snapshot* | - head -n 1 | - grep -o 'snapshot-[0-9]*-' | - grep -o '[0-9]*' - ) - current_root=$snapshot_slot - goal_root=$((snapshot_slot + 100)) - - attempts=100 - while [[ $current_root -le $goal_root ]]; do - attempts=$((attempts - 1)) - if [[ (($attempts == 0)) || ! -d "/proc/$ssh_pid" ]]; then - on_error "root new slots" - fi - - sleep 3 - current_root=$(./target/release/solana --url http://localhost:18899 slot --commitment root) - echo "##### validator is running ($current_root/$goal_root)... (until timeout: $attempts) #####" - show_log - done - - _ curl \ - -X POST \ - -H 'Content-Type: application/json' \ - -d '{"jsonrpc":"2.0","id":1, "method":"validatorExit"}' \ - http://localhost:18899 - - (sleep 3 && kill "$tail_pid") & - kill_pid=$! - timeout 30 wait "$ssh_pid" "$tail_pid" "$kill_pid" + ./net/ssh.sh "$instance_ip" ./remote-live-cluster-sanity.sh "$@" } # UPDATE docs/src/clusters.md TOO!! diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh new file mode 100755 index 00000000000000..4a0dcfed3790f3 --- /dev/null +++ b/ci/remote-live-cluster-sanity.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash + +handle_error() { + action=$1 + set +e + kill "$ssh_pid" "$tail_pid" + wait "$ssh_pid" "$tail_pid" + echo "--- Error: validator failed to $action" + exit 1 +} + +show_log() { + if find cluster-sanity/log-tail -not -empty | grep ^ > /dev/null; then + echo "##### new log:" + timeout 1 cat cluster-sanity/log-tail | tail -n 3 | cut -c 1-300 || true + truncate --size 0 cluster-sanity/log-tail + echo + fi +} + +rm -rf cluster-sanity +mkdir cluster-sanity + +cluster_label="$1" +shift + +echo "--- Starting validator $cluster_label" + +validator_log="$cluster_label-validator.log" +./solana-validator \ + --no-untrusted-rpc \ + --ledger cluster-sanity/ledger \ + --log - \ + --init-complete-file cluster-sanity/init-completed \ + --enable-rpc-exit \ + --private-rpc \ + --rpc-port 8899 \ + --rpc-bind-address localhost \ + --snapshot-interval-slots 0 \ + "$@" &> "$validator_log" + +validator_pid=$! +tail -F "$validator_log" > cluster-sanity/log-tail 2> /dev/null & +tail_pid=$! + +attempts=100 +while ! [[ -f cluster-sanity/init-completed ]]; do + attempts=$((attempts - 1)) + if [[ (($attempts == 0)) || ! -d "/proc/$validator_pid" ]]; then + handle_error "start" + fi + + sleep 3 + echo "##### validator is starting... (until timeout: $attempts) #####" + show_log +done + +echo "--- Monitoring validator $cluster_label" + +snapshot_slot=$(ls -t cluster-sanity/ledger/snapshot* | + head -n 1 | + grep -o 'snapshot-[0-9]*-' | + grep -o '[0-9]*' +) +current_root=$snapshot_slot +goal_root=$((snapshot_slot + 100)) + +attempts=100 +while [[ $current_root -le $goal_root ]]; do + attempts=$((attempts - 1)) + if [[ (($attempts == 0)) || ! -d "/proc/$validator_pid" ]]; then + handle_error "root new slots" + fi + + sleep 3 + current_root=$(./solana --url http://localhost:8899 slot --commitment root) + echo "##### validator is running ($current_root/$goal_root)... (until timeout: $attempts) #####" + show_log +done + +curl \ + -X POST \ + -H 'Content-Type: application/json' \ + -d '{"jsonrpc":"2.0","id":1, "method":"validatorExit"}' \ + http://localhost:8899 + +(sleep 3 && kill "$tail_pid") & +(sleep 30 && kill -KILL "$validator_pid" "$tail_pid") & +kill_pid=$! +wait "$validator_pid" "$tail_pid" "$kill_pid" From 2999edafc570b56a9eff7077eaca451bff97a30f Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Thu, 24 Sep 2020 17:26:08 +0900 Subject: [PATCH 17/57] Fix shellcheck --- ci/remote-live-cluster-sanity.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index 4a0dcfed3790f3..e6837555559126 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -3,8 +3,8 @@ handle_error() { action=$1 set +e - kill "$ssh_pid" "$tail_pid" - wait "$ssh_pid" "$tail_pid" + kill "$validator_pid" "$tail_pid" + wait "$validator_pid" "$tail_pid" echo "--- Error: validator failed to $action" exit 1 } @@ -57,6 +57,7 @@ done echo "--- Monitoring validator $cluster_label" +# shellcheck disable=SC2012 snapshot_slot=$(ls -t cluster-sanity/ledger/snapshot* | head -n 1 | grep -o 'snapshot-[0-9]*-' | From bfd3c9667f8dba26a8c571831a6d9520b2eac39b Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Thu, 24 Sep 2020 17:33:58 +0900 Subject: [PATCH 18/57] Forgot the &... --- ci/remote-live-cluster-sanity.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index e6837555559126..4ecb47793d14d0 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -37,7 +37,7 @@ validator_log="$cluster_label-validator.log" --rpc-port 8899 \ --rpc-bind-address localhost \ --snapshot-interval-slots 0 \ - "$@" &> "$validator_log" + "$@" &> "$validator_log" & validator_pid=$! tail -F "$validator_log" > cluster-sanity/log-tail 2> /dev/null & From 80b3009e9b440263e742a3de687cb18b1793c38f Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Thu, 24 Sep 2020 17:56:19 +0900 Subject: [PATCH 19/57] Collect logs and run sys-tuner --- ci/buildkite-pipeline.sh | 4 +++- ci/live-cluster-sanity.sh | 9 ++++++++- ci/remote-live-cluster-sanity.sh | 11 +++++++---- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/ci/buildkite-pipeline.sh b/ci/buildkite-pipeline.sh index daee2a31458949..5a01a20dfeacec 100755 --- a/ci/buildkite-pipeline.sh +++ b/ci/buildkite-pipeline.sh @@ -175,7 +175,9 @@ all_test_steps() { - command: "ci/live-cluster-sanity.sh" name: "live-cluster-sanity" timeout_in_minutes: 20 - artifact_paths: "*-validator.log" + artifact_paths: + - "*-validator.log" + - "*-sys-tuner.log" agents: - "queue=gce-deploy" EOF diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index 8877ae62109d59..fe5bef10553835 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -31,7 +31,14 @@ _ ./net/scp.sh \ "$instance_ip:." test_with_live_cluster() { - ./net/ssh.sh "$instance_ip" ./remote-live-cluster-sanity.sh "$@" + cluster_label="$1" + + _ ./net/ssh.sh "$instance_ip" ./remote-live-cluster-sanity.sh "$@" + + # good it existed successfully; let's collect logs for profit! + for log in $(./net/ssh.sh ls -l '*.log'); do + _ ./net/scp.sh "$instance_ip:$log" . + done } # UPDATE docs/src/clusters.md TOO!! diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index 4ecb47793d14d0..9841057a9f1352 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -27,6 +27,10 @@ shift echo "--- Starting validator $cluster_label" validator_log="$cluster_label-validator.log" +sys_tuner_log="$cluster_label-sys-tuner.log" +sudo ./solana-sys-tuner --user $(whoami) > "$sys_tuner_log" & +sys_tuner_pid=$! + ./solana-validator \ --no-untrusted-rpc \ --ledger cluster-sanity/ledger \ @@ -38,7 +42,6 @@ validator_log="$cluster_label-validator.log" --rpc-bind-address localhost \ --snapshot-interval-slots 0 \ "$@" &> "$validator_log" & - validator_pid=$! tail -F "$validator_log" > cluster-sanity/log-tail 2> /dev/null & tail_pid=$! @@ -85,7 +88,7 @@ curl \ -d '{"jsonrpc":"2.0","id":1, "method":"validatorExit"}' \ http://localhost:8899 -(sleep 3 && kill "$tail_pid") & -(sleep 30 && kill -KILL "$validator_pid" "$tail_pid") & +(sleep 3 && kill "$tail_pid" && sudo kill "$sys_tuner_pid") & kill_pid=$! -wait "$validator_pid" "$tail_pid" "$kill_pid" + +wait "$validator_pid" "$sys_tuner_pid" "$tail_pid" "$kill_pid" From 5e8368948148a4ba7ae9ae0d1d16cec5ee90eb55 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Thu, 24 Sep 2020 18:00:31 +0900 Subject: [PATCH 20/57] Fix shellcheck --- ci/remote-live-cluster-sanity.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index 9841057a9f1352..2157f672781b31 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -28,7 +28,7 @@ echo "--- Starting validator $cluster_label" validator_log="$cluster_label-validator.log" sys_tuner_log="$cluster_label-sys-tuner.log" -sudo ./solana-sys-tuner --user $(whoami) > "$sys_tuner_log" & +sudo ./solana-sys-tuner --user "$(whoami)" > "$sys_tuner_log" & sys_tuner_pid=$! ./solana-validator \ From 3ed39019d16b03dd13a0c852c8a7d858d0c2448e Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Thu, 24 Sep 2020 18:04:59 +0900 Subject: [PATCH 21/57] Really fix shellcheck --- ci/live-cluster-sanity.sh | 2 -- ci/remote-live-cluster-sanity.sh | 5 +++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index fe5bef10553835..5440db408978cc 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -31,8 +31,6 @@ _ ./net/scp.sh \ "$instance_ip:." test_with_live_cluster() { - cluster_label="$1" - _ ./net/ssh.sh "$instance_ip" ./remote-live-cluster-sanity.sh "$@" # good it existed successfully; let's collect logs for profit! diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index 2157f672781b31..ac56392b7923fb 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -28,7 +28,8 @@ echo "--- Starting validator $cluster_label" validator_log="$cluster_label-validator.log" sys_tuner_log="$cluster_label-sys-tuner.log" -sudo ./solana-sys-tuner --user "$(whoami)" > "$sys_tuner_log" & +# shellcheck disable=SC2024 # create log as non-root user +sudo ./solana-sys-tuner --user "$(whoami)" &> "$sys_tuner_log" & sys_tuner_pid=$! ./solana-validator \ @@ -60,7 +61,7 @@ done echo "--- Monitoring validator $cluster_label" -# shellcheck disable=SC2012 +# shellcheck disable=SC2012 # ls here is handy for sorted snapshots snapshot_slot=$(ls -t cluster-sanity/ledger/snapshot* | head -n 1 | grep -o 'snapshot-[0-9]*-' | From acdedc126a3949b3698a39f60cadfaaa4ab4c87b Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Thu, 24 Sep 2020 18:23:05 +0900 Subject: [PATCH 22/57] Fix killing tuner and really collect logs for profit --- ci/live-cluster-sanity.sh | 2 +- ci/remote-live-cluster-sanity.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index 5440db408978cc..56ebf84b818465 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -34,7 +34,7 @@ test_with_live_cluster() { _ ./net/ssh.sh "$instance_ip" ./remote-live-cluster-sanity.sh "$@" # good it existed successfully; let's collect logs for profit! - for log in $(./net/ssh.sh ls -l '*.log'); do + for log in $(./net/ssh.sh "$instance_ip" ls -l '*.log'); do _ ./net/scp.sh "$instance_ip:$log" . done } diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index ac56392b7923fb..faf2da496d1f8b 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -89,7 +89,7 @@ curl \ -d '{"jsonrpc":"2.0","id":1, "method":"validatorExit"}' \ http://localhost:8899 -(sleep 3 && kill "$tail_pid" && sudo kill "$sys_tuner_pid") & +(set -x && sleep 3 && sudo kill "$sys_tuner_pid" && kill "$tail_pid") & kill_pid=$! wait "$validator_pid" "$sys_tuner_pid" "$tail_pid" "$kill_pid" From 199954497c764fbd5e5085ae67bc9c72d330c352 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Thu, 24 Sep 2020 19:04:46 +0900 Subject: [PATCH 23/57] Really kill --- ci/live-cluster-sanity.sh | 2 +- ci/remote-live-cluster-sanity.sh | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index 56ebf84b818465..d35ab42e131766 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -34,7 +34,7 @@ test_with_live_cluster() { _ ./net/ssh.sh "$instance_ip" ./remote-live-cluster-sanity.sh "$@" # good it existed successfully; let's collect logs for profit! - for log in $(./net/ssh.sh "$instance_ip" ls -l '*.log'); do + for log in $(./net/ssh.sh "$instance_ip" ls '*.log'); do _ ./net/scp.sh "$instance_ip:$log" . done } diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index faf2da496d1f8b..031ef11a6b262b 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -89,7 +89,8 @@ curl \ -d '{"jsonrpc":"2.0","id":1, "method":"validatorExit"}' \ http://localhost:8899 -(set -x && sleep 3 && sudo kill "$sys_tuner_pid" && kill "$tail_pid") & +ps auxf +(set -x && sleep 3 && sudo kill "$sys_tuner_pid" && kill "$tail_pid" && sudo pkill sys-tuner) & kill_pid=$! wait "$validator_pid" "$sys_tuner_pid" "$tail_pid" "$kill_pid" From 2cf13182f964e1b87244005aaa92ce3b894ed7a3 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 25 Sep 2020 13:47:52 +0900 Subject: [PATCH 24/57] Really kill sys-tuner... --- ci/remote-live-cluster-sanity.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index 031ef11a6b262b..12ebf1929bcede 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -90,7 +90,7 @@ curl \ http://localhost:8899 ps auxf -(set -x && sleep 3 && sudo kill "$sys_tuner_pid" && kill "$tail_pid" && sudo pkill sys-tuner) & +(set -x && sleep 3 && kill "$tail_pid" && sudo pkill -f solana-sys-tuner) & kill_pid=$! wait "$validator_pid" "$sys_tuner_pid" "$tail_pid" "$kill_pid" From ce3db631e085959de115bc664dac08be54623b0d Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 25 Sep 2020 14:30:32 +0900 Subject: [PATCH 25/57] Collect logs even if failed, enable metrics, snapshot upload --- ci/live-cluster-sanity.sh | 11 ++++++++--- ci/remote-live-cluster-sanity.sh | 7 +++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index d35ab42e131766..0cca97762e3e08 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -31,12 +31,17 @@ _ ./net/scp.sh \ "$instance_ip:." test_with_live_cluster() { - _ ./net/ssh.sh "$instance_ip" ./remote-live-cluster-sanity.sh "$@" + validator_failed= + _ ./net/ssh.sh "$instance_ip" ./remote-live-cluster-sanity.sh "$@" || validator_failed=$? - # good it existed successfully; let's collect logs for profit! - for log in $(./net/ssh.sh "$instance_ip" ls '*.log'); do + # let's collect logs for profit! + for log in $(./net/ssh.sh "$instance_ip" ls '*.log' 'cluster-sanity/ledger/snapshot-*.tar.*'); do _ ./net/scp.sh "$instance_ip:$log" . done + + if [[ -n $validator_failed ]]; then + (exit "$validator_failed") + fi } # UPDATE docs/src/clusters.md TOO!! diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index 12ebf1929bcede..f55107a5aa705d 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -28,6 +28,9 @@ echo "--- Starting validator $cluster_label" validator_log="$cluster_label-validator.log" sys_tuner_log="$cluster_label-sys-tuner.log" +metrics_host="https://metrics.solana.com:8086" +export SOLANA_METRICS_CONFIG="host=$metrics_host,db=testnet-live-cluster,u=scratch_writer,p=topsecret" + # shellcheck disable=SC2024 # create log as non-root user sudo ./solana-sys-tuner --user "$(whoami)" &> "$sys_tuner_log" & sys_tuner_pid=$! @@ -62,7 +65,7 @@ done echo "--- Monitoring validator $cluster_label" # shellcheck disable=SC2012 # ls here is handy for sorted snapshots -snapshot_slot=$(ls -t cluster-sanity/ledger/snapshot* | +snapshot_slot=$(ls -t cluster-sanity/ledger/snapshot-*.tar.* | head -n 1 | grep -o 'snapshot-[0-9]*-' | grep -o '[0-9]*' @@ -89,7 +92,7 @@ curl \ -d '{"jsonrpc":"2.0","id":1, "method":"validatorExit"}' \ http://localhost:8899 -ps auxf +# well, kill $sys_tuner_pid didn't work for some reason, maybe sudo doen't relay signals? (set -x && sleep 3 && kill "$tail_pid" && sudo pkill -f solana-sys-tuner) & kill_pid=$! From 6011e5a17a85b8f99a11127f14fcb4b4c9fe470a Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 25 Sep 2020 14:51:24 +0900 Subject: [PATCH 26/57] Adjust log file path and really upload snapshot --- ci/buildkite-pipeline.sh | 5 +++-- ci/live-cluster-sanity.sh | 8 ++++++-- ci/remote-live-cluster-sanity.sh | 4 ++-- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/ci/buildkite-pipeline.sh b/ci/buildkite-pipeline.sh index 5a01a20dfeacec..07d8296efde212 100755 --- a/ci/buildkite-pipeline.sh +++ b/ci/buildkite-pipeline.sh @@ -176,8 +176,9 @@ all_test_steps() { name: "live-cluster-sanity" timeout_in_minutes: 20 artifact_paths: - - "*-validator.log" - - "*-sys-tuner.log" + - "*/validator.log" + - "*/sys-tuner.log" + - "*/snapshot-*.tar.*" agents: - "queue=gce-deploy" EOF diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index 0cca97762e3e08..be29f0980ef76b 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -31,12 +31,16 @@ _ ./net/scp.sh \ "$instance_ip:." test_with_live_cluster() { + cluster_label="$1" + rm -rf "./$cluster_label" + mkdir "./$cluster_label" + validator_failed= _ ./net/ssh.sh "$instance_ip" ./remote-live-cluster-sanity.sh "$@" || validator_failed=$? # let's collect logs for profit! - for log in $(./net/ssh.sh "$instance_ip" ls '*.log' 'cluster-sanity/ledger/snapshot-*.tar.*'); do - _ ./net/scp.sh "$instance_ip:$log" . + for log in $(./net/ssh.sh "$instance_ip" ls 'cluster-sanity/'{'*.log','ledger/snapshot-*.tar.*'}); do + _ ./net/scp.sh "$instance_ip:$log" "./$cluster_label" done if [[ -n $validator_failed ]]; then diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index f55107a5aa705d..75fb6513835a9e 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -26,8 +26,8 @@ shift echo "--- Starting validator $cluster_label" -validator_log="$cluster_label-validator.log" -sys_tuner_log="$cluster_label-sys-tuner.log" +validator_log="cluster-sanity/validator.log" +sys_tuner_log="cluster-sanity/sys-tuner.log" metrics_host="https://metrics.solana.com:8086" export SOLANA_METRICS_CONFIG="host=$metrics_host,db=testnet-live-cluster,u=scratch_writer,p=topsecret" From aa148273e54402e8baabd91bfbc358823d04198b Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 25 Sep 2020 15:11:16 +0900 Subject: [PATCH 27/57] Don't always upload snapshots --- ci/live-cluster-sanity.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index be29f0980ef76b..e8e5bf2250ad9d 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -39,11 +39,16 @@ test_with_live_cluster() { _ ./net/ssh.sh "$instance_ip" ./remote-live-cluster-sanity.sh "$@" || validator_failed=$? # let's collect logs for profit! - for log in $(./net/ssh.sh "$instance_ip" ls 'cluster-sanity/'{'*.log','ledger/snapshot-*.tar.*'}); do + for log in $(./net/ssh.sh "$instance_ip" ls 'cluster-sanity/*.log'); do _ ./net/scp.sh "$instance_ip:$log" "./$cluster_label" done if [[ -n $validator_failed ]]; then + # let's even collect snapshot for diagnostics + for log in $(./net/ssh.sh "$instance_ip" ls 'cluster-sanity/ledger/snapshot-*.tar.*'); do + _ ./net/scp.sh "$instance_ip:$log" "./$cluster_label" + done + (exit "$validator_failed") fi } From 9c80451f4e29f184986ad568e1c4911ad6b391f5 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Thu, 1 Oct 2020 15:15:53 +0900 Subject: [PATCH 28/57] Revert comment out --- ci/buildkite-pipeline.sh | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/ci/buildkite-pipeline.sh b/ci/buildkite-pipeline.sh index 07d8296efde212..5773e0ef102022 100755 --- a/ci/buildkite-pipeline.sh +++ b/ci/buildkite-pipeline.sh @@ -125,9 +125,8 @@ wait_step() { } all_test_steps() { - #command_step checks ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_nightly_docker_image ci/test-checks.sh" 20 - #wait_step - true + command_step checks ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_nightly_docker_image ci/test-checks.sh" 20 + wait_step # Coverage... if affects \ @@ -138,18 +137,16 @@ all_test_steps() { ^ci/test-coverage.sh \ ^scripts/coverage.sh \ ; then - #command_step coverage ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_nightly_docker_image ci/test-coverage.sh" 30 - #wait_step - true + command_step coverage ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_nightly_docker_image ci/test-coverage.sh" 30 + wait_step else annotate --style info --context test-coverage \ "Coverage skipped as no .rs files were modified" fi # Full test suite - # command_step stable ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_stable_docker_image ci/test-stable.sh" 60 - #wait_step - true + command_step stable ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_stable_docker_image ci/test-stable.sh" 60 + wait_step # Perf test suite if affects \ @@ -232,8 +229,8 @@ EOF } pull_or_push_steps() { - #command_step sanity "ci/test-sanity.sh" 5 - #wait_step + command_step sanity "ci/test-sanity.sh" 5 + wait_step # Check for any .sh file changes if affects .sh$; then From f6ea13ffeae391c57e82c8ea0fb7ea617dd8af44 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Thu, 1 Oct 2020 15:24:36 +0900 Subject: [PATCH 29/57] Rename to nicely align with local-cluster --- ci/buildkite-pipeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/buildkite-pipeline.sh b/ci/buildkite-pipeline.sh index 5773e0ef102022..616cf8a3b787f3 100755 --- a/ci/buildkite-pipeline.sh +++ b/ci/buildkite-pipeline.sh @@ -170,7 +170,7 @@ all_test_steps() { agents: - "queue=cuda" - command: "ci/live-cluster-sanity.sh" - name: "live-cluster-sanity" + name: "live-cluster" timeout_in_minutes: 20 artifact_paths: - "*/validator.log" From b00613ef3c2df5197d4f9adfd218437ead80e533 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Thu, 1 Oct 2020 18:30:48 +0900 Subject: [PATCH 30/57] Increase duration of monitoring phase --- ci/remote-live-cluster-sanity.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index 75fb6513835a9e..ccc3dd8e534644 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -71,7 +71,7 @@ snapshot_slot=$(ls -t cluster-sanity/ledger/snapshot-*.tar.* | grep -o '[0-9]*' ) current_root=$snapshot_slot -goal_root=$((snapshot_slot + 100)) +goal_root=$((snapshot_slot + 400)) attempts=100 while [[ $current_root -le $goal_root ]]; do From 0edc8ce858fc8294f7dfc199f2e808c824c80645 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Mon, 5 Oct 2020 15:15:49 +0900 Subject: [PATCH 31/57] Run ledger-tool verify too --- ci/live-cluster-sanity.sh | 2 +- ci/remote-live-cluster-sanity.sh | 26 ++++++++++++++++++++++---- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index e8e5bf2250ad9d..64556b2064c7c9 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -27,7 +27,7 @@ trap on_trap INT TERM EXIT _ cargo +"$rust_stable" build --bins --release _ ./net/scp.sh \ ./ci/remote-live-cluster-sanity.sh \ - ./target/release/{solana,solana-validator,solana-sys-tuner} \ + ./target/release/{solana,solana-validator,solana-ledger-tool,solana-sys-tuner} \ "$instance_ip:." test_with_live_cluster() { diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index ccc3dd8e534644..335461c4fdb667 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -35,9 +35,10 @@ export SOLANA_METRICS_CONFIG="host=$metrics_host,db=testnet-live-cluster,u=scrat sudo ./solana-sys-tuner --user "$(whoami)" &> "$sys_tuner_log" & sys_tuner_pid=$! -./solana-validator \ - --no-untrusted-rpc \ +( + ./solana-validator \ --ledger cluster-sanity/ledger \ + --no-untrusted-rpc \ --log - \ --init-complete-file cluster-sanity/init-completed \ --enable-rpc-exit \ @@ -45,7 +46,12 @@ sys_tuner_pid=$! --rpc-port 8899 \ --rpc-bind-address localhost \ --snapshot-interval-slots 0 \ - "$@" &> "$validator_log" & + "$@" && + ./solana-ledger-tool \ + --ledger cluster-sanity/ledger \ + verify +) &> "$validator_log" & + validator_pid=$! tail -F "$validator_log" > cluster-sanity/log-tail 2> /dev/null & tail_pid=$! @@ -71,7 +77,7 @@ snapshot_slot=$(ls -t cluster-sanity/ledger/snapshot-*.tar.* | grep -o '[0-9]*' ) current_root=$snapshot_slot -goal_root=$((snapshot_slot + 400)) +goal_root=$((snapshot_slot + 100)) attempts=100 while [[ $current_root -le $goal_root ]]; do @@ -92,6 +98,18 @@ curl \ -d '{"jsonrpc":"2.0","id":1, "method":"validatorExit"}' \ http://localhost:8899 +attempts=100 +while true; do + attempts=$((attempts - 1)) + if [[ (($attempts == 0)) || ! -d "/proc/$validator_pid" ]]; then + handle_error "ledger tool" + fi + + sleep 3 + echo "##### ledger-tool is running... (until timeout: $attempts) #####" + show_log +done + # well, kill $sys_tuner_pid didn't work for some reason, maybe sudo doen't relay signals? (set -x && sleep 3 && kill "$tail_pid" && sudo pkill -f solana-sys-tuner) & kill_pid=$! From 389fb2aaf8b7d0d6e6dc0218aca3e35d3e0baccf Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Mon, 5 Oct 2020 17:37:13 +0900 Subject: [PATCH 32/57] Maybe bpf_loader.so needed only for `ledger-tool`? --- ci/live-cluster-sanity.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index 64556b2064c7c9..be28876a0d50e4 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -29,6 +29,8 @@ _ ./net/scp.sh \ ./ci/remote-live-cluster-sanity.sh \ ./target/release/{solana,solana-validator,solana-ledger-tool,solana-sys-tuner} \ "$instance_ip:." +_ ./net/ssh.sh "$instance_ip" mkdir deps +_ ./net/scp.sh ./target/release/deps/libsolana_bpf_loader_program.so "$instance_ip:./deps/" test_with_live_cluster() { cluster_label="$1" From 0fdde8a7e31244f83a2e66b12689c46f03304456 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sat, 10 Oct 2020 01:30:38 +0900 Subject: [PATCH 33/57] Well, this shouldn't needed anymore --- ci/live-cluster-sanity.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index be28876a0d50e4..64556b2064c7c9 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -29,8 +29,6 @@ _ ./net/scp.sh \ ./ci/remote-live-cluster-sanity.sh \ ./target/release/{solana,solana-validator,solana-ledger-tool,solana-sys-tuner} \ "$instance_ip:." -_ ./net/ssh.sh "$instance_ip" mkdir deps -_ ./net/scp.sh ./target/release/deps/libsolana_bpf_loader_program.so "$instance_ip:./deps/" test_with_live_cluster() { cluster_label="$1" From 70d68266c0fe0f5348304e92f267176c4424fdbf Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sat, 10 Oct 2020 02:43:31 +0900 Subject: [PATCH 34/57] Silly me. --- ci/remote-live-cluster-sanity.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index 335461c4fdb667..8a32a45ee953c4 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -101,7 +101,7 @@ curl \ attempts=100 while true; do attempts=$((attempts - 1)) - if [[ (($attempts == 0)) || ! -d "/proc/$validator_pid" ]]; then + if [[ (($attempts == 0)) ]]; then handle_error "ledger tool" fi From 956af9fec571ada945f0f1e487d036b1dee73075 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sat, 10 Oct 2020 11:04:51 +0900 Subject: [PATCH 35/57] meh... --- ci/remote-live-cluster-sanity.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index 8a32a45ee953c4..d11a6919df9f7c 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -67,6 +67,7 @@ while ! [[ -f cluster-sanity/init-completed ]]; do echo "##### validator is starting... (until timeout: $attempts) #####" show_log done +echo "##### validator finished starting! #####" echo "--- Monitoring validator $cluster_label" @@ -91,6 +92,7 @@ while [[ $current_root -le $goal_root ]]; do echo "##### validator is running ($current_root/$goal_root)... (until timeout: $attempts) #####" show_log done +echo "##### validator finished running! #####" curl \ -X POST \ @@ -99,7 +101,7 @@ curl \ http://localhost:8899 attempts=100 -while true; do +while [[ -d "/proc/$validator_pid" ]]; do attempts=$((attempts - 1)) if [[ (($attempts == 0)) ]]; then handle_error "ledger tool" @@ -109,6 +111,7 @@ while true; do echo "##### ledger-tool is running... (until timeout: $attempts) #####" show_log done +echo "##### ledger-tool finished running! #####" # well, kill $sys_tuner_pid didn't work for some reason, maybe sudo doen't relay signals? (set -x && sleep 3 && kill "$tail_pid" && sudo pkill -f solana-sys-tuner) & From 5350ca0a9accae362e667c69d9b24adc7769d9bd Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Tue, 13 Oct 2020 13:36:45 +0900 Subject: [PATCH 36/57] Reduce rooted slots also rename confusing var --- ci/remote-live-cluster-sanity.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index d11a6919df9f7c..50076aac3864af 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -52,14 +52,14 @@ sys_tuner_pid=$! verify ) &> "$validator_log" & -validator_pid=$! +validator_and_ledger_tool_pid=$! tail -F "$validator_log" > cluster-sanity/log-tail 2> /dev/null & tail_pid=$! attempts=100 while ! [[ -f cluster-sanity/init-completed ]]; do attempts=$((attempts - 1)) - if [[ (($attempts == 0)) || ! -d "/proc/$validator_pid" ]]; then + if [[ (($attempts == 0)) || ! -d "/proc/$validator_and_ledger_tool_pid" ]]; then handle_error "start" fi @@ -78,12 +78,12 @@ snapshot_slot=$(ls -t cluster-sanity/ledger/snapshot-*.tar.* | grep -o '[0-9]*' ) current_root=$snapshot_slot -goal_root=$((snapshot_slot + 100)) +goal_root=$((snapshot_slot + 50)) attempts=100 while [[ $current_root -le $goal_root ]]; do attempts=$((attempts - 1)) - if [[ (($attempts == 0)) || ! -d "/proc/$validator_pid" ]]; then + if [[ (($attempts == 0)) || ! -d "/proc/$validator_and_ledger_tool_pid" ]]; then handle_error "root new slots" fi @@ -101,7 +101,7 @@ curl \ http://localhost:8899 attempts=100 -while [[ -d "/proc/$validator_pid" ]]; do +while [[ -d "/proc/$validator_and_ledger_tool_pid" ]]; do attempts=$((attempts - 1)) if [[ (($attempts == 0)) ]]; then handle_error "ledger tool" @@ -117,4 +117,4 @@ echo "##### ledger-tool finished running! #####" (set -x && sleep 3 && kill "$tail_pid" && sudo pkill -f solana-sys-tuner) & kill_pid=$! -wait "$validator_pid" "$sys_tuner_pid" "$tail_pid" "$kill_pid" +wait "$validator_and_ledger_tool_pid" "$sys_tuner_pid" "$tail_pid" "$kill_pid" From ec336b524697806a882c0859150d981918f378c1 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Tue, 13 Oct 2020 13:45:28 +0900 Subject: [PATCH 37/57] more var renaming fix.... --- ci/remote-live-cluster-sanity.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index 50076aac3864af..f645d986f43f2a 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -3,8 +3,8 @@ handle_error() { action=$1 set +e - kill "$validator_pid" "$tail_pid" - wait "$validator_pid" "$tail_pid" + kill "$validator_then_ledger_tool_pid" "$tail_pid" + wait "$validator_then_ledger_tool_pid" "$tail_pid" echo "--- Error: validator failed to $action" exit 1 } @@ -52,14 +52,14 @@ sys_tuner_pid=$! verify ) &> "$validator_log" & -validator_and_ledger_tool_pid=$! +validator_then_ledger_tool_pid=$! tail -F "$validator_log" > cluster-sanity/log-tail 2> /dev/null & tail_pid=$! attempts=100 while ! [[ -f cluster-sanity/init-completed ]]; do attempts=$((attempts - 1)) - if [[ (($attempts == 0)) || ! -d "/proc/$validator_and_ledger_tool_pid" ]]; then + if [[ (($attempts == 0)) || ! -d "/proc/$validator_then_ledger_tool_pid" ]]; then handle_error "start" fi @@ -83,7 +83,7 @@ goal_root=$((snapshot_slot + 50)) attempts=100 while [[ $current_root -le $goal_root ]]; do attempts=$((attempts - 1)) - if [[ (($attempts == 0)) || ! -d "/proc/$validator_and_ledger_tool_pid" ]]; then + if [[ (($attempts == 0)) || ! -d "/proc/$validator_then_ledger_tool_pid" ]]; then handle_error "root new slots" fi @@ -101,7 +101,7 @@ curl \ http://localhost:8899 attempts=100 -while [[ -d "/proc/$validator_and_ledger_tool_pid" ]]; do +while [[ -d "/proc/$validator_then_ledger_tool_pid" ]]; do attempts=$((attempts - 1)) if [[ (($attempts == 0)) ]]; then handle_error "ledger tool" @@ -117,4 +117,4 @@ echo "##### ledger-tool finished running! #####" (set -x && sleep 3 && kill "$tail_pid" && sudo pkill -f solana-sys-tuner) & kill_pid=$! -wait "$validator_and_ledger_tool_pid" "$sys_tuner_pid" "$tail_pid" "$kill_pid" +wait "$validator_then_ledger_tool_pid" "$sys_tuner_pid" "$tail_pid" "$kill_pid" From c85aa0550f093908781b7b29a7d9a7e6e0a3829b Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Mon, 26 Oct 2020 15:20:30 +0900 Subject: [PATCH 38/57] Double timeout (testnet is slow for some reason) --- ci/remote-live-cluster-sanity.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index f645d986f43f2a..277f8e39412200 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -56,7 +56,7 @@ validator_then_ledger_tool_pid=$! tail -F "$validator_log" > cluster-sanity/log-tail 2> /dev/null & tail_pid=$! -attempts=100 +attempts=200 while ! [[ -f cluster-sanity/init-completed ]]; do attempts=$((attempts - 1)) if [[ (($attempts == 0)) || ! -d "/proc/$validator_then_ledger_tool_pid" ]]; then @@ -80,7 +80,7 @@ snapshot_slot=$(ls -t cluster-sanity/ledger/snapshot-*.tar.* | current_root=$snapshot_slot goal_root=$((snapshot_slot + 50)) -attempts=100 +attempts=200 while [[ $current_root -le $goal_root ]]; do attempts=$((attempts - 1)) if [[ (($attempts == 0)) || ! -d "/proc/$validator_then_ledger_tool_pid" ]]; then @@ -100,7 +100,7 @@ curl \ -d '{"jsonrpc":"2.0","id":1, "method":"validatorExit"}' \ http://localhost:8899 -attempts=100 +attempts=200 while [[ -d "/proc/$validator_then_ledger_tool_pid" ]]; do attempts=$((attempts - 1)) if [[ (($attempts == 0)) ]]; then From 29fa355b8ec81b40b96eea8fe26db41830c8d918 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Wed, 28 Oct 2020 15:33:47 +0900 Subject: [PATCH 39/57] Increase timeout... --- ci/buildkite-pipeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/buildkite-pipeline.sh b/ci/buildkite-pipeline.sh index 616cf8a3b787f3..4c15c457ebf34b 100755 --- a/ci/buildkite-pipeline.sh +++ b/ci/buildkite-pipeline.sh @@ -171,7 +171,7 @@ all_test_steps() { - "queue=cuda" - command: "ci/live-cluster-sanity.sh" name: "live-cluster" - timeout_in_minutes: 20 + timeout_in_minutes: 30 artifact_paths: - "*/validator.log" - "*/sys-tuner.log" From dd85437bc74f86313e157beee238932fc34d00c4 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Wed, 28 Oct 2020 17:59:53 +0900 Subject: [PATCH 40/57] Tooooo much log --- ci/remote-live-cluster-sanity.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index 277f8e39412200..b0e06484021b4c 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -30,6 +30,7 @@ validator_log="cluster-sanity/validator.log" sys_tuner_log="cluster-sanity/sys-tuner.log" metrics_host="https://metrics.solana.com:8086" export SOLANA_METRICS_CONFIG="host=$metrics_host,db=testnet-live-cluster,u=scratch_writer,p=topsecret" +export RUST_LOG=warn # shellcheck disable=SC2024 # create log as non-root user sudo ./solana-sys-tuner --user "$(whoami)" &> "$sys_tuner_log" & From 7078191f16003876882e607d1a21ca50bb69634d Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 30 Oct 2020 19:37:41 +0900 Subject: [PATCH 41/57] Chery pick bank frozen INFO message --- ci/remote-live-cluster-sanity.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index b0e06484021b4c..810f98e7c74331 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -30,7 +30,7 @@ validator_log="cluster-sanity/validator.log" sys_tuner_log="cluster-sanity/sys-tuner.log" metrics_host="https://metrics.solana.com:8086" export SOLANA_METRICS_CONFIG="host=$metrics_host,db=testnet-live-cluster,u=scratch_writer,p=topsecret" -export RUST_LOG=warn +export RUST_LOG="warn,solana_runtime::bank=info" # shellcheck disable=SC2024 # create log as non-root user sudo ./solana-sys-tuner --user "$(whoami)" &> "$sys_tuner_log" & From f39821ba45cc943cc01e6a6b70f6c6255f2fe16a Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 30 Oct 2020 21:30:32 +0900 Subject: [PATCH 42/57] Cherry-pick more logs. --- ci/remote-live-cluster-sanity.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index 810f98e7c74331..6da654a766e2a4 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -30,7 +30,7 @@ validator_log="cluster-sanity/validator.log" sys_tuner_log="cluster-sanity/sys-tuner.log" metrics_host="https://metrics.solana.com:8086" export SOLANA_METRICS_CONFIG="host=$metrics_host,db=testnet-live-cluster,u=scratch_writer,p=topsecret" -export RUST_LOG="warn,solana_runtime::bank=info" +export RUST_LOG="warn,solana_runtime::bank=info,solana_validator=info,solana_core=info,solana_ledger=info" # shellcheck disable=SC2024 # create log as non-root user sudo ./solana-sys-tuner --user "$(whoami)" &> "$sys_tuner_log" & From 1f4ea036427fc5ecceb5706a3292e895f7ad362d Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sat, 31 Oct 2020 02:50:46 +0900 Subject: [PATCH 43/57] less log --- ci/remote-live-cluster-sanity.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index 6da654a766e2a4..bc23aea02aa0be 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -30,7 +30,7 @@ validator_log="cluster-sanity/validator.log" sys_tuner_log="cluster-sanity/sys-tuner.log" metrics_host="https://metrics.solana.com:8086" export SOLANA_METRICS_CONFIG="host=$metrics_host,db=testnet-live-cluster,u=scratch_writer,p=topsecret" -export RUST_LOG="warn,solana_runtime::bank=info,solana_validator=info,solana_core=info,solana_ledger=info" +export RUST_LOG="warn,solana_runtime::bank=info,solana_validator=info,solana_core=info,solana_ledger=info,solana_core::repair_service=warn" # shellcheck disable=SC2024 # create log as non-root user sudo ./solana-sys-tuner --user "$(whoami)" &> "$sys_tuner_log" & From a3f2739c901016ec44059449a74ef16026800d4c Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sat, 31 Oct 2020 02:51:57 +0900 Subject: [PATCH 44/57] Restore --expected-shred-version for faster boot? --- ci/live-cluster-sanity.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index 64556b2064c7c9..f333b38b6196aa 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -72,4 +72,5 @@ test_with_live_cluster "testnet" \ --trusted-validator Ft5fbkqNa76vnsjYNwjDZUXoTWpP7VYm3mtsaQckQADN \ --trusted-validator 9QxCLckBiJc783jnMvXZubK4wH86Eqqvashtrwvcsgkv \ --expected-genesis-hash 4uhcVJyU9pJkvQyS88uRDiswHXSCkY3zQawwpjk2NsNY \ + --expected-shred-version 1579 \ # for your pain-less copy-paste From da49d67420a12fc7f8dd293dd689ff71e38fd9b2 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sat, 31 Oct 2020 16:00:44 +0900 Subject: [PATCH 45/57] longer timeout for ledger-tool and high-legel logs --- ci/remote-live-cluster-sanity.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index bc23aea02aa0be..e18feeab1783d8 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -12,7 +12,7 @@ handle_error() { show_log() { if find cluster-sanity/log-tail -not -empty | grep ^ > /dev/null; then echo "##### new log:" - timeout 1 cat cluster-sanity/log-tail | tail -n 3 | cut -c 1-300 || true + timeout 0.01 cat cluster-sanity/log-tail | tail -n 3 | cut -c 1-300 || true truncate --size 0 cluster-sanity/log-tail echo fi @@ -37,6 +37,7 @@ sudo ./solana-sys-tuner --user "$(whoami)" &> "$sys_tuner_log" & sys_tuner_pid=$! ( + echo "$(date): VALIDATOR STARTED." && ./solana-validator \ --ledger cluster-sanity/ledger \ --no-untrusted-rpc \ @@ -48,9 +49,11 @@ sys_tuner_pid=$! --rpc-bind-address localhost \ --snapshot-interval-slots 0 \ "$@" && + echo "$(date): VALIDATOR FINISHED AND LEDGER-TOOL STARTED." && ./solana-ledger-tool \ --ledger cluster-sanity/ledger \ - verify + verify && + echo "$(date): LEDGER-TOOL FINISHED." ) &> "$validator_log" & validator_then_ledger_tool_pid=$! @@ -101,7 +104,7 @@ curl \ -d '{"jsonrpc":"2.0","id":1, "method":"validatorExit"}' \ http://localhost:8899 -attempts=200 +attempts=400 while [[ -d "/proc/$validator_then_ledger_tool_pid" ]]; do attempts=$((attempts - 1)) if [[ (($attempts == 0)) ]]; then From 2902804e5e45b0c6712a9542a068e0e4693b8e52 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sat, 31 Oct 2020 16:25:53 +0900 Subject: [PATCH 46/57] disable audit --- ci/buildkite-pipeline.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/buildkite-pipeline.sh b/ci/buildkite-pipeline.sh index 4c15c457ebf34b..e22cab3df8e657 100755 --- a/ci/buildkite-pipeline.sh +++ b/ci/buildkite-pipeline.sh @@ -125,8 +125,8 @@ wait_step() { } all_test_steps() { - command_step checks ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_nightly_docker_image ci/test-checks.sh" 20 - wait_step + #command_step checks ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_nightly_docker_image ci/test-checks.sh" 20 + #wait_step # Coverage... if affects \ From 8a1ccdccc45378c20eadc3ea500d699de5bf9d52 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sat, 31 Oct 2020 17:34:06 +0900 Subject: [PATCH 47/57] longer --- ci/buildkite-pipeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/buildkite-pipeline.sh b/ci/buildkite-pipeline.sh index e22cab3df8e657..b8cd5831cf5996 100755 --- a/ci/buildkite-pipeline.sh +++ b/ci/buildkite-pipeline.sh @@ -171,7 +171,7 @@ all_test_steps() { - "queue=cuda" - command: "ci/live-cluster-sanity.sh" name: "live-cluster" - timeout_in_minutes: 30 + timeout_in_minutes: 40 artifact_paths: - "*/validator.log" - "*/sys-tuner.log" From 056e1da49089bef3c4658f00942920c20f491ea0 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Mon, 2 Nov 2020 15:22:23 +0900 Subject: [PATCH 48/57] Revert "disable audit" This reverts commit ae24ab6b28e5a52b1e8ee0b934413a78a4601198. --- ci/buildkite-pipeline.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/buildkite-pipeline.sh b/ci/buildkite-pipeline.sh index b8cd5831cf5996..80a4f8006deacc 100755 --- a/ci/buildkite-pipeline.sh +++ b/ci/buildkite-pipeline.sh @@ -125,8 +125,8 @@ wait_step() { } all_test_steps() { - #command_step checks ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_nightly_docker_image ci/test-checks.sh" 20 - #wait_step + command_step checks ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_nightly_docker_image ci/test-checks.sh" 20 + wait_step # Coverage... if affects \ From 31c4b1abe421dfbab2feb911eec91afde9c35ad0 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sun, 15 Nov 2020 13:51:27 +0900 Subject: [PATCH 49/57] Remove expected shred version? --- ci/live-cluster-sanity.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index f333b38b6196aa..6889543be9c473 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -61,7 +61,6 @@ test_with_live_cluster "mainnet-beta" \ --trusted-validator DE1bawNcRJB9rVm3buyMVfr8mBEoyyu73NBovf2oXJsJ \ --trusted-validator CakcnaRDHka2gXyfbEd2d3xsvkJkqsLw2akB3zsN1D2S \ --expected-genesis-hash 5eykt4UsFv8P8NJdTREpY1vzqKqZKvdpKuc147dw2N9d \ - --expected-shred-version 64864 \ # for your pain-less copy-paste # UPDATE docs/src/clusters.md TOO!! @@ -72,5 +71,4 @@ test_with_live_cluster "testnet" \ --trusted-validator Ft5fbkqNa76vnsjYNwjDZUXoTWpP7VYm3mtsaQckQADN \ --trusted-validator 9QxCLckBiJc783jnMvXZubK4wH86Eqqvashtrwvcsgkv \ --expected-genesis-hash 4uhcVJyU9pJkvQyS88uRDiswHXSCkY3zQawwpjk2NsNY \ - --expected-shred-version 1579 \ # for your pain-less copy-paste From dfc13bcd99e94ad0725e51a0212b4bb4d626b007 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sun, 13 Dec 2020 14:17:22 +0900 Subject: [PATCH 50/57] Update remote-live-cluster-sanity.sh --- ci/remote-live-cluster-sanity.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index e18feeab1783d8..1ff8174a63820a 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -104,7 +104,7 @@ curl \ -d '{"jsonrpc":"2.0","id":1, "method":"validatorExit"}' \ http://localhost:8899 -attempts=400 +attempts=4000 while [[ -d "/proc/$validator_then_ledger_tool_pid" ]]; do attempts=$((attempts - 1)) if [[ (($attempts == 0)) ]]; then From d6e8c02ff075ee0573866d8e8ff8084a25e6813c Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sun, 13 Dec 2020 14:17:45 +0900 Subject: [PATCH 51/57] Update buildkite-pipeline.sh --- ci/buildkite-pipeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/buildkite-pipeline.sh b/ci/buildkite-pipeline.sh index 80a4f8006deacc..cd38e9237eca0c 100755 --- a/ci/buildkite-pipeline.sh +++ b/ci/buildkite-pipeline.sh @@ -171,7 +171,7 @@ all_test_steps() { - "queue=cuda" - command: "ci/live-cluster-sanity.sh" name: "live-cluster" - timeout_in_minutes: 40 + timeout_in_minutes: 60 artifact_paths: - "*/validator.log" - "*/sys-tuner.log" From d2a9e33fdeb86f7c25c7b6414e61f72aa2be84e1 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sun, 7 Mar 2021 14:16:37 +0900 Subject: [PATCH 52/57] Update to new validator subcommands --- ci/remote-live-cluster-sanity.sh | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index 1ff8174a63820a..0f5cb703c4fbd9 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -43,7 +43,6 @@ sys_tuner_pid=$! --no-untrusted-rpc \ --log - \ --init-complete-file cluster-sanity/init-completed \ - --enable-rpc-exit \ --private-rpc \ --rpc-port 8899 \ --rpc-bind-address localhost \ @@ -98,11 +97,9 @@ while [[ $current_root -le $goal_root ]]; do done echo "##### validator finished running! #####" -curl \ - -X POST \ - -H 'Content-Type: application/json' \ - -d '{"jsonrpc":"2.0","id":1, "method":"validatorExit"}' \ - http://localhost:8899 +./solana-validator \ + --ledger cluster-sanity/ledger \ + exit attempts=4000 while [[ -d "/proc/$validator_then_ledger_tool_pid" ]]; do From 7f44ee5d388c1e6087514ae0b5fe30abfd35c497 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Tue, 9 Mar 2021 00:04:16 +0900 Subject: [PATCH 53/57] Add --identity --- ci/live-cluster-sanity.sh | 2 +- ci/remote-live-cluster-sanity.sh | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index 6889543be9c473..4387335fedbc9e 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -27,7 +27,7 @@ trap on_trap INT TERM EXIT _ cargo +"$rust_stable" build --bins --release _ ./net/scp.sh \ ./ci/remote-live-cluster-sanity.sh \ - ./target/release/{solana,solana-validator,solana-ledger-tool,solana-sys-tuner} \ + ./target/release/{solana,solana-keygen,solana-validator,solana-ledger-tool,solana-sys-tuner} \ "$instance_ip:." test_with_live_cluster() { diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index 0f5cb703c4fbd9..5fa3a3fddd25ae 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -38,11 +38,13 @@ sys_tuner_pid=$! ( echo "$(date): VALIDATOR STARTED." && + ./solana-keygen new --no-passphrase -so ./identity.json && ./solana-validator \ - --ledger cluster-sanity/ledger \ + --identity ./identity.json \ + --ledger ./cluster-sanity/ledger \ --no-untrusted-rpc \ --log - \ - --init-complete-file cluster-sanity/init-completed \ + --init-complete-file ./cluster-sanity/init-completed \ --private-rpc \ --rpc-port 8899 \ --rpc-bind-address localhost \ From f116cabc75ea77e716aed569de4e7e35a393de06 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Tue, 9 Mar 2021 01:42:23 +0900 Subject: [PATCH 54/57] Add --no-poh-speed-test.... --- ci/remote-live-cluster-sanity.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index 5fa3a3fddd25ae..b522eb1dfe724b 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -43,6 +43,7 @@ sys_tuner_pid=$! --identity ./identity.json \ --ledger ./cluster-sanity/ledger \ --no-untrusted-rpc \ + --no-poh-speed-test \ --log - \ --init-complete-file ./cluster-sanity/init-completed \ --private-rpc \ From 157960cae98b872c11acd15106355703a0c5556e Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Tue, 9 Mar 2021 01:44:09 +0900 Subject: [PATCH 55/57] Add more entrypoints --- ci/live-cluster-sanity.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ci/live-cluster-sanity.sh b/ci/live-cluster-sanity.sh index 4387335fedbc9e..75f1af733c25b3 100755 --- a/ci/live-cluster-sanity.sh +++ b/ci/live-cluster-sanity.sh @@ -56,6 +56,10 @@ test_with_live_cluster() { # UPDATE docs/src/clusters.md TOO!! test_with_live_cluster "mainnet-beta" \ --entrypoint mainnet-beta.solana.com:8001 \ + --entrypoint entrypoint2.mainnet-beta.solana.com:8001 \ + --entrypoint entrypoint3.mainnet-beta.solana.com:8001 \ + --entrypoint entrypoint4.mainnet-beta.solana.com:8001 \ + --entrypoint entrypoint5.mainnet-beta.solana.com:8001 \ --trusted-validator 7Np41oeYqPefeNQEHSv1UDhYrehxin3NStELsSKCT4K2 \ --trusted-validator GdnSyH3YtwcxFvQrVVJMm1JhTS4QVX7MFsX56uJLUfiZ \ --trusted-validator DE1bawNcRJB9rVm3buyMVfr8mBEoyyu73NBovf2oXJsJ \ From ca1a3c03c17ce562d8d4e772e1e314300bede09c Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Tue, 9 Mar 2021 09:54:15 +0900 Subject: [PATCH 56/57] Add --force..... --- ci/remote-live-cluster-sanity.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index b522eb1dfe724b..bfabfb30cffe56 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -38,7 +38,7 @@ sys_tuner_pid=$! ( echo "$(date): VALIDATOR STARTED." && - ./solana-keygen new --no-passphrase -so ./identity.json && + ./solana-keygen new --force --no-passphrase --silent --outfile ./identity.json && ./solana-validator \ --identity ./identity.json \ --ledger ./cluster-sanity/ledger \ From f74ea2c3b7fcca12e663df107e8d60b9aaa0b0c1 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Thu, 25 Mar 2021 12:03:32 +0900 Subject: [PATCH 57/57] Add --force --- ci/remote-live-cluster-sanity.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/remote-live-cluster-sanity.sh b/ci/remote-live-cluster-sanity.sh index bfabfb30cffe56..26145724c35bad 100755 --- a/ci/remote-live-cluster-sanity.sh +++ b/ci/remote-live-cluster-sanity.sh @@ -102,7 +102,7 @@ echo "##### validator finished running! #####" ./solana-validator \ --ledger cluster-sanity/ledger \ - exit + exit --force attempts=4000 while [[ -d "/proc/$validator_then_ledger_tool_pid" ]]; do