-
Notifications
You must be signed in to change notification settings - Fork 4.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Introduce sanity/compatibility test for live clusters #12175
Changes from all commits
350ff35
250029f
7e3d007
a147f0a
3e53824
b535877
4d1463a
f8706be
87e19ed
9e4c48f
2f14108
553b17e
764ccc4
5fc203e
69dd2b2
570887e
2999eda
bfd3c96
80b3009
5e83689
3ed3901
acdedc1
1999544
2cf1318
ce3db63
6011e5a
aa14827
9c80451
f6ea13f
b00613e
0edc8ce
389fb2a
0fdde8a
70d6826
956af9f
5350ca0
ec336b5
c85aa05
29fa355
dd85437
7078191
f39821b
1f4ea03
a3f2739
da49d67
2902804
8a1ccdc
056e1da
31c4b1a
dfc13bc
d6e8c02
d2a9e33
7f44ee5
f116cab
157960c
ca1a3c0
f74ea2c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
#!/usr/bin/env bash | ||
set -e | ||
cd "$(dirname "$0")/.." | ||
|
||
source ci/_ | ||
source ci/rust-version.sh stable | ||
|
||
if [[ -n $CI ]]; then | ||
escaped_branch=$(echo "$BUILDKITE_BRANCH" | tr -c "[:alnum:]" - | sed -r "s#(^-*|-*head-*|-*$)##g") | ||
instance_prefix="testnet-live-sanity-$escaped_branch" | ||
else | ||
instance_prefix="testnet-live-sanity-$(whoami)" | ||
fi | ||
|
||
# ensure to delete leftover cluster | ||
_ ./net/gce.sh delete -p "$instance_prefix" || true | ||
# only bootstrap, no normal validator | ||
_ ./net/gce.sh create -p "$instance_prefix" -n 0 --self-destruct-hours 1 | ||
instance_ip=$(./net/gce.sh info | grep bootstrap-validator | awk '{print $3}') | ||
|
||
on_trap() { | ||
set +e | ||
_ ./net/gce.sh delete -p "$instance_prefix" | ||
} | ||
trap on_trap INT TERM EXIT | ||
|
||
_ cargo +"$rust_stable" build --bins --release | ||
_ ./net/scp.sh \ | ||
./ci/remote-live-cluster-sanity.sh \ | ||
./target/release/{solana,solana-keygen,solana-validator,solana-ledger-tool,solana-sys-tuner} \ | ||
"$instance_ip:." | ||
|
||
test_with_live_cluster() { | ||
mvines marked this conversation as resolved.
Show resolved
Hide resolved
|
||
cluster_label="$1" | ||
rm -rf "./$cluster_label" | ||
mkdir "./$cluster_label" | ||
|
||
validator_failed= | ||
_ ./net/ssh.sh "$instance_ip" ./remote-live-cluster-sanity.sh "$@" || validator_failed=$? | ||
|
||
# let's collect logs for profit! | ||
for log in $(./net/ssh.sh "$instance_ip" ls 'cluster-sanity/*.log'); do | ||
_ ./net/scp.sh "$instance_ip:$log" "./$cluster_label" | ||
done | ||
|
||
if [[ -n $validator_failed ]]; then | ||
# let's even collect snapshot for diagnostics | ||
for log in $(./net/ssh.sh "$instance_ip" ls 'cluster-sanity/ledger/snapshot-*.tar.*'); do | ||
_ ./net/scp.sh "$instance_ip:$log" "./$cluster_label" | ||
done | ||
|
||
(exit "$validator_failed") | ||
fi | ||
} | ||
|
||
# UPDATE docs/src/clusters.md TOO!! | ||
test_with_live_cluster "mainnet-beta" \ | ||
--entrypoint mainnet-beta.solana.com:8001 \ | ||
--entrypoint entrypoint2.mainnet-beta.solana.com:8001 \ | ||
--entrypoint entrypoint3.mainnet-beta.solana.com:8001 \ | ||
--entrypoint entrypoint4.mainnet-beta.solana.com:8001 \ | ||
--entrypoint entrypoint5.mainnet-beta.solana.com:8001 \ | ||
--trusted-validator 7Np41oeYqPefeNQEHSv1UDhYrehxin3NStELsSKCT4K2 \ | ||
--trusted-validator GdnSyH3YtwcxFvQrVVJMm1JhTS4QVX7MFsX56uJLUfiZ \ | ||
--trusted-validator DE1bawNcRJB9rVm3buyMVfr8mBEoyyu73NBovf2oXJsJ \ | ||
--trusted-validator CakcnaRDHka2gXyfbEd2d3xsvkJkqsLw2akB3zsN1D2S \ | ||
--expected-genesis-hash 5eykt4UsFv8P8NJdTREpY1vzqKqZKvdpKuc147dw2N9d \ | ||
# for your pain-less copy-paste | ||
|
||
# UPDATE docs/src/clusters.md TOO!! | ||
test_with_live_cluster "testnet" \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When backporting to v1.2, I'll remove this line. |
||
--entrypoint entrypoint.testnet.solana.com:8001 \ | ||
--trusted-validator 5D1fNXzvv5NjV1ysLjirC4WY92RNsVH18vjmcszZd8on \ | ||
--trusted-validator ta1Uvfb7W5BRPrdGnhP9RmeCGKzBySGM1hTE4rBRy6T \ | ||
--trusted-validator Ft5fbkqNa76vnsjYNwjDZUXoTWpP7VYm3mtsaQckQADN \ | ||
--trusted-validator 9QxCLckBiJc783jnMvXZubK4wH86Eqqvashtrwvcsgkv \ | ||
--expected-genesis-hash 4uhcVJyU9pJkvQyS88uRDiswHXSCkY3zQawwpjk2NsNY \ | ||
# for your pain-less copy-paste |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
#!/usr/bin/env bash | ||
|
||
handle_error() { | ||
action=$1 | ||
set +e | ||
kill "$validator_then_ledger_tool_pid" "$tail_pid" | ||
wait "$validator_then_ledger_tool_pid" "$tail_pid" | ||
echo "--- Error: validator failed to $action" | ||
exit 1 | ||
} | ||
|
||
show_log() { | ||
if find cluster-sanity/log-tail -not -empty | grep ^ > /dev/null; then | ||
echo "##### new log:" | ||
timeout 0.01 cat cluster-sanity/log-tail | tail -n 3 | cut -c 1-300 || true | ||
truncate --size 0 cluster-sanity/log-tail | ||
echo | ||
fi | ||
} | ||
|
||
rm -rf cluster-sanity | ||
mkdir cluster-sanity | ||
|
||
cluster_label="$1" | ||
shift | ||
|
||
echo "--- Starting validator $cluster_label" | ||
|
||
validator_log="cluster-sanity/validator.log" | ||
sys_tuner_log="cluster-sanity/sys-tuner.log" | ||
metrics_host="https://metrics.solana.com:8086" | ||
export SOLANA_METRICS_CONFIG="host=$metrics_host,db=testnet-live-cluster,u=scratch_writer,p=topsecret" | ||
export RUST_LOG="warn,solana_runtime::bank=info,solana_validator=info,solana_core=info,solana_ledger=info,solana_core::repair_service=warn" | ||
|
||
# shellcheck disable=SC2024 # create log as non-root user | ||
sudo ./solana-sys-tuner --user "$(whoami)" &> "$sys_tuner_log" & | ||
sys_tuner_pid=$! | ||
|
||
( | ||
echo "$(date): VALIDATOR STARTED." && | ||
./solana-keygen new --force --no-passphrase --silent --outfile ./identity.json && | ||
./solana-validator \ | ||
--identity ./identity.json \ | ||
--ledger ./cluster-sanity/ledger \ | ||
--no-untrusted-rpc \ | ||
--no-poh-speed-test \ | ||
--log - \ | ||
--init-complete-file ./cluster-sanity/init-completed \ | ||
--private-rpc \ | ||
--rpc-port 8899 \ | ||
--rpc-bind-address localhost \ | ||
--snapshot-interval-slots 0 \ | ||
"$@" && | ||
echo "$(date): VALIDATOR FINISHED AND LEDGER-TOOL STARTED." && | ||
./solana-ledger-tool \ | ||
--ledger cluster-sanity/ledger \ | ||
verify && | ||
echo "$(date): LEDGER-TOOL FINISHED." | ||
) &> "$validator_log" & | ||
|
||
validator_then_ledger_tool_pid=$! | ||
tail -F "$validator_log" > cluster-sanity/log-tail 2> /dev/null & | ||
tail_pid=$! | ||
|
||
attempts=200 | ||
while ! [[ -f cluster-sanity/init-completed ]]; do | ||
attempts=$((attempts - 1)) | ||
if [[ (($attempts == 0)) || ! -d "/proc/$validator_then_ledger_tool_pid" ]]; then | ||
handle_error "start" | ||
fi | ||
|
||
sleep 3 | ||
echo "##### validator is starting... (until timeout: $attempts) #####" | ||
show_log | ||
done | ||
echo "##### validator finished starting! #####" | ||
|
||
echo "--- Monitoring validator $cluster_label" | ||
|
||
# shellcheck disable=SC2012 # ls here is handy for sorted snapshots | ||
snapshot_slot=$(ls -t cluster-sanity/ledger/snapshot-*.tar.* | | ||
head -n 1 | | ||
grep -o 'snapshot-[0-9]*-' | | ||
grep -o '[0-9]*' | ||
) | ||
current_root=$snapshot_slot | ||
goal_root=$((snapshot_slot + 50)) | ||
|
||
attempts=200 | ||
while [[ $current_root -le $goal_root ]]; do | ||
attempts=$((attempts - 1)) | ||
if [[ (($attempts == 0)) || ! -d "/proc/$validator_then_ledger_tool_pid" ]]; then | ||
handle_error "root new slots" | ||
fi | ||
|
||
sleep 3 | ||
current_root=$(./solana --url http://localhost:8899 slot --commitment root) | ||
echo "##### validator is running ($current_root/$goal_root)... (until timeout: $attempts) #####" | ||
show_log | ||
done | ||
echo "##### validator finished running! #####" | ||
|
||
./solana-validator \ | ||
--ledger cluster-sanity/ledger \ | ||
exit --force | ||
|
||
attempts=4000 | ||
while [[ -d "/proc/$validator_then_ledger_tool_pid" ]]; do | ||
attempts=$((attempts - 1)) | ||
if [[ (($attempts == 0)) ]]; then | ||
handle_error "ledger tool" | ||
fi | ||
|
||
sleep 3 | ||
echo "##### ledger-tool is running... (until timeout: $attempts) #####" | ||
show_log | ||
done | ||
echo "##### ledger-tool finished running! #####" | ||
|
||
# well, kill $sys_tuner_pid didn't work for some reason, maybe sudo doen't relay signals? | ||
(set -x && sleep 3 && kill "$tail_pid" && sudo pkill -f solana-sys-tuner) & | ||
kill_pid=$! | ||
|
||
wait "$validator_then_ledger_tool_pid" "$sys_tuner_pid" "$tail_pid" "$kill_pid" |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,15 +42,13 @@ solana config set --url https://api.devnet.solana.com | |
|
||
```bash | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The doc/ and There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no strong reason to create separate PRs. I just thought it's not worth to be its own prs. |
||
$ solana-validator \ | ||
--identity validator-keypair.json \ | ||
--vote-account vote-account-keypair.json \ | ||
--trusted-validator dv1LfzJvDF7S1fBKpFgKoKXK5yoSosmkAdfbxBo1GqJ \ | ||
--identity ~/validator-keypair.json \ | ||
--vote-account ~/vote-account-keypair.json \ | ||
--no-untrusted-rpc \ | ||
--ledger ledger \ | ||
--rpc-port 8899 \ | ||
--dynamic-port-range 8000-8010 \ | ||
--entrypoint entrypoint.devnet.solana.com:8001 \ | ||
--expected-genesis-hash EtWTRABZaYq6iMfeYKouRu166VU2xqa1wcaWoxPkrZBG \ | ||
--wal-recovery-mode skip_any_corrupted_record \ | ||
--limit-ledger-size | ||
``` | ||
|
@@ -87,22 +85,24 @@ solana config set --url https://api.testnet.solana.com | |
|
||
##### Example `solana-validator` command-line | ||
|
||
[comment]: # (UPDATE ci/live-cluster-sanity.sh TOO!!) | ||
|
||
```bash | ||
$ solana-validator \ | ||
--identity validator-keypair.json \ | ||
--vote-account vote-account-keypair.json \ | ||
--entrypoint entrypoint.testnet.solana.com:8001 \ | ||
--entrypoint entrypoint2.testnet.solana.com:8001 \ | ||
--entrypoint entrypoint3.testnet.solana.com:8001 \ | ||
--trusted-validator 5D1fNXzvv5NjV1ysLjirC4WY92RNsVH18vjmcszZd8on \ | ||
--trusted-validator 7XSY3MrYnK8vq693Rju17bbPkCN3Z7KvvfvJx4kdrsSY \ | ||
--trusted-validator Ft5fbkqNa76vnsjYNwjDZUXoTWpP7VYm3mtsaQckQADN \ | ||
--trusted-validator 9QxCLckBiJc783jnMvXZubK4wH86Eqqvashtrwvcsgkv \ | ||
--expected-genesis-hash 4uhcVJyU9pJkvQyS88uRDiswHXSCkY3zQawwpjk2NsNY \ | ||
--identity ~/validator-keypair.json \ | ||
--vote-account ~/vote-account-keypair.json \ | ||
--no-untrusted-rpc \ | ||
--ledger ledger \ | ||
--rpc-port 8899 \ | ||
--dynamic-port-range 8000-8010 \ | ||
--entrypoint entrypoint.testnet.solana.com:8001 \ | ||
--entrypoint entrypoint2.testnet.solana.com:8001 \ | ||
--entrypoint entrypoint3.testnet.solana.com:8001 \ | ||
--expected-genesis-hash 4uhcVJyU9pJkvQyS88uRDiswHXSCkY3zQawwpjk2NsNY \ | ||
--wal-recovery-mode skip_any_corrupted_record \ | ||
--limit-ledger-size | ||
``` | ||
|
@@ -142,25 +142,28 @@ solana config set --url https://api.mainnet-beta.solana.com | |
|
||
##### Example `solana-validator` command-line | ||
|
||
[comment]: # (UPDATE ci/live-cluster-sanity.sh TOO!!) | ||
|
||
```bash | ||
$ solana-validator \ | ||
--identity ~/validator-keypair.json \ | ||
--vote-account ~/vote-account-keypair.json \ | ||
--entrypoint entrypoint.mainnet-beta.solana.com:8001 \ | ||
--entrypoint entrypoint2.mainnet-beta.solana.com:8001 \ | ||
--entrypoint entrypoint3.mainnet-beta.solana.com:8001 \ | ||
--entrypoint entrypoint4.mainnet-beta.solana.com:8001 \ | ||
--entrypoint entrypoint5.mainnet-beta.solana.com:8001 \ | ||
--trusted-validator 7Np41oeYqPefeNQEHSv1UDhYrehxin3NStELsSKCT4K2 \ | ||
--trusted-validator GdnSyH3YtwcxFvQrVVJMm1JhTS4QVX7MFsX56uJLUfiZ \ | ||
--trusted-validator DE1bawNcRJB9rVm3buyMVfr8mBEoyyu73NBovf2oXJsJ \ | ||
--trusted-validator CakcnaRDHka2gXyfbEd2d3xsvkJkqsLw2akB3zsN1D2S \ | ||
--expected-genesis-hash 5eykt4UsFv8P8NJdTREpY1vzqKqZKvdpKuc147dw2N9d \ | ||
--expected-shred-version 64864 \ | ||
--identity ~/validator-keypair.json \ | ||
--vote-account ~/vote-account-keypair.json \ | ||
--no-untrusted-rpc \ | ||
--ledger ledger \ | ||
--rpc-port 8899 \ | ||
--private-rpc \ | ||
--dynamic-port-range 8000-8010 \ | ||
--entrypoint entrypoint.mainnet-beta.solana.com:8001 \ | ||
--entrypoint entrypoint2.mainnet-beta.solana.com:8001 \ | ||
--entrypoint entrypoint3.mainnet-beta.solana.com:8001 \ | ||
--entrypoint entrypoint4.mainnet-beta.solana.com:8001 \ | ||
--entrypoint entrypoint5.mainnet-beta.solana.com:8001 \ | ||
--expected-genesis-hash 5eykt4UsFv8P8NJdTREpY1vzqKqZKvdpKuc147dw2N9d \ | ||
--wal-recovery-mode skip_any_corrupted_record \ | ||
--limit-ledger-size | ||
``` | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to run this on every PR? It seems like a nightly would be suffcient
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I think this worth on every PR. These are some reasons:
local-cluster
is the longest at this pipeline phase...)live-cluster
occupiesqueue=gce-deploy
which isn't so crowded compared to thequeue=default
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe there are only one or two agents running
gce-deploy
ATM. So we'll want to bump that up first. It should just be a matter of ensuring the gcloud CLI tools are installed and pointed at the correct project, then adding a systemd service for the new agent