From 3d6c0e24a6a96c361ec85bcf926373263550125c Mon Sep 17 00:00:00 2001 From: ClaytonNorthey92 Date: Tue, 19 Nov 2024 11:37:59 -0500 Subject: [PATCH 1/2] op-conductor robustness + bootstrap/startup script improvement op-conductor robustness fixes in localnet modified "setup-raft.bash" script to allow configuration of op-conductor raft setup from environment --- .github/workflows/localnet-test.yml | 4 +- e2e/docker-compose.yml | 68 +++++++++++++++++++++-------- e2e/entrypointl2.sh | 1 - e2e/optimism-stack.Dockerfile | 2 + e2e/setup-raft.bash | 48 ++++++++++++++------ 5 files changed, 88 insertions(+), 35 deletions(-) diff --git a/.github/workflows/localnet-test.yml b/.github/workflows/localnet-test.yml index 7b797a80..1faa5b7d 100644 --- a/.github/workflows/localnet-test.yml +++ b/.github/workflows/localnet-test.yml @@ -41,8 +41,8 @@ jobs: - name: "run localnet" run: docker compose -f ./e2e/docker-compose.yml up -d - - name: "kill an op-node after a minute" - run: sleep 60 && docker compose -f ./e2e/docker-compose.yml down op-node + - name: "kill an op-node after 15 seconds, then wait 3 minutes (the healthcheck interval + time for another sequencer to take over)" + run: sleep 15 && docker compose -f ./e2e/docker-compose.yml down op-node && sleep 180 - name: "get localnet stats" working-directory: ./e2e/monitor diff --git a/e2e/docker-compose.yml b/e2e/docker-compose.yml index 5ae36284..81586f3c 100644 --- a/e2e/docker-compose.yml +++ b/e2e/docker-compose.yml @@ -209,8 +209,10 @@ services: depends_on: - "geth-l1" healthcheck: - test: ["CMD-SHELL", "ls /l2configs/rollup.json"] - timeout: 60s + test: ["CMD-SHELL", "nc -w 1 -vz 0.0.0.0 30303"] + timeout: 1s + retries: 300 + interval: 1s environment: ADMIN_PRIVATE_KEY: "${ADMIN_PRIVATE_KEY}" OP_GETH_L1_RPC: "http://geth-l1:8545" @@ -240,8 +242,10 @@ services: op-geth-l2: condition: "service_healthy" healthcheck: - test: ["CMD-SHELL", "ls /l2configs/rollup.json"] - timeout: 60s + test: ["CMD-SHELL", "nc -w 1 -vz 0.0.0.0 30303"] + timeout: 1s + retries: 300 + interval: 1s environment: ADMIN_PRIVATE_KEY: "${ADMIN_PRIVATE_KEY}" OP_GETH_L1_RPC: "http://geth-l1:8545" @@ -273,8 +277,10 @@ services: op-geth-l2: condition: "service_healthy" healthcheck: - test: ["CMD-SHELL", "ls /l2configs/rollup.json"] - timeout: 60s + test: ["CMD-SHELL", "nc -w 1 -vz 0.0.0.0 30303"] + timeout: 1s + retries: 300 + interval: 1s environment: ADMIN_PRIVATE_KEY: "${ADMIN_PRIVATE_KEY}" OP_GETH_L1_RPC: "http://geth-l1:8545" @@ -310,6 +316,11 @@ services: condition: "service_started" op-geth-l2: condition: "service_healthy" + healthcheck: + test: ["CMD-SHELL", "nc -w 1 -vz 0.0.0.0 9222"] + timeout: 1s + retries: 300 + interval: 1s environment: OP_NODE_BSS_WS: "http://bssd:8081/v1/ws" command: @@ -329,7 +340,7 @@ services: - "--l1.trustrpc" - "--log.level=info" - "--l1.trustrpc=true" - - "--l1.http-poll-interval=6s" + - "--l1.http-poll-interval=1s" - "--p2p.no-discovery" - "--p2p.priv.path=/tmp/op-node-priv-key.txt" - "--p2p.sequencer.key=${ADMIN_PRIVATE_KEY}" @@ -364,6 +375,11 @@ services: condition: "service_started" op-geth-l2-2: condition: "service_healthy" + healthcheck: + test: ["CMD-SHELL", "nc -w 1 -vz 0.0.0.0 9222"] + timeout: 1s + retries: 300 + interval: 1s environment: OP_NODE_BSS_WS: "http://bssd:8081/v1/ws" command: @@ -415,6 +431,11 @@ services: condition: "service_started" op-geth-l2-3: condition: "service_healthy" + healthcheck: + test: ["CMD-SHELL", "nc -w 1 -vz 0.0.0.0 9222"] + timeout: 1s + retries: 300 + interval: 1s environment: OP_NODE_BSS_WS: "http://bssd:8081/v1/ws" command: @@ -575,6 +596,8 @@ services: - "--l2oo-address=${L2OO_ADDRESS}" - "--private-key=${ADMIN_PRIVATE_KEY}" - "--l1-eth-rpc=http://geth-l1:8545" + networks: + e2e: op-proposer-2: build: @@ -634,19 +657,20 @@ services: - "op-conductor/bin/op-conductor" - "--consensus.addr=op-conductor" - "--consensus.port=50050" - - "--raft.server.id=op-conductor-1" + - "--raft.server.id=op-conductor-1:50050" - "--raft.storage.dir=/tmp/raft" - "--raft.bootstrap" - "--node.rpc=http://op-node:8547" - "--execution.rpc=http://op-geth-l2:8546" - - "--healthcheck.unsafe-interval=10" + - "--healthcheck.unsafe-interval=12" - "--healthcheck.safe-interval=200" - "--healthcheck.min-peer-count=1" - - "--healthcheck.interval=60" + - "--healthcheck.interval=120" - "--rollup.config=/l2configs/rollup.json" - "--log.format=terminal" - "--rpc.addr=0.0.0.0" - "--rpc.port=8547" + - "--paused" volumes: - "l2configs:/l2configs" - "./jwt.txt:/tmp/jwt.txt" @@ -655,7 +679,7 @@ services: e2e: depends_on: op-node: - condition: "service_started" + condition: "service_healthy" op-geth-l2: condition: "service_healthy" ports: @@ -669,18 +693,19 @@ services: - "op-conductor/bin/op-conductor" - "--consensus.addr=op-conductor-2" - "--consensus.port=50051" - - "--raft.server.id=op-conductor-2" + - "--raft.server.id=op-conductor-2:50051" - "--raft.storage.dir=/tmp/raft" - "--node.rpc=http://op-node-2:8547" - "--execution.rpc=http://op-geth-l2-2:8546" - - "--healthcheck.unsafe-interval=10" + - "--healthcheck.unsafe-interval=12" - "--healthcheck.safe-interval=200" - "--healthcheck.min-peer-count=1" - - "--healthcheck.interval=60" + - "--healthcheck.interval=120" - "--rollup.config=/l2configs/rollup.json" - "--log.format=terminal" - "--rpc.addr=0.0.0.0" - "--rpc.port=8547" + - "--paused" volumes: - "l2configs:/l2configs" - "./jwt.txt:/tmp/jwt.txt" @@ -689,7 +714,7 @@ services: e2e: depends_on: op-node-2: - condition: "service_started" + condition: "service_healthy" op-geth-l2-2: condition: "service_healthy" ports: @@ -703,18 +728,19 @@ services: - "op-conductor/bin/op-conductor" - "--consensus.addr=op-conductor-3" - "--consensus.port=50052" - - "--raft.server.id=op-conductor-3" + - "--raft.server.id=op-conductor-3:50052" - "--raft.storage.dir=/tmp/raft" - "--node.rpc=http://op-node-3:8547" - "--execution.rpc=http://op-geth-l2-3:8546" - - "--healthcheck.unsafe-interval=10" + - "--healthcheck.unsafe-interval=12" - "--healthcheck.safe-interval=200" - "--healthcheck.min-peer-count=1" - - "--healthcheck.interval=60" + - "--healthcheck.interval=120" - "--rollup.config=/l2configs/rollup.json" - "--log.format=terminal" - "--rpc.addr=0.0.0.0" - "--rpc.port=8547" + - "--paused" volumes: - "l2configs:/l2configs" - "./jwt.txt:/tmp/jwt.txt" @@ -723,7 +749,7 @@ services: e2e: depends_on: op-node-3: - condition: "service_started" + condition: "service_healthy" op-geth-l2-3: condition: "service_healthy" ports: @@ -735,6 +761,10 @@ services: context: "." entrypoint: - "bash" + environment: + OPNODE_RPCS: 'http://op-node:8547,http://op-node-2:8547,http://op-node-3:8547' + OPCONDUCTOR_RPCS: 'http://op-conductor:8547,http://op-conductor-2:8547,http://op-conductor-3:8547' + OPCONDUCTOR_RAFT_VOTERS: 'op-conductor:50050,op-conductor-2:50051,op-conductor-3:50052' command: - "/tmp/setup-raft.bash" depends_on: diff --git a/e2e/entrypointl2.sh b/e2e/entrypointl2.sh index 96838425..07a60017 100644 --- a/e2e/entrypointl2.sh +++ b/e2e/entrypointl2.sh @@ -41,7 +41,6 @@ fi --authrpc.addr=0.0.0.0 \ --authrpc.port=8551 \ --authrpc.jwtsecret=/tmp/jwt.txt \ - --verbosity=5 \ --gpo.maxprice=1 \ --tbc.network=localnet \ --tbc.initheight=1 \ diff --git a/e2e/optimism-stack.Dockerfile b/e2e/optimism-stack.Dockerfile index f5bca118..06876024 100644 --- a/e2e/optimism-stack.Dockerfile +++ b/e2e/optimism-stack.Dockerfile @@ -61,3 +61,5 @@ RUN forge build WORKDIR /git/optimism RUN make devnet-allocs + +RUN apt-get install -y netcat-openbsd \ No newline at end of file diff --git a/e2e/setup-raft.bash b/e2e/setup-raft.bash index 42b8bed8..86788573 100644 --- a/e2e/setup-raft.bash +++ b/e2e/setup-raft.bash @@ -1,20 +1,42 @@ #! /bin/bash -set -ev +set -evx -curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_pause","params":[],"id":4}' http://op-conductor:8547 -curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_pause","params":[],"id":4}' http://op-conductor-2:8547 -curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_pause","params":[],"id":4}' http://op-conductor-3:8547 -curl -X POST -H "Content-Type: application/json" --data "{\"jsonrpc\":\"2.0\",\"method\":\"admin_stopSequencer\",\"params\":[],\"id\":3}" http://op-node:8547 +IFS=',' read -ra conductor_rpcs <<< "$OPCONDUCTOR_RPCS" +IFS=',' read -ra conductor_rafts <<< "$OPCONDUCTOR_RAFT_VOTERS" +IFS=',' read -ra opnode_rpcs <<< "$OPNODE_RPCS" +opnode_rpc= +# find the leader +for i in "${!conductor_rpcs[@]}"; do + is_leader=$(curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_leader","params":[],"id":4}' "${conductor_rpcs[$i]}" | jq '.result') + if [ "$is_leader" = 'true' ]; then + opnode_rpc=${opnode_rpcs[$i]} + fi +done -curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"optimism_syncStatus","params":[],"id":1}' http://op-node:8547 -curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_addServerAsVoter","params":["op-conductor-2", "op-conductor-2:50051"],"id":4}' http://op-conductor:8547 -curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_addServerAsVoter","params":["op-conductor-3", "op-conductor-3:50052"],"id":4}' http://op-conductor:8547 -curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_resume","params":[],"id":4}' http://op-conductor:8547 -curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_resume","params":[],"id":4}' http://op-conductor-2:8547 -curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_resume","params":[],"id":4}' http://op-conductor-3:8547 -unsafe_head=$(curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"optimism_syncStatus","params":[],"id":2}' http://op-node:8547 | jq '.result.unsafe_l2.hash' ) +if [ "$opnode_rpc" = '' ]; then + echo "could not find leader, aborting" + exit 1 +fi + +# pause each conductor so we can modify state +for rpc in "${conductor_rpcs[@]}"; do + curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_pause","params":[],"id":4}' $rpc +done + +# for each raft port in op-conductor, add as a voter. this may error when adding self as a voter with the leader, but that's ok, the others should succeed +for raft in "${conductor_rafts[@]}"; do + curl -X POST -H "Content-Type: application/json" --data "{\"jsonrpc\":\"2.0\",\"method\":\"conductor_addServerAsVoter\",\"params\":[\"$raft\", \"$raft\"],\"id\":4}" ${conductor_rpcs[0]} +done + +# resume the conductors +for rpc in "${conductor_rpcs[@]}"; do + curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_resume","params":[],"id":4}' $rpc +done + +# restart the sequencer using the unsafe head from the leader's sync status +unsafe_head=$(curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"optimism_syncStatus","params":[],"id":2}' $opnode_rpc | jq '.result.unsafe_l2.hash' ) echo "unsafe_head=$unsafe_head" -curl -X POST -H "Content-Type: application/json" --data "{\"jsonrpc\":\"2.0\",\"method\":\"admin_startSequencer\",\"params\":[$unsafe_head],\"id\":3}" http://op-node:8547 +curl -X POST -H "Content-Type: application/json" --data "{\"jsonrpc\":\"2.0\",\"method\":\"admin_startSequencer\",\"params\":[$unsafe_head],\"id\":3}" $opnode_rpc From 4b3fb1894eadbcb3dbac0a287beb9bf7d9f7a8db Mon Sep 17 00:00:00 2001 From: ClaytonNorthey92 Date: Wed, 20 Nov 2024 13:54:52 -0500 Subject: [PATCH 2/2] stop sequencers as well --- e2e/setup-raft.bash | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/e2e/setup-raft.bash b/e2e/setup-raft.bash index 86788573..60c9cc57 100644 --- a/e2e/setup-raft.bash +++ b/e2e/setup-raft.bash @@ -26,6 +26,11 @@ for rpc in "${conductor_rpcs[@]}"; do curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_pause","params":[],"id":4}' $rpc done +for rpc in "${opnode_rpcs[@]}"; do + curl -X POST -H "Content-Type: application/json" --data "{\"jsonrpc\":\"2.0\",\"method\":\"admin_stopSequencer\",\"params\":[],\"id\":3}" $rpc +done + + # for each raft port in op-conductor, add as a voter. this may error when adding self as a voter with the leader, but that's ok, the others should succeed for raft in "${conductor_rafts[@]}"; do curl -X POST -H "Content-Type: application/json" --data "{\"jsonrpc\":\"2.0\",\"method\":\"conductor_addServerAsVoter\",\"params\":[\"$raft\", \"$raft\"],\"id\":4}" ${conductor_rpcs[0]}