diff --git a/.github/workflows/replay-verify.yaml b/.github/workflows/replay-verify.yaml index ae44009412ac8..ac498ad123fc4 100644 --- a/.github/workflows/replay-verify.yaml +++ b/.github/workflows/replay-verify.yaml @@ -32,7 +32,6 @@ on: paths: - ".github/workflows/replay-verify.yaml" - ".github/workflows/workflow-run-replay-verify.yaml" - - "testsuite/replay_verify.py" schedule: - cron: "0 22 * * 0,2,4" # The main branch cadence. This runs every Sun,Tues,Thurs diff --git a/.github/workflows/workflow-run-replay-verify.yaml b/.github/workflows/workflow-run-replay-verify.yaml index cb7af2922a65d..40788a9fa0e28 100644 --- a/.github/workflows/workflow-run-replay-verify.yaml +++ b/.github/workflows/workflow-run-replay-verify.yaml @@ -110,7 +110,6 @@ jobs: # which cleans up the target directory in its post action path: | aptos-debugger - testsuite/replay_verify.py key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }} - name: Prepare for build if not cached @@ -185,12 +184,11 @@ jobs: matrix: job_id: ${{ fromJson(needs.prepare.outputs.job_ids) }} steps: - - name: Load cached aptos-debugger binary and replay_verify.py script + - name: Load cached aptos-debugger binary uses: actions/cache/restore@v4 with: path: | aptos-debugger - testsuite/replay_verify.py key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }} fail-on-cache-miss: true diff --git a/testsuite/replay_verify.py b/testsuite/replay_verify.py deleted file mode 100755 index 2ef81c92bfa4f..0000000000000 --- a/testsuite/replay_verify.py +++ /dev/null @@ -1,250 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright © Aptos Foundation -# SPDX-License-Identifier: Apache-2.0 - -import os -import shutil -import subprocess -import sys -from collections import deque -from multiprocessing import Pool, freeze_support -from typing import List, Tuple - -from verify_core.common import clear_artifacts, warm_cache_and_get_latest_backup_version - -TESTNET_RANGES: List[Tuple[int, int]] = [ - (862_000_000, 878_000_000), - (894_000_000, 910_000_000), - (942_000_000, 958_000_000), - (974_000_000, 990_000_000), - (1_006_000_000, 1_022_000_000), - (1_038_000_000, 1_054_000_000), - (1_070_000_000, 1_086_000_000), - (1_102_000_000, 1_115_000_000), - (1_128_000_000, 1_141_000_000), - (1_154_000_000, 1_167_000_000), - (5_495_000_000, 5_520_000_000), - (5_520_000_000, 5_545_000_000), - (5_600_000_000, 5_625_000_000), - (5_650_000_000, 5_675_000_000), - (5_675_000_000, 5_700_000_000), - (5_765_000_000, 5_785_000_000), - (5_922_000_000, 5_935_000_000), - (5_935_000_000, 5_950_000_000), - (5_950_000_000, sys.maxsize), -] - -MAINNET_RANGES: List[Tuple[int, int]] = [ - (518_000_000, 534_000_000), - (534_000_000, 550_000_000), - (550_000_000, 566_000_000), - (566_000_000, 581_000_000), - (581_000_000, 597_000_000), - (597_000_000, 613_000_000), - (613_000_000, 629_000_000), - (629_000_000, 640_000_000), - # Skip tapos range - (949_000_000, 954_000_000), - (954_000_000, 969_000_000), - (969_000_000, 984_000_000), - (984_000_000, 1_000_000_000), - (1_000_000_000, 1_020_000_000), - (1_020_000_000, 1_040_000_000), - (1_040_000_000, 1_060_000_000), - (1_060_000_000, 1_085_000_000), - # Skip tapos2 range - (1_635_000_000, 1_655_000_000), - (1_655_000_000, 1_675_000_000), - (1_675_000_000, sys.maxsize), -] - - -# retry the replay_verify_partition if it fails -def retry_replay_verify_partition(func, *args, **kwargs) -> Tuple[int, int, bytes]: - (partition_number, code, msg) = (0, 0, b"") - NUM_OF_RETRIES = 6 - for i in range(1, NUM_OF_RETRIES + 1): - print(f"try {i}") - (partition_number, code, msg) = func(*args, **kwargs) - # let's only not retry on txn error and success case, - if code == 2 or code == 0: - break - return (partition_number, code, msg) - - -def replay_verify_partition( - n: int, - N: int, - history_start: int, - per_partition: int, - latest_version: int, - txns_to_skip: Tuple[int], - backup_config_template_path: str, -) -> Tuple[int, int, bytes]: - """ - Run replay-verify for a partition of the backup, returning a tuple of the (partition number, return code) - - n: partition number - N: total number of partitions - history_start: start version of the history to verify - per_partition: number of versions per partition - latest_version: last version to verify - txns_to_skip: list of transactions to skip - backup_config_template_path: path to the backup config template - """ - end = history_start + n * per_partition - if n == N and end < latest_version: - end = latest_version - - start = end - per_partition - partition_name = f"run_{n}_{start}_{end}" - - print(f"[partition {n}] spawning {partition_name}") - if not os.path.exists(partition_name): - os.mkdir(partition_name) - # the metadata cache is shared across partitions and downloaded when querying the latest version. - shutil.copytree("metadata-cache", f"{partition_name}/metadata-cache") - - txns_to_skip_args = [f"--txns-to-skip={txn}" for txn in txns_to_skip] - - # run and print output - process = subprocess.Popen( - [ - "target/release/aptos-debugger", - "aptos-db", - "replay-verify", - # "--enable-storage-sharding", - *txns_to_skip_args, - "--concurrent-downloads", - "8", - "--replay-concurrency-level", - "2", - "--metadata-cache-dir", - f"./{partition_name}/metadata-cache", - "--target-db-dir", - f"./{partition_name}/db", - "--start-version", - str(start), - "--end-version", - str(end), - "--lazy-quit", - "--command-adapter-config", - backup_config_template_path, - ], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, # redirect stderr to stdout - ) - if process.stdout is None: - raise Exception(f"[partition {n}] stdout is None") - last_lines = deque(maxlen=10) - for line in iter(process.stdout.readline, b""): - print(f"[partition {n}] {line}", flush=True) - last_lines.append(line) - process.communicate() - - return (n, process.returncode, b"\n".join(last_lines)) - - -def main(runner_no=None, runner_cnt=None, start_version=None, end_version=None): - # collect all required ENV variables - REQUIRED_ENVS = [ - "BUCKET", - "SUB_DIR", - "HISTORY_START", - "TXNS_TO_SKIP", - "BACKUP_CONFIG_TEMPLATE_PATH", - ] - - if not all(env in os.environ for env in REQUIRED_ENVS): - raise Exception("Missing required ENV variables") - - # the runner may have small overlap at the boundary to prevent missing any transactions - runner_mapping = ( - TESTNET_RANGES if "testnet" in os.environ["BUCKET"] else MAINNET_RANGES - ) - - # by default we only have 1 runner - if runner_no is None or runner_cnt is None: - runner_no = 0 - runner_cnt = 1 - runner_mapping = [[runner_mapping[0][0], runner_mapping[-1][1]]] - - assert ( - runner_no >= 0 and runner_no < runner_cnt - ), "runner_no must be between 0 and runner_cnt" - - TXNS_TO_SKIP = [int(txn) for txn in os.environ["TXNS_TO_SKIP"].split(" ")] - BACKUP_CONFIG_TEMPLATE_PATH = os.environ["BACKUP_CONFIG_TEMPLATE_PATH"] - - if not os.path.exists(BACKUP_CONFIG_TEMPLATE_PATH): - raise Exception("BACKUP_CONFIG_TEMPLATE_PATH does not exist") - with open(BACKUP_CONFIG_TEMPLATE_PATH, "r") as f: - config = f.read() - if "aws" in config and shutil.which("aws") is None: - raise Exception("Missing required AWS CLI for pulling backup data from S3") - - if os.environ.get("REUSE_BACKUP_ARTIFACTS", "true") != "true": - print("[main process] clearing existing backup artifacts") - clear_artifacts() - else: - print("[main process] skipping clearing backup artifacts") - - assert runner_cnt == len( - runner_mapping - ), "runner_cnt must match the number of runners in the mapping" - runner_start = runner_mapping[runner_no][0] - runner_end = runner_mapping[runner_no][1] - latest_version = warm_cache_and_get_latest_backup_version( - BACKUP_CONFIG_TEMPLATE_PATH - ) - if runner_no == runner_cnt - 1: - runner_end = min(runner_end, latest_version) - print("runner start %d end %d" % (runner_start, runner_end)) - if start_version is not None and end_version is not None: - runner_start = start_version - runner_end = end_version - - # run replay-verify in parallel - N = 16 - PER_PARTITION = (runner_end - runner_start) // N - - with Pool(N) as p: - all_partitions = p.starmap( - retry_replay_verify_partition, - [ - ( - replay_verify_partition, - n, - N, - runner_start, - PER_PARTITION, - runner_end, - TXNS_TO_SKIP, - BACKUP_CONFIG_TEMPLATE_PATH, - ) - for n in range(1, N + 1) - ], - ) - - print("[main process] finished") - - err = False - for partition_num, return_code, msg in all_partitions: - if return_code != 0: - print("======== ERROR ========") - print( - f"ERROR: partition {partition_num} failed with exit status {return_code}, {msg})" - ) - err = True - - if err: - sys.exit(1) - - -if __name__ == "__main__": - freeze_support() - (runner_no, runner_cnt) = ( - (int(sys.argv[1]), int(sys.argv[2])) if len(sys.argv) > 2 else (None, None) - ) - main(runner_no, runner_cnt) diff --git a/testsuite/replay_verify_run_local.py b/testsuite/replay_verify_run_local.py deleted file mode 100755 index 1872278a5c6ce..0000000000000 --- a/testsuite/replay_verify_run_local.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright © Aptos Foundation -# SPDX-License-Identifier: Apache-2.0 - -# Test replay-verify by running it on a public testnet backup -# While the replay-verify composite Github Action is meant to run with aptos-core checked out in the current -# working directory, this test script is meant to be run from this separate repo. The environment variable APTOS_CORE_PATH -# is required to be set to the path of your local checkout of aptos-core, which will be used to build and copy over test dependencies. - -import os -import subprocess - -import replay_verify - - -def local_setup(): - # Take these from the expected replay verify run - envs = { - "TIMEOUT_MINUTES": "5", - "BUCKET": "aptos-testnet-backup", - "SUB_DIR": "e1", - "HISTORY_START": "350000000", - "TXNS_TO_SKIP": "0", # 46874937 151020059 should be excluded - "BACKUP_CONFIG_TEMPLATE_PATH": "terraform/helm/fullnode/files/backup/gcs.yaml", - "REUSE_BACKUP_ARTIFACTS": "true", - } - - # build backup tools - subprocess.run( - [ - "cargo", - "build", - "--release", - "-p", - "aptos-debugger", - ], - check=True, - ) - - # write to environment variables - for key, value in envs.items(): - os.environ[key] = value - - -if __name__ == "__main__": - local_setup() - replay_verify.main( - runner_no=None, runner_cnt=None, start_version=291217350, end_version=292975771 - ) diff --git a/testsuite/replay_verify_test.py b/testsuite/replay_verify_test.py deleted file mode 100644 index aa751e1ab3452..0000000000000 --- a/testsuite/replay_verify_test.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright © Aptos Foundation -# SPDX-License-Identifier: Apache-2.0 - -import os -import unittest -import subprocess - -from verify_core.common import find_latest_version_from_db_backup_output - - -class ReplayVerifyHarnessTests(unittest.TestCase): - def testFindLatestVersionFromDbBackupOutput(self) -> None: - proc = subprocess.Popen( - f"cat {os.path.dirname(__file__)}/fixtures/backup_oneshot.fixture", - shell=True, - stdout=subprocess.PIPE, - ) - if proc.stdout is None: - raise Exception("Failed to get test fixture contents") - latest_version = find_latest_version_from_db_backup_output(proc.stdout) - self.assertEqual(latest_version, 417000000) - proc.communicate()