Script that recalibrates performance benchmarks (#15446)

## Description Move performance benchmark values into tsv files, and add script that updates them. ## How Has This Been Tested? used the script to update values based on stable runs over the holidays.
aptos-labs · Dec 3, 2024 · 4815045 · 4815045
1 parent b011781
commit 4815045
Show file tree

Hide file tree

Showing 7 changed files with 325 additions and 96 deletions.
diff --git a/aptos-move/e2e-benchmark/data/calibration_values.tsv b/aptos-move/e2e-benchmark/data/calibration_values.tsv
@@ -0,0 +1,26 @@
+Loop { loop_count: Some(100000), loop_type: NoOp }	60	0.960	1.119	42122.6
+Loop { loop_count: Some(10000), loop_type: Arithmetic }	60	0.956	1.074	26240.7
+CreateObjects { num_objects: 10, object_payload_size: 0 }	60	0.938	1.168	156.6
+CreateObjects { num_objects: 10, object_payload_size: 10240 }	60	0.924	1.086	9713.2
+CreateObjects { num_objects: 100, object_payload_size: 0 }	60	0.922	1.275	1577.0
+CreateObjects { num_objects: 100, object_payload_size: 10240 }	60	0.935	1.070	11728.7
+InitializeVectorPicture { length: 128 }	60	0.926	1.069	169.4
+VectorPicture { length: 128 }	60	0.908	1.075	50.2
+VectorPictureRead { length: 128 }	60	0.919	1.059	48.0
+InitializeVectorPicture { length: 30720 }	60	0.939	1.127	28404.4
+VectorPicture { length: 30720 }	60	0.936	1.095	6935.6
+VectorPictureRead { length: 30720 }	60	0.939	1.093	6948.2
+SmartTablePicture { length: 30720, num_points_per_txn: 200 }	60	0.947	1.080	43673.3
+SmartTablePicture { length: 1048576, num_points_per_txn: 300 }	60	0.947	1.111	74145.8
+ResourceGroupsSenderWriteTag { string_length: 1024 }	60	0.918	1.075	15.8
+ResourceGroupsSenderMultiChange { string_length: 1024 }	60	0.909	1.169	32.9
+TokenV1MintAndTransferFT	60	0.953	1.069	384.6
+TokenV1MintAndTransferNFTSequential	60	0.938	1.064	600.3
+TokenV2AmbassadorMint { numbered: true }	60	0.951	1.057	516.6
+LiquidityPoolSwap { is_stable: true }	60	0.961	1.139	582.6
+LiquidityPoolSwap { is_stable: false }	60	0.929	1.099	563.0
+CoinInitAndMint	60	0.928	1.130	205.0
+FungibleAssetMint	60	0.930	1.098	235.8
+IncGlobalMilestoneAggV2 { milestone_every: 1 }	60	0.914	1.051	33.5
+IncGlobalMilestoneAggV2 { milestone_every: 2 }	60	0.914	1.105	19.0
+EmitEvents { count: 1000 }	60	0.937	1.158	8818.7
diff --git a/aptos-move/e2e-benchmark/src/main.rs b/aptos-move/e2e-benchmark/src/main.rs
@@ -15,7 +15,7 @@ use aptos_transaction_generator_lib::{
 use aptos_types::{account_address::AccountAddress, transaction::TransactionPayload};
 use rand::{rngs::StdRng, SeedableRng};
 use serde_json::json;
-use std::{collections::HashMap, process::exit};
+use std::{collections::HashMap, fs, process::exit};
 
 // bump after a bigger test or perf change, so you can easily distinguish runs
 // that are on top of this commit
@@ -85,42 +85,16 @@ const ALLOWED_REGRESSION: f64 = 0.15;
 const ALLOWED_IMPROVEMENT: f64 = 0.15;
 const ABSOLUTE_BUFFER_US: f64 = 2.0;
 
-const CALIBRATION_VALUES: &str = "
-Loop { loop_count: Some(100000), loop_type: NoOp }	60	0.955	1.074	41893.7
-Loop { loop_count: Some(10000), loop_type: Arithmetic }	60	0.965	1.078	25915.0
-CreateObjects { num_objects: 10, object_payload_size: 0 }	60	0.924	1.082	158.1
-CreateObjects { num_objects: 10, object_payload_size: 10240 }	60	0.951	1.118	9356.2
-CreateObjects { num_objects: 100, object_payload_size: 0 }	60	0.926	1.082	1574.2
-CreateObjects { num_objects: 100, object_payload_size: 10240 }	60	0.952	1.092	11541.9
-InitializeVectorPicture { length: 128 }	10	0.965	1.038	163.3
-VectorPicture { length: 128 }	10	0.938	1.060	48.8
-VectorPictureRead { length: 128 }	10	0.977	1.077	46.4
-InitializeVectorPicture { length: 30720 }	60	0.948	1.123	27893.4
-VectorPicture { length: 30720 }	60	0.931	1.125	6923.1
-VectorPictureRead { length: 30720 }	60	0.934	1.102	6923.1
-SmartTablePicture { length: 30720, num_points_per_txn: 200 }	60	0.952	1.109	43594.7
-SmartTablePicture { length: 1048576, num_points_per_txn: 300 }	60	0.957	1.120	73865.4
-ResourceGroupsSenderWriteTag { string_length: 1024 }	60	0.934	1.134	15.0
-ResourceGroupsSenderMultiChange { string_length: 1024 }	60	0.929	1.122	32.3
-TokenV1MintAndTransferFT	60	0.958	1.093	385.2
-TokenV1MintAndTransferNFTSequential	60	0.973	1.139	588.1
-TokenV2AmbassadorMint { numbered: true }	60	0.960	1.141	512.5
-LiquidityPoolSwap { is_stable: true }	60	0.961	1.103	590.3
-LiquidityPoolSwap { is_stable: false }	60	0.954	1.134	552.2
-CoinInitAndMint	10	0.975	1.043	199.6
-FungibleAssetMint	10	0.954	1.038	236.3
-IncGlobalMilestoneAggV2 { milestone_every: 1 }	10	0.960	1.047	32.9
-IncGlobalMilestoneAggV2 { milestone_every: 2 }	10	0.971	1.066	18.1
-EmitEvents { count: 1000 }	10	0.969	1.052	8615.5
-";
-
 struct CalibrationInfo {
     // count: usize,
     expected_time_micros: f64,
 }
 
 fn get_parsed_calibration_values() -> HashMap<String, CalibrationInfo> {
-    CALIBRATION_VALUES
+    let calibration_values =
+        fs::read_to_string("aptos-move/e2e-benchmark/data/calibration_values.tsv")
+            .expect("Unable to read file");
+    calibration_values
         .trim()
         .split('\n')
         .map(|line| {

diff --git a/testsuite/forge.py b/testsuite/forge.py
@@ -619,16 +619,19 @@ def format_pre_comment(context: ForgeContext) -> str:
         context.forge_namespace,
     )
 
-    return textwrap.dedent(
-        f"""
+    return (
+        textwrap.dedent(
+            f"""
             ### Forge is running suite `{context.forge_test_suite}` on {get_testsuite_images(context)}
             * [Grafana dashboard (auto-refresh)]({dashboard_link})
             * [Humio Logs]({humio_logs_link})
             * [Axiom Logs]({axiom_logs_link})
             * [Validator CPU Profile]({validator_cpu_profile_link})
             * [Fullnode CPU Profile]({fullnode_cpu_profile_link})
             """
-    ).lstrip() + format_github_info(context)
+        ).lstrip()
+        + format_github_info(context)
+    )
 
 
 def format_comment(context: ForgeContext, result: ForgeResult) -> str:

diff --git a/testsuite/replay-verify/main.py b/testsuite/replay-verify/main.py
@@ -28,6 +28,7 @@
 
 REPLAY_CONCURRENCY_LEVEL = 1
 
+
 class Network(Enum):
     TESTNET = 1
     MAINNET = 2
@@ -241,6 +242,7 @@ def get_pod_status(self):
     def get_humio_log_link(self):
         return construct_humio_url(self.label, self.name, self.start_time, time.time())
 
+
 class ReplayConfig:
     def __init__(self, network):
         if network == Network.TESTNET:
@@ -253,9 +255,10 @@ def __init__(self, network):
             self.concurrent_replayer = 18
             self.pvc_number = 8
             self.min_range_size = 10_000
-            self.range_size = 2_000_000 
+            self.range_size = 2_000_000
             self.timeout_secs = 400
 
+
 class TaskStats:
     def __init__(self, name):
         self.name = name
@@ -308,7 +311,7 @@ def __init__(
         self.image = image
         self.pvcs = []
         self.config = replay_config
-        
+
     def __str__(self):
         return f"""ReplayScheduler:
             id: {self.id}
@@ -360,7 +363,11 @@ def create_pvc_from_snapshot(self):
             else MAINNET_SNAPSHOT_NAME
         )
         pvcs = create_pvcs_from_snapshot(
-            self.id, snapshot_name, self.namespace, self.config.pvc_number, self.get_label()
+            self.id,
+            snapshot_name,
+            self.namespace,
+            self.config.pvc_number,
+            self.get_label(),
         )
         assert len(pvcs) == self.config.pvc_number, "failed to create all pvcs"
         self.pvcs = pvcs
@@ -504,12 +511,16 @@ def get_image(image_tag=None):
     shell = forge.LocalShell()
     git = forge.Git(shell)
     image_name = "tools"
-    default_latest_image = forge.find_recent_images(
-        shell,
-        git,
-        1,
-        image_name=image_name,
-    )[0] if image_tag is None else image_tag
+    default_latest_image = (
+        forge.find_recent_images(
+            shell,
+            git,
+            1,
+            image_name=image_name,
+        )[0]
+        if image_tag is None
+        else image_tag
+    )
     full_image = f"{forge.GAR_REPO_NAME}/{image_name}:{default_latest_image}"
     return full_image
 
@@ -546,7 +557,7 @@ def print_logs(failed_workpod_logs, txn_mismatch_logs):
         range_size=range_size,
         image=image,
         replay_config=config,
-        network= network,
+        network=network,
         namespace=args.namespace,
     )
     logger.info(f"scheduler: {scheduler}")

diff --git a/testsuite/single_node_performance.py b/testsuite/single_node_performance.py
@@ -166,52 +166,9 @@ class RunGroupConfig:
 CALIBRATION_SEPARATOR = "	"
 
 # transaction_type	module_working_set_size	executor_type	count	min_ratio	max_ratio	median
-CALIBRATION = """
-no-op	1	VM	6	0.938	1.019	38925.3
-no-op	1000	VM	6	0.943	1.019	36444.6
-apt-fa-transfer	1	VM	6	0.927	1.018	26954.7
-apt-fa-transfer	1	NativeVM	6	0.927	1.018	35259.7
-account-generation	1	VM	6	0.96	1.02	20606.2
-account-generation	1	NativeVM	6	0.96	1.02	28216.2
-account-resource32-b	1	VM	6	0.94	1.026	34260.4
-modify-global-resource	1	VM	6	0.993	1.021	2260.5
-modify-global-resource	100	VM	6	0.982	1.02	33129.7
-publish-package	1	VM	6	0.983	1.012	1672.6
-mix_publish_transfer	1	VM	6	0.972	1.044	20832.8
-batch100-transfer	1	VM	6	0.953	1.024	645.1
-batch100-transfer	1	NativeVM	6	0.953	1.024	1437.0
-vector-picture30k	1	VM	6	0.992	1.039	103.6
-vector-picture30k	100	VM	6	0.913	1.015	1831.5
-smart-table-picture30-k-with200-change	1	VM	6	0.976	1.034	16.1
-smart-table-picture30-k-with200-change	100	VM	6	0.985	1.018	212.9
-modify-global-resource-agg-v2	1	VM	6	0.976	1.035	33992.5
-modify-global-flag-agg-v2	1	VM	6	0.986	1.016	4224
-modify-global-bounded-agg-v2	1	VM	6	0.964	1.047	7661.6
-modify-global-milestone-agg-v2	1	VM	6	0.973	1.017	25187.1
-resource-groups-global-write-tag1-kb	1	VM	6	0.989	1.03	9215.7
-resource-groups-global-write-and-read-tag1-kb	1	VM	6	0.982	1.018	5538.3
-resource-groups-sender-write-tag1-kb	1	VM	6	0.985	1.059	20084.2
-resource-groups-sender-multi-change1-kb	1	VM	6	0.968	1.034	16400.4
-token-v1ft-mint-and-transfer	1	VM	6	0.987	1.022	1156.3
-token-v1ft-mint-and-transfer	100	VM	6	0.964	1.024	17842.6
-token-v1nft-mint-and-transfer-sequential	1	VM	6	0.984	1.017	735.7
-token-v1nft-mint-and-transfer-sequential	100	VM	6	0.966	1.017	12819.7
-coin-init-and-mint	1	VM	6	0.95	1.024	26906.4
-coin-init-and-mint	100	VM	6	0.985	1.022	22312.6
-fungible-asset-mint	1	VM	6	0.955	1.013	23001.6
-fungible-asset-mint	100	VM	6	0.955	1.015	19973.5
-no-op5-signers	1	VM	6	0.934	1.016	38708.6
-token-v2-ambassador-mint	1	VM	6	0.975	1.008	15179.3
-token-v2-ambassador-mint	100	VM	6	0.985	1.007	15150.8
-liquidity-pool-swap	1	VM	6	0.987	1.018	805.5
-liquidity-pool-swap	100	VM	6	0.993	1.02	11156.3
-liquidity-pool-swap-stable	1	VM	6	0.985	1.017	778.7
-liquidity-pool-swap-stable	100	VM	6	0.982	1.009	11056.6
-deserialize-u256	1	VM	6	0.968	1.026	36444.6
-no-op-fee-payer	1	VM	6	0.994	1.026	2046
-no-op-fee-payer	100	VM	6	0.96	1.014	32866.5
-simple-script	1	VM	6	0.941	1.012	38206.1
-"""
+with open('testsuite/single_node_performance_values.tsv', 'r') as file:
+    CALIBRATION = file.read()
+
 
 # when adding a new test, add estimated expected_tps to it, as well as waived=True.
 # And then after a day or two - add calibration result for it above, removing expected_tps/waived fields.
@@ -232,7 +189,7 @@ class RunGroupConfig:
     RunGroupConfig(key=RunGroupKey("mix_publish_transfer"), key_extra=RunGroupKeyExtra(
         transaction_type_override="publish-package apt-fa-transfer",
         transaction_weights_override="1 100",
-    ), included_in=LAND_BLOCKING_AND_C, waived=True),
+    ), included_in=LAND_BLOCKING_AND_C),
     RunGroupConfig(key=RunGroupKey("batch100-transfer"), included_in=LAND_BLOCKING_AND_C),
     RunGroupConfig(key=RunGroupKey("batch100-transfer", executor_type="NativeVM"), included_in=Flow.CONTINUOUS),
 
@@ -294,7 +251,7 @@ class RunGroupConfig:
     # fee payer sequentializes transactions today. in these tests module publisher is the fee payer, so larger number of modules tests throughput with multiple fee payers
     RunGroupConfig(key=RunGroupKey("no-op-fee-payer"), included_in=LAND_BLOCKING_AND_C),
     RunGroupConfig(key=RunGroupKey("no-op-fee-payer", module_working_set_size=DEFAULT_MODULE_WORKING_SET_SIZE), included_in=Flow.CONTINUOUS),
-    RunGroupConfig(key=RunGroupKey("simple-script"), included_in=LAND_BLOCKING_AND_C, waived=True),
+    RunGroupConfig(key=RunGroupKey("simple-script"), included_in=LAND_BLOCKING_AND_C),
 
     RunGroupConfig(expected_tps=50000, key=RunGroupKey("coin_transfer_connected_components", executor_type="sharded"), key_extra=RunGroupKeyExtra(sharding_traffic_flags="--connected-tx-grps 5000", transaction_type_override=""), included_in=Flow.REPRESENTATIVE, waived=True),
     RunGroupConfig(expected_tps=50000, key=RunGroupKey("coin_transfer_hotspot", executor_type="sharded"), key_extra=RunGroupKeyExtra(sharding_traffic_flags="--hotspot-probability 0.8", transaction_type_override=""), included_in=Flow.REPRESENTATIVE, waived=True),
@@ -1067,16 +1024,24 @@ def print_table(
         """If you expect your PR to change the performance, you need to recalibrate the values.
 To do so, you should run the test on your branch 6 times
 (https://github.com/aptos-labs/aptos-core/actions/workflows/workflow-run-execution-performance.yaml ; remember to select CONTINUOUS).
-Then go to Humio calibration link (https://gist.github.com/igor-aptos/7b12ca28de03894cddda8e415f37889e),
-update it to your branch, and export values as CSV, and then open and copy values inside
-testsuite/single_node_performance.py testsuite), and add Blockchain oncall as the reviewer.
+Then run the script locally `./testsuite/single_node_performance_calibration.py --branch=YOUR_BRANCH` to update calibration values
+and add Blockchain oncall as the reviewer.
 """
     )
     exit(1)
 
 if move_e2e_benchmark_failed:
     print(
-        "Move e2e benchmark failed, failing the job. See logs at the beginning for more details."
+        """
+Move e2e benchmark failed, failing the job. See logs at the beginning for more details.
+
+If you expect your PR to change the performance, you need to recalibrate the values.
+To do so, you should run the test on your branch 6 times
+(https://github.com/aptos-labs/aptos-core/actions/workflows/workflow-run-execution-performance.yaml ; remember to select CONTINUOUS,
+and don't select to skip move-only e2e tests).
+Then run the script locally `./testsuite/single_node_performance_calibration.py --branch=YOUR_BRANCH --move-e2e` to update calibration values
+and add Blockchain oncall as the reviewer.
+"""
     )
     exit(1)