[quorum store] reduce backpressure significantly for more TPS (#13558)

## Description As Quorum Store batches are bucketed, and we are looking to increase block limits, now is the time to reduce Quorum Store backpressure. We now allow 36K transactions outstanding. At 12K TPS, this is approximately 3 seconds worth of batches. For forge tests, a lot of the queuing shifts from mempool to POS-to-Proposal, so the limits need to be adjusted accordingly.
aptos-labs · Jun 7, 2024 · 1a47323 · 1a47323
1 parent 0594ca5
commit 1a47323
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 11 deletions.
diff --git a/config/src/config/quorum_store_config.rs b/config/src/config/quorum_store_config.rs
@@ -29,14 +29,14 @@ impl Default for QuorumStoreBackPressureConfig {
         QuorumStoreBackPressureConfig {
             // QS will be backpressured if the remaining total txns is more than this number
             // Roughly, target TPS * commit latency seconds
-            backlog_txn_limit_count: 12_000,
+            backlog_txn_limit_count: 36_000,
             // QS will create batches at the max rate until this number is reached
-            backlog_per_validator_batch_limit_count: 4,
+            backlog_per_validator_batch_limit_count: 20,
             decrease_duration_ms: 1000,
             increase_duration_ms: 1000,
             decrease_fraction: 0.5,
             dynamic_min_txn_per_s: 160,
-            dynamic_max_txn_per_s: 4000,
+            dynamic_max_txn_per_s: 12000,
         }
     }
 }

diff --git a/testsuite/forge-cli/src/main.rs b/testsuite/forge-cli/src/main.rs
@@ -1112,10 +1112,10 @@ fn realistic_env_workload_sweep_test() -> ForgeConfig {
         ]),
         // Investigate/improve to make latency more predictable on different workloads
         criteria: [
-            (7700, 100, 0.3, 0.3, 0.5, 0.5),
-            (7000, 100, 0.3, 0.3, 0.5, 0.5),
-            (2000, 300, 0.3, 0.8, 0.6, 0.7),
-            (3200, 500, 0.3, 0.4, 0.7, 1.0),
+            (7700, 100, 0.3, 0.5, 0.5, 0.5),
+            (7000, 100, 0.3, 0.5, 0.5, 0.5),
+            (2000, 300, 0.3, 1.0, 0.6, 1.0),
+            (3200, 500, 0.3, 1.5, 0.7, 0.7),
             // (150, 0.5, 1.0, 1.5, 0.65),
         ]
         .into_iter()
@@ -1950,9 +1950,9 @@ fn realistic_env_max_load_test(
         .add_system_metrics_threshold(SystemMetricsThreshold::new(
             // Check that we don't use more than 18 CPU cores for 10% of the time.
             MetricsThreshold::new(18.0, 10),
-            // Memory starts around 3GB, and grows around 1.2GB/hr in this test.
+            // Memory starts around 3.5GB, and grows around 1.4GB/hr in this test.
             // Check that we don't use more than final expected memory for more than 10% of the time.
-            MetricsThreshold::new_gb(3.3 + 1.4 * (duration_secs as f64 / 3600.0), 10),
+            MetricsThreshold::new_gb(3.5 + 1.4 * (duration_secs as f64 / 3600.0), 10),
         ))
         .add_no_restarts()
         .add_wait_for_catchup_s(
@@ -1970,8 +1970,8 @@ fn realistic_env_max_load_test(
             LatencyBreakdownThreshold::new_with_breach_pct(
                 vec![
                     (LatencyBreakdownSlice::QsBatchToPos, 0.35),
-                    // only reaches close to threshold during epoch change
-                    (LatencyBreakdownSlice::QsPosToProposal, 0.6),
+                    // quorum store backpressure is relaxed, so queueing happens here
+                    (LatencyBreakdownSlice::QsPosToProposal, 2.5),
                     // can be adjusted down if less backpressure
                     (LatencyBreakdownSlice::ConsensusProposalToOrdered, 0.85),
                     // can be adjusted down if less backpressure