From ca1c3eaf46b1c88380c2c94e4edabcc066443bcf Mon Sep 17 00:00:00 2001 From: Josh Lind Date: Wed, 18 Oct 2023 18:59:22 -0400 Subject: [PATCH] [State Sync] Tweak configs for performance. --- config/src/config/network_config.rs | 2 +- config/src/config/state_sync_config.rs | 34 +++++++++++++------------- network/framework/src/constants.rs | 2 +- storage/storage-interface/src/lib.rs | 2 +- testsuite/forge-cli/src/main.rs | 12 +++++++++ 5 files changed, 32 insertions(+), 20 deletions(-) diff --git a/config/src/config/network_config.rs b/config/src/config/network_config.rs index f1e5838ecccb93..8c7a64387a6b83 100644 --- a/config/src/config/network_config.rs +++ b/config/src/config/network_config.rs @@ -48,7 +48,7 @@ pub const MAX_MESSAGE_METADATA_SIZE: usize = 128 * 1024; /* 128 KiB: a buffer fo pub const MESSAGE_PADDING_SIZE: usize = 2 * 1024 * 1024; /* 2 MiB: a safety buffer to allow messages to get larger during serialization */ pub const MAX_APPLICATION_MESSAGE_SIZE: usize = (MAX_MESSAGE_SIZE - MAX_MESSAGE_METADATA_SIZE) - MESSAGE_PADDING_SIZE; /* The message size that applications should check against */ -pub const MAX_FRAME_SIZE: usize = 4 * 1024 * 1024; /* 4 MiB large messages will be chunked into multiple frames and streamed */ +pub const MAX_FRAME_SIZE: usize = 60 * 1024 * 1024; /* 60 MiB large messages will be chunked into multiple frames and streamed */ pub const MAX_MESSAGE_SIZE: usize = 64 * 1024 * 1024; /* 64 MiB */ pub const CONNECTION_BACKOFF_BASE: u64 = 2; pub const IP_BYTE_BUCKET_RATE: usize = 102400 /* 100 KiB */; diff --git a/config/src/config/state_sync_config.rs b/config/src/config/state_sync_config.rs index 2fe87a7801a183..ee3a4f4fc63a92 100644 --- a/config/src/config/state_sync_config.rs +++ b/config/src/config/state_sync_config.rs @@ -11,13 +11,13 @@ use serde::{Deserialize, Serialize}; use serde_yaml::Value; // The maximum message size per state sync message -const MAX_MESSAGE_SIZE: usize = 4 * 1024 * 1024; /* 4 MiB */ +const MAX_MESSAGE_SIZE: usize = 60 * 1024 * 1024; /* 60 MiB */ // The maximum chunk sizes for data client requests and response const MAX_EPOCH_CHUNK_SIZE: u64 = 200; const MAX_STATE_CHUNK_SIZE: u64 = 4000; -const MAX_TRANSACTION_CHUNK_SIZE: u64 = 2000; -const MAX_TRANSACTION_OUTPUT_CHUNK_SIZE: u64 = 1000; +const MAX_TRANSACTION_CHUNK_SIZE: u64 = 20_000; +const MAX_TRANSACTION_OUTPUT_CHUNK_SIZE: u64 = 20_000; // The maximum number of concurrent requests to send const MAX_CONCURRENT_REQUESTS: u64 = 6; @@ -127,16 +127,16 @@ pub struct StateSyncDriverConfig { impl Default for StateSyncDriverConfig { fn default() -> Self { Self { - bootstrapping_mode: BootstrappingMode::ExecuteOrApplyFromGenesis, + bootstrapping_mode: BootstrappingMode::ApplyTransactionOutputsFromGenesis, commit_notification_timeout_ms: 5000, - continuous_syncing_mode: ContinuousSyncingMode::ExecuteTransactionsOrApplyOutputs, + continuous_syncing_mode: ContinuousSyncingMode::ApplyTransactionOutputs, enable_auto_bootstrapping: false, fallback_to_output_syncing_secs: 180, // 3 minutes progress_check_interval_ms: 50, max_connection_deadline_secs: 10, - max_consecutive_stream_notifications: 10, + max_consecutive_stream_notifications: 100, max_num_stream_timeouts: 12, - max_pending_data_chunks: 100, + max_pending_data_chunks: 500, max_stream_wait_time_ms: 5000, mempool_commit_ack_timeout_ms: 5000, // 5 seconds num_versions_to_skip_snapshot_sync: 100_000_000, // At 5k TPS, this allows a node to fail for about 6 hours. @@ -188,10 +188,10 @@ impl Default for StorageServiceConfig { max_lru_cache_size: 500, // At ~0.6MiB per chunk, this should take no more than 0.5GiB max_network_channel_size: 4000, max_network_chunk_bytes: MAX_MESSAGE_SIZE as u64, - max_num_active_subscriptions: 30, - max_optimistic_fetch_period_ms: 5000, // 5 seconds + max_num_active_subscriptions: 40, + max_optimistic_fetch_period_ms: 10_000, // 10 seconds max_state_chunk_size: MAX_STATE_CHUNK_SIZE, - max_subscription_period_ms: 30_000, // 30 seconds + max_subscription_period_ms: 60_000, // 60 seconds max_transaction_chunk_size: MAX_TRANSACTION_CHUNK_SIZE, max_transaction_output_chunk_size: MAX_TRANSACTION_OUTPUT_CHUNK_SIZE, min_time_to_ignore_peers_secs: 300, // 5 minutes @@ -240,7 +240,7 @@ pub struct DataStreamingServiceConfig { impl Default for DataStreamingServiceConfig { fn default() -> Self { Self { - enable_subscription_streaming: true, + enable_subscription_streaming: false, global_summary_refresh_interval_ms: 50, max_concurrent_requests: MAX_CONCURRENT_REQUESTS, max_concurrent_state_requests: MAX_CONCURRENT_STATE_REQUESTS, @@ -333,17 +333,17 @@ impl Default for AptosDataClientConfig { latency_monitor_loop_interval_ms: 100, max_epoch_chunk_size: MAX_EPOCH_CHUNK_SIZE, max_num_output_reductions: 0, - max_optimistic_fetch_lag_secs: 30, // 30 seconds - max_response_timeout_ms: 60_000, // 60 seconds + max_optimistic_fetch_lag_secs: 120, // 120 seconds + max_response_timeout_ms: 240_000, // 240 seconds max_state_chunk_size: MAX_STATE_CHUNK_SIZE, - max_subscription_lag_secs: 30, // 30 seconds + max_subscription_lag_secs: 120, // 120 seconds max_transaction_chunk_size: MAX_TRANSACTION_CHUNK_SIZE, max_transaction_output_chunk_size: MAX_TRANSACTION_OUTPUT_CHUNK_SIZE, min_peer_ratio_for_latency_filtering: 5, // Only filter if we have at least 5 potential peers per request min_peers_for_latency_filtering: 10, // Only filter if we have at least 10 total peers - optimistic_fetch_timeout_ms: 5000, // 5 seconds - response_timeout_ms: 10_000, // 10 seconds - subscription_response_timeout_ms: 20_000, // 20 seconds (must be longer than a regular timeout because of pre-fetching) + optimistic_fetch_timeout_ms: 10_000, // 10 seconds + response_timeout_ms: 120_000, // 120 seconds + subscription_response_timeout_ms: 60_000, // 60 seconds (must be longer than a regular timeout because of pre-fetching) use_compression: true, } } diff --git a/network/framework/src/constants.rs b/network/framework/src/constants.rs index 6cdcc779f4194f..2d5de7e06a2f31 100644 --- a/network/framework/src/constants.rs +++ b/network/framework/src/constants.rs @@ -9,7 +9,7 @@ // with your use-case. If you do change a value, please add a comment linking to the PR which // advocated the change. /// The timeout for any inbound RPC call before it's cut off -pub const INBOUND_RPC_TIMEOUT_MS: u64 = 10_000; +pub const INBOUND_RPC_TIMEOUT_MS: u64 = 60_000; /// Limit on concurrent Outbound RPC requests before backpressure is applied pub const MAX_CONCURRENT_OUTBOUND_RPCS: u32 = 100; /// Limit on concurrent Inbound RPC requests before backpressure is applied diff --git a/storage/storage-interface/src/lib.rs b/storage/storage-interface/src/lib.rs index a4efbb8d188a23..2eb3c43043c1a6 100644 --- a/storage/storage-interface/src/lib.rs +++ b/storage/storage-interface/src/lib.rs @@ -53,7 +53,7 @@ pub use executed_trees::ExecutedTrees; // This is last line of defense against large queries slipping through external facing interfaces, // like the API and State Sync, etc. -pub const MAX_REQUEST_LIMIT: u64 = 10000; +pub const MAX_REQUEST_LIMIT: u64 = 100_000; pub trait StateSnapshotReceiver: Send { fn add_chunk(&mut self, chunk: Vec<(K, V)>, proof: SparseMerkleRangeProof) -> Result<()>; diff --git a/testsuite/forge-cli/src/main.rs b/testsuite/forge-cli/src/main.rs index 1bb081dd4fda83..417775c4dcc4fe 100644 --- a/testsuite/forge-cli/src/main.rs +++ b/testsuite/forge-cli/src/main.rs @@ -1856,6 +1856,18 @@ fn realistic_network_tuned_for_throughput_test() -> ForgeConfig { if USE_CRAZY_MACHINES { config.execution.concurrency_level = 48; } + })) + .with_fullnode_override_node_config_fn(Arc::new(|config, _| { + // Mempool config optimizations + mempool_config_practically_non_expiring(&mut config.mempool); + + // Higher concurrency level + if USE_CRAZY_MACHINES { + config.execution.concurrency_level = 48; + } + + // Experimental storage optimizations + config.storage.rocksdb_configs.enable_storage_sharding = true; })); if ENABLE_VFNS {