Skip to content

Commit

Permalink
fix(db): Fix write stalls in RocksDB (again) (#265)
Browse files Browse the repository at this point in the history
# What ❔

RocksDB write stalls are still happening, this time for a different
reason. Previously, they were caused by too many immutable memtables,
this time – by too many level-0 SST files. This PR:

- Tunes RocksDB options some more (the main tuning point is [optimizing
level-style
compaction](https://docs.rs/rocksdb/latest/rocksdb/struct.Options.html#method.optimize_level_style_compaction)).
- Increases the number of retries on stall and introduces exponential
backoff.
- Introduces a dozen of RocksDB metrics that should help monitoring
RocksDB health.

## Why ❔

Having write stalls leads to panics and is obviously bad.

## Checklist

- [x] PR title corresponds to the body of PR (we generate changelog
entries from PRs).
- [x] Tests for the changes have been added / updated.
- [x] Documentation comments have been added / updated.
- [x] Code has been formatted via `zk fmt` and `zk lint`.

---------

Co-authored-by: AnastasiiaVashchuk <[email protected]>
  • Loading branch information
slowli and AnastasiiaVashchuk authored Oct 23, 2023
1 parent 8928a41 commit 7b23ab0
Show file tree
Hide file tree
Showing 12 changed files with 258 additions and 53 deletions.
14 changes: 14 additions & 0 deletions core/bin/external_node/src/config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,11 @@ pub struct OptionalENConfig {
#[serde(default = "OptionalENConfig::default_merkle_tree_block_cache_size_mb")]
merkle_tree_block_cache_size_mb: usize,

/// Byte capacity of memtables (recent, non-persisted changes to RocksDB). Setting this to a reasonably
/// large value (order of 512 MiB) is helpful for large DBs that experience write stalls.
#[serde(default = "OptionalENConfig::default_merkle_tree_memtable_capacity_mb")]
merkle_tree_memtable_capacity_mb: usize,

// Other config settings
/// Port on which the Prometheus exporter server is listening.
pub prometheus_port: Option<u16>,
Expand Down Expand Up @@ -274,6 +279,10 @@ impl OptionalENConfig {
128
}

const fn default_merkle_tree_memtable_capacity_mb() -> usize {
256
}

const fn default_fee_history_limit() -> u64 {
1_024
}
Expand Down Expand Up @@ -318,6 +327,11 @@ impl OptionalENConfig {
self.merkle_tree_block_cache_size_mb * BYTES_IN_MEGABYTE
}

/// Returns the memtable capacity for Merkle tree in bytes.
pub fn merkle_tree_memtable_capacity(&self) -> usize {
self.merkle_tree_memtable_capacity_mb * BYTES_IN_MEGABYTE
}

pub fn api_namespaces(&self) -> Vec<Namespace> {
self.api_namespaces
.clone()
Expand Down
1 change: 1 addition & 0 deletions core/bin/external_node/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ async fn init_tasks(
max_l1_batches_per_iter: config.optional.max_l1_batches_per_tree_iter,
multi_get_chunk_size: config.optional.merkle_tree_multi_get_chunk_size,
block_cache_capacity: config.optional.merkle_tree_block_cache_size(),
memtable_capacity: config.optional.merkle_tree_memtable_capacity(),
})
.await;
healthchecks.push(Box::new(metadata_calculator.tree_health_check()));
Expand Down
14 changes: 14 additions & 0 deletions core/lib/config/src/configs/database.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ pub struct MerkleTreeConfig {
/// The default value is 128 MB.
#[serde(default = "MerkleTreeConfig::default_block_cache_size_mb")]
pub block_cache_size_mb: usize,
/// Byte capacity of memtables (recent, non-persisted changes to RocksDB). Setting this to a reasonably
/// large value (order of 512 MiB) is helpful for large DBs that experience write stalls.
#[serde(default = "MerkleTreeConfig::default_memtable_capacity_mb")]
pub memtable_capacity_mb: usize,
/// Maximum number of L1 batches to be processed by the Merkle tree at a time.
#[serde(default = "MerkleTreeConfig::default_max_l1_batches_per_iter")]
pub max_l1_batches_per_iter: usize,
Expand All @@ -51,6 +55,7 @@ impl Default for MerkleTreeConfig {
mode: MerkleTreeMode::default(),
multi_get_chunk_size: Self::default_multi_get_chunk_size(),
block_cache_size_mb: Self::default_block_cache_size_mb(),
memtable_capacity_mb: Self::default_memtable_capacity_mb(),
max_l1_batches_per_iter: Self::default_max_l1_batches_per_iter(),
}
}
Expand All @@ -73,6 +78,10 @@ impl MerkleTreeConfig {
128
}

const fn default_memtable_capacity_mb() -> usize {
256
}

const fn default_max_l1_batches_per_iter() -> usize {
20
}
Expand All @@ -81,6 +90,11 @@ impl MerkleTreeConfig {
pub fn block_cache_size(&self) -> usize {
self.block_cache_size_mb * super::BYTES_IN_MEGABYTE
}

/// Returns the memtable capacity in bytes.
pub fn memtable_capacity(&self) -> usize {
self.memtable_capacity_mb * super::BYTES_IN_MEGABYTE
}
}

/// Database configuration.
Expand Down
17 changes: 10 additions & 7 deletions core/lib/merkle_tree/examples/loadtest/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use zksync_crypto::hasher::blake2::Blake2Hasher;
use zksync_merkle_tree::{
Database, HashTree, MerkleTree, MerkleTreePruner, PatchSet, RocksDBWrapper, TreeInstruction,
};
use zksync_storage::RocksDB;
use zksync_storage::{RocksDB, RocksDBOptions};
use zksync_types::{AccountTreeId, Address, StorageKey, H256, U256};

mod batch;
Expand Down Expand Up @@ -90,12 +90,15 @@ impl Cli {
"Created temp dir for RocksDB: {}",
dir.path().to_string_lossy()
);
rocksdb = if let Some(block_cache_capacity) = self.block_cache {
let db = RocksDB::with_cache(dir.path(), Some(block_cache_capacity));
RocksDBWrapper::from(db)
} else {
RocksDBWrapper::new(dir.path())
};
let db = RocksDB::with_options(
dir.path(),
RocksDBOptions {
block_cache_capacity: self.block_cache,
..RocksDBOptions::default()
},
);
rocksdb = RocksDBWrapper::from(db);

if let Some(chunk_size) = self.chunk_size {
rocksdb.set_multi_get_chunk_size(chunk_size);
}
Expand Down
16 changes: 9 additions & 7 deletions core/lib/merkle_tree/examples/recovery.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use zksync_merkle_tree::{
recovery::{MerkleTreeRecovery, RecoveryEntry},
HashTree, Key, PatchSet, PruneDatabase, RocksDBWrapper, ValueHash,
};
use zksync_storage::RocksDB;
use zksync_storage::{RocksDB, RocksDBOptions};

/// CLI for load-testing Merkle tree recovery.
#[derive(Debug, Parser)]
Expand Down Expand Up @@ -60,12 +60,14 @@ impl Cli {
"Created temp dir for RocksDB: {}",
dir.path().to_string_lossy()
);
rocksdb = if let Some(block_cache_capacity) = self.block_cache {
let db = RocksDB::with_cache(dir.path(), Some(block_cache_capacity));
RocksDBWrapper::from(db)
} else {
RocksDBWrapper::new(dir.path())
};
let db = RocksDB::with_options(
dir.path(),
RocksDBOptions {
block_cache_capacity: self.block_cache,
..RocksDBOptions::default()
},
);
rocksdb = RocksDBWrapper::from(db);
_temp_dir = Some(dir);
&mut rocksdb
};
Expand Down
4 changes: 4 additions & 0 deletions core/lib/merkle_tree/src/storage/rocksdb.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ impl NamedColumnFamily for MerkleTreeColumnFamily {
Self::StaleKeys => "stale_keys",
}
}

fn requires_tuning(&self) -> bool {
matches!(self, Self::Tree)
}
}

/// Main [`Database`] implementation wrapping a [`RocksDB`] reference.
Expand Down
Loading

0 comments on commit 7b23ab0

Please sign in to comment.