From ab3c84cb0a51f1ff9cdb5798463865bbe9963eef Mon Sep 17 00:00:00 2001 From: Guoteng Rao <3603304+grao1991@users.noreply.github.com> Date: Mon, 26 Sep 2022 17:33:32 -0700 Subject: [PATCH] [Forge] Add working_dir param to support running node on checkpoint dir, so that the existing data on disk is preserved. --- aptos-node/src/lib.rs | 52 +++++++++++++++++-- config/src/config/mod.rs | 9 ++++ config/src/config/storage_config.rs | 2 + consensus/src/consensusdb/mod.rs | 16 ++++++ consensus/src/lib.rs | 2 + docker/compose/aptos-node/validator.yaml | 2 +- .../state-sync-driver/src/metadata_storage.rs | 14 +++++ storage/aptosdb/src/lib.rs | 20 ++++--- .../files/configs/validator-base.yaml | 2 +- testsuite/forge/src/backend/local/node.rs | 2 +- 10 files changed, 107 insertions(+), 14 deletions(-) diff --git a/aptos-node/src/lib.rs b/aptos-node/src/lib.rs index fbafd7de65e52..f8111c81f6116 100644 --- a/aptos-node/src/lib.rs +++ b/aptos-node/src/lib.rs @@ -11,7 +11,9 @@ use aptos_build_info::build_information; use aptos_config::{ config::{ AptosDataClientConfig, BaseConfig, DataStreamingServiceConfig, NetworkConfig, NodeConfig, - PersistableConfig, StorageServiceConfig, + PersistableConfig, RocksdbConfigs, StorageServiceConfig, + DEFAULT_MAX_NUM_NODES_PER_LRU_CACHE_SHARD, NO_OP_STORAGE_PRUNER_CONFIG, + TARGET_SNAPSHOT_SIZE, }, network_id::NetworkId, utils::get_genesis_txn, @@ -26,6 +28,7 @@ use aptos_types::{ account_config::CORE_CODE_ADDRESS, account_view::AccountView, chain_id::ChainId, on_chain_config::ON_CHAIN_CONFIG_REGISTRY, waypoint::Waypoint, }; + use aptos_vm::AptosVM; use aptosdb::AptosDB; use backup_service::start_backup_service; @@ -53,8 +56,9 @@ use state_sync_driver::{ use std::{ boxed::Box, collections::{HashMap, HashSet}, + fs, io::Write, - path::PathBuf, + path::{Path, PathBuf}, sync::{ atomic::{AtomicBool, AtomicUsize, Ordering}, Arc, @@ -570,8 +574,43 @@ fn bootstrap_indexer( Ok(None) } +fn create_checkpoint_and_change_working_dir( + node_config: &mut NodeConfig, + working_dir: impl AsRef, +) { + let source_dir = node_config.storage.dir(); + node_config.set_data_dir(working_dir.as_ref().to_path_buf()); + let checkpoint_dir = node_config.storage.dir(); + + assert!(source_dir != checkpoint_dir); + + // Create rocksdb checkpoint. + fs::create_dir_all(&checkpoint_dir).unwrap(); + + AptosDB::open( + &source_dir, + false, /* readonly */ + NO_OP_STORAGE_PRUNER_CONFIG, /* pruner */ + RocksdbConfigs::default(), + false, + TARGET_SNAPSHOT_SIZE, + DEFAULT_MAX_NUM_NODES_PER_LRU_CACHE_SHARD, + ) + .expect("AptosDB open failure.") + .create_checkpoint(&checkpoint_dir) + .expect("AptosDB checkpoint creation failed."); + + consensus::create_checkpoint(&source_dir, &checkpoint_dir) + .expect("ConsensusDB checkpoint creation failed."); + let state_sync_db = + state_sync_driver::metadata_storage::PersistentMetadataStorage::new(&source_dir); + state_sync_db + .create_checkpoint(&checkpoint_dir) + .expect("StateSyncDB checkpoint creation failed."); +} + pub fn setup_environment( - node_config: NodeConfig, + mut node_config: NodeConfig, remote_log_rx: Option>, logger_filter_update_job: Option, ) -> anyhow::Result { @@ -581,6 +620,13 @@ pub fn setup_environment( inspection_service::inspection_service::start_inspection_service(node_config_clone) }); + // If working_dir is provided, we will make RocksDb checkpoint for consensus_db, + // state_sync_db, ledger_db and state_merkle_db to the checkpoint_path, and running the node + // on the new path, so that the existing data won't change. For now this is a testonly feature. + if let Some(working_dir) = node_config.base.working_dir.clone() { + create_checkpoint_and_change_working_dir(&mut node_config, working_dir); + } + // Open the database let mut instant = Instant::now(); let (aptos_db, db_rw) = DbReaderWriter::wrap( diff --git a/config/src/config/mod.rs b/config/src/config/mod.rs index 060d4dd691921..060ff7f7eb54c 100644 --- a/config/src/config/mod.rs +++ b/config/src/config/mod.rs @@ -101,6 +101,7 @@ pub struct NodeConfig { #[serde(default, deny_unknown_fields)] pub struct BaseConfig { pub data_dir: PathBuf, + pub working_dir: Option, pub role: RoleType, pub waypoint: WaypointConfig, } @@ -109,6 +110,7 @@ impl Default for BaseConfig { fn default() -> BaseConfig { BaseConfig { data_dir: PathBuf::from("/opt/aptos/data"), + working_dir: None, role: RoleType::Validator, waypoint: WaypointConfig::None, } @@ -266,6 +268,13 @@ impl NodeConfig { &self.base.data_dir } + pub fn working_dir(&self) -> &Path { + match &self.base.working_dir { + Some(working_dir) => working_dir, + None => &self.base.data_dir, + } + } + pub fn set_data_dir(&mut self, data_dir: PathBuf) { self.base.data_dir = data_dir.clone(); self.consensus.set_data_dir(data_dir.clone()); diff --git a/config/src/config/storage_config.rs b/config/src/config/storage_config.rs index 5f70780426084..316704d2d7979 100644 --- a/config/src/config/storage_config.rs +++ b/config/src/config/storage_config.rs @@ -87,6 +87,7 @@ pub struct StorageConfig { /// since genesis. To recover operation after data loss, or to bootstrap a node in fast sync /// mode, the indexer db needs to be copied in from another node. pub enable_indexer: bool, + pub working_dir: Option, } pub const NO_OP_STORAGE_PRUNER_CONFIG: PrunerConfig = PrunerConfig { @@ -236,6 +237,7 @@ impl Default for StorageConfig { enable_indexer: false, target_snapshot_size: TARGET_SNAPSHOT_SIZE, max_num_nodes_per_lru_cache_shard: DEFAULT_MAX_NUM_NODES_PER_LRU_CACHE_SHARD, + working_dir: None, } } } diff --git a/consensus/src/consensusdb/mod.rs b/consensus/src/consensusdb/mod.rs index 6c80df4b4e86d..68376da86aabd 100644 --- a/consensus/src/consensusdb/mod.rs +++ b/consensus/src/consensusdb/mod.rs @@ -24,6 +24,22 @@ use std::{collections::HashMap, iter::Iterator, path::Path, time::Instant}; /// The name of the consensus db file pub const CONSENSUS_DB_NAME: &str = "consensus_db"; +/// Creates new physical DB checkpoint in directory specified by `checkpoint_path`. +pub fn create_checkpoint + Clone>(db_path: P, checkpoint_path: P) -> Result<()> { + let start = Instant::now(); + let consensus_db_checkpoint_path = checkpoint_path.as_ref().join(CONSENSUS_DB_NAME); + std::fs::remove_dir_all(&consensus_db_checkpoint_path).unwrap_or(()); + ConsensusDB::new(db_path) + .db + .create_checkpoint(&consensus_db_checkpoint_path)?; + info!( + path = consensus_db_checkpoint_path, + time_ms = %start.elapsed().as_millis(), + "Made ConsensusDB checkpoint." + ); + Ok(()) +} + pub struct ConsensusDB { db: DB, } diff --git a/consensus/src/lib.rs b/consensus/src/lib.rs index ab80cfb69970c..d13e9b8c8e014 100644 --- a/consensus/src/lib.rs +++ b/consensus/src/lib.rs @@ -48,6 +48,8 @@ pub mod network_interface; /// Required by the smoke tests pub use consensusdb::CONSENSUS_DB_NAME; +pub use consensusdb::create_checkpoint; + #[cfg(feature = "fuzzing")] pub use round_manager::round_manager_fuzzing; diff --git a/docker/compose/aptos-node/validator.yaml b/docker/compose/aptos-node/validator.yaml index b5e91f06b9b6d..7df4ba8deaacb 100644 --- a/docker/compose/aptos-node/validator.yaml +++ b/docker/compose/aptos-node/validator.yaml @@ -10,7 +10,7 @@ consensus: type: "local" backend: type: "on_disk_storage" - path: /opt/aptos/data/secure-data.json + path: secure-data.json namespace: ~ initial_safety_rules_config: from_file: diff --git a/state-sync/state-sync-v2/state-sync-driver/src/metadata_storage.rs b/state-sync/state-sync-v2/state-sync-driver/src/metadata_storage.rs index d04883d8424c0..ed9896ee176ef 100644 --- a/state-sync/state-sync-v2/state-sync-driver/src/metadata_storage.rs +++ b/state-sync/state-sync-v2/state-sync-driver/src/metadata_storage.rs @@ -161,6 +161,20 @@ impl PersistentMetadataStorage { )) }) } + + /// Creates new physical DB checkpoint in directory specified by `path`. + pub fn create_checkpoint>(&self, path: P) -> Result<()> { + let start = Instant::now(); + let state_sync_db_path = path.as_ref().join(STATE_SYNC_DB_NAME); + std::fs::remove_dir_all(&state_sync_db_path).unwrap_or(()); + self.database.create_checkpoint(&state_sync_db_path)?; + info!( + path = state_sync_db_path, + time_ms = %start.elapsed().as_millis(), + "Made StateSyncDB checkpoint." + ); + Ok(()) + } } impl MetadataStorageInterface for PersistentMetadataStorage { diff --git a/storage/aptosdb/src/lib.rs b/storage/aptosdb/src/lib.rs index 1985c4d444861..d5891a583e68b 100644 --- a/storage/aptosdb/src/lib.rs +++ b/storage/aptosdb/src/lib.rs @@ -64,8 +64,6 @@ use aptos_crypto::hash::HashValue; use aptos_infallible::Mutex; use aptos_logger::prelude::*; use aptos_rocksdb_options::gen_rocksdb_options; -use aptos_types::proof::TransactionAccumulatorSummary; -use aptos_types::state_store::state_storage_usage::StateStorageUsage; use aptos_types::{ account_address::AccountAddress, account_config::{new_block_event_key, NewBlockEvent}, @@ -76,12 +74,13 @@ use aptos_types::{ ledger_info::LedgerInfoWithSignatures, proof::{ accumulator::InMemoryAccumulator, AccumulatorConsistencyProof, SparseMerkleProofExt, - TransactionInfoListWithProof, + TransactionAccumulatorSummary, TransactionInfoListWithProof, }, state_proof::StateProof, state_store::{ state_key::StateKey, state_key_prefix::StateKeyPrefix, + state_storage_usage::StateStorageUsage, state_value::{StateValue, StateValueChunkWithProof}, table::{TableHandle, TableInfo}, }, @@ -107,12 +106,15 @@ use std::{ time::{Duration, Instant}, }; -use crate::pruner::{ - ledger_pruner_manager::LedgerPrunerManager, ledger_store::ledger_store_pruner::LedgerPruner, - state_pruner_manager::StatePrunerManager, state_store::StateMerklePruner, +use crate::{ + pruner::{ + ledger_pruner_manager::LedgerPrunerManager, + ledger_store::ledger_store_pruner::LedgerPruner, state_pruner_manager::StatePrunerManager, + state_store::StateMerklePruner, + }, + stale_node_index::StaleNodeIndexSchema, + stale_node_index_cross_epoch::StaleNodeIndexCrossEpochSchema, }; -use crate::stale_node_index::StaleNodeIndexSchema; -use crate::stale_node_index_cross_epoch::StaleNodeIndexCrossEpochSchema; use storage_interface::{ state_delta::StateDelta, state_view::DbStateView, DbReader, DbWriter, ExecutedTrees, Order, StateSnapshotReceiver, @@ -658,6 +660,8 @@ impl AptosDB { let start = Instant::now(); let ledger_db_path = path.as_ref().join(LEDGER_DB_NAME); let state_merkle_db_path = path.as_ref().join(STATE_MERKLE_DB_NAME); + std::fs::remove_dir_all(&ledger_db_path).unwrap_or(()); + std::fs::remove_dir_all(&state_merkle_db_path).unwrap_or(()); self.ledger_db.create_checkpoint(&ledger_db_path)?; self.state_merkle_db .create_checkpoint(&state_merkle_db_path)?; diff --git a/terraform/helm/aptos-node/files/configs/validator-base.yaml b/terraform/helm/aptos-node/files/configs/validator-base.yaml index 8a3fc7c8fd04a..c7a3e146a34bf 100644 --- a/terraform/helm/aptos-node/files/configs/validator-base.yaml +++ b/terraform/helm/aptos-node/files/configs/validator-base.yaml @@ -13,7 +13,7 @@ consensus: type: "local" backend: type: "on_disk_storage" - path: /opt/aptos/data/secure-data.json + path: secure-data.json namespace: ~ initial_safety_rules_config: from_file: diff --git a/testsuite/forge/src/backend/local/node.rs b/testsuite/forge/src/backend/local/node.rs index 1f261b1d8ea96..090f369a7a5e2 100644 --- a/testsuite/forge/src/backend/local/node.rs +++ b/testsuite/forge/src/backend/local/node.rs @@ -277,7 +277,7 @@ impl Node for LocalNode { let node_config = self.config(); let ledger_db_path = node_config.storage.dir().join(LEDGER_DB_NAME); let state_db_path = node_config.storage.dir().join(STATE_MERKLE_DB_NAME); - let secure_storage_path = node_config.base.data_dir.join("secure_storage.json"); + let secure_storage_path = node_config.working_dir().join("secure_storage.json"); let state_sync_db_path = node_config.storage.dir().join(STATE_SYNC_DB_NAME); debug!(