diff --git a/Cargo.lock b/Cargo.lock index 74665ebd271..86612a16ef4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -895,6 +895,7 @@ dependencies = [ "environment", "error-chain", "eth1", + "eth2", "eth2_config", "eth2_libp2p", "eth2_ssz 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -909,6 +910,7 @@ dependencies = [ "parking_lot", "prometheus", "reqwest", + "sensitive_url", "serde", "serde_derive", "serde_yaml", @@ -938,17 +940,6 @@ dependencies = [ "cc", ] -[[package]] -name = "colored" -version = "1.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4ffc801dacf156c5854b9df4f425a626539c3a6ef7893cc0c5084a23f0b6c59" -dependencies = [ - "atty", - "lazy_static", - "winapi", -] - [[package]] name = "compare_fields" version = "0.2.0" @@ -1715,6 +1706,7 @@ dependencies = [ "sensitive_url", "serde", "serde_json", + "store", "types", "zeroize", ] @@ -3066,6 +3058,7 @@ dependencies = [ "deposit_contract", "directory", "dirs", + "env_logger 0.9.0", "environment", "eth1_test_rig", "eth2_keystore", @@ -3082,8 +3075,8 @@ dependencies = [ "regex", "sensitive_url", "serde", + "serde_json", "serde_yaml", - "simple_logger", "state_processing", "tokio", "tree_hash 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -5635,19 +5628,6 @@ dependencies = [ "rand_core 0.6.3", ] -[[package]] -name = "simple_logger" -version = "1.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7de33c687404ec3045d4a0d437580455257c0436f858d702f244e7d652f9f07" -dependencies = [ - "atty", - "chrono", - "colored", - "log", - "winapi", -] - [[package]] name = "simulator" version = "0.2.0" @@ -6800,6 +6780,7 @@ dependencies = [ "serde_json", "serde_yaml", "slog", + "state_processing", "superstruct", "swap_or_not_shuffle", "tempfile", diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index 074321885e9..7236e639664 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -14,6 +14,7 @@ use crate::errors::{BeaconChainError as Error, BlockProductionError}; use crate::eth1_chain::{Eth1Chain, Eth1ChainBackend}; use crate::events::ServerSentEventHandler; use crate::head_tracker::HeadTracker; +use crate::historical_blocks::HistoricalBlockError; use crate::migrate::BackgroundMigrator; use crate::naive_aggregation_pool::{ AggregatedAttestationMap, Error as NaiveAggregationError, NaiveAggregationPool, @@ -431,10 +432,23 @@ impl BeaconChain { /// - Skipped slots contain the root of the closest prior /// non-skipped slot (identical to the way they are stored in `state.block_roots`). /// - Iterator returns `(Hash256, Slot)`. + /// + /// Will return a `BlockOutOfRange` error if the requested start slot is before the period of + /// history for which we have blocks stored. See `get_oldest_block_slot`. pub fn forwards_iter_block_roots( &self, start_slot: Slot, ) -> Result>, Error> { + let oldest_block_slot = self.store.get_oldest_block_slot(); + if start_slot < oldest_block_slot { + return Err(Error::HistoricalBlockError( + HistoricalBlockError::BlockOutOfRange { + slot: start_slot, + oldest_block_slot, + }, + )); + } + let local_head = self.head()?; let iter = HotColdDB::forwards_block_roots_iterator( @@ -620,6 +634,12 @@ impl BeaconChain { return Ok(Some(self.genesis_state_root)); } + // Check limits w.r.t historic state bounds. + let (historic_lower_limit, historic_upper_limit) = self.store.get_historic_state_limits(); + if request_slot > historic_lower_limit && request_slot < historic_upper_limit { + return Ok(None); + } + // Try an optimized path of reading the root directly from the head state. let fast_lookup: Option = self.with_head(|head| { if head.beacon_block.slot() <= request_slot { @@ -657,7 +677,8 @@ impl BeaconChain { /// ## Notes /// /// - Use the `skips` parameter to define the behaviour when `request_slot` is a skipped slot. - /// - Returns `Ok(None)` for any slot higher than the current wall-clock slot. + /// - Returns `Ok(None)` for any slot higher than the current wall-clock slot, or less than + /// the oldest known block slot. pub fn block_root_at_slot( &self, request_slot: Slot, @@ -667,6 +688,10 @@ impl BeaconChain { WhenSlotSkipped::None => self.block_root_at_slot_skips_none(request_slot), WhenSlotSkipped::Prev => self.block_root_at_slot_skips_prev(request_slot), } + .or_else(|e| match e { + Error::HistoricalBlockError(_) => Ok(None), + e => Err(e), + }) } /// Returns the block root at the given slot, if any. Only returns roots in the canonical chain. diff --git a/beacon_node/beacon_chain/src/block_verification.rs b/beacon_node/beacon_chain/src/block_verification.rs index 151db27269e..5ddeafa4595 100644 --- a/beacon_node/beacon_chain/src/block_verification.rs +++ b/beacon_node/beacon_chain/src/block_verification.rs @@ -501,6 +501,9 @@ impl GossipVerifiedBlock { let block_root = get_block_root(&block); + // Disallow blocks that conflict with the anchor (weak subjectivity checkpoint), if any. + check_block_against_anchor_slot(block.message(), chain)?; + // Do not gossip a block from a finalized slot. check_block_against_finalized_slot(block.message(), chain)?; @@ -708,6 +711,9 @@ impl SignatureVerifiedBlock { .fork_name(&chain.spec) .map_err(BlockError::InconsistentFork)?; + // Check the anchor slot before loading the parent, to avoid spurious lookups. + check_block_against_anchor_slot(block.message(), chain)?; + let (mut parent, block) = load_parent(block, chain)?; // Reject any block that exceeds our limit on skipped slots. @@ -1115,6 +1121,19 @@ fn check_block_skip_slots( Ok(()) } +/// Returns `Ok(())` if the block's slot is greater than the anchor block's slot (if any). +fn check_block_against_anchor_slot( + block: BeaconBlockRef<'_, T::EthSpec>, + chain: &BeaconChain, +) -> Result<(), BlockError> { + if let Some(anchor_slot) = chain.store.get_anchor_slot() { + if block.slot() <= anchor_slot { + return Err(BlockError::WeakSubjectivityConflict); + } + } + Ok(()) +} + /// Returns `Ok(())` if the block is later than the finalized slot on `chain`. /// /// Returns an error if the block is earlier or equal to the finalized slot, or there was an error diff --git a/beacon_node/beacon_chain/src/builder.rs b/beacon_node/beacon_chain/src/builder.rs index 925d6f5da29..6d4b9225d7e 100644 --- a/beacon_node/beacon_chain/src/builder.rs +++ b/beacon_node/beacon_chain/src/builder.rs @@ -28,8 +28,8 @@ use std::time::Duration; use store::{Error as StoreError, HotColdDB, ItemStore}; use task_executor::ShutdownReason; use types::{ - BeaconBlock, BeaconState, ChainSpec, EthSpec, Graffiti, Hash256, PublicKeyBytes, Signature, - SignedBeaconBlock, Slot, + BeaconBlock, BeaconState, ChainSpec, Checkpoint, EthSpec, Graffiti, Hash256, PublicKeyBytes, + Signature, SignedBeaconBlock, Slot, }; /// An empty struct used to "witness" all the `BeaconChainTypes` traits. It has no user-facing @@ -282,12 +282,19 @@ where Ok(self) } - /// Starts a new chain from a genesis state. - pub fn genesis_state( + /// Store the genesis state & block in the DB. + /// + /// Do *not* initialize fork choice, or do anything that assumes starting from genesis. + /// + /// Return the `BeaconSnapshot` representing genesis as well as the mutated builder. + fn set_genesis_state( mut self, mut beacon_state: BeaconState, - ) -> Result { - let store = self.store.clone().ok_or("genesis_state requires a store")?; + ) -> Result<(BeaconSnapshot, Self), String> { + let store = self + .store + .clone() + .ok_or("set_genesis_state requires a store")?; let beacon_block = genesis_block(&mut beacon_state, &self.spec)?; @@ -298,9 +305,6 @@ where let beacon_state_root = beacon_block.message().state_root(); let beacon_block_root = beacon_block.canonical_root(); - self.genesis_state_root = Some(beacon_state_root); - self.genesis_block_root = Some(beacon_block_root); - store .put_state(&beacon_state_root, &beacon_state) .map_err(|e| format!("Failed to store genesis state: {:?}", e))?; @@ -318,11 +322,26 @@ where ) })?; - let genesis = BeaconSnapshot { - beacon_block, - beacon_block_root, - beacon_state, - }; + self.genesis_state_root = Some(beacon_state_root); + self.genesis_block_root = Some(beacon_block_root); + self.genesis_time = Some(beacon_state.genesis_time()); + + Ok(( + BeaconSnapshot { + beacon_block_root, + beacon_block, + beacon_state, + }, + self, + )) + } + + /// Starts a new chain from a genesis state. + pub fn genesis_state(mut self, beacon_state: BeaconState) -> Result { + let store = self.store.clone().ok_or("genesis_state requires a store")?; + + let (genesis, updated_builder) = self.set_genesis_state(beacon_state)?; + self = updated_builder; let fc_store = BeaconForkChoiceStore::get_forkchoice_store(store, &genesis); @@ -332,10 +351,115 @@ where &genesis.beacon_block, &genesis.beacon_state, ) - .map_err(|e| format!("Unable to build initialize ForkChoice: {:?}", e))?; + .map_err(|e| format!("Unable to initialize ForkChoice: {:?}", e))?; + + self.fork_choice = Some(fork_choice); + + Ok(self.empty_op_pool()) + } + + /// Start the chain from a weak subjectivity state. + pub fn weak_subjectivity_state( + mut self, + mut weak_subj_state: BeaconState, + weak_subj_block: SignedBeaconBlock, + genesis_state: BeaconState, + ) -> Result { + let store = self.store.clone().ok_or("genesis_state requires a store")?; + + let weak_subj_slot = weak_subj_state.slot(); + let weak_subj_block_root = weak_subj_block.canonical_root(); + let weak_subj_state_root = weak_subj_block.state_root(); + + // Check that the given block lies on an epoch boundary. Due to the database only storing + // full states on epoch boundaries and at restore points it would be difficult to support + // starting from a mid-epoch state. + if weak_subj_slot % TEthSpec::slots_per_epoch() != 0 { + return Err(format!( + "Checkpoint block at slot {} is not aligned to epoch start. \ + Please supply an aligned checkpoint with block.slot % 32 == 0", + weak_subj_block.slot(), + )); + } + + // Check that the block and state have consistent slots and state roots. + if weak_subj_state.slot() != weak_subj_block.slot() { + return Err(format!( + "Slot of snapshot block ({}) does not match snapshot state ({})", + weak_subj_block.slot(), + weak_subj_state.slot(), + )); + } + + let computed_state_root = weak_subj_state + .update_tree_hash_cache() + .map_err(|e| format!("Error computing checkpoint state root: {:?}", e))?; + + if weak_subj_state_root != computed_state_root { + return Err(format!( + "Snapshot state root does not match block, expected: {:?}, got: {:?}", + weak_subj_state_root, computed_state_root + )); + } + + // Check that the checkpoint state is for the same network as the genesis state. + // This check doesn't do much for security but should prevent mistakes. + if weak_subj_state.genesis_validators_root() != genesis_state.genesis_validators_root() { + return Err(format!( + "Snapshot state appears to be from the wrong network. Genesis validators root \ + is {:?} but should be {:?}", + weak_subj_state.genesis_validators_root(), + genesis_state.genesis_validators_root() + )); + } + + // Set the store's split point *before* storing genesis so that genesis is stored + // immediately in the freezer DB. + store.set_split(weak_subj_slot, weak_subj_state_root); + store + .store_split() + .map_err(|e| format!("Error storing DB split point: {:?}", e))?; + + let (_, updated_builder) = self.set_genesis_state(genesis_state)?; + self = updated_builder; + + store + .put_state(&weak_subj_state_root, &weak_subj_state) + .map_err(|e| format!("Failed to store weak subjectivity state: {:?}", e))?; + store + .put_block(&weak_subj_block_root, weak_subj_block.clone()) + .map_err(|e| format!("Failed to store weak subjectivity block: {:?}", e))?; + + // Store anchor info (context for weak subj sync). + store + .init_anchor_info(weak_subj_block.message()) + .map_err(|e| format!("Failed to initialize anchor info: {:?}", e))?; + + // Store pruning checkpoint to prevent attempting to prune before the anchor state. + store + .store_pruning_checkpoint(Checkpoint { + root: weak_subj_block_root, + epoch: weak_subj_state.slot().epoch(TEthSpec::slots_per_epoch()), + }) + .map_err(|e| format!("Failed to write pruning checkpoint: {:?}", e))?; + + let snapshot = BeaconSnapshot { + beacon_block_root: weak_subj_block_root, + beacon_block: weak_subj_block, + beacon_state: weak_subj_state, + }; + + let fc_store = BeaconForkChoiceStore::get_forkchoice_store(store, &snapshot); + + let fork_choice = ForkChoice::from_anchor( + fc_store, + snapshot.beacon_block_root, + &snapshot.beacon_block, + &snapshot.beacon_state, + ) + .map_err(|e| format!("Unable to initialize ForkChoice: {:?}", e))?; self.fork_choice = Some(fork_choice); - self.genesis_time = Some(genesis.beacon_state.genesis_time()); Ok(self.empty_op_pool()) } @@ -520,12 +644,13 @@ where let fc_finalized = fork_choice.finalized_checkpoint(); let head_finalized = canonical_head.beacon_state.finalized_checkpoint(); if fc_finalized != head_finalized { - if head_finalized.root == Hash256::zero() + let is_genesis = head_finalized.root.is_zero() && head_finalized.epoch == fc_finalized.epoch - && fc_finalized.root == genesis_block_root - { - // This is a legal edge-case encountered during genesis. - } else { + && fc_finalized.root == genesis_block_root; + let is_wss = store.get_anchor_slot().map_or(false, |anchor_slot| { + fc_finalized.epoch == anchor_slot.epoch(TEthSpec::slots_per_epoch()) + }); + if !is_genesis && !is_wss { return Err(format!( "Database corrupt: fork choice is finalized at {:?} whilst head is finalized at \ {:?}", @@ -654,6 +779,11 @@ where "head_slot" => format!("{}", head.beacon_block.slot()), ); + // Check for states to reconstruct (in the background). + if beacon_chain.config.reconstruct_historic_states { + beacon_chain.store_migrator.process_reconstruction(); + } + Ok(beacon_chain) } } diff --git a/beacon_node/beacon_chain/src/chain_config.rs b/beacon_node/beacon_chain/src/chain_config.rs index 53e3f9e5e1e..9d020032300 100644 --- a/beacon_node/beacon_chain/src/chain_config.rs +++ b/beacon_node/beacon_chain/src/chain_config.rs @@ -12,6 +12,8 @@ pub struct ChainConfig { /// /// If `None`, there is no weak subjectivity verification. pub weak_subjectivity_checkpoint: Option, + /// Determine whether to reconstruct historic states, usually after a checkpoint sync. + pub reconstruct_historic_states: bool, } impl Default for ChainConfig { @@ -19,6 +21,7 @@ impl Default for ChainConfig { Self { import_max_skip_slots: None, weak_subjectivity_checkpoint: None, + reconstruct_historic_states: false, } } } diff --git a/beacon_node/beacon_chain/src/errors.rs b/beacon_node/beacon_chain/src/errors.rs index 543f4222269..65b07d87f12 100644 --- a/beacon_node/beacon_chain/src/errors.rs +++ b/beacon_node/beacon_chain/src/errors.rs @@ -2,6 +2,7 @@ use crate::attester_cache::Error as AttesterCacheError; use crate::beacon_chain::ForkChoiceError; use crate::beacon_fork_choice_store::Error as ForkChoiceStoreError; use crate::eth1_chain::Error as Eth1ChainError; +use crate::historical_blocks::HistoricalBlockError; use crate::migrate::PruningError; use crate::naive_aggregation_pool::Error as NaiveAggregationError; use crate::observed_aggregates::Error as ObservedAttestationsError; @@ -117,6 +118,7 @@ pub enum BeaconChainError { block_slot: Slot, state_slot: Slot, }, + HistoricalBlockError(HistoricalBlockError), InvalidStateForShuffling { state_epoch: Epoch, shuffling_epoch: Epoch, @@ -150,6 +152,7 @@ easy_from_to!(BlockSignatureVerifierError, BeaconChainError); easy_from_to!(PruningError, BeaconChainError); easy_from_to!(ArithError, BeaconChainError); easy_from_to!(ForkChoiceStoreError, BeaconChainError); +easy_from_to!(HistoricalBlockError, BeaconChainError); easy_from_to!(StateAdvanceError, BeaconChainError); #[derive(Debug)] diff --git a/beacon_node/beacon_chain/src/historical_blocks.rs b/beacon_node/beacon_chain/src/historical_blocks.rs new file mode 100644 index 00000000000..63e83ce3694 --- /dev/null +++ b/beacon_node/beacon_chain/src/historical_blocks.rs @@ -0,0 +1,195 @@ +use crate::{errors::BeaconChainError as Error, BeaconChain, BeaconChainTypes}; +use itertools::Itertools; +use slog::debug; +use state_processing::{ + per_block_processing::ParallelSignatureSets, + signature_sets::{block_proposal_signature_set_from_parts, Error as SignatureSetError}, +}; +use std::borrow::Cow; +use std::iter; +use std::time::Duration; +use store::{chunked_vector::BlockRoots, AnchorInfo, ChunkWriter, KeyValueStore}; +use types::{Hash256, SignedBeaconBlock, Slot}; + +/// Use a longer timeout on the pubkey cache. +/// +/// It's ok if historical sync is stalled due to writes from forwards block processing. +const PUBKEY_CACHE_LOCK_TIMEOUT: Duration = Duration::from_secs(30); + +#[derive(Debug)] +pub enum HistoricalBlockError { + /// Block is not available (only returned when fetching historic blocks). + BlockOutOfRange { slot: Slot, oldest_block_slot: Slot }, + /// Block root mismatch, caller should retry with different blocks. + MismatchedBlockRoot { + block_root: Hash256, + expected_block_root: Hash256, + }, + /// Bad signature, caller should retry with different blocks. + SignatureSet(SignatureSetError), + /// Bad signature, caller should retry with different blocks. + InvalidSignature, + /// Transitory error, caller should retry with the same blocks. + ValidatorPubkeyCacheTimeout, + /// No historical sync needed. + NoAnchorInfo, + /// Logic error: should never occur. + IndexOutOfBounds, +} + +impl BeaconChain { + /// Store a batch of historical blocks in the database. + /// + /// The `blocks` should be given in slot-ascending order. One of the blocks should have a block + /// root corresponding to the `oldest_block_parent` from the store's `AnchorInfo`. + /// + /// The block roots and proposer signatures are verified. If any block doesn't match the parent + /// root listed in its successor, then the whole batch will be discarded and + /// `MismatchedBlockRoot` will be returned. If any proposer signature is invalid then + /// `SignatureSetError` or `InvalidSignature` will be returned. + /// + /// To align with sync we allow some excess blocks with slots greater than or equal to + /// `oldest_block_slot` to be provided. They will be ignored without being checked. + /// + /// This function should not be called concurrently with any other function that mutates + /// the anchor info (including this function itself). If a concurrent mutation occurs that + /// would violate consistency then an `AnchorInfoConcurrentMutation` error will be returned. + /// + /// Return the number of blocks successfully imported. + pub fn import_historical_block_batch( + &self, + blocks: &[SignedBeaconBlock], + ) -> Result { + let anchor_info = self + .store + .get_anchor_info() + .ok_or(HistoricalBlockError::NoAnchorInfo)?; + + // Take all blocks with slots less than the oldest block slot. + let num_relevant = + blocks.partition_point(|block| block.slot() < anchor_info.oldest_block_slot); + let blocks_to_import = &blocks + .get(..num_relevant) + .ok_or(HistoricalBlockError::IndexOutOfBounds)?; + + if blocks_to_import.len() != blocks.len() { + debug!( + self.log, + "Ignoring some historic blocks"; + "oldest_block_slot" => anchor_info.oldest_block_slot, + "total_blocks" => blocks.len(), + "ignored" => blocks.len().saturating_sub(blocks_to_import.len()), + ); + } + + if blocks_to_import.is_empty() { + return Ok(0); + } + + let mut expected_block_root = anchor_info.oldest_block_parent; + let mut prev_block_slot = anchor_info.oldest_block_slot; + let mut chunk_writer = + ChunkWriter::::new(&self.store.cold_db, prev_block_slot.as_usize())?; + + let mut cold_batch = Vec::with_capacity(blocks.len()); + let mut hot_batch = Vec::with_capacity(blocks.len()); + + for block in blocks_to_import.iter().rev() { + // Check chain integrity. + let block_root = block.canonical_root(); + + if block_root != expected_block_root { + return Err(HistoricalBlockError::MismatchedBlockRoot { + block_root, + expected_block_root, + } + .into()); + } + + // Store block in the hot database. + hot_batch.push(self.store.block_as_kv_store_op(&block_root, block)); + + // Store block roots, including at all skip slots in the freezer DB. + for slot in (block.slot().as_usize()..prev_block_slot.as_usize()).rev() { + chunk_writer.set(slot, block_root, &mut cold_batch)?; + } + + prev_block_slot = block.slot(); + expected_block_root = block.message().parent_root(); + + // If we've reached genesis, add the genesis block root to the batch and set the + // anchor slot to 0 to indicate completion. + if expected_block_root == self.genesis_block_root { + let genesis_slot = self.spec.genesis_slot; + chunk_writer.set( + genesis_slot.as_usize(), + self.genesis_block_root, + &mut cold_batch, + )?; + prev_block_slot = genesis_slot; + expected_block_root = Hash256::zero(); + break; + } + } + chunk_writer.write(&mut cold_batch)?; + + // Verify signatures in one batch, holding the pubkey cache lock for the shortest duration + // possible. For each block fetch the parent root from its successor. Slicing from index 1 + // is safe because we've already checked that `blocks_to_import` is non-empty. + let pubkey_cache = self + .validator_pubkey_cache + .try_read_for(PUBKEY_CACHE_LOCK_TIMEOUT) + .ok_or(HistoricalBlockError::ValidatorPubkeyCacheTimeout)?; + let block_roots = blocks_to_import + .get(1..) + .ok_or(HistoricalBlockError::IndexOutOfBounds)? + .iter() + .map(|block| block.parent_root()) + .chain(iter::once(anchor_info.oldest_block_parent)); + let signature_set = blocks_to_import + .iter() + .zip_eq(block_roots) + .map(|(block, block_root)| { + block_proposal_signature_set_from_parts( + block, + Some(block_root), + block.message().proposer_index(), + &self.spec.fork_at_epoch(block.message().epoch()), + self.genesis_validators_root, + |validator_index| pubkey_cache.get(validator_index).map(Cow::Borrowed), + &self.spec, + ) + }) + .collect::, _>>() + .map_err(HistoricalBlockError::SignatureSet) + .map(ParallelSignatureSets::from)?; + if !signature_set.verify() { + return Err(HistoricalBlockError::InvalidSignature.into()); + } + drop(pubkey_cache); + + // Write the I/O batches to disk, writing the blocks themselves first, as it's better + // for the hot DB to contain extra blocks than for the cold DB to point to blocks that + // do not exist. + self.store.hot_db.do_atomically(hot_batch)?; + self.store.cold_db.do_atomically(cold_batch)?; + + // Update the anchor. + let new_anchor = AnchorInfo { + oldest_block_slot: prev_block_slot, + oldest_block_parent: expected_block_root, + ..anchor_info + }; + let backfill_complete = new_anchor.block_backfill_complete(); + self.store + .compare_and_set_anchor_info(Some(anchor_info), Some(new_anchor))?; + + // If backfill has completed and the chain is configured to reconstruct historic states, + // send a message to the background migrator instructing it to begin reconstruction. + if backfill_complete && self.config.reconstruct_historic_states { + self.store_migrator.process_reconstruction(); + } + + Ok(blocks_to_import.len()) + } +} diff --git a/beacon_node/beacon_chain/src/lib.rs b/beacon_node/beacon_chain/src/lib.rs index 5efcc34009e..9796c65d1da 100644 --- a/beacon_node/beacon_chain/src/lib.rs +++ b/beacon_node/beacon_chain/src/lib.rs @@ -13,6 +13,7 @@ pub mod eth1_chain; pub mod events; pub mod fork_revert; mod head_tracker; +pub mod historical_blocks; mod metrics; pub mod migrate; mod naive_aggregation_pool; @@ -39,6 +40,7 @@ pub use self::beacon_chain::{ pub use self::beacon_snapshot::BeaconSnapshot; pub use self::chain_config::ChainConfig; pub use self::errors::{BeaconChainError, BlockProductionError}; +pub use self::historical_blocks::HistoricalBlockError; pub use attestation_verification::Error as AttestationError; pub use beacon_fork_choice_store::{BeaconForkChoiceStore, Error as ForkChoiceStoreError}; pub use block_verification::{BlockError, GossipVerifiedBlock}; diff --git a/beacon_node/beacon_chain/src/migrate.rs b/beacon_node/beacon_chain/src/migrate.rs index cf2fb6484fc..b2a925bb779 100644 --- a/beacon_node/beacon_chain/src/migrate.rs +++ b/beacon_node/beacon_chain/src/migrate.rs @@ -30,7 +30,7 @@ const COMPACTION_FINALITY_DISTANCE: u64 = 1024; pub struct BackgroundMigrator, Cold: ItemStore> { db: Arc>, #[allow(clippy::type_complexity)] - tx_thread: Option, thread::JoinHandle<()>)>>, + tx_thread: Option, thread::JoinHandle<()>)>>, /// Genesis block root, for persisting the `PersistedBeaconChain`. genesis_block_root: Hash256, log: Logger, @@ -73,7 +73,12 @@ pub enum PruningError { } /// Message sent to the migration thread containing the information it needs to run. -pub struct MigrationNotification { +pub enum Notification { + Finalization(FinalizationNotification), + Reconstruction, +} + +pub struct FinalizationNotification { finalized_state_root: BeaconStateHash, finalized_checkpoint: Checkpoint, head_tracker: Arc, @@ -112,13 +117,46 @@ impl, Cold: ItemStore> BackgroundMigrator, ) -> Result<(), BeaconChainError> { - let notif = MigrationNotification { + let notif = FinalizationNotification { finalized_state_root, finalized_checkpoint, head_tracker, genesis_block_root: self.genesis_block_root, }; + // Send to background thread if configured, otherwise run in foreground. + if let Some(Notification::Finalization(notif)) = + self.send_background_notification(Notification::Finalization(notif)) + { + Self::run_migration(self.db.clone(), notif, &self.log); + } + + Ok(()) + } + + pub fn process_reconstruction(&self) { + if let Some(Notification::Reconstruction) = + self.send_background_notification(Notification::Reconstruction) + { + Self::run_reconstruction(self.db.clone(), &self.log); + } + } + + pub fn run_reconstruction(db: Arc>, log: &Logger) { + if let Err(e) = db.reconstruct_historic_states() { + error!( + log, + "State reconstruction failed"; + "error" => ?e, + ); + } + } + + /// If configured to run in the background, send `notif` to the background thread. + /// + /// Return `None` if the message was sent to the background thread, `Some(notif)` otherwise. + #[must_use = "Message is not processed when this function returns `Some`"] + fn send_background_notification(&self, notif: Notification) -> Option { // Async path, on the background thread. if let Some(tx_thread) = &self.tx_thread { let (ref mut tx, ref mut thread) = *tx_thread.lock(); @@ -143,17 +181,21 @@ impl, Cold: ItemStore> BackgroundMigrator>, notif: MigrationNotification, log: &Logger) { + fn run_migration( + db: Arc>, + notif: FinalizationNotification, + log: &Logger, + ) { + debug!(log, "Database consolidation started"); + let finalized_state_root = notif.finalized_state_root; let finalized_state = match db.get_state(&finalized_state_root.into(), None) { @@ -223,31 +265,44 @@ impl, Cold: ItemStore> BackgroundMigrator format!("{:?}", e)); } + + debug!(log, "Database consolidation complete"); } /// Spawn a new child thread to run the migration process. /// - /// Return a channel handle for sending new finalized states to the thread. + /// Return a channel handle for sending requests to the thread. fn spawn_thread( db: Arc>, log: Logger, - ) -> (mpsc::Sender, thread::JoinHandle<()>) { + ) -> (mpsc::Sender, thread::JoinHandle<()>) { let (tx, rx) = mpsc::channel(); let thread = thread::spawn(move || { while let Ok(notif) = rx.recv() { - // Read the rest of the messages in the channel, ultimately choosing the `notif` - // with the highest finalized epoch. - let notif = rx - .try_iter() - .fold(notif, |best, other: MigrationNotification| { - if other.finalized_checkpoint.epoch > best.finalized_checkpoint.epoch { - other - } else { - best - } - }); + // Read the rest of the messages in the channel, preferring any reconstruction + // notification, or the finalization notification with the greatest finalized epoch. + let notif = + rx.try_iter() + .fold(notif, |best, other: Notification| match (&best, &other) { + (Notification::Reconstruction, _) + | (_, Notification::Reconstruction) => Notification::Reconstruction, + ( + Notification::Finalization(fin1), + Notification::Finalization(fin2), + ) => { + if fin2.finalized_checkpoint.epoch > fin1.finalized_checkpoint.epoch + { + other + } else { + best + } + } + }); - Self::run_migration(db.clone(), notif, &log); + match notif { + Notification::Reconstruction => Self::run_reconstruction(db.clone(), &log), + Notification::Finalization(fin) => Self::run_migration(db.clone(), fin, &log), + } } }); (tx, thread) diff --git a/beacon_node/beacon_chain/src/schema_change.rs b/beacon_node/beacon_chain/src/schema_change.rs index a96e1e7c3ce..ec92b7c8ac0 100644 --- a/beacon_node/beacon_chain/src/schema_change.rs +++ b/beacon_node/beacon_chain/src/schema_change.rs @@ -2,12 +2,15 @@ use crate::beacon_chain::{BeaconChainTypes, OP_POOL_DB_KEY}; use crate::validator_pubkey_cache::ValidatorPubkeyCache; use operation_pool::{PersistedOperationPool, PersistedOperationPoolBase}; +use ssz::{Decode, Encode}; +use ssz_derive::{Decode, Encode}; use std::fs; use std::path::Path; use std::sync::Arc; +use store::config::OnDiskStoreConfig; use store::hot_cold_store::{HotColdDB, HotColdDBError}; -use store::metadata::{SchemaVersion, CURRENT_SCHEMA_VERSION}; -use store::Error as StoreError; +use store::metadata::{SchemaVersion, CONFIG_KEY, CURRENT_SCHEMA_VERSION}; +use store::{DBColumn, Error as StoreError, ItemStore, StoreItem}; const PUBKEY_CACHE_FILENAME: &str = "pubkey_cache.ssz"; @@ -73,6 +76,23 @@ pub fn migrate_schema( Ok(()) } + // Migration for weak subjectivity sync support and clean up of `OnDiskStoreConfig` (#1784). + (SchemaVersion(4), SchemaVersion(5)) => { + if let Some(OnDiskStoreConfigV4 { + slots_per_restore_point, + .. + }) = db.hot_db.get(&CONFIG_KEY)? + { + let new_config = OnDiskStoreConfig { + slots_per_restore_point, + }; + db.hot_db.put(&CONFIG_KEY, &new_config)?; + } + + db.store_schema_version(to)?; + + Ok(()) + } // Anything else is an error. (_, _) => Err(HotColdDBError::UnsupportedSchemaVersion { target_version: to, @@ -81,3 +101,24 @@ pub fn migrate_schema( .into()), } } + +// Store config used in v4 schema and earlier. +#[derive(Debug, Clone, PartialEq, Eq, Encode, Decode)] +pub struct OnDiskStoreConfigV4 { + pub slots_per_restore_point: u64, + pub _block_cache_size: usize, +} + +impl StoreItem for OnDiskStoreConfigV4 { + fn db_column() -> DBColumn { + DBColumn::BeaconMeta + } + + fn as_store_bytes(&self) -> Vec { + self.as_ssz_bytes() + } + + fn from_store_bytes(bytes: &[u8]) -> Result { + Ok(Self::from_ssz_bytes(bytes)?) + } +} diff --git a/beacon_node/beacon_chain/src/test_utils.rs b/beacon_node/beacon_chain/src/test_utils.rs index 41747c3c073..2b452f57791 100644 --- a/beacon_node/beacon_chain/src/test_utils.rs +++ b/beacon_node/beacon_chain/src/test_utils.rs @@ -12,7 +12,7 @@ use crate::{ }; use bls::get_withdrawal_credentials; use futures::channel::mpsc::Receiver; -use genesis::interop_genesis_state; +pub use genesis::interop_genesis_state; use int_to_bytes::int_to_bytes32; use merkle_proof::MerkleTree; use parking_lot::Mutex; diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index bff9627367f..a04d2b9a44b 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -1,11 +1,16 @@ #![cfg(not(debug_assertions))] use beacon_chain::attestation_verification::Error as AttnError; +use beacon_chain::builder::BeaconChainBuilder; use beacon_chain::test_utils::{ test_logger, test_spec, AttestationStrategy, BeaconChainHarness, BlockStrategy, DiskHarnessType, HARNESS_SLOT_TIME, }; -use beacon_chain::{BeaconChain, BeaconChainTypes, BeaconSnapshot, ChainConfig}; +use beacon_chain::{ + historical_blocks::HistoricalBlockError, migrate::MigratorConfig, BeaconChain, + BeaconChainError, BeaconChainTypes, BeaconSnapshot, ChainConfig, ServerSentEventHandler, + WhenSlotSkipped, +}; use lazy_static::lazy_static; use maplit::hashset; use rand::Rng; @@ -558,7 +563,7 @@ fn multiple_attestations_per_block() { let harness = get_harness(store, HIGH_VALIDATOR_COUNT); harness.extend_chain( - MainnetEthSpec::slots_per_epoch() as usize * 3, + E::slots_per_epoch() as usize * 3, BlockStrategy::OnCanonicalHead, AttestationStrategy::AllValidators, ); @@ -1741,6 +1746,173 @@ fn garbage_collect_temp_states_from_failed_block() { assert_eq!(store.iter_temporary_state_roots().count(), 0); } +#[test] +fn weak_subjectivity_sync() { + // Build an initial chain on one harness, representing a synced node with full history. + let num_initial_blocks = E::slots_per_epoch() * 11; + let num_final_blocks = E::slots_per_epoch() * 2; + + let temp1 = tempdir().unwrap(); + let full_store = get_store(&temp1); + let harness = get_harness(full_store.clone(), LOW_VALIDATOR_COUNT); + + harness.extend_chain( + num_initial_blocks as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ); + + let genesis_state = full_store + .get_state(&harness.chain.genesis_state_root, Some(Slot::new(0))) + .unwrap() + .unwrap(); + let wss_checkpoint = harness.chain.head_info().unwrap().finalized_checkpoint; + let wss_block = harness.get_block(wss_checkpoint.root.into()).unwrap(); + let wss_state = full_store + .get_state(&wss_block.state_root(), None) + .unwrap() + .unwrap(); + let wss_slot = wss_block.slot(); + + // Add more blocks that advance finalization further. + harness.advance_slot(); + harness.extend_chain( + num_final_blocks as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ); + + let (shutdown_tx, _shutdown_rx) = futures::channel::mpsc::channel(1); + let log = test_logger(); + let temp2 = tempdir().unwrap(); + let store = get_store(&temp2); + + // Initialise a new beacon chain from the finalized checkpoint + let beacon_chain = BeaconChainBuilder::new(MinimalEthSpec) + .store(store.clone()) + .custom_spec(test_spec::()) + .weak_subjectivity_state(wss_state, wss_block.clone(), genesis_state) + .unwrap() + .logger(log.clone()) + .store_migrator_config(MigratorConfig::default().blocking()) + .dummy_eth1_backend() + .expect("should build dummy backend") + .testing_slot_clock(HARNESS_SLOT_TIME) + .expect("should configure testing slot clock") + .shutdown_sender(shutdown_tx) + .chain_config(ChainConfig::default()) + .event_handler(Some(ServerSentEventHandler::new_with_capacity( + log.clone(), + 1, + ))) + .monitor_validators(true, vec![], log) + .build() + .expect("should build"); + + // Apply blocks forward to reach head. + let chain_dump = harness.chain.chain_dump().unwrap(); + let new_blocks = &chain_dump[wss_slot.as_usize() + 1..]; + + assert_eq!(new_blocks[0].beacon_block.slot(), wss_slot + 1); + + for snapshot in new_blocks { + let block = &snapshot.beacon_block; + beacon_chain.slot_clock.set_slot(block.slot().as_u64()); + beacon_chain.process_block(block.clone()).unwrap(); + beacon_chain.fork_choice().unwrap(); + + // Check that the new block's state can be loaded correctly. + let state_root = block.state_root(); + let mut state = beacon_chain + .store + .get_state(&state_root, Some(block.slot())) + .unwrap() + .unwrap(); + assert_eq!(state.update_tree_hash_cache().unwrap(), state_root); + } + + // Forwards iterator from 0 should fail as we lack blocks. + assert!(matches!( + beacon_chain.forwards_iter_block_roots(Slot::new(0)), + Err(BeaconChainError::HistoricalBlockError( + HistoricalBlockError::BlockOutOfRange { .. } + )) + )); + + // Simulate processing of a `StatusMessage` with an older finalized epoch by calling + // `block_root_at_slot` with an old slot for which we don't know the block root. It should + // return `None` rather than erroring. + assert_eq!( + beacon_chain + .block_root_at_slot(Slot::new(1), WhenSlotSkipped::None) + .unwrap(), + None + ); + + // Simulate querying the API for a historic state that is unknown. It should also return + // `None` rather than erroring. + assert_eq!(beacon_chain.state_root_at_slot(Slot::new(1)).unwrap(), None); + + // Supply blocks backwards to reach genesis. Omit the genesis block to check genesis handling. + let historical_blocks = chain_dump[..wss_block.slot().as_usize()] + .iter() + .filter(|s| s.beacon_block.slot() != 0) + .map(|s| s.beacon_block.clone()) + .collect::>(); + beacon_chain + .import_historical_block_batch(&historical_blocks) + .unwrap(); + assert_eq!(beacon_chain.store.get_oldest_block_slot(), 0); + + // Resupplying the blocks should not fail, they can be safely ignored. + beacon_chain + .import_historical_block_batch(&historical_blocks) + .unwrap(); + + // The forwards iterator should now match the original chain + let forwards = beacon_chain + .forwards_iter_block_roots(Slot::new(0)) + .unwrap() + .map(Result::unwrap) + .collect::>(); + let expected = harness + .chain + .forwards_iter_block_roots(Slot::new(0)) + .unwrap() + .map(Result::unwrap) + .collect::>(); + assert_eq!(forwards, expected); + + // All blocks can be loaded. + for (block_root, slot) in beacon_chain + .forwards_iter_block_roots(Slot::new(0)) + .unwrap() + .map(Result::unwrap) + { + let block = store.get_block(&block_root).unwrap().unwrap(); + assert_eq!(block.slot(), slot); + } + + // All states from the oldest state slot can be loaded. + let (_, oldest_state_slot) = store.get_historic_state_limits(); + for (state_root, slot) in beacon_chain + .forwards_iter_state_roots(oldest_state_slot) + .unwrap() + .map(Result::unwrap) + { + let state = store.get_state(&state_root, Some(slot)).unwrap().unwrap(); + assert_eq!(state.slot(), slot); + assert_eq!(state.canonical_root(), state_root); + } + + // Anchor slot is still set to the starting slot. + assert_eq!(store.get_anchor_slot(), Some(wss_slot)); + + // Reconstruct states. + store.clone().reconstruct_historic_states().unwrap(); + assert_eq!(store.get_anchor_slot(), None); +} + #[test] fn finalizes_after_resuming_from_db() { let validator_count = 16; diff --git a/beacon_node/client/Cargo.toml b/beacon_node/client/Cargo.toml index 25d51e89e5b..dfc969f740d 100644 --- a/beacon_node/client/Cargo.toml +++ b/beacon_node/client/Cargo.toml @@ -32,6 +32,8 @@ futures = "0.3.7" reqwest = { version = "0.11.0", features = ["native-tls-vendored"] } url = "2.1.1" eth1 = { path = "../eth1" } +eth2 = { path = "../../common/eth2" } +sensitive_url = { path = "../../common/sensitive_url" } genesis = { path = "../genesis" } task_executor = { path = "../../common/task_executor" } environment = { path = "../../lighthouse/environment" } diff --git a/beacon_node/client/src/builder.rs b/beacon_node/client/src/builder.rs index d7de3e0d5df..7f19dbb85d1 100644 --- a/beacon_node/client/src/builder.rs +++ b/beacon_node/client/src/builder.rs @@ -12,6 +12,10 @@ use beacon_chain::{ }; use environment::RuntimeContext; use eth1::{Config as Eth1Config, Service as Eth1Service}; +use eth2::{ + types::{BlockId, StateId}, + BeaconNodeHttpClient, Error as ApiError, Timeouts, +}; use eth2_libp2p::NetworkGlobals; use genesis::{interop_genesis_state, Eth1GenesisService}; use monitoring_api::{MonitoringHttpClient, ProcessType}; @@ -25,11 +29,16 @@ use std::sync::Arc; use std::time::Duration; use timer::spawn_timer; use tokio::sync::{mpsc::UnboundedSender, oneshot}; -use types::{test_utils::generate_deterministic_keypairs, BeaconState, ChainSpec, EthSpec}; +use types::{ + test_utils::generate_deterministic_keypairs, BeaconState, ChainSpec, EthSpec, SignedBeaconBlock, +}; /// Interval between polling the eth1 node for genesis information. pub const ETH1_GENESIS_UPDATE_INTERVAL_MILLIS: u64 = 7_000; +/// Timeout for checkpoint sync HTTP requests. +pub const CHECKPOINT_SYNC_HTTP_TIMEOUT: Duration = Duration::from_secs(60); + /// Builds a `Client` instance. /// /// ## Notes @@ -168,11 +177,22 @@ where // // Alternatively, if there's a beacon chain in the database then always resume // using it. - let client_genesis = if client_genesis == ClientGenesis::FromStore && !chain_exists { + let client_genesis = if matches!(client_genesis, ClientGenesis::FromStore) && !chain_exists + { info!(context.log(), "Defaulting to deposit contract genesis"); ClientGenesis::DepositContract } else if chain_exists { + if matches!(client_genesis, ClientGenesis::WeakSubjSszBytes { .. }) + || matches!(client_genesis, ClientGenesis::CheckpointSyncUrl { .. }) + { + info!( + context.log(), + "Refusing to checkpoint sync"; + "msg" => "database already exists, use --purge-db to force checkpoint sync" + ); + } + ClientGenesis::FromStore } else { client_genesis @@ -200,6 +220,103 @@ where builder.genesis_state(genesis_state).map(|v| (v, None))? } + ClientGenesis::WeakSubjSszBytes { + anchor_state_bytes, + anchor_block_bytes, + genesis_state_bytes, + } => { + info!(context.log(), "Starting checkpoint sync"); + + let anchor_state = BeaconState::from_ssz_bytes(&anchor_state_bytes, &spec) + .map_err(|e| format!("Unable to parse weak subj state SSZ: {:?}", e))?; + let anchor_block = SignedBeaconBlock::from_ssz_bytes(&anchor_block_bytes, &spec) + .map_err(|e| format!("Unable to parse weak subj block SSZ: {:?}", e))?; + let genesis_state = BeaconState::from_ssz_bytes(&genesis_state_bytes, &spec) + .map_err(|e| format!("Unable to parse genesis state SSZ: {:?}", e))?; + + builder + .weak_subjectivity_state(anchor_state, anchor_block, genesis_state) + .map(|v| (v, None))? + } + ClientGenesis::CheckpointSyncUrl { + genesis_state_bytes, + url, + } => { + info!( + context.log(), + "Starting checkpoint sync"; + "remote_url" => %url, + ); + + let remote = + BeaconNodeHttpClient::new(url, Timeouts::set_all(CHECKPOINT_SYNC_HTTP_TIMEOUT)); + let slots_per_epoch = TEthSpec::slots_per_epoch(); + + // Find a suitable finalized block on an epoch boundary. + let mut block = remote + .get_beacon_blocks_ssz::(BlockId::Finalized, &spec) + .await + .map_err(|e| match e { + ApiError::InvalidSsz(e) => format!( + "Unable to parse SSZ: {:?}. Ensure the checkpoint-sync-url refers to a \ + node for the correct network", + e + ), + e => format!("Error fetching finalized block from remote: {:?}", e), + })? + .ok_or("Finalized block missing from remote, it returned 404")?; + + let mut block_slot = block.slot(); + + while block.slot() % slots_per_epoch != 0 { + block_slot = (block_slot / slots_per_epoch - 1) * slots_per_epoch; + + debug!( + context.log(), + "Searching for aligned checkpoint block"; + "block_slot" => block_slot, + ); + + if let Some(found_block) = remote + .get_beacon_blocks_ssz::(BlockId::Slot(block_slot), &spec) + .await + .map_err(|e| { + format!("Error fetching block at slot {}: {:?}", block_slot, e) + })? + { + block = found_block; + } + } + + let state_root = block.state_root(); + let state = remote + .get_debug_beacon_states_ssz::(StateId::Root(state_root), &spec) + .await + .map_err(|e| { + format!( + "Error loading checkpoint state from remote {:?}: {:?}", + state_root, e + ) + })? + .ok_or_else(|| { + format!("Checkpoint state missing from remote: {:?}", state_root) + })?; + + let genesis_state = BeaconState::from_ssz_bytes(&genesis_state_bytes, &spec) + .map_err(|e| format!("Unable to parse genesis state SSZ: {:?}", e))?; + + info!( + context.log(), + "Loaded checkpoint block and state"; + "slot" => block.slot(), + "block_root" => ?block.canonical_root(), + "state_root" => ?state_root, + ); + + builder + .weak_subjectivity_state(state, block, genesis_state) + .map(|v| (v, None))? + } ClientGenesis::DepositContract => { info!( context.log(), diff --git a/beacon_node/client/src/config.rs b/beacon_node/client/src/config.rs index 043d7d6fae7..40e13898b96 100644 --- a/beacon_node/client/src/config.rs +++ b/beacon_node/client/src/config.rs @@ -1,5 +1,6 @@ use directory::DEFAULT_ROOT_DIR; use network::NetworkConfig; +use sensitive_url::SensitiveUrl; use serde_derive::{Deserialize, Serialize}; use std::fs; use std::path::PathBuf; @@ -9,7 +10,7 @@ use types::{Graffiti, PublicKeyBytes}; const DEFAULT_FREEZER_DB_DIR: &str = "freezer_db"; /// Defines how the client should initialize the `BeaconChain` and other components. -#[derive(PartialEq, Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub enum ClientGenesis { /// Creates a genesis state as per the 2019 Canada interop specifications. Interop { @@ -26,6 +27,15 @@ pub enum ClientGenesis { /// We include the bytes instead of the `BeaconState` because the `EthSpec` type /// parameter would be very annoying. SszBytes { genesis_state_bytes: Vec }, + WeakSubjSszBytes { + genesis_state_bytes: Vec, + anchor_state_bytes: Vec, + anchor_block_bytes: Vec, + }, + CheckpointSyncUrl { + genesis_state_bytes: Vec, + url: SensitiveUrl, + }, } impl Default for ClientGenesis { diff --git a/beacon_node/client/src/notifier.rs b/beacon_node/client/src/notifier.rs index 82ac3a30465..31fb6740639 100644 --- a/beacon_node/client/src/notifier.rs +++ b/beacon_node/client/src/notifier.rs @@ -1,6 +1,6 @@ use crate::metrics; use beacon_chain::{BeaconChain, BeaconChainTypes}; -use eth2_libp2p::NetworkGlobals; +use eth2_libp2p::{types::SyncState, NetworkGlobals}; use parking_lot::Mutex; use slog::{debug, error, info, warn, Logger}; use slot_clock::SlotClock; @@ -19,6 +19,9 @@ const MINUTES_PER_HOUR: i64 = 60; /// The number of historical observations that should be used to determine the average sync time. const SPEEDO_OBSERVATIONS: usize = 4; +/// The number of slots between logs that give detail about backfill process. +const BACKFILL_LOG_INTERVAL: u64 = 5; + /// Spawns a notifier service which periodically logs information about the node. pub fn spawn_notifier( executor: task_executor::TaskExecutor, @@ -42,6 +45,16 @@ pub fn spawn_notifier( let log = executor.log().clone(); let mut interval = tokio::time::interval_at(start_instant, interval_duration); + // Keep track of sync state and reset the speedo on specific sync state changes. + // Specifically, if we switch between a sync and a backfill sync, reset the speedo. + let mut current_sync_state = network.sync_state(); + + // Store info if we are required to do a backfill sync. + let original_anchor_slot = beacon_chain + .store + .get_anchor_info() + .map(|ai| ai.oldest_block_slot); + let interval_future = async move { // Perform pre-genesis logging. loop { @@ -63,11 +76,30 @@ pub fn spawn_notifier( } // Perform post-genesis logging. + let mut last_backfill_log_slot = None; loop { interval.tick().await; let connected_peer_count = network.connected_peers(); let sync_state = network.sync_state(); + // Determine if we have switched syncing chains + if sync_state != current_sync_state { + match (current_sync_state, &sync_state) { + (_, SyncState::BackFillSyncing { .. }) => { + // We have transitioned to a backfill sync. Reset the speedo. + let mut speedo = speedo.lock(); + speedo.clear(); + } + (SyncState::BackFillSyncing { .. }, _) => { + // We have transitioned from a backfill sync, reset the speedo + let mut speedo = speedo.lock(); + speedo.clear(); + } + (_, _) => {} + } + current_sync_state = sync_state; + } + let head_info = match beacon_chain.head_info() { Ok(head_info) => head_info, Err(e) => { @@ -97,17 +129,46 @@ pub fn spawn_notifier( let finalized_root = head_info.finalized_checkpoint.root; let head_root = head_info.block_root; + // The default is for regular sync but this gets modified if backfill sync is in + // progress. + let mut sync_distance = current_slot - head_slot; + let mut speedo = speedo.lock(); - speedo.observe(head_slot, Instant::now()); + match current_sync_state { + SyncState::BackFillSyncing { .. } => { + // Observe backfilling sync info. + if let Some(oldest_slot) = original_anchor_slot { + if let Some(current_anchor_slot) = beacon_chain + .store + .get_anchor_info() + .map(|ai| ai.oldest_block_slot) + { + sync_distance = current_anchor_slot; + speedo + // For backfill sync use a fake slot which is the distance we've progressed from the starting `oldest_block_slot`. + .observe( + oldest_slot.saturating_sub(current_anchor_slot), + Instant::now(), + ); + } + } + } + SyncState::SyncingFinalized { .. } + | SyncState::SyncingHead { .. } + | SyncState::SyncTransition => { + speedo.observe(head_slot, Instant::now()); + } + SyncState::Stalled | SyncState::Synced => {} + } + // NOTE: This is going to change based on which sync we are currently performing. A + // backfill sync should process slots significantly faster than the other sync + // processes. metrics::set_gauge( &metrics::SYNC_SLOTS_PER_SECOND, speedo.slots_per_second().unwrap_or(0_f64) as i64, ); - // The next two lines take advantage of saturating subtraction on `Slot`. - let head_distance = current_slot - head_slot; - if connected_peer_count <= WARN_PEER_COUNT { warn!(log, "Low peer count"; "peer_count" => peer_count_pretty(connected_peer_count)); } @@ -121,16 +182,57 @@ pub fn spawn_notifier( "head_block" => format!("{}", head_root), "head_slot" => head_slot, "current_slot" => current_slot, - "sync_state" =>format!("{}", sync_state) + "sync_state" =>format!("{}", current_sync_state) ); + // Log if we are backfilling. + let is_backfilling = matches!(current_sync_state, SyncState::BackFillSyncing { .. }); + if is_backfilling + && last_backfill_log_slot + .map_or(true, |slot| slot + BACKFILL_LOG_INTERVAL <= current_slot) + { + last_backfill_log_slot = Some(current_slot); + + let distance = format!( + "{} slots ({})", + sync_distance.as_u64(), + slot_distance_pretty(sync_distance, slot_duration) + ); + + let speed = speedo.slots_per_second(); + let display_speed = speed.map_or(false, |speed| speed != 0.0); + + if display_speed { + info!( + log, + "Downloading historical blocks"; + "distance" => distance, + "speed" => sync_speed_pretty(speed), + "est_time" => estimated_time_pretty(speedo.estimated_time_till_slot(original_anchor_slot.unwrap_or(current_slot))), + ); + } else { + info!( + log, + "Downloading historical blocks"; + "distance" => distance, + "est_time" => estimated_time_pretty(speedo.estimated_time_till_slot(original_anchor_slot.unwrap_or(current_slot))), + ); + } + } else if !is_backfilling && last_backfill_log_slot.is_some() { + last_backfill_log_slot = None; + info!( + log, + "Historical block download complete"; + ); + } + // Log if we are syncing - if sync_state.is_syncing() { + if current_sync_state.is_syncing() { metrics::set_gauge(&metrics::IS_SYNCED, 0); let distance = format!( "{} slots ({})", - head_distance.as_u64(), - slot_distance_pretty(head_distance, slot_duration) + sync_distance.as_u64(), + slot_distance_pretty(sync_distance, slot_duration) ); let speed = speedo.slots_per_second(); @@ -154,7 +256,7 @@ pub fn spawn_notifier( "est_time" => estimated_time_pretty(speedo.estimated_time_till_slot(current_slot)), ); } - } else if sync_state.is_synced() { + } else if current_sync_state.is_synced() { metrics::set_gauge(&metrics::IS_SYNCED, 1); let block_info = if current_slot > head_slot { " … empty".to_string() @@ -397,4 +499,9 @@ impl Speedo { None } } + + /// Clears all past observations to be used for an alternative sync (i.e backfill sync). + pub fn clear(&mut self) { + self.0.clear() + } } diff --git a/beacon_node/eth2_libp2p/src/behaviour/mod.rs b/beacon_node/eth2_libp2p/src/behaviour/mod.rs index 316917ac5ac..8e29ce6d925 100644 --- a/beacon_node/eth2_libp2p/src/behaviour/mod.rs +++ b/beacon_node/eth2_libp2p/src/behaviour/mod.rs @@ -513,7 +513,7 @@ impl Behaviour { } /// Inform the peer that their request produced an error. - pub fn _send_error_reponse( + pub fn send_error_reponse( &mut self, peer_id: PeerId, id: PeerRequestId, diff --git a/beacon_node/eth2_libp2p/src/peer_manager/mod.rs b/beacon_node/eth2_libp2p/src/peer_manager/mod.rs index 34ba564d612..a2c8adbe514 100644 --- a/beacon_node/eth2_libp2p/src/peer_manager/mod.rs +++ b/beacon_node/eth2_libp2p/src/peer_manager/mod.rs @@ -551,8 +551,17 @@ impl PeerManager { RPCResponseErrorCode::Unknown => PeerAction::HighToleranceError, RPCResponseErrorCode::ResourceUnavailable => { // NOTE: This error only makes sense for the `BlocksByRange` and `BlocksByRoot` - // protocols. For the time being, there is no reason why a peer should send - // this error. + // protocols. + // + // If we are syncing, there is no point keeping these peers around and + // continually failing to request blocks. We instantly ban them and hope that + // by the time the ban lifts, the peers will have completed their backfill + // sync. + // + // TODO: Potentially a more graceful way of handling such peers, would be to + // implement a new sync type which tracks these peers and prevents the sync + // algorithms from requesting blocks from them (at least for a set period of + // time, multiple failures would then lead to a ban). PeerAction::Fatal } RPCResponseErrorCode::ServerError => PeerAction::MidToleranceError, diff --git a/beacon_node/eth2_libp2p/src/rpc/codec/base.rs b/beacon_node/eth2_libp2p/src/rpc/codec/base.rs index 8b2df43ef92..eca05787853 100644 --- a/beacon_node/eth2_libp2p/src/rpc/codec/base.rs +++ b/beacon_node/eth2_libp2p/src/rpc/codec/base.rs @@ -211,16 +211,13 @@ mod tests { let _ = snappy_buf.split_to(1); // decode message just as snappy message - let snappy_decoded_message = snappy_outbound_codec.decode(&mut snappy_buf).unwrap(); + let _snappy_decoded_message = snappy_outbound_codec.decode(&mut snappy_buf).unwrap(); // build codecs for entire chunk let mut snappy_base_outbound_codec = BaseOutboundCodec::new(snappy_outbound_codec); // decode message as ssz snappy chunk - let snappy_decoded_chunk = snappy_base_outbound_codec.decode(&mut buf).unwrap(); - - dbg!(snappy_decoded_message); - dbg!(snappy_decoded_chunk); + let _snappy_decoded_chunk = snappy_base_outbound_codec.decode(&mut buf).unwrap(); } #[test] diff --git a/beacon_node/eth2_libp2p/src/service.rs b/beacon_node/eth2_libp2p/src/service.rs index 3c5ee5938b9..0914307fd08 100644 --- a/beacon_node/eth2_libp2p/src/service.rs +++ b/beacon_node/eth2_libp2p/src/service.rs @@ -274,7 +274,7 @@ impl Service { ) { self.swarm .behaviour_mut() - ._send_error_reponse(peer_id, id, error, reason); + .send_error_reponse(peer_id, id, error, reason); } /// Report a peer's action. diff --git a/beacon_node/eth2_libp2p/src/types/globals.rs b/beacon_node/eth2_libp2p/src/types/globals.rs index 4055e53b205..5c3b0690d8c 100644 --- a/beacon_node/eth2_libp2p/src/types/globals.rs +++ b/beacon_node/eth2_libp2p/src/types/globals.rs @@ -1,7 +1,7 @@ //! A collection of variables that are accessible outside of the network thread itself. use crate::peer_manager::PeerDB; use crate::rpc::MetaData; -use crate::types::SyncState; +use crate::types::{BackFillState, SyncState}; use crate::Client; use crate::EnrExt; use crate::{Enr, GossipTopic, Multiaddr, PeerId}; @@ -29,6 +29,8 @@ pub struct NetworkGlobals { pub gossipsub_subscriptions: RwLock>, /// The current sync status of the node. pub sync_state: RwLock, + /// The current state of the backfill sync. + pub backfill_state: RwLock, } impl NetworkGlobals { @@ -50,6 +52,7 @@ impl NetworkGlobals { peers: RwLock::new(PeerDB::new(trusted_peers, log)), gossipsub_subscriptions: RwLock::new(HashSet::new()), sync_state: RwLock::new(SyncState::Stalled), + backfill_state: RwLock::new(BackFillState::NotRequired), } } @@ -104,6 +107,11 @@ impl NetworkGlobals { self.sync_state.read().clone() } + /// Returns the current backfill state. + pub fn backfill_state(&self) -> BackFillState { + self.backfill_state.read().clone() + } + /// Returns a `Client` type if one is known for the `PeerId`. pub fn client(&self, peer_id: &PeerId) -> Client { self.peers diff --git a/beacon_node/eth2_libp2p/src/types/mod.rs b/beacon_node/eth2_libp2p/src/types/mod.rs index 1d045bb38d6..ad02e07fb70 100644 --- a/beacon_node/eth2_libp2p/src/types/mod.rs +++ b/beacon_node/eth2_libp2p/src/types/mod.rs @@ -15,5 +15,5 @@ pub type Enr = discv5::enr::Enr; pub use globals::NetworkGlobals; pub use pubsub::{PubsubMessage, SnappyTransform}; pub use subnet::{Subnet, SubnetDiscovery}; -pub use sync_state::SyncState; +pub use sync_state::{BackFillState, SyncState}; pub use topics::{subnet_from_topic_hash, GossipEncoding, GossipKind, GossipTopic, CORE_TOPICS}; diff --git a/beacon_node/eth2_libp2p/src/types/sync_state.rs b/beacon_node/eth2_libp2p/src/types/sync_state.rs index 6fdcc2d0871..ce03f61ffe6 100644 --- a/beacon_node/eth2_libp2p/src/types/sync_state.rs +++ b/beacon_node/eth2_libp2p/src/types/sync_state.rs @@ -10,8 +10,13 @@ pub enum SyncState { /// The node is performing a long-range (batch) sync over one or many head chains. /// In this state parent lookups are disabled. SyncingHead { start_slot: Slot, target_slot: Slot }, - /// The node has identified the need for is sync operations and is transitioning to a syncing - /// state. + /// The node is undertaking a backfill sync. This occurs when a user has specified a trusted + /// state. The node first syncs "forward" by downloading blocks up to the current head as + /// specified by its peers. Once completed, the node enters this sync state and attempts to + /// download all required historical blocks to complete its chain. + BackFillSyncing { completed: usize, remaining: usize }, + /// The node has completed syncing a finalized chain and is in the process of re-evaluating + /// which sync state to progress to. SyncTransition, /// The node is up to date with all known peers and is connected to at least one /// fully synced peer. In this state, parent lookups are enabled. @@ -21,6 +26,21 @@ pub enum SyncState { Stalled, } +#[derive(PartialEq, Debug, Clone, Serialize, Deserialize)] +/// The state of the backfill sync. +pub enum BackFillState { + /// The sync is partially completed and currently paused. + Paused, + /// We are currently backfilling. + Syncing, + /// A backfill sync has completed. + Completed, + /// A backfill sync is not required. + NotRequired, + /// Too many failed attempts at backfilling. Consider it failed. + Failed, +} + impl PartialEq for SyncState { fn eq(&self, other: &Self) -> bool { matches!( @@ -32,6 +52,10 @@ impl PartialEq for SyncState { | (SyncState::Synced, SyncState::Synced) | (SyncState::Stalled, SyncState::Stalled) | (SyncState::SyncTransition, SyncState::SyncTransition) + | ( + SyncState::BackFillSyncing { .. }, + SyncState::BackFillSyncing { .. } + ) ) } } @@ -43,14 +67,18 @@ impl SyncState { SyncState::SyncingFinalized { .. } => true, SyncState::SyncingHead { .. } => true, SyncState::SyncTransition => true, + // Backfill doesn't effect any logic, we consider this state, not syncing. + SyncState::BackFillSyncing { .. } => false, SyncState::Synced => false, SyncState::Stalled => false, } } /// Returns true if the node is synced. + /// + /// NOTE: We consider the node synced if it is fetching old historical blocks. pub fn is_synced(&self) -> bool { - matches!(self, SyncState::Synced) + matches!(self, SyncState::Synced | SyncState::BackFillSyncing { .. }) } } @@ -61,7 +89,8 @@ impl std::fmt::Display for SyncState { SyncState::SyncingHead { .. } => write!(f, "Syncing Head Chain"), SyncState::Synced { .. } => write!(f, "Synced"), SyncState::Stalled { .. } => write!(f, "Stalled"), - SyncState::SyncTransition => write!(f, "Searching syncing peers"), + SyncState::SyncTransition => write!(f, "Evaluating known peers"), + SyncState::BackFillSyncing { .. } => write!(f, "Syncing Historical Blocks"), } } } diff --git a/beacon_node/http_api/src/database.rs b/beacon_node/http_api/src/database.rs new file mode 100644 index 00000000000..e911883349d --- /dev/null +++ b/beacon_node/http_api/src/database.rs @@ -0,0 +1,33 @@ +use beacon_chain::store::{metadata::CURRENT_SCHEMA_VERSION, AnchorInfo}; +use beacon_chain::{BeaconChain, BeaconChainTypes}; +use eth2::lighthouse::DatabaseInfo; +use std::sync::Arc; +use types::SignedBeaconBlock; + +pub fn info( + chain: Arc>, +) -> Result { + let store = &chain.store; + let split = store.get_split_info(); + let anchor = store.get_anchor_info(); + + Ok(DatabaseInfo { + schema_version: CURRENT_SCHEMA_VERSION.as_u64(), + split, + anchor, + }) +} + +pub fn historical_blocks( + chain: Arc>, + blocks: Vec>, +) -> Result { + chain + .import_historical_block_batch(&blocks) + .map_err(warp_utils::reject::beacon_chain_error)?; + + let anchor = chain.store.get_anchor_info().ok_or_else(|| { + warp_utils::reject::custom_bad_request("node is not checkpoint synced".to_string()) + })?; + Ok(anchor) +} diff --git a/beacon_node/http_api/src/lib.rs b/beacon_node/http_api/src/lib.rs index da3bb595c21..bb9ee822fa7 100644 --- a/beacon_node/http_api/src/lib.rs +++ b/beacon_node/http_api/src/lib.rs @@ -7,6 +7,7 @@ mod attester_duties; mod block_id; +mod database; mod metrics; mod proposer_duties; mod state_id; @@ -349,7 +350,7 @@ pub fn serve( ))) } } - SyncState::SyncingHead { .. } | SyncState::SyncTransition => Ok(()), + SyncState::SyncingHead { .. } | SyncState::SyncTransition | SyncState::BackFillSyncing { .. } => Ok(()), SyncState::Synced => Ok(()), SyncState::Stalled => Err(warp_utils::reject::not_synced( "sync is stalled".to_string(), @@ -1579,7 +1580,8 @@ pub fn serve( blocking_task(move || match *network_globals.sync_state.read() { SyncState::SyncingFinalized { .. } | SyncState::SyncingHead { .. } - | SyncState::SyncTransition => Ok(warp::reply::with_status( + | SyncState::SyncTransition + | SyncState::BackFillSyncing { .. } => Ok(warp::reply::with_status( warp::reply(), warp::http::StatusCode::PARTIAL_CONTENT, )), @@ -2040,7 +2042,7 @@ pub fn serve( .and(warp::path("validator")) .and(warp::path("contribution_and_proofs")) .and(warp::path::end()) - .and(not_while_syncing_filter) + .and(not_while_syncing_filter.clone()) .and(chain_filter.clone()) .and(warp::body::json()) .and(network_tx_filter.clone()) @@ -2399,6 +2401,49 @@ pub fn serve( }) }); + let database_path = warp::path("lighthouse").and(warp::path("database")); + + // GET lighthouse/database/info + let get_lighthouse_database_info = database_path + .and(warp::path("info")) + .and(warp::path::end()) + .and(chain_filter.clone()) + .and_then(|chain: Arc>| blocking_json_task(move || database::info(chain))); + + // POST lighthouse/database/reconstruct + let post_lighthouse_database_reconstruct = database_path + .and(warp::path("reconstruct")) + .and(warp::path::end()) + .and(not_while_syncing_filter) + .and(chain_filter.clone()) + .and_then(|chain: Arc>| { + blocking_json_task(move || { + chain.store_migrator.process_reconstruction(); + Ok("success") + }) + }); + + // POST lighthouse/database/historical_blocks + let post_lighthouse_database_historical_blocks = database_path + .and(warp::path("historical_blocks")) + .and(warp::path::end()) + .and(warp::body::json()) + .and(chain_filter.clone()) + .and(log_filter.clone()) + .and_then( + |blocks: Vec>, + chain: Arc>, + log: Logger| { + info!( + log, + "Importing historical blocks"; + "count" => blocks.len(), + "source" => "http_api" + ); + blocking_json_task(move || database::historical_blocks(chain, blocks)) + }, + ); + let get_events = eth1_v1 .and(warp::path("events")) .and(warp::path::end()) @@ -2510,6 +2555,7 @@ pub fn serve( .or(get_lighthouse_eth1_deposit_cache.boxed()) .or(get_lighthouse_beacon_states_ssz.boxed()) .or(get_lighthouse_staking.boxed()) + .or(get_lighthouse_database_info.boxed()) .or(get_events.boxed()), ) .or(warp::post().and( @@ -2526,7 +2572,9 @@ pub fn serve( .or(post_validator_contribution_and_proofs.boxed()) .or(post_validator_beacon_committee_subscriptions.boxed()) .or(post_validator_sync_committee_subscriptions.boxed()) - .or(post_lighthouse_liveness.boxed()), + .or(post_lighthouse_liveness.boxed()) + .or(post_lighthouse_database_reconstruct.boxed()) + .or(post_lighthouse_database_historical_blocks.boxed()), )) .recover(warp_utils::reject::handle_rejection) .with(slog_logging(log.clone())) diff --git a/beacon_node/http_api/tests/tests.rs b/beacon_node/http_api/tests/tests.rs index 87f3ef4f62e..5ccd672ca6b 100644 --- a/beacon_node/http_api/tests/tests.rs +++ b/beacon_node/http_api/tests/tests.rs @@ -2101,6 +2101,29 @@ impl ApiTester { self } + pub async fn test_get_lighthouse_database_info(self) -> Self { + let info = self.client.get_lighthouse_database_info().await.unwrap(); + + assert_eq!(info.anchor, self.chain.store.get_anchor_info()); + assert_eq!(info.split, self.chain.store.get_split_info()); + assert_eq!( + info.schema_version, + store::metadata::CURRENT_SCHEMA_VERSION.as_u64() + ); + + self + } + + pub async fn test_post_lighthouse_database_reconstruct(self) -> Self { + let response = self + .client + .post_lighthouse_database_reconstruct() + .await + .unwrap(); + assert_eq!(response, "success"); + self + } + pub async fn test_post_lighthouse_liveness(self) -> Self { let epoch = self.chain.epoch().unwrap(); let head_state = self.chain.head_beacon_state().unwrap(); @@ -2653,6 +2676,10 @@ async fn lighthouse_endpoints() { .await .test_get_lighthouse_staking() .await + .test_get_lighthouse_database_info() + .await + .test_post_lighthouse_database_reconstruct() + .await .test_post_lighthouse_liveness() .await; } diff --git a/beacon_node/network/src/beacon_processor/worker/rpc_methods.rs b/beacon_node/network/src/beacon_processor/worker/rpc_methods.rs index e1e51ef2c6b..da66e89f5d6 100644 --- a/beacon_node/network/src/beacon_processor/worker/rpc_methods.rs +++ b/beacon_node/network/src/beacon_processor/worker/rpc_methods.rs @@ -2,7 +2,7 @@ use crate::beacon_processor::worker::FUTURE_SLOT_TOLERANCE; use crate::service::NetworkMessage; use crate::status::ToStatusMessage; use crate::sync::SyncMessage; -use beacon_chain::{BeaconChainError, BeaconChainTypes, WhenSlotSkipped}; +use beacon_chain::{BeaconChainError, BeaconChainTypes, HistoricalBlockError, WhenSlotSkipped}; use eth2_libp2p::rpc::StatusMessage; use eth2_libp2p::rpc::*; use eth2_libp2p::{PeerId, PeerRequestId, ReportSource, Response, SyncInfo}; @@ -38,6 +38,21 @@ impl Worker { }) } + pub fn send_error_response( + &self, + peer_id: PeerId, + error: RPCResponseErrorCode, + reason: String, + id: PeerRequestId, + ) { + self.send_network_message(NetworkMessage::SendErrorResponse { + peer_id, + error, + reason, + id, + }) + } + /* Processing functions */ /// Process a `Status` message to determine if a peer is relevant to us. If the peer is @@ -163,6 +178,20 @@ impl Worker { .forwards_iter_block_roots(Slot::from(req.start_slot)) { Ok(iter) => iter, + Err(BeaconChainError::HistoricalBlockError( + HistoricalBlockError::BlockOutOfRange { + slot, + oldest_block_slot, + }, + )) => { + debug!(self.log, "Range request failed during backfill"; "requested_slot" => slot, "oldest_known_slot" => oldest_block_slot); + return self.send_error_response( + peer_id, + RPCResponseErrorCode::ResourceUnavailable, + "Backfilling".into(), + request_id, + ); + } Err(e) => return error!(self.log, "Unable to obtain root iter"; "error" => ?e), }; diff --git a/beacon_node/network/src/beacon_processor/worker/sync_methods.rs b/beacon_node/network/src/beacon_processor/worker/sync_methods.rs index db2b8db75ed..ff2739d5091 100644 --- a/beacon_node/network/src/beacon_processor/worker/sync_methods.rs +++ b/beacon_node/network/src/beacon_processor/worker/sync_methods.rs @@ -2,9 +2,11 @@ use super::{super::work_reprocessing_queue::ReprocessQueueMessage, Worker}; use crate::beacon_processor::worker::FUTURE_SLOT_TOLERANCE; use crate::beacon_processor::BlockResultSender; use crate::metrics; -use crate::sync::manager::SyncMessage; +use crate::sync::manager::{SyncMessage, SyncRequestType}; use crate::sync::{BatchProcessResult, ChainId}; -use beacon_chain::{BeaconChainTypes, BlockError, ChainSegmentResult}; +use beacon_chain::{ + BeaconChainError, BeaconChainTypes, BlockError, ChainSegmentResult, HistoricalBlockError, +}; use eth2_libp2p::PeerId; use slog::{crit, debug, error, info, trace, warn}; use tokio::sync::mpsc; @@ -15,6 +17,8 @@ use types::{Epoch, Hash256, SignedBeaconBlock}; pub enum ProcessId { /// Processing Id of a range syncing batch. RangeBatchId(ChainId, Epoch), + /// Processing ID for a backfill syncing batch. + BackSyncBatchId(Epoch), /// Processing Id of the parent lookup of a block. ParentLookup(PeerId, Hash256), } @@ -99,11 +103,40 @@ impl Worker { } }; - self.send_sync_message(SyncMessage::BatchProcessed { - chain_id, - epoch, - result, - }); + let sync_type = SyncRequestType::RangeSync(epoch, chain_id); + + self.send_sync_message(SyncMessage::BatchProcessed { sync_type, result }); + } + // this a request from the Backfill sync + ProcessId::BackSyncBatchId(epoch) => { + let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64()); + let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64()); + let sent_blocks = downloaded_blocks.len(); + + let result = match self.process_backfill_blocks(&downloaded_blocks) { + (_, Ok(_)) => { + debug!(self.log, "Backfill batch processed"; + "batch_epoch" => epoch, + "first_block_slot" => start_slot, + "last_block_slot" => end_slot, + "processed_blocks" => sent_blocks, + "service"=> "sync"); + BatchProcessResult::Success(sent_blocks > 0) + } + (_, Err(e)) => { + debug!(self.log, "Backfill batch processing failed"; + "batch_epoch" => epoch, + "first_block_slot" => start_slot, + "last_block_slot" => end_slot, + "error" => e, + "service" => "sync"); + BatchProcessResult::Failed(false) + } + }; + + let sync_type = SyncRequestType::BackFillSync(epoch); + + self.send_sync_message(SyncMessage::BatchProcessed { sync_type, result }); } // this is a parent lookup request from the sync manager ProcessId::ParentLookup(peer_id, chain_head) => { @@ -160,6 +193,80 @@ impl Worker { } } + /// Helper function to process backfill block batches which only consumes the chain and blocks to process. + fn process_backfill_blocks( + &self, + blocks: &[SignedBeaconBlock], + ) -> (usize, Result<(), String>) { + match self.chain.import_historical_block_batch(blocks) { + Ok(imported_blocks) => { + metrics::inc_counter( + &metrics::BEACON_PROCESSOR_BACKFILL_CHAIN_SEGMENT_SUCCESS_TOTAL, + ); + + (imported_blocks, Ok(())) + } + Err(error) => { + metrics::inc_counter( + &metrics::BEACON_PROCESSOR_BACKFILL_CHAIN_SEGMENT_FAILED_TOTAL, + ); + let err = match error { + // Handle the historical block errors specifically + BeaconChainError::HistoricalBlockError(e) => match e { + HistoricalBlockError::MismatchedBlockRoot { + block_root, + expected_block_root, + } => { + debug!( + self.log, + "Backfill batch processing error"; + "error" => "mismatched_block_root", + "block_root" => ?block_root, + "expected_root" => ?expected_block_root + ); + String::from("mismatched_block_root") + } + HistoricalBlockError::InvalidSignature + | HistoricalBlockError::SignatureSet(_) => { + warn!( + self.log, + "Backfill batch processing error"; + "error" => ?e + ); + "invalid_signature".into() + } + HistoricalBlockError::ValidatorPubkeyCacheTimeout => { + warn!( + self.log, + "Backfill batch processing error"; + "error" => "pubkey_cache_timeout" + ); + "pubkey_cache_timeout".into() + } + HistoricalBlockError::NoAnchorInfo => { + warn!(self.log, "Backfill not required"); + String::from("no_anchor_info") + } + HistoricalBlockError::IndexOutOfBounds + | HistoricalBlockError::BlockOutOfRange { .. } => { + error!( + self.log, + "Backfill batch processing error"; + "error" => ?e, + ); + String::from("logic_error") + } + }, + other => { + warn!(self.log, "Backfill batch processing error"; "error" => ?other); + format!("{:?}", other) + } + }; + (0, Err(err)) + } + } + } + /// Runs fork-choice on a given chain. This is used during block processing after one successful /// block import. fn run_fork_choice(&self) { diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index 8204978553d..e57a8883547 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -338,10 +338,18 @@ lazy_static! { "beacon_processor_chain_segment_success_total", "Total number of chain segments successfully processed." ); + pub static ref BEACON_PROCESSOR_BACKFILL_CHAIN_SEGMENT_SUCCESS_TOTAL: Result = try_create_int_counter( + "beacon_processor_backfill_chain_segment_success_total", + "Total number of chain segments successfully processed." + ); pub static ref BEACON_PROCESSOR_CHAIN_SEGMENT_FAILED_TOTAL: Result = try_create_int_counter( "beacon_processor_chain_segment_failed_total", "Total number of chain segments that failed processing." ); + pub static ref BEACON_PROCESSOR_BACKFILL_CHAIN_SEGMENT_FAILED_TOTAL: Result = try_create_int_counter( + "beacon_processor_backfill_chain_segment_failed_total", + "Total number of backfill chain segments that failed processing." + ); // Unaggregated attestations. pub static ref BEACON_PROCESSOR_UNAGGREGATED_ATTESTATION_QUEUE_TOTAL: Result = try_create_int_gauge( "beacon_processor_unaggregated_attestation_queue_total", diff --git a/beacon_node/network/src/router/processor.rs b/beacon_node/network/src/router/processor.rs index 103ab85dc22..01ea98948ba 100644 --- a/beacon_node/network/src/router/processor.rs +++ b/beacon_node/network/src/router/processor.rs @@ -418,7 +418,7 @@ impl HandlerNetworkContext { error: RPCResponseErrorCode, reason: String, ) { - self.inform_network(NetworkMessage::SendError { + self.inform_network(NetworkMessage::SendErrorResponse { peer_id, error, id, diff --git a/beacon_node/network/src/service.rs b/beacon_node/network/src/service.rs index 6dd0b58d1e2..8fd2d75b836 100644 --- a/beacon_node/network/src/service.rs +++ b/beacon_node/network/src/service.rs @@ -61,10 +61,8 @@ pub enum NetworkMessage { response: Response, id: PeerRequestId, }, - /// Respond to a peer's request with an error. - SendError { - // NOTE: Currently this is never used, we just say goodbye without nicely closing the - // stream assigned to the request + /// Sends an error response to an RPC request. + SendErrorResponse { peer_id: PeerId, error: RPCResponseErrorCode, reason: String, @@ -367,7 +365,7 @@ fn spawn_service( NetworkMessage::SendResponse{ peer_id, response, id } => { service.libp2p.send_response(peer_id, id, response); } - NetworkMessage::SendError{ peer_id, error, id, reason } => { + NetworkMessage::SendErrorResponse{ peer_id, error, id, reason } => { service.libp2p.respond_with_error(peer_id, id, error, reason); } NetworkMessage::UPnPMappingEstablished { tcp_socket, udp_socket} => { diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs new file mode 100644 index 00000000000..903b57f03a0 --- /dev/null +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -0,0 +1,1172 @@ +//! This module contains the logic for Lighthouse's backfill sync. +//! +//! This kind of sync occurs when a trusted state is provided to the client. The client +//! will perform a [`RangeSync`] to the latest head from the trusted state, such that the +//! client can perform its duties right away. Once completed, a backfill sync occurs, where all old +//! blocks (from genesis) are downloaded in order to keep a consistent history. +//! +//! If a batch fails, the backfill sync cannot progress. In this scenario, we mark the backfill +//! sync as failed, log an error and attempt to retry once a new peer joins the node. + +use super::RequestId; +use crate::beacon_processor::{ProcessId, WorkEvent as BeaconWorkEvent}; +use crate::sync::manager::BatchProcessResult; +use crate::sync::network_context::SyncNetworkContext; +use crate::sync::range_sync::{BatchConfig, BatchId, BatchInfo, BatchState}; +use beacon_chain::{BeaconChain, BeaconChainTypes}; +use eth2_libp2p::types::{BackFillState, NetworkGlobals}; +use eth2_libp2p::{PeerAction, PeerId}; +use rand::seq::SliceRandom; +use slog::{crit, debug, error, info, warn}; +use std::collections::{ + btree_map::{BTreeMap, Entry}, + HashMap, HashSet, +}; +use std::sync::Arc; +use tokio::sync::mpsc; +use types::{Epoch, EthSpec, SignedBeaconBlock}; + +/// Blocks are downloaded in batches from peers. This constant specifies how many epochs worth of +/// blocks per batch are requested _at most_. A batch may request less blocks to account for +/// already requested slots. There is a timeout for each batch request. If this value is too high, +/// we will negatively report peers with poor bandwidth. This can be set arbitrarily high, in which +/// case the responder will fill the response up to the max request size, assuming they have the +/// bandwidth to do so. +pub const BACKFILL_EPOCHS_PER_BATCH: u64 = 2; + +/// The maximum number of batches to queue before requesting more. +const BACKFILL_BATCH_BUFFER_SIZE: u8 = 20; + +/// The number of times to retry a batch before it is considered failed. +const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 10; + +/// Invalid batches are attempted to be re-downloaded from other peers. If a batch cannot be processed +/// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty. +const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 10; + +/// Custom configuration for the batch object. +struct BackFillBatchConfig {} + +impl BatchConfig for BackFillBatchConfig { + fn max_batch_download_attempts() -> u8 { + MAX_BATCH_DOWNLOAD_ATTEMPTS + } + fn max_batch_processing_attempts() -> u8 { + MAX_BATCH_PROCESSING_ATTEMPTS + } +} + +/// Return type when attempting to start the backfill sync process. +pub enum SyncStart { + /// The chain started syncing or is already syncing. + Syncing { + /// The number of slots that have been processed so far. + completed: usize, + /// The number of slots still to be processed. + remaining: usize, + }, + /// The chain didn't start syncing. + NotSyncing, +} + +/// A standard result from calling public functions on [`BackFillSync`]. +pub enum ProcessResult { + /// The call was successful. + Successful, + /// The call resulted in completing the backfill sync. + SyncCompleted, +} + +/// The ways a backfill sync can fail. +#[derive(Debug)] +pub enum BackFillError { + /// A batch failed to be downloaded. + BatchDownloadFailed(BatchId), + /// A batch could not be processed. + BatchProcessingFailed(BatchId), + /// A batch entered an invalid state. + BatchInvalidState(BatchId, String), + /// The sync algorithm entered an invalid state. + InvalidSyncState(String), + /// The chain became paused. + Paused, +} + +pub struct BackFillSync { + /// The current state of the backfill sync. + state: BackFillState, + + /// Keeps track of the current progress of the backfill. + /// This only gets refreshed from the beacon chain if we enter a failed state. + current_start: BatchId, + + /// Starting epoch of the batch that needs to be processed next. + /// This is incremented as the chain advances. + processing_target: BatchId, + + /// Starting epoch of the next batch that needs to be downloaded. + to_be_downloaded: BatchId, + + /// Keeps track if we have requested the final batch. + last_batch_downloaded: bool, + + /// Sorted map of batches undergoing some kind of processing. + batches: BTreeMap>, + + /// List of peers we are currently awaiting a response for. + active_requests: HashMap>, + + /// The current processing batch, if any. + current_processing_batch: Option, + + /// Batches validated by this chain. + validated_batches: u64, + + /// We keep track of peer that are participating in the backfill sync. Unlike RangeSync, + /// BackFillSync uses all synced peers to download the chain from. If BackFillSync fails, we don't + /// want to penalize all our synced peers, so we use this variable to keep track of peers that + /// have participated and only penalize these peers if backfill sync fails. + participating_peers: HashSet, + + /// When a backfill sync fails, we keep track of whether a new fully synced peer has joined. + /// This signifies that we are able to attempt to restart a failed chain. + restart_failed_sync: bool, + + /// Reference to the beacon chain to obtain initial starting points for the backfill sync. + beacon_chain: Arc>, + + /// Reference to the network globals in order to obtain valid peers to backfill blocks from + /// (i.e synced peers). + network_globals: Arc>, + + /// A multi-threaded, non-blocking processor for processing batches in the beacon chain. + beacon_processor_send: mpsc::Sender>, + + /// A logger for backfill sync. + log: slog::Logger, +} + +impl BackFillSync { + pub fn new( + beacon_chain: Arc>, + network_globals: Arc>, + beacon_processor_send: mpsc::Sender>, + log: slog::Logger, + ) -> Self { + // Determine if backfill is enabled or not. + // Get the anchor info, if this returns None, then backfill is not required for this + // running instance. + // If, for some reason a backfill has already been completed (or we've used a trusted + // genesis root) then backfill has been completed. + + let (state, current_start) = if let Some(anchor_info) = beacon_chain.store.get_anchor_info() + { + if anchor_info.block_backfill_complete() { + (BackFillState::Completed, Epoch::new(0)) + } else { + ( + BackFillState::Paused, + anchor_info + .oldest_block_slot + .epoch(T::EthSpec::slots_per_epoch()), + ) + } + } else { + (BackFillState::NotRequired, Epoch::new(0)) + }; + + let bfs = BackFillSync { + state, + batches: BTreeMap::new(), + active_requests: HashMap::new(), + processing_target: current_start, + current_start, + last_batch_downloaded: false, + to_be_downloaded: current_start, + network_globals, + current_processing_batch: None, + validated_batches: 0, + participating_peers: HashSet::new(), + restart_failed_sync: false, + beacon_chain, + beacon_processor_send, + log, + }; + + // Update the global network state with the current backfill state. + bfs.update_global_state(); + bfs + } + + /// Pauses the backfill sync if it's currently syncing. + pub fn pause(&mut self) { + if let BackFillState::Syncing = self.state { + debug!(self.log, "Backfill sync paused"; "processed_epochs" => self.validated_batches, "to_be_processed" => self.current_start); + self.state = BackFillState::Paused; + } + } + + /// Starts or resumes syncing. + /// + /// If resuming is successful, reports back the current syncing metrics. + #[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"] + pub fn start( + &mut self, + network: &mut SyncNetworkContext, + ) -> Result { + match self.state { + BackFillState::Syncing => {} // already syncing ignore. + BackFillState::Paused => { + if self + .network_globals + .peers + .read() + .synced_peers() + .next() + .is_some() + { + // If there are peers to resume with, begin the resume. + debug!(self.log, "Resuming backfill sync"; "start_epoch" => self.current_start, "awaiting_batches" => self.batches.len(), "processing_target" => self.processing_target); + self.state = BackFillState::Syncing; + // Resume any previously failed batches. + self.resume_batches(network)?; + // begin requesting blocks from the peer pool, until all peers are exhausted. + self.request_batches(network)?; + + // start processing batches if needed + self.process_completed_batches(network)?; + } else { + return Ok(SyncStart::NotSyncing); + } + } + BackFillState::Failed => { + // Attempt to recover from a failed sync. All local variables should be reset and + // cleared already for a fresh start. + // We only attempt to restart a failed backfill sync if a new synced peer has been + // added. + if !self.restart_failed_sync { + return Ok(SyncStart::NotSyncing); + } + + self.state = BackFillState::Syncing; + + // Obtain a new start slot, from the beacon chain and handle possible errors. + match self.reset_start_epoch() { + Err(ResetEpochError::SyncCompleted) => { + error!(self.log, "Backfill sync completed whilst in failed status"); + self.state = BackFillState::Completed; + self.update_global_state(); + return Err(BackFillError::InvalidSyncState(String::from( + "chain completed", + ))); + } + Err(ResetEpochError::NotRequired) => { + error!( + self.log, + "Backfill sync not required whilst in failed status" + ); + self.state = BackFillState::NotRequired; + self.update_global_state(); + return Err(BackFillError::InvalidSyncState(String::from( + "backfill not required", + ))); + } + Ok(_) => {} + } + + debug!(self.log, "Resuming a failed backfill sync"; "start_epoch" => self.current_start); + + // begin requesting blocks from the peer pool, until all peers are exhausted. + self.request_batches(network)?; + } + BackFillState::Completed | BackFillState::NotRequired => { + return Ok(SyncStart::NotSyncing) + } + } + + self.update_global_state(); + + Ok(SyncStart::Syncing { + completed: (self.validated_batches + * BACKFILL_EPOCHS_PER_BATCH + * T::EthSpec::slots_per_epoch()) as usize, + remaining: self + .current_start + .start_slot(T::EthSpec::slots_per_epoch()) + .as_usize(), + }) + } + + /// A fully synced peer has joined us. + /// If we are in a failed state, update a local variable to indicate we are able to restart + /// the failed sync on the next attempt. + pub fn fully_synced_peer_joined(&mut self) { + if matches!(self.state, BackFillState::Failed) { + self.restart_failed_sync = true; + } + } + + /// A peer has disconnected. + /// If the peer has active batches, those are considered failed and re-requested. + #[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"] + pub fn peer_disconnected( + &mut self, + peer_id: &PeerId, + network: &mut SyncNetworkContext, + ) -> Result<(), BackFillError> { + if matches!( + self.state, + BackFillState::Failed | BackFillState::NotRequired + ) { + return Ok(()); + } + + if let Some(batch_ids) = self.active_requests.remove(peer_id) { + // fail the batches + for id in batch_ids { + if let Some(batch) = self.batches.get_mut(&id) { + match batch.download_failed(false) { + Ok(true) => { + self.fail_sync(BackFillError::BatchDownloadFailed(id))?; + } + Ok(false) => {} + Err(e) => { + self.fail_sync(BackFillError::BatchInvalidState(id, e.0))?; + } + } + // If we have run out of peers in which to retry this batch, the backfill state + // transitions to a paused state. + self.retry_batch_download(network, id)?; + } else { + debug!(self.log, "Batch not found while removing peer"; + "peer" => %peer_id, "batch" => id) + } + } + } + + // Remove the peer from the participation list + self.participating_peers.remove(peer_id); + Ok(()) + } + + /// An RPC error has occurred. + /// + /// If the batch exists it is re-requested. + #[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"] + pub fn inject_error( + &mut self, + network: &mut SyncNetworkContext, + batch_id: BatchId, + peer_id: &PeerId, + request_id: RequestId, + ) -> Result<(), BackFillError> { + if let Some(batch) = self.batches.get_mut(&batch_id) { + // A batch could be retried without the peer failing the request (disconnecting/ + // sending an error /timeout) if the peer is removed from the chain for other + // reasons. Check that this block belongs to the expected peer + if !batch.is_expecting_block(peer_id, &request_id) { + return Ok(()); + } + debug!(self.log, "Batch failed"; "batch_epoch" => batch_id, "error" => "rpc_error"); + if let Some(active_requests) = self.active_requests.get_mut(peer_id) { + active_requests.remove(&batch_id); + } + match batch.download_failed(true) { + Err(e) => self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)), + Ok(true) => self.fail_sync(BackFillError::BatchDownloadFailed(batch_id)), + Ok(false) => self.retry_batch_download(network, batch_id), + } + } else { + // this could be an error for an old batch, removed when the chain advances + Ok(()) + } + } + + /// A block has been received for a batch relating to this backfilling chain. + /// If the block correctly completes the batch it will be processed if possible. + /// If this returns an error, the backfill sync has failed and will be restarted once new peers + /// join the system. + /// The sync manager should update the global sync state on failure. + #[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"] + pub fn on_block_response( + &mut self, + network: &mut SyncNetworkContext, + batch_id: BatchId, + peer_id: &PeerId, + request_id: RequestId, + beacon_block: Option>, + ) -> Result { + // check if we have this batch + let batch = match self.batches.get_mut(&batch_id) { + None => { + if !matches!(self.state, BackFillState::Failed) { + // A batch might get removed when the chain advances, so this is non fatal. + debug!(self.log, "Received a block for unknown batch"; "epoch" => batch_id); + } + return Ok(ProcessResult::Successful); + } + Some(batch) => { + // A batch could be retried without the peer failing the request (disconnecting/ + // sending an error /timeout) if the peer is removed from the chain for other + // reasons. Check that this block belongs to the expected peer, and that the + // request_id matches + if !batch.is_expecting_block(peer_id, &request_id) { + return Ok(ProcessResult::Successful); + } + batch + } + }; + + if let Some(block) = beacon_block { + // This is not a stream termination, simply add the block to the request + if let Err(e) = batch.add_block(block) { + self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))?; + } + Ok(ProcessResult::Successful) + } else { + // A stream termination has been sent. This batch has ended. Process a completed batch. + // Remove the request from the peer's active batches + self.active_requests + .get_mut(peer_id) + .map(|active_requests| active_requests.remove(&batch_id)); + + match batch.download_completed() { + Ok(received) => { + let awaiting_batches = + self.processing_target.saturating_sub(batch_id) / BACKFILL_EPOCHS_PER_BATCH; + debug!(self.log, "Completed batch received"; "epoch" => batch_id, "blocks" => received, "awaiting_batches" => awaiting_batches); + + // pre-emptively request more blocks from peers whilst we process current blocks, + self.request_batches(network)?; + self.process_completed_batches(network) + } + Err(result) => { + let (expected_boundary, received_boundary, is_failed) = match result { + Err(e) => { + return self + .fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)) + .map(|_| ProcessResult::Successful); + } + Ok(v) => v, + }; + warn!(self.log, "Batch received out of range blocks"; "expected_boundary" => expected_boundary, "received_boundary" => received_boundary, + "peer_id" => %peer_id, batch); + + if is_failed { + error!(self.log, "Backfill failed"; "epoch" => batch_id, "received_boundary" => received_boundary, "expected_boundary" => expected_boundary); + return self + .fail_sync(BackFillError::BatchDownloadFailed(batch_id)) + .map(|_| ProcessResult::Successful); + } + // this batch can't be used, so we need to request it again. + self.retry_batch_download(network, batch_id) + .map(|_| ProcessResult::Successful) + } + } + } + } + + /// The syncing process has failed. + /// + /// This resets past variables, to allow for a fresh start when resuming. + fn fail_sync(&mut self, error: BackFillError) -> Result<(), BackFillError> { + // Some errors shouldn't fail the chain. + if matches!(error, BackFillError::Paused) { + return Ok(()); + } + + // Set the state + self.state = BackFillState::Failed; + // Remove all batches and active requests and participating peers. + self.batches.clear(); + self.active_requests.clear(); + self.participating_peers.clear(); + self.restart_failed_sync = false; + + // Reset all downloading and processing targets + self.processing_target = self.current_start; + self.to_be_downloaded = self.current_start; + self.last_batch_downloaded = false; + self.current_processing_batch = None; + + // Keep the global network state up to date. + self.update_global_state(); + + // NOTE: Lets keep validated_batches for posterity + + // Emit the log here + error!(self.log, "Backfill sync failed"; "error" => ?error); + + // Return the error, kinda weird pattern, but I want to use + // `self.fail_chain(_)?` in other parts of the code. + Err(error) + } + + /// Processes the batch with the given id. + /// The batch must exist and be ready for processing + fn process_batch( + &mut self, + network: &mut SyncNetworkContext, + batch_id: BatchId, + ) -> Result { + // Only process batches if this chain is Syncing, and only one at a time + if self.state != BackFillState::Syncing || self.current_processing_batch.is_some() { + return Ok(ProcessResult::Successful); + } + + let batch = match self.batches.get_mut(&batch_id) { + Some(batch) => batch, + None => { + return self + .fail_sync(BackFillError::InvalidSyncState(format!( + "Trying to process a batch that does not exist: {}", + batch_id + ))) + .map(|_| ProcessResult::Successful); + } + }; + + // NOTE: We send empty batches to the processor in order to trigger the block processor + // result callback. This is done, because an empty batch could end a chain and the logic + // for removing chains and checking completion is in the callback. + + let blocks = match batch.start_processing() { + Err(e) => { + return self + .fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)) + .map(|_| ProcessResult::Successful) + } + Ok(v) => v, + }; + + let process_id = ProcessId::BackSyncBatchId(batch_id); + self.current_processing_batch = Some(batch_id); + + if let Err(e) = self + .beacon_processor_send + .try_send(BeaconWorkEvent::chain_segment(process_id, blocks)) + { + crit!(self.log, "Failed to send backfill segment to processor."; "msg" => "process_batch", + "error" => %e, "batch" => self.processing_target); + // This is unlikely to happen but it would stall syncing since the batch now has no + // blocks to continue, and the chain is expecting a processing result that won't + // arrive. To mitigate this, (fake) fail this processing so that the batch is + // re-downloaded. + self.on_batch_process_result(network, batch_id, &BatchProcessResult::Failed(false)) + } else { + Ok(ProcessResult::Successful) + } + } + + /// The block processor has completed processing a batch. This function handles the result + /// of the batch processor. + /// If an error is returned the BackFill sync has failed. + #[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"] + pub fn on_batch_process_result( + &mut self, + network: &mut SyncNetworkContext, + batch_id: BatchId, + result: &BatchProcessResult, + ) -> Result { + // On each batch process, we update the global state. + self.update_global_state(); + + // The first two cases are possible in regular sync, should not occur in backfill, but we + // keep this logic for handling potential processing race conditions. + // result + match &self.current_processing_batch { + Some(processing_id) if *processing_id != batch_id => { + debug!(self.log, "Unexpected batch result"; + "batch_epoch" => batch_id, "expected_batch_epoch" => processing_id); + return Ok(ProcessResult::Successful); + } + None => { + debug!(self.log, "Chain was not expecting a batch result"; + "batch_epoch" => batch_id); + return Ok(ProcessResult::Successful); + } + _ => { + // batch_id matches, continue + self.current_processing_batch = None; + } + } + + match result { + BatchProcessResult::Success(was_non_empty) => { + let batch = match self.batches.get_mut(&batch_id) { + Some(v) => v, + None => { + // This is an error. Fail the sync algorithm. + return self + .fail_sync(BackFillError::InvalidSyncState(format!( + "Current processing batch not found: {}", + batch_id + ))) + .map(|_| ProcessResult::Successful); + } + }; + + if let Err(e) = batch.processing_completed(true) { + self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))?; + } + // If the processed batch was not empty, we can validate previous unvalidated + // blocks. + if *was_non_empty { + self.advance_chain(network, batch_id); + } + + if batch_id == self.processing_target { + self.processing_target = self + .processing_target + .saturating_sub(BACKFILL_EPOCHS_PER_BATCH); + } + + // check if the chain has completed syncing + if self.check_completed() { + // chain is completed + info!(self.log, "Backfill sync completed"; "blocks_processed" => self.validated_batches * T::EthSpec::slots_per_epoch()); + self.state = BackFillState::Completed; + self.update_global_state(); + Ok(ProcessResult::SyncCompleted) + } else { + // chain is not completed + // attempt to request more batches + self.request_batches(network)?; + // attempt to process more batches + self.process_completed_batches(network) + } + } + BatchProcessResult::Failed(imported_blocks) => { + let batch = match self.batches.get_mut(&batch_id) { + Some(v) => v, + None => { + return self + .fail_sync(BackFillError::InvalidSyncState(format!( + "Batch not found for current processing target {}", + batch_id + ))) + .map(|_| ProcessResult::Successful) + } + }; + + let peer = match batch.current_peer() { + Some(v) => *v, + None => { + return self + .fail_sync(BackFillError::BatchInvalidState( + batch_id, + String::from("Peer does not exist"), + )) + .map(|_| ProcessResult::Successful) + } + }; + debug!(self.log, "Batch processing failed"; "imported_blocks" => imported_blocks, + "batch_epoch" => batch_id, "peer" => %peer, "client" => %network.client_type(&peer)); + match batch.processing_completed(false) { + Err(e) => { + // Batch was in the wrong state + self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)) + .map(|_| ProcessResult::Successful) + } + Ok(true) => { + // check that we have not exceeded the re-process retry counter + // If a batch has exceeded the invalid batch lookup attempts limit, it means + // that it is likely all peers are sending invalid batches + // repeatedly and are either malicious or faulty. We stop the backfill sync and + // report all synced peers that have participated. + let action = PeerAction::LowToleranceError; + warn!(self.log, "Backfill batch failed to download. Penalizing peers"; + "score_adjustment" => %action, + "batch_epoch"=> batch_id); + for peer in self.participating_peers.drain() { + network.report_peer(peer, action); + } + self.fail_sync(BackFillError::BatchProcessingFailed(batch_id)) + .map(|_| ProcessResult::Successful) + } + + Ok(false) => { + // chain can continue. Check if it can be progressed + if *imported_blocks { + // At least one block was successfully verified and imported, then we can be sure all + // previous batches are valid and we only need to download the current failed + // batch. + self.advance_chain(network, batch_id); + } + // Handle this invalid batch, that is within the re-process retries limit. + self.handle_invalid_batch(network, batch_id) + .map(|_| ProcessResult::Successful) + } + } + } + } + } + + /// Processes the next ready batch. + fn process_completed_batches( + &mut self, + network: &mut SyncNetworkContext, + ) -> Result { + // Only process batches if backfill is syncing and only process one batch at a time + if self.state != BackFillState::Syncing || self.current_processing_batch.is_some() { + return Ok(ProcessResult::Successful); + } + + // Find the id of the batch we are going to process. + if let Some(batch) = self.batches.get(&self.processing_target) { + let state = batch.state(); + match state { + BatchState::AwaitingProcessing(..) => { + return self.process_batch(network, self.processing_target); + } + BatchState::Downloading(..) => { + // Batch is not ready, nothing to process + } + BatchState::Poisoned => unreachable!("Poisoned batch"), + BatchState::Failed | BatchState::AwaitingDownload | BatchState::Processing(_) => { + // these are all inconsistent states: + // - Failed -> non recoverable batch. Chain should have been removed + // - AwaitingDownload -> A recoverable failed batch should have been + // re-requested. + // - Processing -> `self.current_processing_batch` is None + return self + .fail_sync(BackFillError::InvalidSyncState(String::from( + "Invalid expected batch state", + ))) + .map(|_| ProcessResult::Successful); + } + BatchState::AwaitingValidation(_) => { + // TODO: I don't think this state is possible, log a CRIT just in case. + // If this is not observed, add it to the failed state branch above. + crit!(self.log, "Chain encountered a robust batch awaiting validation"; "batch" => self.processing_target); + + self.processing_target -= BACKFILL_EPOCHS_PER_BATCH; + if self.to_be_downloaded >= self.processing_target { + self.to_be_downloaded = self.processing_target - BACKFILL_EPOCHS_PER_BATCH; + } + self.request_batches(network)?; + } + } + } else { + return self + .fail_sync(BackFillError::InvalidSyncState(format!( + "Batch not found for current processing target {}", + self.processing_target + ))) + .map(|_| ProcessResult::Successful); + } + Ok(ProcessResult::Successful) + } + + /// Removes any batches previous to the given `validating_epoch` and updates the current + /// boundaries of the chain. + /// + /// The `validating_epoch` must align with batch boundaries. + /// + /// If a previous batch has been validated and it had been re-processed, penalize the original + /// peer. + fn advance_chain( + &mut self, + network: &mut SyncNetworkContext, + validating_epoch: Epoch, + ) { + // make sure this epoch produces an advancement + if validating_epoch >= self.current_start { + return; + } + + // We can now validate higher batches that the current batch. Here we remove all + // batches that are higher than the current batch. We add on an extra + // `BACKFILL_EPOCHS_PER_BATCH` as `split_off` is inclusive. + let removed_batches = self + .batches + .split_off(&(validating_epoch + BACKFILL_EPOCHS_PER_BATCH)); + + for (id, batch) in removed_batches.into_iter() { + self.validated_batches = self.validated_batches.saturating_add(1); + // only for batches awaiting validation can we be sure the last attempt is + // right, and thus, that any different attempt is wrong + match batch.state() { + BatchState::AwaitingValidation(ref processed_attempt) => { + for attempt in batch.attempts() { + // The validated batch has been re-processed + if attempt.hash != processed_attempt.hash { + // The re-downloaded version was different + if processed_attempt.peer_id != attempt.peer_id { + // A different peer sent the correct batch, the previous peer did not + // We negatively score the original peer. + let action = PeerAction::LowToleranceError; + debug!(self.log, "Re-processed batch validated. Scoring original peer"; + "batch_epoch" => id, "score_adjustment" => %action, + "original_peer" => %attempt.peer_id, "new_peer" => %processed_attempt.peer_id + ); + network.report_peer(attempt.peer_id, action); + } else { + // The same peer corrected it's previous mistake. There was an error, so we + // negative score the original peer. + let action = PeerAction::MidToleranceError; + debug!(self.log, "Re-processed batch validated by the same peer"; + "batch_epoch" => id, "score_adjustment" => %action, + "original_peer" => %attempt.peer_id, "new_peer" => %processed_attempt.peer_id + ); + network.report_peer(attempt.peer_id, action); + } + } + } + } + BatchState::Downloading(peer, ..) => { + // remove this batch from the peer's active requests + if let Some(active_requests) = self.active_requests.get_mut(peer) { + active_requests.remove(&id); + } + } + BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => { + crit!( + self.log, + "batch indicates inconsistent chain state while advancing chain" + ) + } + BatchState::AwaitingProcessing(..) => {} + BatchState::Processing(_) => { + debug!(self.log, "Advancing chain while processing a batch"; "batch" => id, batch); + if let Some(processing_id) = self.current_processing_batch { + if id >= processing_id { + self.current_processing_batch = None; + } + } + } + } + } + + self.processing_target = self.processing_target.min(validating_epoch); + self.current_start = validating_epoch; + self.to_be_downloaded = self.to_be_downloaded.min(validating_epoch); + if self.batches.contains_key(&self.to_be_downloaded) { + // if a chain is advanced by Range beyond the previous `self.to_be_downloaded`, we + // won't have this batch, so we need to request it. + self.to_be_downloaded -= BACKFILL_EPOCHS_PER_BATCH; + } + debug!(self.log, "Backfill advanced"; "validated_epoch" => validating_epoch, "processing_target" => self.processing_target); + } + + /// An invalid batch has been received that could not be processed, but that can be retried. + /// + /// These events occur when a peer has successfully responded with blocks, but the blocks we + /// have received are incorrect or invalid. This indicates the peer has not performed as + /// intended and can result in downvoting a peer. + fn handle_invalid_batch( + &mut self, + network: &mut SyncNetworkContext, + batch_id: BatchId, + ) -> Result<(), BackFillError> { + // The current batch could not be processed, indicating either the current or previous + // batches are invalid. + + // The previous batch could be incomplete due to the block sizes being too large to fit in + // a single RPC request or there could be consecutive empty batches which are not supposed + // to be there + + // The current (sub-optimal) strategy is to simply re-request all batches that could + // potentially be faulty. If a batch returns a different result than the original and + // results in successful processing, we downvote the original peer that sent us the batch. + + // this is our robust `processing_target`. All previous batches must be awaiting + // validation + let mut redownload_queue = Vec::new(); + + for (id, batch) in self + .batches + .iter_mut() + .filter(|(&id, _batch)| id > batch_id) + { + match batch + .validation_failed() + .map_err(|e| BackFillError::BatchInvalidState(batch_id, e.0))? + { + true => { + // Batch has failed and cannot be redownloaded. + return self.fail_sync(BackFillError::BatchProcessingFailed(batch_id)); + } + false => { + redownload_queue.push(*id); + } + } + } + + // no batch maxed out it process attempts, so now the chain's volatile progress must be + // reset + self.processing_target = self.current_start; + + for id in redownload_queue { + self.retry_batch_download(network, id)?; + } + // finally, re-request the failed batch. + self.retry_batch_download(network, batch_id) + } + + /// Sends and registers the request of a batch awaiting download. + fn retry_batch_download( + &mut self, + network: &mut SyncNetworkContext, + batch_id: BatchId, + ) -> Result<(), BackFillError> { + let batch = match self.batches.get_mut(&batch_id) { + Some(batch) => batch, + None => return Ok(()), + }; + + // Find a peer to request the batch + let failed_peers = batch.failed_peers(); + + let new_peer = { + let mut priorized_peers = self + .network_globals + .peers + .read() + .synced_peers() + .map(|peer| { + ( + failed_peers.contains(peer), + self.active_requests.get(peer).map(|v| v.len()).unwrap_or(0), + *peer, + ) + }) + .collect::>(); + // Sort peers prioritizing unrelated peers with less active requests. + priorized_peers.sort_unstable(); + priorized_peers.get(0).map(|&(_, _, peer)| peer) + }; + + if let Some(peer) = new_peer { + self.participating_peers.insert(peer); + self.send_batch(network, batch_id, peer) + } else { + // If we are here the chain has no more synced peers + info!(self.log, "Backfill sync paused"; "reason" => "insufficient_synced_peers"); + self.state = BackFillState::Paused; + Err(BackFillError::Paused) + } + } + + /// Requests the batch assigned to the given id from a given peer. + fn send_batch( + &mut self, + network: &mut SyncNetworkContext, + batch_id: BatchId, + peer: PeerId, + ) -> Result<(), BackFillError> { + if let Some(batch) = self.batches.get_mut(&batch_id) { + let request = batch.to_blocks_by_range_request(); + match network.backfill_blocks_by_range_request(peer, request, batch_id) { + Ok(request_id) => { + // inform the batch about the new request + if let Err(e) = batch.start_downloading_from_peer(peer, request_id) { + return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)); + } + debug!(self.log, "Requesting batch"; "epoch" => batch_id, &batch); + + // register the batch for this peer + self.active_requests + .entry(peer) + .or_default() + .insert(batch_id); + return Ok(()); + } + Err(e) => { + // NOTE: under normal conditions this shouldn't happen but we handle it anyway + warn!(self.log, "Could not send batch request"; + "batch_id" => batch_id, "error" => e, &batch); + // register the failed download and check if the batch can be retried + if let Err(e) = batch.start_downloading_from_peer(peer, 1) { + return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)); + } + self.active_requests + .get_mut(&peer) + .map(|request| request.remove(&batch_id)); + + match batch.download_failed(true) { + Err(e) => { + self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))? + } + Ok(true) => self.fail_sync(BackFillError::BatchDownloadFailed(batch_id))?, + Ok(false) => return self.retry_batch_download(network, batch_id), + } + } + } + } + + Ok(()) + } + + /// When resuming a chain, this function searches for batches that need to be re-downloaded and + /// transitions their state to redownload the batch. + fn resume_batches( + &mut self, + network: &mut SyncNetworkContext, + ) -> Result<(), BackFillError> { + let batch_ids_to_retry = self + .batches + .iter() + .filter_map(|(batch_id, batch)| { + // In principle there should only ever be on of these, and we could terminate the + // loop early, however the processing is negligible and we continue the search + // for robustness to handle potential future modification + if matches!(batch.state(), BatchState::AwaitingDownload) { + Some(*batch_id) + } else { + None + } + }) + .collect::>(); + + for batch_id in batch_ids_to_retry { + self.retry_batch_download(network, batch_id)?; + } + Ok(()) + } + + /// Attempts to request the next required batches from the peer pool if the chain is syncing. It will exhaust the peer + /// pool and left over batches until the batch buffer is reached or all peers are exhausted. + fn request_batches( + &mut self, + network: &mut SyncNetworkContext, + ) -> Result<(), BackFillError> { + if !matches!(self.state, BackFillState::Syncing) { + return Ok(()); + } + + // find the next pending batch and request it from the peer + + // randomize the peers for load balancing + let mut rng = rand::thread_rng(); + let mut idle_peers = self + .network_globals + .peers + .read() + .synced_peers() + .filter(|peer_id| { + self.active_requests + .get(peer_id) + .map(|requests| requests.is_empty()) + .unwrap_or(true) + }) + .cloned() + .collect::>(); + + idle_peers.shuffle(&mut rng); + + while let Some(peer) = idle_peers.pop() { + if let Some(batch_id) = self.include_next_batch() { + // send the batch + self.send_batch(network, batch_id, peer)?; + } else { + // No more batches, simply stop + return Ok(()); + } + } + Ok(()) + } + + /// Creates the next required batch from the chain. If there are no more batches required, + /// `false` is returned. + fn include_next_batch(&mut self) -> Option { + // don't request batches beyond genesis; + if self.last_batch_downloaded { + return None; + } + + // only request batches up to the buffer size limit + // NOTE: we don't count batches in the AwaitingValidation state, to prevent stalling sync + // if the current processing window is contained in a long range of skip slots. + let in_buffer = |batch: &BatchInfo| { + matches!( + batch.state(), + BatchState::Downloading(..) | BatchState::AwaitingProcessing(..) + ) + }; + if self + .batches + .iter() + .filter(|&(_epoch, batch)| in_buffer(batch)) + .count() + > BACKFILL_BATCH_BUFFER_SIZE as usize + { + return None; + } + + let batch_id = self.to_be_downloaded; + // this batch could have been included already being an optimistic batch + match self.batches.entry(batch_id) { + Entry::Occupied(_) => { + // this batch doesn't need downloading, let this same function decide the next batch + if batch_id == 0 { + self.last_batch_downloaded = true; + } + + self.to_be_downloaded = self + .to_be_downloaded + .saturating_sub(BACKFILL_EPOCHS_PER_BATCH); + self.include_next_batch() + } + Entry::Vacant(entry) => { + entry.insert(BatchInfo::new(&batch_id, BACKFILL_EPOCHS_PER_BATCH)); + if batch_id == 0 { + self.last_batch_downloaded = true; + } + self.to_be_downloaded = self + .to_be_downloaded + .saturating_sub(BACKFILL_EPOCHS_PER_BATCH); + Some(batch_id) + } + } + } + + /// Resets the start epoch based on the beacon chain. + /// + /// This errors if the beacon chain indicates that backfill sync has already completed or is + /// not required. + fn reset_start_epoch(&mut self) -> Result<(), ResetEpochError> { + if let Some(anchor_info) = self.beacon_chain.store.get_anchor_info() { + if anchor_info.block_backfill_complete() { + Err(ResetEpochError::SyncCompleted) + } else { + self.current_start = anchor_info + .oldest_block_slot + .epoch(T::EthSpec::slots_per_epoch()); + Ok(()) + } + } else { + Err(ResetEpochError::NotRequired) + } + } + + /// Checks with the beacon chain if backfill sync has completed. + fn check_completed(&mut self) -> bool { + if self.current_start == 0 { + // Check that the beacon chain agrees + + if let Some(anchor_info) = self.beacon_chain.store.get_anchor_info() { + // Conditions that we have completed a backfill sync + if anchor_info.block_backfill_complete() { + return true; + } else { + error!(self.log, "Backfill out of sync with beacon chain"); + } + } + } + false + } + + /// Updates the global network state indicating the current state of a backfill sync. + fn update_global_state(&self) { + *self.network_globals.backfill_state.write() = self.state.clone(); + } +} + +/// Error kind for attempting to restart the sync from beacon chain parameters. +enum ResetEpochError { + /// The chain has already completed. + SyncCompleted, + /// Backfill is not required. + NotRequired, +} diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 2b678751ea5..90ff61b41fb 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -33,6 +33,7 @@ //! needs to be searched for (i.e if an attestation references an unknown block) this manager can //! search for the block and subsequently search for parents if needed. +use super::backfill_sync::{BackFillSync, ProcessResult, SyncStart}; use super::network_context::SyncNetworkContext; use super::peer_sync_info::{remote_sync_type, PeerSyncType}; use super::range_sync::{ChainId, RangeSync, RangeSyncType, EPOCHS_PER_BATCH}; @@ -77,14 +78,14 @@ pub enum SyncMessage { /// A useful peer has been discovered. AddPeer(PeerId, SyncInfo), - /// A `BlocksByRange` response has been received. + /// A [`BlocksByRange`] response has been received. BlocksByRangeResponse { peer_id: PeerId, request_id: RequestId, beacon_block: Option>>, }, - /// A `BlocksByRoot` response has been received. + /// A [`BlocksByRoot`] response has been received. BlocksByRootResponse { peer_id: PeerId, request_id: RequestId, @@ -106,8 +107,7 @@ pub enum SyncMessage { /// A batch has been processed by the block processor thread. BatchProcessed { - chain_id: ChainId, - epoch: Epoch, + sync_type: SyncRequestType, result: BatchProcessResult, }, @@ -120,6 +120,15 @@ pub enum SyncMessage { }, } +/// The type of sync request made +#[derive(Debug, Clone)] +pub enum SyncRequestType { + /// Request was from the backfill sync algorithm. + BackFillSync(Epoch), + /// The request was from a chain in the range sync algorithm. + RangeSync(Epoch, ChainId), +} + /// The result of processing a multiple blocks (a chain segment). #[derive(Debug)] pub enum BatchProcessResult { @@ -166,6 +175,9 @@ pub struct SyncManager { /// The object handling long-range batch load-balanced syncing. range_sync: RangeSync, + /// Backfill syncing. + backfill_sync: BackFillSync, + /// A collection of parent block lookups. parent_queue: SmallVec<[ParentRequests; 3]>, @@ -227,6 +239,12 @@ pub fn spawn( beacon_processor_send.clone(), log.clone(), ), + backfill_sync: BackFillSync::new( + beacon_chain.clone(), + network_globals.clone(), + beacon_processor_send.clone(), + log.clone(), + ), network: SyncNetworkContext::new(network_send, network_globals.clone(), log.clone()), chain: beacon_chain, network_globals, @@ -576,6 +594,7 @@ impl SyncManager { } } + /// Handles RPC errors related to requests that were emitted from the sync manager. fn inject_error(&mut self, peer_id: PeerId, request_id: RequestId) { trace!(self.log, "Sync manager received a failed RPC"); // remove any single block lookups @@ -597,14 +616,16 @@ impl SyncManager { return; } - // otherwise, this is a range sync issue, notify the range sync - self.range_sync - .inject_error(&mut self.network, peer_id, request_id); - self.update_sync_state(); + // Otherwise this error matches no known request. + trace!(self.log, "Response/Error for non registered request"; "request_id" => request_id) } fn peer_disconnect(&mut self, peer_id: &PeerId) { self.range_sync.peer_disconnect(&mut self.network, peer_id); + // Regardless of the outcome, we update the sync status. + let _ = self + .backfill_sync + .peer_disconnected(peer_id, &mut self.network); self.update_sync_state(); } @@ -624,12 +645,18 @@ impl SyncManager { if let Some(peer_info) = self.network_globals.peers.write().peer_info_mut(peer_id) { let new_state = sync_type.as_sync_status(remote_sync_info); let rpr = new_state.as_str(); - let was_updated = peer_info.sync_status.update(new_state); + let was_updated = peer_info.sync_status.update(new_state.clone()); if was_updated { debug!(self.log, "Peer transitioned sync state"; "peer_id" => %peer_id, "new_state" => rpr, "our_head_slot" => local_sync_info.head_slot, "out_finalized_epoch" => local_sync_info.finalized_epoch, "their_head_slot" => remote_sync_info.head_slot, "their_finalized_epoch" => remote_sync_info.finalized_epoch, "is_connected" => peer_info.is_connected()); + + // A peer has transitioned its sync state. If the new state is "synced" we + // inform the backfill sync that a new synced peer has joined us. + if new_state.is_synced() { + self.backfill_sync.fully_synced_peer_joined(); + } } peer_info.is_connected() } else { @@ -638,7 +665,17 @@ impl SyncManager { } } - /// Updates the global sync state and logs any changes. + /// Updates the global sync state, optionally instigating or pausing a backfill sync as well as + /// logging any changes. + /// + /// The logic for which sync should be running is as follows: + /// - If there is a range-sync running (or required) pause any backfill and let range-sync + /// complete. + /// - If there is no current range sync, check for any requirement to backfill and either + /// start/resume a backfill sync if required. The global state will be BackFillSync if a + /// backfill sync is running. + /// - If there is no range sync and no required backfill and we have synced up to the currently + /// known peers, we consider ourselves synced. fn update_sync_state(&mut self) { let new_state: SyncState = match self.range_sync.state() { Err(e) => { @@ -647,41 +684,75 @@ impl SyncManager { } Ok(state) => match state { None => { - // no range sync, decide if we are stalled or synced. + // No range sync, so we decide if we are stalled or synced. // For this we check if there is at least one advanced peer. An advanced peer // with Idle range is possible since a peer's status is updated periodically. // If we synced a peer between status messages, most likely the peer has // advanced and will produce a head chain on re-status. Otherwise it will shift // to being synced - let head = self.chain.best_slot().unwrap_or_else(|_| Slot::new(0)); - let current_slot = self.chain.slot().unwrap_or_else(|_| Slot::new(0)); - - let peers = self.network_globals.peers.read(); - if current_slot >= head - && current_slot.sub(head) <= (SLOT_IMPORT_TOLERANCE as u64) - && head > 0 - { - SyncState::Synced - } else if peers.advanced_peers().next().is_some() { - SyncState::SyncTransition - } else if peers.synced_peers().next().is_none() { - SyncState::Stalled - } else { - // There are no peers that require syncing and we have at least one synced - // peer - SyncState::Synced + let mut sync_state = { + let head = self.chain.best_slot().unwrap_or_else(|_| Slot::new(0)); + let current_slot = self.chain.slot().unwrap_or_else(|_| Slot::new(0)); + + let peers = self.network_globals.peers.read(); + if current_slot >= head + && current_slot.sub(head) <= (SLOT_IMPORT_TOLERANCE as u64) + && head > 0 + { + SyncState::Synced + } else if peers.advanced_peers().next().is_some() { + SyncState::SyncTransition + } else if peers.synced_peers().next().is_none() { + SyncState::Stalled + } else { + // There are no peers that require syncing and we have at least one synced + // peer + SyncState::Synced + } + }; + + // If we would otherwise be synced, first check if we need to perform or + // complete a backfill sync. + if matches!(sync_state, SyncState::Synced) { + // Determine if we need to start/resume/restart a backfill sync. + match self.backfill_sync.start(&mut self.network) { + Ok(SyncStart::Syncing { + completed, + remaining, + }) => { + sync_state = SyncState::BackFillSyncing { + completed, + remaining, + }; + } + Ok(SyncStart::NotSyncing) => {} // Ignore updating the state if the backfill sync state didn't start. + Err(e) => { + error!(self.log, "Backfill sync failed to start"; "error" => ?e); + } + } } + + // Return the sync state if backfilling is not required. + sync_state } Some((RangeSyncType::Finalized, start_slot, target_slot)) => { + // If there is a backfill sync in progress pause it. + self.backfill_sync.pause(); + SyncState::SyncingFinalized { start_slot, target_slot, } } - Some((RangeSyncType::Head, start_slot, target_slot)) => SyncState::SyncingHead { - start_slot, - target_slot, - }, + Some((RangeSyncType::Head, start_slot, target_slot)) => { + // If there is a backfill sync in progress pause it. + self.backfill_sync.pause(); + + SyncState::SyncingHead { + start_slot, + target_slot, + } + } }, }; @@ -690,7 +761,14 @@ impl SyncManager { if !new_state.eq(&old_state) { info!(self.log, "Sync state updated"; "old_state" => %old_state, "new_state" => %new_state); // If we have become synced - Subscribe to all the core subnet topics - if new_state.is_synced() { + // We don't need to subscribe if the old state is a state that would have already + // invoked this call. + if new_state.is_synced() + && !matches!( + old_state, + SyncState::Synced { .. } | SyncState::BackFillSyncing { .. } + ) + { self.network.subscribe_core_topics(); } } @@ -828,14 +906,13 @@ impl SyncManager { // peer. We don't consider this chain a failure and prevent retries with another // peer. "too many failed attempts" - } else { - if !parent_request.downloaded_blocks.is_empty() { - self.failed_chains - .insert(parent_request.downloaded_blocks[0].canonical_root()); - } else { - crit!(self.log, "Parent lookup has no blocks"); - } + } else if !parent_request.downloaded_blocks.is_empty() { + self.failed_chains + .insert(parent_request.downloaded_blocks[0].canonical_root()); "reached maximum lookup-depth" + } else { + crit!(self.log, "Parent lookup has no blocks"); + "no blocks" }; debug!(self.log, "Parent import failed"; @@ -887,13 +964,44 @@ impl SyncManager { request_id, beacon_block, } => { - self.range_sync.blocks_by_range_response( - &mut self.network, - peer_id, - request_id, - beacon_block.map(|b| *b), - ); - self.update_sync_state(); + let beacon_block = beacon_block.map(|b| *b); + // Obtain which sync requested these blocks and divert accordingly. + match self + .network + .blocks_by_range_response(request_id, beacon_block.is_none()) + { + Some(SyncRequestType::RangeSync(batch_id, chain_id)) => { + self.range_sync.blocks_by_range_response( + &mut self.network, + peer_id, + chain_id, + batch_id, + request_id, + beacon_block, + ); + self.update_sync_state(); + } + Some(SyncRequestType::BackFillSync(batch_id)) => { + match self.backfill_sync.on_block_response( + &mut self.network, + batch_id, + &peer_id, + request_id, + beacon_block, + ) { + Ok(ProcessResult::SyncCompleted) => self.update_sync_state(), + Ok(ProcessResult::Successful) => {} + Err(_error) => { + // The backfill sync has failed, errors are reported + // within. + self.update_sync_state(); + } + } + } + None => { + trace!(self.log, "Response/Error for non registered request"; "request_id" => request_id) + } + } } SyncMessage::BlocksByRootResponse { peer_id, @@ -913,21 +1021,63 @@ impl SyncManager { self.peer_disconnect(&peer_id); } SyncMessage::RPCError(peer_id, request_id) => { - self.inject_error(peer_id, request_id); - } - SyncMessage::BatchProcessed { - chain_id, - epoch, - result, - } => { - self.range_sync.handle_block_process_result( - &mut self.network, - chain_id, - epoch, - result, - ); - self.update_sync_state(); + // Redirect to a sync mechanism if the error is related to one of their + // requests. + match self.network.blocks_by_range_response(request_id, true) { + Some(SyncRequestType::RangeSync(batch_id, chain_id)) => { + self.range_sync.inject_error( + &mut self.network, + peer_id, + batch_id, + chain_id, + request_id, + ); + self.update_sync_state(); + } + Some(SyncRequestType::BackFillSync(batch_id)) => { + match self.backfill_sync.inject_error( + &mut self.network, + batch_id, + &peer_id, + request_id, + ) { + Ok(_) => {} + Err(_) => self.update_sync_state(), + } + } + None => { + // This is a request not belonging to a sync algorithm. + // Process internally. + self.inject_error(peer_id, request_id); + } + } } + SyncMessage::BatchProcessed { sync_type, result } => match sync_type { + SyncRequestType::RangeSync(epoch, chain_id) => { + self.range_sync.handle_block_process_result( + &mut self.network, + chain_id, + epoch, + result, + ); + self.update_sync_state(); + } + SyncRequestType::BackFillSync(epoch) => { + match self.backfill_sync.on_batch_process_result( + &mut self.network, + epoch, + &result, + ) { + Ok(ProcessResult::Successful) => {} + Ok(ProcessResult::SyncCompleted) => self.update_sync_state(), + Err(error) => { + error!(self.log, "Backfill sync failed"; "error" => ?error); + // Update the global status + self.update_sync_state(); + } + } + } + }, SyncMessage::ParentLookupFailed { chain_head, peer_id, diff --git a/beacon_node/network/src/sync/mod.rs b/beacon_node/network/src/sync/mod.rs index 9377f655295..1755b13e372 100644 --- a/beacon_node/network/src/sync/mod.rs +++ b/beacon_node/network/src/sync/mod.rs @@ -1,6 +1,7 @@ //! Syncing for lighthouse. //! //! Stores the various syncing methods for the beacon chain. +mod backfill_sync; pub mod manager; mod network_context; mod peer_sync_info; diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 35a8046c3dc..0647fc68304 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1,6 +1,7 @@ //! Provides network functionality for the Syncing thread. This fundamentally wraps a network //! channel and stores a global RPC ID to perform requests. +use super::manager::SyncRequestType; use super::range_sync::{BatchId, ChainId}; use super::RequestId as SyncRequestId; use crate::service::NetworkMessage; @@ -26,8 +27,8 @@ pub struct SyncNetworkContext { /// A sequential ID for all RPC requests. request_id: SyncRequestId, - /// BlocksByRange requests made by range syncing chains. - range_requests: FnvHashMap, + /// BlocksByRange requests made by syncing algorithms. + range_requests: FnvHashMap, /// Logger for the `SyncNetworkContext`. log: slog::Logger, @@ -81,6 +82,7 @@ impl SyncNetworkContext { } } + /// A blocks by range request for the range sync algorithm. pub fn blocks_by_range_request( &mut self, peer_id: PeerId, @@ -96,15 +98,37 @@ impl SyncNetworkContext { "peer" => %peer_id, ); let req_id = self.send_rpc_request(peer_id, Request::BlocksByRange(request))?; - self.range_requests.insert(req_id, (chain_id, batch_id)); + self.range_requests + .insert(req_id, SyncRequestType::RangeSync(batch_id, chain_id)); Ok(req_id) } + /// A blocks by range request sent by the backfill sync algorithm + pub fn backfill_blocks_by_range_request( + &mut self, + peer_id: PeerId, + request: BlocksByRangeRequest, + batch_id: BatchId, + ) -> Result { + trace!( + self.log, + "Sending backfill BlocksByRange Request"; + "method" => "BlocksByRange", + "count" => request.count, + "peer" => %peer_id, + ); + let req_id = self.send_rpc_request(peer_id, Request::BlocksByRange(request))?; + self.range_requests + .insert(req_id, SyncRequestType::BackFillSync(batch_id)); + Ok(req_id) + } + + /// Received a blocks by range response. pub fn blocks_by_range_response( &mut self, request_id: usize, remove: bool, - ) -> Option<(ChainId, BatchId)> { + ) -> Option { // NOTE: we can't guarantee that the request must be registered as it could receive more // than an error, and be removed after receiving the first one. // FIXME: https://github.com/sigp/lighthouse/issues/1634 @@ -115,6 +139,7 @@ impl SyncNetworkContext { } } + /// Sends a blocks by root request. pub fn blocks_by_root_request( &mut self, peer_id: PeerId, @@ -130,6 +155,7 @@ impl SyncNetworkContext { self.send_rpc_request(peer_id, Request::BlocksByRoot(request)) } + /// Terminates the connection with the peer and bans them. pub fn goodbye_peer(&mut self, peer_id: PeerId, reason: GoodbyeReason) { self.network_send .send(NetworkMessage::GoodbyePeer { @@ -142,6 +168,7 @@ impl SyncNetworkContext { }); } + /// Reports to the scoring algorithm the behaviour of a peer. pub fn report_peer(&mut self, peer_id: PeerId, action: PeerAction) { debug!(self.log, "Sync reporting peer"; "peer_id" => %peer_id, "action" => %action); self.network_send @@ -155,7 +182,8 @@ impl SyncNetworkContext { }); } - pub fn send_rpc_request( + /// Sends an RPC request. + fn send_rpc_request( &mut self, peer_id: PeerId, request: Request, @@ -170,6 +198,7 @@ impl SyncNetworkContext { Ok(request_id) } + /// Subscribes to core topics. pub fn subscribe_core_topics(&mut self) { self.network_send .send(NetworkMessage::SubscribeCoreTopics) @@ -178,6 +207,7 @@ impl SyncNetworkContext { }); } + /// Sends an arbitrary network message. fn send_network_msg(&mut self, msg: NetworkMessage) -> Result<(), &'static str> { self.network_send.send(msg).map_err(|_| { debug!(self.log, "Could not send message to the network service"); diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 5f411260ffe..f8d1347507a 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -14,15 +14,34 @@ const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5; /// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty. const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3; +/// Allows customisation of the above constants used in other sync methods such as BackFillSync. +pub trait BatchConfig { + /// The maximum batch download attempts. + fn max_batch_download_attempts() -> u8; + /// The max batch processing attempts. + fn max_batch_processing_attempts() -> u8; +} + +pub struct RangeSyncBatchConfig {} + +impl BatchConfig for RangeSyncBatchConfig { + fn max_batch_download_attempts() -> u8 { + MAX_BATCH_DOWNLOAD_ATTEMPTS + } + fn max_batch_processing_attempts() -> u8 { + MAX_BATCH_PROCESSING_ATTEMPTS + } +} + /// Error type of a batch in a wrong state. // Such errors should never be encountered. -pub struct WrongState(pub(super) String); +pub struct WrongState(pub(crate) String); /// Auxiliary type alias for readability. type IsFailed = bool; /// A segment of a chain. -pub struct BatchInfo { +pub struct BatchInfo { /// Start slot of the batch. start_slot: Slot, /// End slot of the batch. @@ -33,6 +52,8 @@ pub struct BatchInfo { failed_download_attempts: Vec, /// State of the batch. state: BatchState, + /// Pin the generic + marker: std::marker::PhantomData, } /// Current state of a batch @@ -73,7 +94,7 @@ impl BatchState { } } -impl BatchInfo { +impl BatchInfo { /// Batches are downloaded excluding the first block of the epoch assuming it has already been /// downloaded. /// @@ -91,6 +112,7 @@ impl BatchInfo { failed_processing_attempts: Vec::new(), failed_download_attempts: Vec::new(), state: BatchState::AwaitingDownload, + marker: std::marker::PhantomData, } } @@ -120,6 +142,7 @@ impl BatchInfo { false } + /// Returns the peer that is currently responsible for progressing the state of the batch. pub fn current_peer(&self) -> Option<&PeerId> { match &self.state { BatchState::AwaitingDownload | BatchState::Failed => None, @@ -131,6 +154,7 @@ impl BatchInfo { } } + /// Returns a BlocksByRange request associated with the batch. pub fn to_blocks_by_range_request(&self) -> BlocksByRangeRequest { BlocksByRangeRequest { start_slot: self.start_slot.into(), @@ -192,7 +216,7 @@ impl BatchInfo { // can be tried again self.failed_download_attempts.push(peer); self.state = if self.failed_download_attempts.len() - >= MAX_BATCH_DOWNLOAD_ATTEMPTS as usize + >= B::max_batch_download_attempts() as usize { BatchState::Failed } else { @@ -219,14 +243,21 @@ impl BatchInfo { } } + /// Mark the batch as failed and return whether we can attempt a re-download. + /// + /// This can happen if a peer disconnects or some error occurred that was not the peers fault. + /// THe `mark_failed` parameter, when set to false, does not increment the failed attempts of + /// this batch and register the peer, rather attempts a re-download. #[must_use = "Batch may have failed"] - pub fn download_failed(&mut self) -> Result { + pub fn download_failed(&mut self, mark_failed: bool) -> Result { match self.state.poison() { BatchState::Downloading(peer, _, _request_id) => { // register the attempt and check if the batch can be tried again - self.failed_download_attempts.push(peer); + if mark_failed { + self.failed_download_attempts.push(peer); + } self.state = if self.failed_download_attempts.len() - >= MAX_BATCH_DOWNLOAD_ATTEMPTS as usize + >= B::max_batch_download_attempts as usize { BatchState::Failed } else { @@ -294,7 +325,7 @@ impl BatchInfo { // check if the batch can be downloaded again if self.failed_processing_attempts.len() - >= MAX_BATCH_PROCESSING_ATTEMPTS as usize + >= B::max_batch_processing_attempts() as usize { BatchState::Failed } else { @@ -324,7 +355,7 @@ impl BatchInfo { // check if the batch can be downloaded again self.state = if self.failed_processing_attempts.len() - >= MAX_BATCH_PROCESSING_ATTEMPTS as usize + >= B::max_batch_processing_attempts() as usize { BatchState::Failed } else { @@ -365,7 +396,7 @@ impl Attempt { } } -impl slog::KV for &mut BatchInfo { +impl slog::KV for &mut BatchInfo { fn serialize( &self, record: &slog::Record, @@ -375,7 +406,7 @@ impl slog::KV for &mut BatchInfo { } } -impl slog::KV for BatchInfo { +impl slog::KV for BatchInfo { fn serialize( &self, record: &slog::Record, diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 87c2f2762e9..3da865503d5 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -181,7 +181,7 @@ impl SyncingChain { // fail the batches for id in batch_ids { if let Some(batch) = self.batches.get_mut(&id) { - if batch.download_failed()? { + if batch.download_failed(true)? { return Err(RemoveChain::ChainFailed(id)); } self.retry_batch_download(network, id)?; @@ -273,7 +273,7 @@ impl SyncingChain { } } - /// Sends to process the batch with the given id. + /// Processes the batch with the given id. /// The batch must exist and be ready for processing fn process_batch( &mut self, @@ -794,7 +794,7 @@ impl SyncingChain { if let Some(active_requests) = self.peers.get_mut(peer_id) { active_requests.remove(&batch_id); } - if batch.download_failed()? { + if batch.download_failed(true)? { return Err(RemoveChain::ChainFailed(batch_id)); } self.retry_batch_download(network, batch_id) @@ -837,7 +837,7 @@ impl SyncingChain { } } - /// Requests the batch asigned to the given id from a given peer. + /// Requests the batch assigned to the given id from a given peer. pub fn send_batch( &mut self, network: &mut SyncNetworkContext, @@ -883,7 +883,7 @@ impl SyncingChain { self.peers .get_mut(&peer) .map(|request| request.remove(&batch_id)); - if batch.download_failed()? { + if batch.download_failed(true)? { return Err(RemoveChain::ChainFailed(batch_id)); } else { return self.retry_batch_download(network, batch_id); @@ -990,7 +990,7 @@ impl SyncingChain { // this batch could have been included already being an optimistic batch match self.batches.entry(batch_id) { Entry::Occupied(_) => { - // this batch doesn't need downlading, let this same function decide the next batch + // this batch doesn't need downloading, let this same function decide the next batch self.to_be_downloaded += EPOCHS_PER_BATCH; self.include_next_batch() } diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs index 7bebd417c5e..a8d18b8c8c2 100644 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ b/beacon_node/network/src/sync/range_sync/mod.rs @@ -7,7 +7,7 @@ mod chain_collection; mod range; mod sync_type; -pub use batch::BatchInfo; +pub use batch::{BatchConfig, BatchInfo, BatchState}; pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; pub use range::RangeSync; pub use sync_type::RangeSyncType; diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index e291233bd12..097f6158fb8 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -39,7 +39,7 @@ //! Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially //! and further batches are requested as current blocks are being processed. -use super::chain::{ChainId, RemoveChain, SyncingChain}; +use super::chain::{BatchId, ChainId, RemoveChain, SyncingChain}; use super::chain_collection::ChainCollection; use super::sync_type::RangeSyncType; use crate::beacon_processor::WorkEvent as BeaconWorkEvent; @@ -194,34 +194,29 @@ impl RangeSync { &mut self, network: &mut SyncNetworkContext, peer_id: PeerId, + chain_id: ChainId, + batch_id: BatchId, request_id: RequestId, beacon_block: Option>, ) { - // get the chain and batch for which this response belongs - if let Some((chain_id, batch_id)) = - network.blocks_by_range_response(request_id, beacon_block.is_none()) - { - // check if this chunk removes the chain - match self.chains.call_by_id(chain_id, |chain| { - chain.on_block_response(network, batch_id, &peer_id, request_id, beacon_block) - }) { - Ok((removed_chain, sync_type)) => { - if let Some((removed_chain, remove_reason)) = removed_chain { - self.on_chain_removed( - removed_chain, - sync_type, - remove_reason, - network, - "block response", - ); - } - } - Err(_) => { - trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id) + // check if this chunk removes the chain + match self.chains.call_by_id(chain_id, |chain| { + chain.on_block_response(network, batch_id, &peer_id, request_id, beacon_block) + }) { + Ok((removed_chain, sync_type)) => { + if let Some((removed_chain, remove_reason)) = removed_chain { + self.on_chain_removed( + removed_chain, + sync_type, + remove_reason, + network, + "block response", + ); } } - } else { - trace!(self.log, "Response/Error for non registered request"; "request_id" => request_id) + Err(_) => { + trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id) + } } } @@ -298,31 +293,28 @@ impl RangeSync { &mut self, network: &mut SyncNetworkContext, peer_id: PeerId, + batch_id: BatchId, + chain_id: ChainId, request_id: RequestId, ) { - // get the chain and batch for which this response belongs - if let Some((chain_id, batch_id)) = network.blocks_by_range_response(request_id, true) { - // check that this request is pending - match self.chains.call_by_id(chain_id, |chain| { - chain.inject_error(network, batch_id, &peer_id, request_id) - }) { - Ok((removed_chain, sync_type)) => { - if let Some((removed_chain, remove_reason)) = removed_chain { - self.on_chain_removed( - removed_chain, - sync_type, - remove_reason, - network, - "RPC error", - ); - } - } - Err(_) => { - trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id) + // check that this request is pending + match self.chains.call_by_id(chain_id, |chain| { + chain.inject_error(network, batch_id, &peer_id, request_id) + }) { + Ok((removed_chain, sync_type)) => { + if let Some((removed_chain, remove_reason)) = removed_chain { + self.on_chain_removed( + removed_chain, + sync_type, + remove_reason, + network, + "RPC error", + ); } } - } else { - trace!(self.log, "Response/Error for non registered request"; "request_id" => request_id) + Err(_) => { + trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id) + } } } diff --git a/beacon_node/src/cli.rs b/beacon_node/src/cli.rs index b2c3b5bf2c4..f27d430b742 100644 --- a/beacon_node/src/cli.rs +++ b/beacon_node/src/cli.rs @@ -54,7 +54,8 @@ pub fn cli_app<'a, 'b>() -> App<'a, 'b> { .arg( Arg::with_name("shutdown-after-sync") .long("shutdown-after-sync") - .help("Shutdown beacon node as soon as sync is completed") + .help("Shutdown beacon node as soon as sync is completed. Backfill sync will \ + not be performed before shutdown.") .takes_value(false), ) .arg( @@ -479,12 +480,46 @@ pub fn cli_app<'a, 'b>() -> App<'a, 'b> { Arg::with_name("wss-checkpoint") .long("wss-checkpoint") .help( - "Used to input a Weak Subjectivity State Checkpoint in `block_root:epoch_number` format,\ - where block_root is an '0x' prefixed 32-byte hex string and epoch_number is an integer." + "Specify a weak subjectivity checkpoint in `block_root:epoch` format to verify \ + the node's sync against. The block root should be 0x-prefixed. Note that this \ + flag is for verification only, to perform a checkpoint sync from a recent \ + state use --checkpoint-sync-url." ) .value_name("WSS_CHECKPOINT") .takes_value(true) ) + .arg( + Arg::with_name("checkpoint-state") + .long("checkpoint-state") + .help("Set a checkpoint state to start syncing from. Must be aligned and match \ + --checkpoint-block. Using --checkpoint-sync-url instead is recommended.") + .value_name("STATE_SSZ") + .takes_value(true) + .requires("checkpoint-block") + ) + .arg( + Arg::with_name("checkpoint-block") + .long("checkpoint-block") + .help("Set a checkpoint block to start syncing from. Must be aligned and match \ + --checkpoint-state. Using --checkpoint-sync-url instead is recommended.") + .value_name("BLOCK_SSZ") + .takes_value(true) + .requires("checkpoint-state") + ) + .arg( + Arg::with_name("checkpoint-sync-url") + .long("checkpoint-sync-url") + .help("Set the remote beacon node HTTP endpoint to use for checkpoint sync.") + .value_name("BEACON_NODE") + .takes_value(true) + .conflicts_with("checkpoint-state") + ) + .arg( + Arg::with_name("reconstruct-historic-states") + .long("reconstruct-historic-states") + .help("After a checkpoint sync, reconstruct historic states in the database.") + .takes_value(false) + ) .arg( Arg::with_name("validator-monitor-auto") .long("validator-monitor-auto") diff --git a/beacon_node/src/config.rs b/beacon_node/src/config.rs index 9eb5db39568..b0fd687f489 100644 --- a/beacon_node/src/config.rs +++ b/beacon_node/src/config.rs @@ -292,15 +292,62 @@ pub fn get_config( } } - if let Some(genesis_state_bytes) = eth2_network_config.genesis_state_bytes { - // Note: re-serializing the genesis state is not so efficient, however it avoids adding - // trait bounds to the `ClientGenesis` enum. This would have significant flow-on - // effects. - client_config.genesis = ClientGenesis::SszBytes { - genesis_state_bytes, - }; + client_config.genesis = if let Some(genesis_state_bytes) = + eth2_network_config.genesis_state_bytes + { + // Set up weak subjectivity sync, or start from the hardcoded genesis state. + if let (Some(initial_state_path), Some(initial_block_path)) = ( + cli_args.value_of("checkpoint-state"), + cli_args.value_of("checkpoint-block"), + ) { + let read = |path: &str| { + use std::fs::File; + use std::io::Read; + File::open(Path::new(path)) + .and_then(|mut f| { + let mut buffer = vec![]; + f.read_to_end(&mut buffer)?; + Ok(buffer) + }) + .map_err(|e| format!("Unable to open {}: {:?}", path, e)) + }; + + let anchor_state_bytes = read(initial_state_path)?; + let anchor_block_bytes = read(initial_block_path)?; + + ClientGenesis::WeakSubjSszBytes { + genesis_state_bytes, + anchor_state_bytes, + anchor_block_bytes, + } + } else if let Some(remote_bn_url) = cli_args.value_of("checkpoint-sync-url") { + let url = SensitiveUrl::parse(remote_bn_url) + .map_err(|e| format!("Invalid checkpoint sync URL: {:?}", e))?; + + ClientGenesis::CheckpointSyncUrl { + genesis_state_bytes, + url, + } + } else { + // Note: re-serializing the genesis state is not so efficient, however it avoids adding + // trait bounds to the `ClientGenesis` enum. This would have significant flow-on + // effects. + ClientGenesis::SszBytes { + genesis_state_bytes, + } + } } else { - client_config.genesis = ClientGenesis::DepositContract; + if cli_args.is_present("checkpoint-state") || cli_args.is_present("checkpoint-sync-url") { + return Err( + "Checkpoint sync is not available for this network as no genesis state is known" + .to_string(), + ); + } + ClientGenesis::DepositContract + }; + + if cli_args.is_present("reconstruct-historic-states") { + client_config.chain.reconstruct_historic_states = true; } let raw_graffiti = if let Some(graffiti) = cli_args.value_of("graffiti") { diff --git a/beacon_node/store/src/chunk_writer.rs b/beacon_node/store/src/chunk_writer.rs new file mode 100644 index 00000000000..059b812e74c --- /dev/null +++ b/beacon_node/store/src/chunk_writer.rs @@ -0,0 +1,75 @@ +use crate::chunked_vector::{chunk_key, Chunk, ChunkError, Field}; +use crate::{Error, KeyValueStore, KeyValueStoreOp}; +use types::EthSpec; + +/// Buffered writer for chunked vectors (block roots mainly). +pub struct ChunkWriter<'a, F, E, S> +where + F: Field, + E: EthSpec, + S: KeyValueStore, +{ + /// Buffered chunk awaiting writing to disk (always dirty). + chunk: Chunk, + /// Chunk index of `chunk`. + index: usize, + store: &'a S, +} + +impl<'a, F, E, S> ChunkWriter<'a, F, E, S> +where + F: Field, + E: EthSpec, + S: KeyValueStore, +{ + pub fn new(store: &'a S, vindex: usize) -> Result { + let chunk_index = F::chunk_index(vindex); + let chunk = Chunk::load(store, F::column(), &chunk_key(chunk_index))? + .unwrap_or_else(|| Chunk::new(vec![F::Value::default(); F::chunk_size()])); + + Ok(Self { + chunk, + index: chunk_index, + store, + }) + } + + /// Set the value at a given vector index, writing the current chunk and moving on if necessary. + pub fn set( + &mut self, + vindex: usize, + value: F::Value, + batch: &mut Vec, + ) -> Result<(), Error> { + let chunk_index = F::chunk_index(vindex); + + // Advance to the next chunk. + if chunk_index != self.index { + self.write(batch)?; + *self = Self::new(self.store, vindex)?; + } + + let i = vindex % F::chunk_size(); + let existing_value = &self.chunk.values[i]; + + if existing_value == &value || existing_value == &F::Value::default() { + self.chunk.values[i] = value; + Ok(()) + } else { + Err(ChunkError::Inconsistent { + field: F::column(), + chunk_index, + existing_value: format!("{:?}", existing_value), + new_value: format!("{:?}", value), + } + .into()) + } + } + + /// Write the current chunk to disk. + /// + /// Should be called before the writer is dropped, in order to write the final chunk to disk. + pub fn write(&self, batch: &mut Vec) -> Result<(), Error> { + self.chunk.store(F::column(), &chunk_key(self.index), batch) + } +} diff --git a/beacon_node/store/src/chunked_iter.rs b/beacon_node/store/src/chunked_iter.rs index 632b3e01796..7d47e8c99aa 100644 --- a/beacon_node/store/src/chunked_iter.rs +++ b/beacon_node/store/src/chunked_iter.rs @@ -97,7 +97,7 @@ where self.current_chunk = Chunk::load( &self.store.cold_db, F::column(), - &chunk_key(self.next_cindex as u64), + &chunk_key(self.next_cindex), ) .map_err(|e| { error!( diff --git a/beacon_node/store/src/chunked_vector.rs b/beacon_node/store/src/chunked_vector.rs index 5958d6b624f..64754cf52d6 100644 --- a/beacon_node/store/src/chunked_vector.rs +++ b/beacon_node/store/src/chunked_vector.rs @@ -34,8 +34,8 @@ pub enum UpdatePattern { /// Map a chunk index to bytes that can be used to key the NoSQL database. /// /// We shift chunks up by 1 to make room for a genesis chunk that is handled separately. -pub fn chunk_key(cindex: u64) -> [u8; 8] { - (cindex + 1).to_be_bytes() +pub fn chunk_key(cindex: usize) -> [u8; 8] { + (cindex as u64 + 1).to_be_bytes() } /// Return the database key for the genesis value. @@ -73,6 +73,11 @@ pub trait Field: Copy { 128 } + /// Convert a v-index (vector index) to a chunk index. + fn chunk_index(vindex: usize) -> usize { + vindex / Self::chunk_size() + } + /// Get the value of this field at the given vector index, from the state. fn get_value( state: &BeaconState, @@ -399,7 +404,7 @@ where I: Iterator, { for chunk_index in range { - let chunk_key = &chunk_key(chunk_index as u64)[..]; + let chunk_key = &chunk_key(chunk_index)[..]; let existing_chunk = Chunk::::load(store, F::column(), chunk_key)?.unwrap_or_else(Chunk::default); @@ -440,7 +445,7 @@ fn range_query, E: EthSpec, T: Decode + Encode>( let mut result = Vec::with_capacity(len); for chunk_index in range { - let key = &chunk_key(chunk_index as u64)[..]; + let key = &chunk_key(chunk_index)[..]; let chunk = Chunk::load(store, column, key)?.ok_or(ChunkError::Missing { chunk_index })?; result.push(chunk); } diff --git a/beacon_node/store/src/config.rs b/beacon_node/store/src/config.rs index 2514e4cf35d..208776c1ef7 100644 --- a/beacon_node/store/src/config.rs +++ b/beacon_node/store/src/config.rs @@ -24,8 +24,6 @@ pub struct StoreConfig { #[derive(Debug, Clone, PartialEq, Eq, Encode, Decode)] pub struct OnDiskStoreConfig { pub slots_per_restore_point: u64, - // NOTE: redundant, see https://github.com/sigp/lighthouse/issues/1784 - pub _block_cache_size: usize, } #[derive(Debug, Clone)] @@ -49,7 +47,6 @@ impl StoreConfig { pub fn as_disk_config(&self) -> OnDiskStoreConfig { OnDiskStoreConfig { slots_per_restore_point: self.slots_per_restore_point, - _block_cache_size: DEFAULT_BLOCK_CACHE_SIZE, } } diff --git a/beacon_node/store/src/errors.rs b/beacon_node/store/src/errors.rs index f943a983ea8..0be8b43d6d9 100644 --- a/beacon_node/store/src/errors.rs +++ b/beacon_node/store/src/errors.rs @@ -13,13 +13,46 @@ pub enum Error { BeaconStateError(BeaconStateError), PartialBeaconStateError, HotColdDBError(HotColdDBError), - DBError { message: String }, + DBError { + message: String, + }, RlpError(String), BlockNotFound(Hash256), NoContinuationData, SplitPointModified(Slot, Slot), ConfigError(StoreConfigError), SchemaMigrationError(String), + /// The store's `anchor_info` was mutated concurrently, the latest modification wasn't applied. + AnchorInfoConcurrentMutation, + /// The block or state is unavailable due to weak subjectivity sync. + HistoryUnavailable, + /// State reconstruction cannot commence because not all historic blocks are known. + MissingHistoricBlocks { + oldest_block_slot: Slot, + }, + /// State reconstruction failed because it didn't reach the upper limit slot. + /// + /// This should never happen (it's a logic error). + StateReconstructionDidNotComplete, + StateReconstructionRootMismatch { + slot: Slot, + expected: Hash256, + computed: Hash256, + }, +} + +pub trait HandleUnavailable { + fn handle_unavailable(self) -> std::result::Result, Error>; +} + +impl HandleUnavailable for Result { + fn handle_unavailable(self) -> std::result::Result, Error> { + match self { + Ok(x) => Ok(Some(x)), + Err(Error::HistoryUnavailable) => Ok(None), + Err(e) => Err(e), + } + } } impl From for Error { diff --git a/beacon_node/store/src/hot_cold_store.rs b/beacon_node/store/src/hot_cold_store.rs index 43f1626e1e4..39c63507ea0 100644 --- a/beacon_node/store/src/hot_cold_store.rs +++ b/beacon_node/store/src/hot_cold_store.rs @@ -3,17 +3,15 @@ use crate::chunked_vector::{ }; use crate::config::{OnDiskStoreConfig, StoreConfig}; use crate::forwards_iter::{HybridForwardsBlockRootsIterator, HybridForwardsStateRootsIterator}; -use crate::impls::{ - beacon_block_as_kv_store_op, - beacon_state::{get_full_state, store_full_state}, -}; +use crate::impls::beacon_state::{get_full_state, store_full_state}; use crate::iter::{ParentRootBlockIterator, StateRootsIterator}; use crate::leveldb_store::BytesKey; use crate::leveldb_store::LevelDB; use crate::memory_store::MemoryStore; use crate::metadata::{ - CompactionTimestamp, PruningCheckpoint, SchemaVersion, COMPACTION_TIMESTAMP_KEY, CONFIG_KEY, - CURRENT_SCHEMA_VERSION, PRUNING_CHECKPOINT_KEY, SCHEMA_VERSION_KEY, SPLIT_KEY, + AnchorInfo, CompactionTimestamp, PruningCheckpoint, SchemaVersion, ANCHOR_INFO_KEY, + COMPACTION_TIMESTAMP_KEY, CONFIG_KEY, CURRENT_SCHEMA_VERSION, PRUNING_CHECKPOINT_KEY, + SCHEMA_VERSION_KEY, SPLIT_KEY, }; use crate::metrics; use crate::{ @@ -23,13 +21,15 @@ use crate::{ use leveldb::iterator::LevelDBIterator; use lru::LruCache; use parking_lot::{Mutex, RwLock}; -use slog::{debug, error, info, trace, warn, Logger}; +use serde_derive::{Deserialize, Serialize}; +use slog::{debug, error, info, trace, Logger}; use ssz::{Decode, Encode}; use ssz_derive::{Decode, Encode}; use state_processing::{ per_block_processing, per_slot_processing, BlockProcessingError, BlockSignatureStrategy, SlotProcessingError, }; +use std::cmp::min; use std::convert::TryInto; use std::marker::PhantomData; use std::path::Path; @@ -57,8 +57,10 @@ pub struct HotColdDB, Cold: ItemStore> { /// /// States with slots less than `split.slot` are in the cold DB, while states with slots /// greater than or equal are in the hot DB. - split: RwLock, - config: StoreConfig, + pub(crate) split: RwLock, + /// The starting slots for the range of blocks & states stored in the database. + anchor_info: RwLock>, + pub(crate) config: StoreConfig, /// Cold database containing compact historical data. pub cold_db: Cold, /// Hot database containing duplicated but quick-to-access recent data. @@ -68,7 +70,7 @@ pub struct HotColdDB, Cold: ItemStore> { /// LRU cache of deserialized blocks. Updated whenever a block is loaded. block_cache: Mutex>>, /// Chain spec. - spec: ChainSpec, + pub(crate) spec: ChainSpec, /// Logger. pub(crate) log: Logger, /// Mere vessel for E. @@ -95,11 +97,13 @@ pub enum HotColdDBError { MissingHotStateSummary(Hash256), MissingEpochBoundaryState(Hash256), MissingSplitState(Hash256, Slot), + MissingAnchorInfo, HotStateSummaryError(BeaconStateError), RestorePointDecodeError(ssz::DecodeError), BlockReplayBeaconError(BeaconStateError), BlockReplaySlotError(SlotProcessingError), BlockReplayBlockError(BlockProcessingError), + MissingLowerLimitState(Slot), InvalidSlotsPerRestorePoint { slots_per_restore_point: u64, slots_per_historical_root: u64, @@ -126,6 +130,7 @@ impl HotColdDB, MemoryStore> { let db = HotColdDB { split: RwLock::new(Split::default()), + anchor_info: RwLock::new(None), cold_db: MemoryStore::open(), hot_db: MemoryStore::open(), block_cache: Mutex::new(LruCache::new(config.block_cache_size)), @@ -158,6 +163,7 @@ impl HotColdDB, LevelDB> { let db = Arc::new(HotColdDB { split: RwLock::new(Split::default()), + anchor_info: RwLock::new(None), cold_db: LevelDB::open(cold_path)?, hot_db: LevelDB::open(hot_path)?, block_cache: Mutex::new(LruCache::new(config.block_cache_size)), @@ -190,6 +196,9 @@ impl HotColdDB, LevelDB> { // Load the previous split slot from the database (if any). This ensures we can // stop and restart correctly. if let Some(split) = db.load_split()? { + *db.split.write() = split; + *db.anchor_info.write() = db.load_anchor_info()?; + info!( db.log, "Hot-Cold DB initialized"; @@ -197,7 +206,6 @@ impl HotColdDB, LevelDB> { "split_slot" => split.slot, "split_state" => format!("{:?}", split.state_root) ); - *db.split.write() = split; } // Run a garbage collection pass. @@ -243,8 +251,8 @@ impl, Cold: ItemStore> HotColdDB block: SignedBeaconBlock, ) -> Result<(), Error> { // Store on disk. - self.hot_db - .do_atomically(vec![beacon_block_as_kv_store_op(block_root, &block)])?; + let op = self.block_as_kv_store_op(block_root, &block); + self.hot_db.do_atomically(vec![op])?; // Update cache. self.block_cache.lock().put(*block_root, block); @@ -252,6 +260,18 @@ impl, Cold: ItemStore> HotColdDB Ok(()) } + /// Prepare a signed beacon block for storage in the database. + #[must_use] + pub fn block_as_kv_store_op( + &self, + key: &Hash256, + block: &SignedBeaconBlock, + ) -> KeyValueStoreOp { + // FIXME(altair): re-add block write/overhead metrics, or remove them + let db_key = get_key_for_col(DBColumn::BeaconBlock.into(), key.as_bytes()); + KeyValueStoreOp::PutKeyValue(db_key, block.as_ssz_bytes()) + } + /// Fetch a block from the store. pub fn get_block(&self, block_root: &Hash256) -> Result>, Error> { metrics::inc_counter(&metrics::BEACON_BLOCK_GET_COUNT); @@ -467,7 +487,7 @@ impl, Cold: ItemStore> HotColdDB Some(state_slot) => { let epoch_boundary_slot = state_slot / E::slots_per_epoch() * E::slots_per_epoch(); - self.load_cold_state_by_slot(epoch_boundary_slot).map(Some) + self.load_cold_state_by_slot(epoch_boundary_slot) } None => Ok(None), } @@ -492,7 +512,7 @@ impl, Cold: ItemStore> HotColdDB for op in batch { match op { StoreOp::PutBlock(block_root, block) => { - key_value_batch.push(beacon_block_as_kv_store_op(block_root, block)); + key_value_batch.push(self.block_as_kv_store_op(block_root, block)); } StoreOp::PutState(state_root, state) => { @@ -563,6 +583,7 @@ impl, Cold: ItemStore> HotColdDB } Ok(()) } + /// Store a post-finalization state efficiently in the hot database. /// /// On an epoch boundary, store a full state. On an intermediate slot, store @@ -639,21 +660,16 @@ impl, Cold: ItemStore> HotColdDB /// Store a pre-finalization state in the freezer database. /// - /// Will log a warning and not store anything if the state does not lie on a restore point - /// boundary. + /// If the state doesn't lie on a restore point boundary then just its summary will be stored. pub fn store_cold_state( &self, state_root: &Hash256, state: &BeaconState, ops: &mut Vec, ) -> Result<(), Error> { + ops.push(ColdStateSummary { slot: state.slot() }.as_kv_store_op(*state_root)); + if state.slot() % self.config.slots_per_restore_point != 0 { - warn!( - self.log, - "Not storing non-restore point state in freezer"; - "slot" => state.slot().as_u64(), - "state_root" => format!("{:?}", state_root) - ); return Ok(()); } @@ -688,7 +704,7 @@ impl, Cold: ItemStore> HotColdDB /// Return `None` if no state with `state_root` lies in the freezer. pub fn load_cold_state(&self, state_root: &Hash256) -> Result>, Error> { match self.load_cold_state_slot(state_root)? { - Some(slot) => self.load_cold_state_by_slot(slot).map(Some), + Some(slot) => self.load_cold_state_by_slot(slot), None => Ok(None), } } @@ -696,12 +712,22 @@ impl, Cold: ItemStore> HotColdDB /// Load a pre-finalization state from the freezer database. /// /// Will reconstruct the state if it lies between restore points. - pub fn load_cold_state_by_slot(&self, slot: Slot) -> Result, Error> { - if slot % self.config.slots_per_restore_point == 0 { - let restore_point_idx = slot.as_u64() / self.config.slots_per_restore_point; - self.load_restore_point_by_index(restore_point_idx) + pub fn load_cold_state_by_slot(&self, slot: Slot) -> Result>, Error> { + // Guard against fetching states that do not exist due to gaps in the historic state + // database, which can occur due to checkpoint sync or re-indexing. + // See the comments in `get_historic_state_limits` for more information. + let (lower_limit, upper_limit) = self.get_historic_state_limits(); + + if slot <= lower_limit || slot >= upper_limit { + if slot % self.config.slots_per_restore_point == 0 { + let restore_point_idx = slot.as_u64() / self.config.slots_per_restore_point; + self.load_restore_point_by_index(restore_point_idx) + } else { + self.load_cold_intermediate_state(slot) + } + .map(Some) } else { - self.load_cold_intermediate_state(slot) + Ok(None) } } @@ -742,17 +768,7 @@ impl, Cold: ItemStore> HotColdDB let split = self.split.read_recursive(); let low_restore_point = self.load_restore_point_by_index(low_restore_point_idx)?; - // If the slot of the high point lies outside the freezer, use the split state - // as the upper restore point. - let high_restore_point = if high_restore_point_idx * self.config.slots_per_restore_point - >= split.slot.as_u64() - { - self.get_state(&split.state_root, Some(split.slot))?.ok_or( - HotColdDBError::MissingSplitState(split.state_root, split.slot), - )? - } else { - self.load_restore_point_by_index(high_restore_point_idx)? - }; + let high_restore_point = self.get_restore_point(high_restore_point_idx, &split)?; // 2. Load the blocks from the high restore point back to the low restore point. let blocks = self.load_blocks_to_replay( @@ -765,6 +781,24 @@ impl, Cold: ItemStore> HotColdDB self.replay_blocks(low_restore_point, blocks, slot, BlockReplay::Accurate) } + /// Get the restore point with the given index, or if it is out of bounds, the split state. + pub(crate) fn get_restore_point( + &self, + restore_point_idx: u64, + split: &Split, + ) -> Result, Error> { + if restore_point_idx * self.config.slots_per_restore_point >= split.slot.as_u64() { + self.get_state(&split.state_root, Some(split.slot))? + .ok_or(HotColdDBError::MissingSplitState( + split.state_root, + split.slot, + )) + .map_err(Into::into) + } else { + self.load_restore_point_by_index(restore_point_idx) + } + } + /// Get a suitable block root for backtracking from `high_restore_point` to the state at `slot`. /// /// Defaults to the block root for `slot`, which *should* be in range. @@ -800,12 +834,21 @@ impl, Cold: ItemStore> HotColdDB .as_ref() .map_or(true, |block| block.slot() <= end_slot) }) - // Include the block at the start slot (if any). Whilst it doesn't need to be applied - // to the state, it contains a potentially useful state root. - .take_while(|result| { - result - .as_ref() - .map_or(true, |block| block.slot() >= start_slot) + // Include the block at the start slot (if any). Whilst it doesn't need to be + // applied to the state, it contains a potentially useful state root. + // + // Return `true` on an `Err` so that the `collect` fails, unless the error is a + // `BlockNotFound` error and some blocks are intentionally missing from the DB. + // This complexity is unfortunately necessary to avoid loading the parent of the + // oldest known block -- we can't know that we have all the required blocks until we + // load a block with slot less than the start slot, which is impossible if there are + // no blocks with slot less than the start slot. + .take_while(|result| match result { + Ok(block) => block.slot() >= start_slot, + Err(Error::BlockNotFound(_)) => { + self.get_oldest_block_slot() == self.spec.genesis_slot + } + Err(_) => true, }) .collect::>()?; blocks.reverse(); @@ -904,6 +947,15 @@ impl, Cold: ItemStore> HotColdDB self.split.read_recursive().slot } + /// Fetch a copy of the current split slot from memory. + pub fn get_split_info(&self) -> Split { + *self.split.read_recursive() + } + + pub fn set_split(&self, slot: Slot, state_root: Hash256) { + *self.split.write() = Split { slot, state_root }; + } + /// Fetch the slot of the most recently stored restore point. pub fn get_latest_restore_point_slot(&self) -> Slot { (self.get_split_slot() - 1) / self.config.slots_per_restore_point @@ -920,6 +972,122 @@ impl, Cold: ItemStore> HotColdDB self.hot_db.put(&SCHEMA_VERSION_KEY, &schema_version) } + /// Initialise the anchor info for checkpoint sync starting from `block`. + pub fn init_anchor_info(&self, block: BeaconBlockRef<'_, E>) -> Result<(), Error> { + let anchor_slot = block.slot(); + let slots_per_restore_point = self.config.slots_per_restore_point; + + // Set the `state_upper_limit` to the slot of the *next* restore point. + // See `get_state_upper_limit` for rationale. + let next_restore_point_slot = if anchor_slot % slots_per_restore_point == 0 { + anchor_slot + } else { + (anchor_slot / slots_per_restore_point + 1) * slots_per_restore_point + }; + let anchor_info = AnchorInfo { + anchor_slot, + oldest_block_slot: anchor_slot, + oldest_block_parent: block.parent_root(), + state_upper_limit: next_restore_point_slot, + state_lower_limit: self.spec.genesis_slot, + }; + self.compare_and_set_anchor_info(None, Some(anchor_info)) + } + + /// Get a clone of the store's anchor info. + /// + /// To do mutations, use `compare_and_set_anchor_info`. + pub fn get_anchor_info(&self) -> Option { + self.anchor_info.read_recursive().clone() + } + + /// Atomically update the anchor info from `prev_value` to `new_value`. + /// + /// Return an `AnchorInfoConcurrentMutation` error if the `prev_value` provided + /// is not correct. + pub fn compare_and_set_anchor_info( + &self, + prev_value: Option, + new_value: Option, + ) -> Result<(), Error> { + let mut anchor_info = self.anchor_info.write(); + if *anchor_info == prev_value { + self.store_anchor_info(&new_value)?; + *anchor_info = new_value; + Ok(()) + } else { + Err(Error::AnchorInfoConcurrentMutation) + } + } + + /// Load the anchor info from disk, but do not set `self.anchor_info`. + fn load_anchor_info(&self) -> Result, Error> { + self.hot_db.get(&ANCHOR_INFO_KEY) + } + + /// Store the given `anchor_info` to disk. + /// + /// The argument is intended to be `self.anchor_info`, but is passed manually to avoid issues + /// with recursive locking. + fn store_anchor_info(&self, anchor_info: &Option) -> Result<(), Error> { + if let Some(ref anchor_info) = anchor_info { + self.hot_db.put(&ANCHOR_INFO_KEY, anchor_info)?; + } else { + self.hot_db.delete::(&ANCHOR_INFO_KEY)?; + } + Ok(()) + } + + /// If an anchor exists, return its `anchor_slot` field. + pub fn get_anchor_slot(&self) -> Option { + self.anchor_info + .read_recursive() + .as_ref() + .map(|a| a.anchor_slot) + } + + /// Return the slot-window describing the available historic states. + /// + /// Returns `(lower_limit, upper_limit)`. + /// + /// The lower limit is the maximum slot such that frozen states are available for all + /// previous slots (<=). + /// + /// The upper limit is the minimum slot such that frozen states are available for all + /// subsequent slots (>=). + /// + /// If `lower_limit >= upper_limit` then all states are available. This will be true + /// if the database is completely filled in, as we'll return `(split_slot, 0)` in this + /// instance. + pub fn get_historic_state_limits(&self) -> (Slot, Slot) { + // If checkpoint sync is used then states in the hot DB will always be available, but may + // become unavailable as finalisation advances due to the lack of a restore point in the + // database. For this reason we take the minimum of the split slot and the + // restore-point-aligned `state_upper_limit`, which should be set _ahead_ of the checkpoint + // slot during initialisation. + // + // E.g. if we start from a checkpoint at slot 2048+1024=3072 with SPRP=2048, then states + // with slots 3072-4095 will be available only while they are in the hot database, and this + // function will return the current split slot as the upper limit. Once slot 4096 is reached + // a new restore point will be created at that slot, making all states from 4096 onwards + // permanently available. + let split_slot = self.get_split_slot(); + self.anchor_info + .read_recursive() + .as_ref() + .map_or((split_slot, self.spec.genesis_slot), |a| { + (a.state_lower_limit, min(a.state_upper_limit, split_slot)) + }) + } + + /// Return the minimum slot such that blocks are available for all subsequent slots. + pub fn get_oldest_block_slot(&self) -> Slot { + self.anchor_info + .read_recursive() + .as_ref() + .map_or(self.spec.genesis_slot, |anchor| anchor.oldest_block_slot) + } + /// Load previously-stored config from disk. fn load_config(&self) -> Result, Error> { self.hot_db.get(&CONFIG_KEY) @@ -935,6 +1103,12 @@ impl, Cold: ItemStore> HotColdDB self.hot_db.get(&SPLIT_KEY) } + /// Store the split point to disk. + pub fn store_split(&self) -> Result<(), Error> { + self.hot_db.put_sync(&SPLIT_KEY, &*self.split.read())?; + Ok(()) + } + /// Load the state root of a restore point. fn load_restore_point_hash(&self, restore_point_index: u64) -> Result { let key = Self::restore_point_key(restore_point_index); @@ -1037,6 +1211,12 @@ impl, Cold: ItemStore> HotColdDB .map(|pc: PruningCheckpoint| pc.checkpoint)) } + /// Store the checkpoint to begin pruning from (the "old finalized checkpoint"). + pub fn store_pruning_checkpoint(&self, checkpoint: Checkpoint) -> Result<(), Error> { + self.hot_db + .do_atomically(vec![self.pruning_checkpoint_store_op(checkpoint)]) + } + /// Create a staged store for the pruning checkpoint. pub fn pruning_checkpoint_store_op(&self, checkpoint: Checkpoint) -> KeyValueStoreOp { PruningCheckpoint { checkpoint }.as_kv_store_op(PRUNING_CHECKPOINT_KEY) @@ -1075,6 +1255,11 @@ pub fn migrate_database, Cold: ItemStore>( // The new frozen head must increase the current split slot, and lie on an epoch // boundary (in order for the hot state summary scheme to work). let current_split_slot = store.split.read_recursive().slot; + let anchor_slot = store + .anchor_info + .read_recursive() + .as_ref() + .map(|a| a.anchor_slot); if frozen_head.slot() < current_split_slot { return Err(HotColdDBError::FreezeSlotError { @@ -1094,7 +1279,10 @@ pub fn migrate_database, Cold: ItemStore>( // to the cold DB. let state_root_iter = StateRootsIterator::new(store.clone(), frozen_head); for maybe_pair in state_root_iter.take_while(|result| match result { - Ok((_, slot)) => slot >= ¤t_split_slot, + Ok((_, slot)) => { + slot >= ¤t_split_slot + && anchor_slot.map_or(true, |anchor_slot| slot >= &anchor_slot) + } Err(_) => true, }) { let (state_root, slot) = maybe_pair?; @@ -1183,10 +1371,10 @@ pub fn migrate_database, Cold: ItemStore>( } /// Struct for storing the split slot and state root in the database. -#[derive(Debug, Clone, Copy, Default, Encode, Decode)] +#[derive(Debug, Clone, Copy, PartialEq, Default, Encode, Decode, Deserialize, Serialize)] pub struct Split { - slot: Slot, - state_root: Hash256, + pub(crate) slot: Slot, + pub(crate) state_root: Hash256, } impl StoreItem for Split { @@ -1252,8 +1440,8 @@ impl HotStateSummary { /// Struct for summarising a state in the freezer database. #[derive(Debug, Clone, Copy, Default, Encode, Decode)] -struct ColdStateSummary { - slot: Slot, +pub(crate) struct ColdStateSummary { + pub slot: Slot, } impl StoreItem for ColdStateSummary { diff --git a/beacon_node/store/src/impls.rs b/beacon_node/store/src/impls.rs index 2321caf2b11..1b442cbc553 100644 --- a/beacon_node/store/src/impls.rs +++ b/beacon_node/store/src/impls.rs @@ -1,15 +1 @@ -use crate::*; -use ssz::Encode; - pub mod beacon_state; - -/// Prepare a signed beacon block for storage in the database. -#[must_use] -pub fn beacon_block_as_kv_store_op( - key: &Hash256, - block: &SignedBeaconBlock, -) -> KeyValueStoreOp { - // FIXME(altair): re-add block write/overhead metrics, or remove them - let db_key = get_key_for_col(DBColumn::BeaconBlock.into(), key.as_bytes()); - KeyValueStoreOp::PutKeyValue(db_key, block.as_ssz_bytes()) -} diff --git a/beacon_node/store/src/iter.rs b/beacon_node/store/src/iter.rs index f32b531ad03..6735b4ff7d7 100644 --- a/beacon_node/store/src/iter.rs +++ b/beacon_node/store/src/iter.rs @@ -1,3 +1,4 @@ +use crate::errors::HandleUnavailable; use crate::{Error, HotColdDB, ItemStore}; use std::borrow::Cow; use std::marker::PhantomData; @@ -201,15 +202,20 @@ impl<'a, T: EthSpec, Hot: ItemStore, Cold: ItemStore> RootsIterator<'a, T, (Ok(block_root), Ok(state_root)) => Ok(Some((*block_root, *state_root, self.slot))), (Err(BeaconStateError::SlotOutOfBounds), Err(BeaconStateError::SlotOutOfBounds)) => { // Read a `BeaconState` from the store that has access to prior historical roots. - let beacon_state = - next_historical_root_backtrack_state(&*self.store, &self.beacon_state)?; - - self.beacon_state = Cow::Owned(beacon_state); - - let block_root = *self.beacon_state.get_block_root(self.slot)?; - let state_root = *self.beacon_state.get_state_root(self.slot)?; - - Ok(Some((block_root, state_root, self.slot))) + if let Some(beacon_state) = + next_historical_root_backtrack_state(&*self.store, &self.beacon_state) + .handle_unavailable()? + { + self.beacon_state = Cow::Owned(beacon_state); + + let block_root = *self.beacon_state.get_block_root(self.slot)?; + let state_root = *self.beacon_state.get_state_root(self.slot)?; + + Ok(Some((block_root, state_root, self.slot))) + } else { + // No more states available due to weak subjectivity sync. + Ok(None) + } } (Err(e), _) => Err(e.into()), (Ok(_), Err(e)) => Err(e.into()), @@ -329,6 +335,9 @@ impl<'a, T: EthSpec, Hot: ItemStore, Cold: ItemStore> Iterator } /// Fetch the next state to use whilst backtracking in `*RootsIterator`. +/// +/// Return `Err(HistoryUnavailable)` in the case where no more backtrack states are available +/// due to weak subjectivity sync. fn next_historical_root_backtrack_state, Cold: ItemStore>( store: &HotColdDB, current_state: &BeaconState, @@ -338,10 +347,17 @@ fn next_historical_root_backtrack_state, Cold: Ite // not frozen, this just means we might not jump back by the maximum amount on // our first jump (i.e. at most 1 extra state load). let new_state_slot = slot_of_prev_restore_point::(current_state.slot()); - let new_state_root = current_state.get_state_root(new_state_slot)?; - Ok(store - .get_state(new_state_root, Some(new_state_slot))? - .ok_or_else(|| BeaconStateError::MissingBeaconState((*new_state_root).into()))?) + + let (_, historic_state_upper_limit) = store.get_historic_state_limits(); + + if new_state_slot >= historic_state_upper_limit { + let new_state_root = current_state.get_state_root(new_state_slot)?; + Ok(store + .get_state(new_state_root, Some(new_state_slot))? + .ok_or_else(|| BeaconStateError::MissingBeaconState((*new_state_root).into()))?) + } else { + Err(Error::HistoryUnavailable) + } } /// Compute the slot of the last guaranteed restore point in the freezer database. diff --git a/beacon_node/store/src/lib.rs b/beacon_node/store/src/lib.rs index 2190806cd1d..ca9af67254a 100644 --- a/beacon_node/store/src/lib.rs +++ b/beacon_node/store/src/lib.rs @@ -10,6 +10,7 @@ #[macro_use] extern crate lazy_static; +mod chunk_writer; pub mod chunked_iter; pub mod chunked_vector; pub mod config; @@ -23,9 +24,11 @@ mod memory_store; pub mod metadata; pub mod metrics; mod partial_beacon_state; +pub mod reconstruct; pub mod iter; +pub use self::chunk_writer::ChunkWriter; pub use self::config::StoreConfig; pub use self::hot_cold_store::{BlockReplay, HotColdDB, HotStateSummary, Split}; pub use self::leveldb_store::LevelDB; @@ -33,6 +36,7 @@ pub use self::memory_store::MemoryStore; pub use self::partial_beacon_state::PartialBeaconState; pub use errors::Error; pub use impls::beacon_state::StorageContainer as BeaconStateStorageContainer; +pub use metadata::AnchorInfo; pub use metrics::scrape_for_metrics; use parking_lot::MutexGuard; pub use types::*; diff --git a/beacon_node/store/src/metadata.rs b/beacon_node/store/src/metadata.rs index b9066240462..fd20a588010 100644 --- a/beacon_node/store/src/metadata.rs +++ b/beacon_node/store/src/metadata.rs @@ -1,8 +1,10 @@ use crate::{DBColumn, Error, StoreItem}; +use serde_derive::{Deserialize, Serialize}; use ssz::{Decode, Encode}; -use types::{Checkpoint, Hash256}; +use ssz_derive::{Decode, Encode}; +use types::{Checkpoint, Hash256, Slot}; -pub const CURRENT_SCHEMA_VERSION: SchemaVersion = SchemaVersion(4); +pub const CURRENT_SCHEMA_VERSION: SchemaVersion = SchemaVersion(5); // All the keys that get stored under the `BeaconMeta` column. // @@ -12,6 +14,7 @@ pub const CONFIG_KEY: Hash256 = Hash256::repeat_byte(1); pub const SPLIT_KEY: Hash256 = Hash256::repeat_byte(2); pub const PRUNING_CHECKPOINT_KEY: Hash256 = Hash256::repeat_byte(3); pub const COMPACTION_TIMESTAMP_KEY: Hash256 = Hash256::repeat_byte(4); +pub const ANCHOR_INFO_KEY: Hash256 = Hash256::repeat_byte(5); #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub struct SchemaVersion(pub u64); @@ -76,3 +79,41 @@ impl StoreItem for CompactionTimestamp { Ok(CompactionTimestamp(u64::from_ssz_bytes(bytes)?)) } } + +/// Database parameters relevant to weak subjectivity sync. +#[derive(Debug, PartialEq, Eq, Clone, Encode, Decode, Serialize, Deserialize)] +pub struct AnchorInfo { + /// The slot at which the anchor state is present and which we cannot revert. + pub anchor_slot: Slot, + /// The slot from which historical blocks are available (>=). + pub oldest_block_slot: Slot, + /// The block root of the next block that needs to be added to fill in the history. + /// + /// Zero if we know all blocks back to genesis. + pub oldest_block_parent: Hash256, + /// The slot from which historical states are available (>=). + pub state_upper_limit: Slot, + /// The slot before which historical states are available (<=). + pub state_lower_limit: Slot, +} + +impl AnchorInfo { + /// Returns true if the block backfill has completed. + pub fn block_backfill_complete(&self) -> bool { + self.oldest_block_slot == 0 + } +} + +impl StoreItem for AnchorInfo { + fn db_column() -> DBColumn { + DBColumn::BeaconMeta + } + + fn as_store_bytes(&self) -> Vec { + self.as_ssz_bytes() + } + + fn from_store_bytes(bytes: &[u8]) -> Result { + Ok(Self::from_ssz_bytes(bytes)?) + } +} diff --git a/beacon_node/store/src/reconstruct.rs b/beacon_node/store/src/reconstruct.rs new file mode 100644 index 00000000000..ff10e642df3 --- /dev/null +++ b/beacon_node/store/src/reconstruct.rs @@ -0,0 +1,160 @@ +//! Implementation of historic state reconstruction (given complete block history). +use crate::hot_cold_store::{HotColdDB, HotColdDBError}; +use crate::{Error, ItemStore, KeyValueStore}; +use itertools::{process_results, Itertools}; +use slog::info; +use state_processing::{per_block_processing, per_slot_processing, BlockSignatureStrategy}; +use std::sync::Arc; +use types::{EthSpec, Hash256}; + +impl HotColdDB +where + E: EthSpec, + Hot: KeyValueStore + ItemStore, + Cold: KeyValueStore + ItemStore, +{ + pub fn reconstruct_historic_states(self: &Arc) -> Result<(), Error> { + let mut anchor = if let Some(anchor) = self.get_anchor_info() { + anchor + } else { + // Nothing to do, history is complete. + return Ok(()); + }; + + // Check that all historic blocks are known. + if anchor.oldest_block_slot != 0 { + return Err(Error::MissingHistoricBlocks { + oldest_block_slot: anchor.oldest_block_slot, + }); + } + + info!( + self.log, + "Beginning historic state reconstruction"; + "start_slot" => anchor.state_lower_limit, + ); + + let slots_per_restore_point = self.config.slots_per_restore_point; + + // Iterate blocks from the state lower limit to the upper limit. + let lower_limit_slot = anchor.state_lower_limit; + let split = self.get_split_info(); + let upper_limit_state = self.get_restore_point( + anchor.state_upper_limit.as_u64() / slots_per_restore_point, + &split, + )?; + let upper_limit_slot = upper_limit_state.slot(); + + // Use a dummy root, as we never read the block for the upper limit state. + let upper_limit_block_root = Hash256::repeat_byte(0xff); + + let block_root_iter = Self::forwards_block_roots_iterator( + self.clone(), + lower_limit_slot, + upper_limit_state, + upper_limit_block_root, + &self.spec, + )?; + + // The state to be advanced. + let mut state = self + .load_cold_state_by_slot(lower_limit_slot)? + .ok_or(HotColdDBError::MissingLowerLimitState(lower_limit_slot))?; + + state.build_all_caches(&self.spec)?; + + process_results(block_root_iter, |iter| -> Result<(), Error> { + let mut io_batch = vec![]; + + let mut prev_state_root = None; + + for ((prev_block_root, _), (block_root, slot)) in iter.tuple_windows() { + let is_skipped_slot = prev_block_root == block_root; + + let block = if is_skipped_slot { + None + } else { + Some( + self.get_block(&block_root)? + .ok_or(Error::BlockNotFound(block_root))?, + ) + }; + + // Advance state to slot. + per_slot_processing(&mut state, prev_state_root.take(), &self.spec) + .map_err(HotColdDBError::BlockReplaySlotError)?; + + // Apply block. + if let Some(block) = block { + per_block_processing( + &mut state, + &block, + Some(block_root), + BlockSignatureStrategy::NoVerification, + &self.spec, + ) + .map_err(HotColdDBError::BlockReplayBlockError)?; + + prev_state_root = Some(block.state_root()); + } + + let state_root = prev_state_root + .ok_or(()) + .or_else(|_| state.update_tree_hash_cache())?; + + // Stage state for storage in freezer DB. + self.store_cold_state(&state_root, &state, &mut io_batch)?; + + // If the slot lies on an epoch boundary, commit the batch and update the anchor. + if slot % slots_per_restore_point == 0 || slot + 1 == upper_limit_slot { + info!( + self.log, + "State reconstruction in progress"; + "slot" => slot, + "remaining" => upper_limit_slot - 1 - slot + ); + + self.cold_db.do_atomically(std::mem::take(&mut io_batch))?; + + // Update anchor. + let old_anchor = Some(anchor.clone()); + + if slot + 1 == upper_limit_slot { + // The two limits have met in the middle! We're done! + // Perform one last integrity check on the state reached. + let computed_state_root = state.update_tree_hash_cache()?; + if computed_state_root != state_root { + return Err(Error::StateReconstructionRootMismatch { + slot, + expected: state_root, + computed: computed_state_root, + }); + } + + self.compare_and_set_anchor_info(old_anchor, None)?; + + return Ok(()); + } else { + // The lower limit has been raised, store it. + anchor.state_lower_limit = slot; + + self.compare_and_set_anchor_info(old_anchor, Some(anchor.clone()))?; + } + } + } + + // Should always reach the `upper_limit_slot` and return early above. + Err(Error::StateReconstructionDidNotComplete) + })??; + + // Check that the split point wasn't mutated during the state reconstruction process. + // It shouldn't have been, due to the serialization of requests through the store migrator, + // so this is just a paranoid check. + let latest_split = self.get_split_info(); + if split != latest_split { + return Err(Error::SplitPointModified(latest_split.slot, split.slot)); + } + + Ok(()) + } +} diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md index 2bd27e1ef8d..48a76e401fe 100644 --- a/book/src/SUMMARY.md +++ b/book/src/SUMMARY.md @@ -30,6 +30,7 @@ * [Signature Header](./api-vc-sig-header.md) * [Prometheus Metrics](./advanced_metrics.md) * [Advanced Usage](./advanced.md) + * [Checkpoint Sync](./checkpoint-sync.md) * [Custom Data Directories](./advanced-datadir.md) * [Validator Graffiti](./graffiti.md) * [Database Configuration](./advanced_database.md) diff --git a/book/src/advanced_database.md b/book/src/advanced_database.md index 076b66ba372..b562b31dd6d 100644 --- a/book/src/advanced_database.md +++ b/book/src/advanced_database.md @@ -25,16 +25,16 @@ some example values. | Use Case | SPRP | Yearly Disk Usage | Load Historical State | | ---------------------- | -------------- | ----------------- | --------------------- | -| Block explorer/analysis | 32 | 411 GB | 96 ms | -| Default | 2048 | 6.4 GB | 6 s | -| Validator only | 8192 | 1.6 GB | 25 s | +| Block explorer/analysis | 32 | 1.4 TB | 155 ms | +| Default | 2048 | 23.1 GB | 10.2 s | +| Validator only | 8192 | 5.7 GB | 41 s | As you can see, it's a high-stakes trade-off! The relationships to disk usage and historical state load time are both linear – doubling SPRP halves disk usage and doubles load time. The minimum SPRP is 32, and the maximum is 8192. The values shown in the table are approximate, calculated using a simple heuristic: each -`BeaconState` consumes around 5MB of disk space, and each block replayed takes around 3ms. The +`BeaconState` consumes around 18MB of disk space, and each block replayed takes around 5ms. The **Yearly Disk Usage** column shows the approx size of the freezer DB _alone_ (hot DB not included), and the **Load Historical State** time is the worst-case load time for a state in the last slot of an epoch. diff --git a/book/src/api-lighthouse.md b/book/src/api-lighthouse.md index bc19d18281e..c25d97ac409 100644 --- a/book/src/api-lighthouse.md +++ b/book/src/api-lighthouse.md @@ -353,4 +353,58 @@ curl -X POST "http://localhost:5052/lighthouse/liveness" -d '{"indices":["0","1" } ] } -``` \ No newline at end of file +``` + +### `/lighthouse/database/info` + +Information about the database's split point and anchor info. + +```bash +curl "http://localhost:5052/lighthouse/database/info" | jq +``` + +```json +{ + "schema_version": 5, + "split": { + "slot": "2034912", + "state_root": "0x11c8516aa7d4d1613e84121e3a557ceca34618b4c1a38f05b66ad045ff82b33b" + }, + "anchor": { + "anchor_slot": "2034720", + "oldest_block_slot": "1958881", + "oldest_block_parent": "0x1fd3d855d03e9df28d8a41a0f9cb9d4c540832b3ca1c3e1d7e09cd75b874cc87", + "state_upper_limit": "2035712", + "state_lower_limit": "0" + } +} +``` + +For more information about the split point, see the [Database Configuration](./advanced_database.md) +docs. + +The `anchor` will be `null` unless the node has been synced with checkpoint sync and state +reconstruction has yet to be completed. For more information +on the specific meanings of these fields see the docs on [Checkpoint +Sync](./checkpoint-sync.md#reconstructing-states). + +### `/lighthouse/database/reconstruct` + +Instruct Lighthouse to begin reconstructing historic states, see +[Reconstructing States](./checkpoint-sync.md#reconstructing-states). This is an alternative +to the `--reconstruct-historic-states` flag. + +``` +curl -X POST "http://localhost:5052/lighthouse/database/reconstruct" | jq +``` + +```json +"success" +``` + +The endpoint will return immediately. See the beacon node logs for an indication of progress. + +### `/lighthouse/database/historical_blocks` + +Manually provide `SignedBeaconBlock`s to backfill the database. This is intended +for use by Lighthouse developers during testing only. \ No newline at end of file diff --git a/book/src/checkpoint-sync.md b/book/src/checkpoint-sync.md new file mode 100644 index 00000000000..4bdddbc7d66 --- /dev/null +++ b/book/src/checkpoint-sync.md @@ -0,0 +1,148 @@ +# Checkpoint Sync + +Lighthouse supports syncing from a recent finalized checkpoint. This is substantially faster +than syncing from genesis, while still providing all the same features. + +If you would like to quickly get started with checkpoint sync, read the sections below on: + +1. [Automatic Checkpoint Sync](#automatic-checkpoint-sync) +2. [Backfilling Blocks](#backfilling-blocks) + +The remaining sections are for more advanced use-cases (archival nodes). + +## Automatic Checkpoint Sync + +To begin checkpoint sync you will need HTTP API access to another synced beacon node. Enable +checkpoint sync by providing the other beacon node's URL to `--checkpoint-sync-url`, alongside any +other flags: + +``` +lighthouse bn --checkpoint-sync-url "http://remote-bn:5052" ... +``` + +Lighthouse will print a message to indicate that checkpoint sync is being used: + +``` +INFO Starting checkpoint sync remote_url: http://remote-bn:8000/, service: beacon +``` + +After a short time (usually less than a minute), it will log the details of the checkpoint +loaded from the remote beacon node: + +``` +INFO Loaded checkpoint block and state state_root: 0xe8252c68784a8d5cc7e5429b0e95747032dd1dcee0d1dc9bdaf6380bf90bc8a6, block_root: 0x5508a20147299b1a7fe9dbea1a8b3bf979f74c52e7242039bd77cbff62c0695a, slot: 2034720, service: beacon +``` + +> **Security Note**: You should cross-reference the `block_root` and `slot` of the loaded checkpoint +> against a trusted source like a friend's node, or a block explorer. + +Once the checkpoint is loaded Lighthouse will sync forwards to the head of the chain. + +If a validator client is connected to the node then it will be able to start completing its duties +as soon as forwards sync completes. + +## Backfilling Blocks + +Once forwards sync completes, Lighthouse will commence a "backfill sync" to download the blocks +from the checkpoint back to genesis. + +The beacon node will log messages similar to the following each minute while it completes backfill +sync: + +``` +INFO Downloading historical blocks est_time: 5 hrs 0 mins, speed: 111.96 slots/sec, distance: 2020451 slots (40 weeks 0 days), service: slot_notifier +``` + +Once backfill is complete, a `INFO Historical block download complete` log will be emitted. + +## FAQ + +1. What if I have an existing database? How can I use checkpoint sync? + +The existing beacon database needs to be deleted before Lighthouse will attempt checkpoint sync. +You can do this by providing the `--purge-db` flag, or by manually deleting `/beacon`. + +2. Why is checkpoint sync faster? + +Checkpoint sync prioritises syncing to the head of the chain quickly so that the node can perform +its duties. Additionally, it only has to perform lightweight verification of historic blocks: +it checks the hash chain integrity & proposer signature rather than computing the full state +transition. + +3. Is checkpoint sync less secure? + +No, in fact it is more secure! Checkpoint sync guards against long-range attacks that +genesis sync does not. This is due to a property of Proof of Stake consensus known as [Weak +Subjectivity][weak-subj]. + +## Reconstructing States + +> This section is only relevant if you are interested in running an archival node for analysis +> purposes. + +After completing backfill sync the node's database will differ from a genesis-synced node in the +lack of historic states. _You do not need these states to run a staking node_, but they are required +for historical API calls (as used by block explorers and researchers). + +You can opt-in to reconstructing all of the historic states by providing the +`--reconstruct-historic-states` flag to the beacon node at any point (before, during or after sync). + +The database keeps track of three markers to determine the availability of historic blocks and +states: + +* `oldest_block_slot`: All blocks with slots less than or equal to this value are available in the + database. Additionally, the genesis block is always available. +* `state_lower_limit`: All states with slots _less than or equal to_ this value are available in + the database. The minimum value is 0, indicating that the genesis state is always available. +* `state_upper_limit`: All states with slots _greater than or equal to_ this value are available + in the database. + +Reconstruction runs from the state lower limit to the upper limit, narrowing the window of +unavailable states as it goes. It will log messages like the following to show its progress: + +``` +INFO State reconstruction in progress remaining: 747519, slot: 466944, service: freezer_db +``` + +Important information to be aware of: + +* Reconstructed states will consume several gigabytes or hundreds of gigabytes of disk space, + depending on the [database configuration used](./advanced_database.md). +* Reconstruction will only begin once backfill sync has completed and `oldest_block_slot` is + equal to 0. +* While reconstruction is running the node will temporarily pause migrating new data to the + freezer database. This will lead to the database increasing in size temporarily (by a few GB per + day) until state reconstruction completes. +* It is safe to interrupt state reconstruction by gracefully terminating the node – it will pick up + from where it left off when it restarts. +* You can start reconstruction from the HTTP API, and view its progress. See the + [`/lighthouse/database`](./api-lighthouse.md) APIs. + +For more information on historic state storage see the +[Database Configuration](./advanced_database.md) page. + +## Manual Checkpoint Sync + +> This section is only relevant if you want to manually provide the checkpoint state and +> block instead of fetching them from a URL. + +To manually specify a checkpoint use the following two flags: + +* `--checkpoint-state`: accepts an SSZ-encoded `BeaconState` blob +* `--checkpoint-block`: accepts an SSZ-encoded `SignedBeaconBlock` blob + +_Both_ the state and block must be provided and **must** adhere to the [Alignment +Requirements](#alignment-requirements) described below. + +### Alignment Requirements + +* The block must be a finalized block from an epoch boundary, i.e. `block.slot() % 32 == 0`. +* The state must be the state corresponding to `block` with `state.slot() == block.slot()` + and `state.hash_tree_root() == block.state_root()`. + +These requirements are imposed to align with Lighthouse's database schema, and notably exclude +finalized blocks from skipped slots. You can avoid alignment issues by using +[Automatic Checkpoint Sync](#automatic-checkpoint-sync), which will search for a suitable block +and state pair. + +[weak-subj]: https://blog.ethereum.org/2014/11/25/proof-stake-learned-love-weak-subjectivity/ diff --git a/common/eth2/Cargo.toml b/common/eth2/Cargo.toml index dcd21359dbe..382a8b79b17 100644 --- a/common/eth2/Cargo.toml +++ b/common/eth2/Cargo.toml @@ -26,6 +26,7 @@ eth2_ssz = "0.3.0" eth2_ssz_derive = "0.2.1" futures-util = "0.3.8" futures = "0.3.8" +store = { path = "../../beacon_node/store", optional = true } [target.'cfg(target_os = "linux")'.dependencies] psutil = { version = "3.2.0", optional = true } @@ -33,4 +34,4 @@ procinfo = { version = "0.4.2", optional = true } [features] default = ["lighthouse"] -lighthouse = ["proto_array", "psutil", "procinfo"] +lighthouse = ["proto_array", "psutil", "procinfo", "store"] diff --git a/common/eth2/src/lib.rs b/common/eth2/src/lib.rs index 528360cf41c..1f38f965338 100644 --- a/common/eth2/src/lib.rs +++ b/common/eth2/src/lib.rs @@ -262,17 +262,23 @@ impl BeaconNodeHttpClient { /// Perform a HTTP POST request. async fn post(&self, url: U, body: &T) -> Result<(), Error> { - let response = self - .client - .post(url) - .json(body) - .send() - .await - .map_err(Error::Reqwest)?; - ok_or_error(response).await?; + self.post_generic(url, body, None).await?; Ok(()) } + /// Perform a HTTP POST request, returning a JSON response. + async fn post_with_response( + &self, + url: U, + body: &T, + ) -> Result { + self.post_generic(url, body, None) + .await? + .json() + .await + .map_err(Error::Reqwest) + } + /// Perform a HTTP POST request with a custom timeout. async fn post_with_timeout( &self, @@ -280,15 +286,7 @@ impl BeaconNodeHttpClient { body: &T, timeout: Duration, ) -> Result<(), Error> { - let response = self - .client - .post(url) - .timeout(timeout) - .json(body) - .send() - .await - .map_err(Error::Reqwest)?; - ok_or_error(response).await?; + self.post_generic(url, body, Some(timeout)).await?; Ok(()) } @@ -299,21 +297,28 @@ impl BeaconNodeHttpClient { body: &V, timeout: Duration, ) -> Result { - let response = self - .client - .post(url) - .timeout(timeout) - .json(body) - .send() - .await - .map_err(Error::Reqwest)?; - ok_or_error(response) + self.post_generic(url, body, Some(timeout)) .await? .json() .await .map_err(Error::Reqwest) } + /// Generic POST function supporting arbitrary responses and timeouts. + async fn post_generic( + &self, + url: U, + body: &T, + timeout: Option, + ) -> Result { + let mut builder = self.client.post(url); + if let Some(timeout) = timeout { + builder = builder.timeout(timeout); + } + let response = builder.json(body).send().await.map_err(Error::Reqwest)?; + ok_or_error(response).await + } + /// `GET beacon/genesis` /// /// ## Errors diff --git a/common/eth2/src/lighthouse.rs b/common/eth2/src/lighthouse.rs index 70c5fa2b32e..68bd94592ad 100644 --- a/common/eth2/src/lighthouse.rs +++ b/common/eth2/src/lighthouse.rs @@ -9,6 +9,7 @@ use proto_array::core::ProtoArray; use reqwest::IntoUrl; use serde::{Deserialize, Serialize}; use ssz_derive::{Decode, Encode}; +use store::{AnchorInfo, Split}; pub use eth2_libp2p::{types::SyncState, PeerInfo}; @@ -311,6 +312,13 @@ impl Eth1Block { } } +#[derive(Debug, Serialize, Deserialize)] +pub struct DatabaseInfo { + pub schema_version: u64, + pub split: Split, + pub anchor: Option, +} + impl BeaconNodeHttpClient { /// Perform a HTTP GET request, returning `None` on a 404 error. async fn get_bytes_opt(&self, url: U) -> Result>, Error> { @@ -490,4 +498,30 @@ impl BeaconNodeHttpClient { self.get_opt::<(), _>(path).await.map(|opt| opt.is_some()) } + + /// `GET lighthouse/database/info` + pub async fn get_lighthouse_database_info(&self) -> Result { + let mut path = self.server.full.clone(); + + path.path_segments_mut() + .map_err(|()| Error::InvalidUrl(self.server.clone()))? + .push("lighthouse") + .push("database") + .push("info"); + + self.get(path).await + } + + /// `POST lighthouse/database/reconstruct` + pub async fn post_lighthouse_database_reconstruct(&self) -> Result { + let mut path = self.server.full.clone(); + + path.path_segments_mut() + .map_err(|()| Error::InvalidUrl(self.server.clone()))? + .push("lighthouse") + .push("database") + .push("reconstruct"); + + self.post_with_response(path, &()).await + } } diff --git a/consensus/fork_choice/tests/tests.rs b/consensus/fork_choice/tests/tests.rs index 5e78bde286e..61e6d56ea80 100644 --- a/consensus/fork_choice/tests/tests.rs +++ b/consensus/fork_choice/tests/tests.rs @@ -983,7 +983,7 @@ fn weak_subjectivity_fail_on_startup() { let chain_config = ChainConfig { weak_subjectivity_checkpoint: Some(Checkpoint { epoch, root }), - import_max_skip_slots: None, + ..ChainConfig::default() }; ForkChoiceTest::new_with_chain_config(chain_config); @@ -996,7 +996,7 @@ fn weak_subjectivity_pass_on_startup() { let chain_config = ChainConfig { weak_subjectivity_checkpoint: Some(Checkpoint { epoch, root }), - import_max_skip_slots: None, + ..ChainConfig::default() }; ForkChoiceTest::new_with_chain_config(chain_config) @@ -1021,7 +1021,7 @@ fn weak_subjectivity_check_passes() { let chain_config = ChainConfig { weak_subjectivity_checkpoint: Some(checkpoint), - import_max_skip_slots: None, + ..ChainConfig::default() }; ForkChoiceTest::new_with_chain_config(chain_config.clone()) @@ -1051,7 +1051,7 @@ fn weak_subjectivity_check_fails_early_epoch() { let chain_config = ChainConfig { weak_subjectivity_checkpoint: Some(checkpoint), - import_max_skip_slots: None, + ..ChainConfig::default() }; ForkChoiceTest::new_with_chain_config(chain_config.clone()) @@ -1080,7 +1080,7 @@ fn weak_subjectivity_check_fails_late_epoch() { let chain_config = ChainConfig { weak_subjectivity_checkpoint: Some(checkpoint), - import_max_skip_slots: None, + ..ChainConfig::default() }; ForkChoiceTest::new_with_chain_config(chain_config.clone()) @@ -1109,7 +1109,7 @@ fn weak_subjectivity_check_fails_incorrect_root() { let chain_config = ChainConfig { weak_subjectivity_checkpoint: Some(checkpoint), - import_max_skip_slots: None, + ..ChainConfig::default() }; ForkChoiceTest::new_with_chain_config(chain_config.clone()) @@ -1145,7 +1145,7 @@ fn weak_subjectivity_check_epoch_boundary_is_skip_slot() { let chain_config = ChainConfig { weak_subjectivity_checkpoint: Some(checkpoint), - import_max_skip_slots: None, + ..ChainConfig::default() }; // recreate the chain exactly @@ -1186,7 +1186,7 @@ fn weak_subjectivity_check_epoch_boundary_is_skip_slot_failure() { let chain_config = ChainConfig { weak_subjectivity_checkpoint: Some(checkpoint), - import_max_skip_slots: None, + ..ChainConfig::default() }; // recreate the chain exactly diff --git a/consensus/state_processing/src/per_block_processing.rs b/consensus/state_processing/src/per_block_processing.rs index 937d0479401..1681f194f23 100644 --- a/consensus/state_processing/src/per_block_processing.rs +++ b/consensus/state_processing/src/per_block_processing.rs @@ -11,7 +11,7 @@ pub use self::verify_attester_slashing::{ }; pub use self::verify_proposer_slashing::verify_proposer_slashing; pub use altair::sync_committee::process_sync_aggregate; -pub use block_signature_verifier::BlockSignatureVerifier; +pub use block_signature_verifier::{BlockSignatureVerifier, ParallelSignatureSets}; pub use is_valid_indexed_attestation::is_valid_indexed_attestation; pub use process_operations::process_operations; pub use verify_attestation::{ diff --git a/consensus/state_processing/src/per_block_processing/block_signature_verifier.rs b/consensus/state_processing/src/per_block_processing/block_signature_verifier.rs index e2a019fcc51..3e7a799341f 100644 --- a/consensus/state_processing/src/per_block_processing/block_signature_verifier.rs +++ b/consensus/state_processing/src/per_block_processing/block_signature_verifier.rs @@ -73,9 +73,20 @@ where decompressor: D, state: &'a BeaconState, spec: &'a ChainSpec, + sets: ParallelSignatureSets<'a>, +} + +#[derive(Default)] +pub struct ParallelSignatureSets<'a> { sets: Vec>, } +impl<'a> From>> for ParallelSignatureSets<'a> { + fn from(sets: Vec>) -> Self { + Self { sets } + } +} + impl<'a, T, F, D> BlockSignatureVerifier<'a, T, F, D> where T: EthSpec, @@ -95,7 +106,7 @@ where decompressor, state, spec, - sets: vec![], + sets: ParallelSignatureSets::default(), } } @@ -119,36 +130,6 @@ where verifier.verify() } - /// Verify all* the signatures that have been included in `self`, returning `Ok(())` if the - /// signatures are all valid. - /// - /// ## Notes - /// - /// Signature validation will take place in accordance to the [Faster verification of multiple - /// BLS signatures](https://ethresear.ch/t/fast-verification-of-multiple-bls-signatures/5407) - /// optimization proposed by Vitalik Buterin. - /// - /// It is not possible to know exactly _which_ signature is invalid here, just that - /// _at least one_ was invalid. - /// - /// Uses `rayon` to do a map-reduce of Vitalik's method across multiple cores. - pub fn verify(self) -> Result<()> { - let num_sets = self.sets.len(); - let num_chunks = std::cmp::max(1, num_sets / rayon::current_num_threads()); - let result: bool = self - .sets - .into_par_iter() - .chunks(num_chunks) - .map(|chunk| verify_signature_sets(chunk.iter())) - .reduce(|| true, |current, this| current && this); - - if result { - Ok(()) - } else { - Err(Error::SignatureInvalid) - } - } - /// Includes all signatures on the block (except the deposit signatures) for verification. pub fn include_all_signatures( &mut self, @@ -210,6 +191,7 @@ where /// Includes all signatures in `self.block.body.proposer_slashings` for verification. pub fn include_proposer_slashings(&mut self, block: &'a SignedBeaconBlock) -> Result<()> { self.sets + .sets .reserve(block.message().body().proposer_slashings().len() * 2); block @@ -235,6 +217,7 @@ where /// Includes all signatures in `self.block.body.attester_slashings` for verification. pub fn include_attester_slashings(&mut self, block: &'a SignedBeaconBlock) -> Result<()> { self.sets + .sets .reserve(block.message().body().attester_slashings().len() * 2); block @@ -263,6 +246,7 @@ where block: &'a SignedBeaconBlock, ) -> Result>> { self.sets + .sets .reserve(block.message().body().attestations().len()); block @@ -298,6 +282,7 @@ where /// Includes all signatures in `self.block.body.voluntary_exits` for verification. pub fn include_exits(&mut self, block: &'a SignedBeaconBlock) -> Result<()> { self.sets + .sets .reserve(block.message().body().voluntary_exits().len()); block @@ -331,4 +316,46 @@ where } Ok(()) } + + /// Verify all the signatures that have been included in `self`, returning `true` if and only if + /// all the signatures are valid. + /// + /// See `ParallelSignatureSets::verify` for more info. + pub fn verify(self) -> Result<()> { + if self.sets.verify() { + Ok(()) + } else { + Err(Error::SignatureInvalid) + } + } +} + +impl<'a> ParallelSignatureSets<'a> { + pub fn push(&mut self, set: SignatureSet<'a>) { + self.sets.push(set); + } + + /// Verify all the signatures that have been included in `self`, returning `true` if and only if + /// all the signatures are valid. + /// + /// ## Notes + /// + /// Signature validation will take place in accordance to the [Faster verification of multiple + /// BLS signatures](https://ethresear.ch/t/fast-verification-of-multiple-bls-signatures/5407) + /// optimization proposed by Vitalik Buterin. + /// + /// It is not possible to know exactly _which_ signature is invalid here, just that + /// _at least one_ was invalid. + /// + /// Uses `rayon` to do a map-reduce of Vitalik's method across multiple cores. + #[must_use] + pub fn verify(self) -> bool { + let num_sets = self.sets.len(); + let num_chunks = std::cmp::max(1, num_sets / rayon::current_num_threads()); + self.sets + .into_par_iter() + .chunks(num_chunks) + .map(|chunk| verify_signature_sets(chunk.iter())) + .reduce(|| true, |current, this| current && this) + } } diff --git a/consensus/state_processing/src/per_block_processing/signature_sets.rs b/consensus/state_processing/src/per_block_processing/signature_sets.rs index 7de7d7d99a0..a3b8dcaf214 100644 --- a/consensus/state_processing/src/per_block_processing/signature_sets.rs +++ b/consensus/state_processing/src/per_block_processing/signature_sets.rs @@ -81,26 +81,56 @@ where T: EthSpec, F: Fn(usize) -> Option>, { - // Verify that the `SignedBeaconBlock` instantiation matches the fork at `signed_block.slot()`. - signed_block - .fork_name(spec) - .map_err(Error::InconsistentBlockFork)?; - let block = signed_block.message(); - let proposer_index = state.get_beacon_proposer_index(block.slot(), spec)?; + let proposer_index = state.get_beacon_proposer_index(block.slot(), spec)? as u64; - if proposer_index as u64 != block.proposer_index() { + if proposer_index != block.proposer_index() { return Err(Error::IncorrectBlockProposer { block: block.proposer_index(), - local_shuffling: proposer_index as u64, + local_shuffling: proposer_index, }); } + block_proposal_signature_set_from_parts( + signed_block, + block_root, + proposer_index, + &state.fork(), + state.genesis_validators_root(), + get_pubkey, + spec, + ) +} + +/// A signature set that is valid if a block was signed by the expected block producer. +/// +/// Unlike `block_proposal_signature_set` this does **not** check that the proposer index is +/// correct according to the shuffling. It should only be used if no suitable `BeaconState` is +/// available. +pub fn block_proposal_signature_set_from_parts<'a, T, F>( + signed_block: &'a SignedBeaconBlock, + block_root: Option, + proposer_index: u64, + fork: &Fork, + genesis_validators_root: Hash256, + get_pubkey: F, + spec: &'a ChainSpec, +) -> Result> +where + T: EthSpec, + F: Fn(usize) -> Option>, +{ + // Verify that the `SignedBeaconBlock` instantiation matches the fork at `signed_block.slot()`. + signed_block + .fork_name(spec) + .map_err(Error::InconsistentBlockFork)?; + + let block = signed_block.message(); let domain = spec.get_domain( block.slot().epoch(T::slots_per_epoch()), Domain::BeaconProposer, - &state.fork(), - state.genesis_validators_root(), + fork, + genesis_validators_root, ); let message = if let Some(root) = block_root { @@ -115,7 +145,7 @@ where Ok(SignatureSet::single_pubkey( signed_block.signature(), - get_pubkey(proposer_index).ok_or_else(|| Error::ValidatorUnknown(proposer_index as u64))?, + get_pubkey(proposer_index as usize).ok_or(Error::ValidatorUnknown(proposer_index))?, message, )) } diff --git a/consensus/types/Cargo.toml b/consensus/types/Cargo.toml index f539c60c311..8832dcfc427 100644 --- a/consensus/types/Cargo.toml +++ b/consensus/types/Cargo.toml @@ -51,6 +51,7 @@ serde_json = "1.0.58" criterion = "0.3.3" beacon_chain = { path = "../../beacon_node/beacon_chain" } eth2_interop_keypairs = { path = "../../common/eth2_interop_keypairs" } +state_processing = { path = "../state_processing" } [features] default = ["sqlite", "legacy-arith"] diff --git a/consensus/types/src/beacon_block.rs b/consensus/types/src/beacon_block.rs index 74bdfe44df8..9575e0b599e 100644 --- a/consensus/types/src/beacon_block.rs +++ b/consensus/types/src/beacon_block.rs @@ -197,6 +197,11 @@ impl<'a, T: EthSpec> BeaconBlockRef<'a, T> { } } + /// Returns the epoch corresponding to `self.slot()`. + pub fn epoch(&self) -> Epoch { + self.slot().epoch(T::slots_per_epoch()) + } + /// Returns a full `BeaconBlockHeader` of this block. pub fn block_header(&self) -> BeaconBlockHeader { BeaconBlockHeader { diff --git a/consensus/types/src/beacon_state/tests.rs b/consensus/types/src/beacon_state/tests.rs index 9e069a8ca41..f8f2447aaf1 100644 --- a/consensus/types/src/beacon_state/tests.rs +++ b/consensus/types/src/beacon_state/tests.rs @@ -1,15 +1,21 @@ #![cfg(test)] use crate::test_utils::*; +use crate::test_utils::{SeedableRng, XorShiftRng}; use beacon_chain::store::config::StoreConfig; -use beacon_chain::test_utils::{BeaconChainHarness, EphemeralHarnessType}; +use beacon_chain::test_utils::{ + interop_genesis_state, test_spec, BeaconChainHarness, EphemeralHarnessType, +}; use beacon_chain::types::{ test_utils::TestRandom, BeaconState, BeaconStateAltair, BeaconStateBase, BeaconStateError, ChainSpec, CloneConfig, Domain, Epoch, EthSpec, FixedVector, Hash256, Keypair, MainnetEthSpec, MinimalEthSpec, RelativeEpoch, Slot, }; +use safe_arith::SafeArith; use ssz::{Decode, Encode}; +use state_processing::per_slot_processing; use std::ops::Mul; use swap_or_not_shuffle::compute_shuffled_index; +use tree_hash::TreeHash; pub const MAX_VALIDATOR_COUNT: usize = 129; pub const SLOT_OFFSET: Slot = Slot::new(1); @@ -489,9 +495,6 @@ fn decode_base_and_altair() { #[test] fn tree_hash_cache_linear_history() { - use crate::test_utils::{SeedableRng, XorShiftRng}; - use tree_hash::TreeHash; - let mut rng = XorShiftRng::from_seed([42; 16]); let mut state: BeaconState = @@ -545,3 +548,59 @@ fn tree_hash_cache_linear_history() { let root = state.update_tree_hash_cache().unwrap(); assert_eq!(root.as_bytes(), &state.tree_hash_root()[..]); } + +// Check how the cache behaves when there's a distance larger than `SLOTS_PER_HISTORICAL_ROOT` +// since its last update. +#[test] +fn tree_hash_cache_linear_history_long_skip() { + let validator_count = 128; + let keypairs = generate_deterministic_keypairs(validator_count); + + let spec = &test_spec::(); + + // This state has a cache that advances normally each slot. + let mut state: BeaconState = interop_genesis_state(&keypairs, 0, spec).unwrap(); + + state.update_tree_hash_cache().unwrap(); + + // This state retains its original cache until it is updated after a long skip. + let mut original_cache_state = state.clone(); + assert!(original_cache_state.tree_hash_cache().is_initialized()); + + // Advance the states to a slot beyond the historical state root limit, using the state root + // from the first state to avoid touching the original state's cache. + let start_slot = state.slot(); + let target_slot = start_slot + .safe_add(MinimalEthSpec::slots_per_historical_root() as u64 + 1) + .unwrap(); + + let mut prev_state_root; + while state.slot() < target_slot { + prev_state_root = state.update_tree_hash_cache().unwrap(); + per_slot_processing(&mut state, None, spec).unwrap(); + per_slot_processing(&mut original_cache_state, Some(prev_state_root), spec).unwrap(); + } + + // The state with the original cache should still be initialized at the starting slot. + assert_eq!( + original_cache_state + .tree_hash_cache() + .initialized_slot() + .unwrap(), + start_slot + ); + + // Updating the tree hash cache should be successful despite the long skip. + assert_eq!( + original_cache_state.update_tree_hash_cache().unwrap(), + state.update_tree_hash_cache().unwrap() + ); + + assert_eq!( + original_cache_state + .tree_hash_cache() + .initialized_slot() + .unwrap(), + target_slot + ); +} diff --git a/consensus/types/src/beacon_state/tree_hash_cache.rs b/consensus/types/src/beacon_state/tree_hash_cache.rs index fc14e9b18f4..35c2ac1c3e9 100644 --- a/consensus/types/src/beacon_state/tree_hash_cache.rs +++ b/consensus/types/src/beacon_state/tree_hash_cache.rs @@ -118,6 +118,13 @@ impl BeaconTreeHashCache { pub fn uninitialize(&mut self) { self.inner = None; } + + /// Return the slot at which the cache was last updated. + /// + /// This should probably only be used during testing. + pub fn initialized_slot(&self) -> Option { + Some(self.inner.as_ref()?.previous_state?.1) + } } #[derive(Debug, PartialEq, Clone)] @@ -206,7 +213,8 @@ impl BeaconTreeHashCacheInner { /// Updates the cache and returns the tree hash root for the given `state`. /// /// The provided `state` should be a descendant of the last `state` given to this function, or - /// the `Self::new` function. + /// the `Self::new` function. If the state is more than `SLOTS_PER_HISTORICAL_ROOT` slots + /// after `self.previous_state` then the whole cache will be re-initialized. pub fn recalculate_tree_hash_root(&mut self, state: &BeaconState) -> Result { // If this cache has previously produced a root, ensure that it is in the state root // history of this state. @@ -224,10 +232,15 @@ impl BeaconTreeHashCacheInner { } // If the state is newer, the previous root must be in the history of the given state. - if previous_slot < state.slot() - && *state.get_state_root(previous_slot)? != previous_root - { - return Err(Error::NonLinearTreeHashCacheHistory); + // If the previous slot is out of range of the `state_roots` array (indicating a long + // gap between the cache's last use and the current state) then we re-initialize. + match state.get_state_root(previous_slot) { + Ok(state_previous_root) if *state_previous_root == previous_root => {} + Ok(_) => return Err(Error::NonLinearTreeHashCacheHistory), + Err(Error::SlotOutOfBounds) => { + *self = Self::new(state); + } + Err(e) => return Err(e), } } diff --git a/lcli/Cargo.toml b/lcli/Cargo.toml index 96c8a0953e2..c5a1db44403 100644 --- a/lcli/Cargo.toml +++ b/lcli/Cargo.toml @@ -15,7 +15,8 @@ hex = "0.4.2" log = "0.4.11" serde = "1.0.116" serde_yaml = "0.8.13" -simple_logger = "1.10.0" +serde_json = "1.0.66" +env_logger = "0.9.0" types = { path = "../consensus/types" } state_processing = { path = "../consensus/state_processing" } eth2_ssz = "0.3.0" diff --git a/lcli/src/main.rs b/lcli/src/main.rs index 7a7b04d77f1..b42a239aafa 100644 --- a/lcli/src/main.rs +++ b/lcli/src/main.rs @@ -16,7 +16,6 @@ mod transition_blocks; use clap::{App, Arg, ArgMatches, SubCommand}; use clap_utils::parse_path_with_default_in_home_dir; use environment::EnvironmentBuilder; -use log::LevelFilter; use parse_ssz::run_parse_ssz; use std::path::PathBuf; use std::process; @@ -25,10 +24,7 @@ use transition_blocks::run_transition_blocks; use types::{EthSpec, EthSpecId}; fn main() { - simple_logger::SimpleLogger::new() - .with_level(LevelFilter::Info) - .init() - .expect("Logger should be initialised"); + env_logger::init(); let matches = App::new("Lighthouse CLI Tool") .version(lighthouse_version::VERSION) @@ -110,6 +106,17 @@ fn main() { .subcommand( SubCommand::with_name("pretty-ssz") .about("Parses SSZ-encoded data from a file") + .arg( + Arg::with_name("format") + .short("f") + .long("format") + .value_name("FORMAT") + .takes_value(true) + .required(true) + .default_value("json") + .possible_values(&["json", "yaml"]) + .help("Output format to use") + ) .arg( Arg::with_name("type") .value_name("TYPE") @@ -123,7 +130,7 @@ fn main() { .takes_value(true) .required(true) .help("Path to SSZ bytes"), - ), + ) ) .subcommand( SubCommand::with_name("deploy-deposit-contract") diff --git a/lcli/src/parse_ssz.rs b/lcli/src/parse_ssz.rs index 34676616d52..5b3c5751661 100644 --- a/lcli/src/parse_ssz.rs +++ b/lcli/src/parse_ssz.rs @@ -1,13 +1,33 @@ use clap::ArgMatches; +use clap_utils::parse_required; use serde::Serialize; use ssz::Decode; use std::fs::File; use std::io::Read; +use std::str::FromStr; use types::*; +enum OutputFormat { + Json, + Yaml, +} + +impl FromStr for OutputFormat { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "json" => Ok(Self::Json), + "yaml" => Ok(Self::Yaml), + _ => Err(format!("Invalid output format \"{}\"", s)), + } + } +} + pub fn run_parse_ssz(matches: &ArgMatches) -> Result<(), String> { let type_str = matches.value_of("type").ok_or("No type supplied")?; let filename = matches.value_of("ssz-file").ok_or("No file supplied")?; + let format = parse_required(matches, "format")?; let mut bytes = vec![]; let mut file = @@ -19,24 +39,40 @@ pub fn run_parse_ssz(matches: &ArgMatches) -> Result<(), String> { info!("Type: {:?}", type_str); match type_str { - "block_base" => decode_and_print::>(&bytes)?, - "block_altair" => decode_and_print::>(&bytes)?, - "state_base" => decode_and_print::>(&bytes)?, - "state_altair" => decode_and_print::>(&bytes)?, + "signed_block_base" => decode_and_print::>(&bytes, format)?, + "signed_block_altair" => decode_and_print::>(&bytes, format)?, + "block_base" => decode_and_print::>(&bytes, format)?, + "block_altair" => decode_and_print::>(&bytes, format)?, + "state_base" => decode_and_print::>(&bytes, format)?, + "state_altair" => decode_and_print::>(&bytes, format)?, other => return Err(format!("Unknown type: {}", other)), }; Ok(()) } -fn decode_and_print(bytes: &[u8]) -> Result<(), String> { +fn decode_and_print( + bytes: &[u8], + output_format: OutputFormat, +) -> Result<(), String> { let item = T::from_ssz_bytes(bytes).map_err(|e| format!("SSZ decode failed: {:?}", e))?; - println!( - "{}", - serde_yaml::to_string(&item) - .map_err(|e| format!("Unable to write object to YAML: {:?}", e))? - ); + match output_format { + OutputFormat::Json => { + println!( + "{}", + serde_json::to_string(&item) + .map_err(|e| format!("Unable to write object to JSON: {:?}", e))? + ); + } + OutputFormat::Yaml => { + println!( + "{}", + serde_yaml::to_string(&item) + .map_err(|e| format!("Unable to write object to YAML: {:?}", e))? + ); + } + } Ok(()) } diff --git a/lighthouse/tests/beacon_node.rs b/lighthouse/tests/beacon_node.rs index af7f847c896..269a4d75b6e 100644 --- a/lighthouse/tests/beacon_node.rs +++ b/lighthouse/tests/beacon_node.rs @@ -689,6 +689,19 @@ fn compact_db_flag() { .run() .with_config(|config| assert!(config.store.compact_on_init)); } +#[test] +fn reconstruct_historic_states_flag() { + CommandLineTest::new() + .flag("reconstruct-historic-states", None) + .run() + .with_config(|config| assert!(config.chain.reconstruct_historic_states)); +} +#[test] +fn no_reconstruct_historic_states_flag() { + CommandLineTest::new() + .run() + .with_config(|config| assert!(!config.chain.reconstruct_historic_states)); +} // Tests for Slasher flags. #[test]