diff --git a/core/lib/zksync_core/src/consistency_checker/mod.rs b/core/lib/zksync_core/src/consistency_checker/mod.rs index 768684e5fbaf..4632343982bb 100644 --- a/core/lib/zksync_core/src/consistency_checker/mod.rs +++ b/core/lib/zksync_core/src/consistency_checker/mod.rs @@ -304,6 +304,7 @@ impl ConsistencyChecker { } L1DataMismatchBehavior::Log => { tracing::warn!("L1 Batch #{batch_number} is inconsistent with L1"); + batch_number += 1; // We don't want to infinitely loop failing the check on the same batch } }, Err(CheckError::Web3(err)) => { diff --git a/core/lib/zksync_core/src/reorg_detector/mod.rs b/core/lib/zksync_core/src/reorg_detector/mod.rs index c399ed4c488b..94bdcb9deedc 100644 --- a/core/lib/zksync_core/src/reorg_detector/mod.rs +++ b/core/lib/zksync_core/src/reorg_detector/mod.rs @@ -32,6 +32,13 @@ enum HashMatchError { Using an earlier snapshot could help." )] EarliestHashMismatch(L1BatchNumber), + #[error( + "Unrecoverable error: the earliest L1 batch #{0} in the local DB \ + is truncated on the main node. Make sure you're connected to the right network; \ + if you've recovered from a snapshot, re-check snapshot authenticity. \ + Using an earlier snapshot could help." + )] + EarliestL1BatchTruncated(L1BatchNumber), #[error("Internal error")] Internal(#[from] anyhow::Error), } @@ -48,6 +55,10 @@ fn is_transient_err(err: &RpcError) -> bool { #[async_trait] trait MainNodeClient: fmt::Debug + Send + Sync { + async fn sealed_miniblock_number(&self) -> Result; + + async fn sealed_l1_batch_number(&self) -> Result; + async fn miniblock_hash(&self, number: MiniblockNumber) -> Result, RpcError>; async fn l1_batch_root_hash(&self, number: L1BatchNumber) -> Result, RpcError>; @@ -55,6 +66,18 @@ trait MainNodeClient: fmt::Debug + Send + Sync { #[async_trait] impl MainNodeClient for HttpClient { + async fn sealed_miniblock_number(&self) -> Result { + let number = self.get_block_number().await?; + let number = u32::try_from(number).map_err(|err| RpcError::Custom(err.to_owned()))?; + Ok(MiniblockNumber(number)) + } + + async fn sealed_l1_batch_number(&self) -> Result { + let number = self.get_l1_batch_number().await?; + let number = u32::try_from(number).map_err(|err| RpcError::Custom(err.to_owned()))?; + Ok(L1BatchNumber(number)) + } + async fn miniblock_hash(&self, number: MiniblockNumber) -> Result, RpcError> { Ok(self .get_block_by_number(number.0.into(), false) @@ -85,10 +108,38 @@ impl UpdateCorrectBlock for () { last_correct_miniblock: MiniblockNumber, last_correct_l1_batch: L1BatchNumber, ) { - EN_METRICS.last_correct_batch[&CheckerComponent::ReorgDetector] - .set(last_correct_miniblock.0.into()); - EN_METRICS.last_correct_miniblock[&CheckerComponent::ReorgDetector] - .set(last_correct_l1_batch.0.into()); + let last_correct_miniblock = last_correct_miniblock.0.into(); + let prev_checked_miniblock = EN_METRICS.last_correct_miniblock + [&CheckerComponent::ReorgDetector] + .set(last_correct_miniblock); + if prev_checked_miniblock != last_correct_miniblock { + tracing::debug!("No reorg at miniblock #{last_correct_miniblock}"); + } + + let last_correct_l1_batch = last_correct_l1_batch.0.into(); + let prev_checked_l1_batch = EN_METRICS.last_correct_batch[&CheckerComponent::ReorgDetector] + .set(last_correct_l1_batch); + if prev_checked_l1_batch != last_correct_l1_batch { + tracing::debug!("No reorg at L1 batch #{last_correct_l1_batch}"); + } + } +} + +/// Output of hash match methods in [`ReorgDetector`]. +#[derive(Debug)] +enum MatchOutput { + Match, + Mismatch, + NoRemoteReference, +} + +impl MatchOutput { + fn new(is_match: bool) -> Self { + if is_match { + Self::Match + } else { + Self::Mismatch + } } } @@ -135,7 +186,7 @@ impl ReorgDetector { async fn miniblock_hashes_match( &self, miniblock_number: MiniblockNumber, - ) -> Result { + ) -> Result { let mut storage = self.pool.access_storage().await?; let local_hash = storage .blocks_dal() @@ -151,7 +202,7 @@ impl ReorgDetector { // Due to reorg, locally we may be ahead of the main node. // Lack of the hash on the main node is treated as a hash match, // We need to wait for our knowledge of main node to catch up. - return Ok(true); + return Ok(MatchOutput::NoRemoteReference); }; if remote_hash != local_hash { @@ -160,14 +211,39 @@ impl ReorgDetector { main node {remote_hash:?} (miniblock #{miniblock_number})" ); } - Ok(remote_hash == local_hash) + Ok(MatchOutput::new(remote_hash == local_hash)) + } + + /// Checks hash correspondence for the latest miniblock sealed both locally and on the main node. + async fn check_sealed_miniblock_hash( + &self, + sealed_miniblock_number: MiniblockNumber, + ) -> Result<(MiniblockNumber, bool), HashMatchError> { + let mut main_node_sealed_miniblock_number = sealed_miniblock_number; + loop { + let checked_number = sealed_miniblock_number.min(main_node_sealed_miniblock_number); + match self.miniblock_hashes_match(checked_number).await? { + MatchOutput::Match => break Ok((checked_number, true)), + MatchOutput::Mismatch => break Ok((checked_number, false)), + MatchOutput::NoRemoteReference => { + tracing::info!( + "Main node has no miniblock #{checked_number}; will check last miniblock on the main node" + ); + main_node_sealed_miniblock_number = + self.client.sealed_miniblock_number().await?; + tracing::debug!( + "Fetched last miniblock on the main node: #{main_node_sealed_miniblock_number}" + ); + } + } + } } /// Compares root hashes of the latest local batch and of the same batch from the main node. async fn root_hashes_match( &self, l1_batch_number: L1BatchNumber, - ) -> Result { + ) -> Result { let mut storage = self.pool.access_storage().await?; let local_hash = storage .blocks_dal() @@ -182,7 +258,7 @@ impl ReorgDetector { // Due to reorg, locally we may be ahead of the main node. // Lack of the root hash on the main node is treated as a hash match, // We need to wait for our knowledge of main node to catch up. - return Ok(true); + return Ok(MatchOutput::NoRemoteReference); }; if remote_hash != local_hash { @@ -191,7 +267,37 @@ impl ReorgDetector { main node {remote_hash:?} (L1 batch #{l1_batch_number})" ); } - Ok(remote_hash == local_hash) + Ok(MatchOutput::new(remote_hash == local_hash)) + } + + /// Checks hash correspondence for the latest L1 batch sealed and having metadata both locally and on the main node. + async fn check_sealed_l1_batch_root_hash( + &self, + sealed_l1_batch_number: L1BatchNumber, + ) -> Result<(L1BatchNumber, bool), HashMatchError> { + let mut main_node_sealed_l1_batch_number = sealed_l1_batch_number; + loop { + let checked_number = sealed_l1_batch_number.min(main_node_sealed_l1_batch_number); + match self.root_hashes_match(checked_number).await? { + MatchOutput::Match => break Ok((checked_number, true)), + MatchOutput::Mismatch => break Ok((checked_number, false)), + MatchOutput::NoRemoteReference => { + tracing::info!( + "Main node has no L1 batch #{checked_number}; will check last L1 batch on the main node" + ); + let fetched_number = self.client.sealed_l1_batch_number().await?; + tracing::debug!("Fetched last L1 batch on the main node: #{fetched_number}"); + let number_changed = fetched_number != main_node_sealed_l1_batch_number; + main_node_sealed_l1_batch_number = fetched_number; + + if !number_changed { + // May happen if the main node has an L1 batch, but its state root hash is not computed yet. + tracing::debug!("Last L1 batch number on the main node has not changed; waiting until its state hash is computed"); + tokio::time::sleep(self.sleep_interval / 10).await; + } + } + } + } } /// Localizes a re-org: performs binary search to determine the last non-diverged block. @@ -202,9 +308,16 @@ impl ReorgDetector { ) -> Result { // TODO (BFT-176, BFT-181): We have to look through the whole history, since batch status updater may mark // a block as executed even if the state diverges for it. - binary_search_with(known_valid_l1_batch.0, diverged_l1_batch.0, |number| { - self.root_hashes_match(L1BatchNumber(number)) - }) + binary_search_with( + known_valid_l1_batch.0, + diverged_l1_batch.0, + |number| async move { + Ok(match self.root_hashes_match(L1BatchNumber(number)).await? { + MatchOutput::Match | MatchOutput::NoRemoteReference => true, + MatchOutput::Mismatch => false, + }) + }, + ) .await .map(L1BatchNumber) } @@ -238,10 +351,18 @@ impl ReorgDetector { tracing::debug!( "Checking root hash match for earliest L1 batch #{earliest_l1_batch_number}" ); - if !self.root_hashes_match(earliest_l1_batch_number).await? { - return Err(HashMatchError::EarliestHashMismatch( - earliest_l1_batch_number, - )); + match self.root_hashes_match(earliest_l1_batch_number).await? { + MatchOutput::Match => { /* we're good */ } + MatchOutput::Mismatch => { + return Err(HashMatchError::EarliestHashMismatch( + earliest_l1_batch_number, + )) + } + MatchOutput::NoRemoteReference => { + return Err(HashMatchError::EarliestL1BatchTruncated( + earliest_l1_batch_number, + )) + } } loop { @@ -266,9 +387,12 @@ impl ReorgDetector { miniblock number #{sealed_miniblock_number}" ); - let root_hashes_match = self.root_hashes_match(sealed_l1_batch_number).await?; - let miniblock_hashes_match = - self.miniblock_hashes_match(sealed_miniblock_number).await?; + let (checked_l1_batch_number, root_hashes_match) = self + .check_sealed_l1_batch_root_hash(sealed_l1_batch_number) + .await?; + let (checked_miniblock_number, miniblock_hashes_match) = self + .check_sealed_miniblock_hash(sealed_miniblock_number) + .await?; // The only event that triggers re-org detection and node rollback is if the // hash mismatch at the same block height is detected, be it miniblocks or batches. @@ -278,12 +402,12 @@ impl ReorgDetector { // a re-org taking place. if root_hashes_match && miniblock_hashes_match { self.block_updater - .update_correct_block(sealed_miniblock_number, sealed_l1_batch_number); + .update_correct_block(checked_miniblock_number, checked_l1_batch_number); } else { let diverged_l1_batch_number = if root_hashes_match { - sealed_l1_batch_number + 1 // Non-sealed L1 batch has diverged + checked_l1_batch_number + 1 // Non-sealed L1 batch has diverged } else { - sealed_l1_batch_number + checked_l1_batch_number }; tracing::info!("Searching for the first diverged L1 batch"); diff --git a/core/lib/zksync_core/src/reorg_detector/tests.rs b/core/lib/zksync_core/src/reorg_detector/tests.rs index 81ca31e06990..6de3a74d687b 100644 --- a/core/lib/zksync_core/src/reorg_detector/tests.rs +++ b/core/lib/zksync_core/src/reorg_detector/tests.rs @@ -58,8 +58,6 @@ async fn binary_search_with_simple_predicate() { } } -type ResponsesMap = HashMap; - #[derive(Debug, Clone, Copy)] enum RpcErrorKind { Transient, @@ -77,13 +75,33 @@ impl From for RpcError { #[derive(Debug, Default)] struct MockMainNodeClient { - miniblock_hash_responses: ResponsesMap, - l1_batch_root_hash_responses: ResponsesMap, + latest_miniblock_response: Option, + latest_l1_batch_response: Option, + miniblock_hash_responses: HashMap, + l1_batch_root_hash_responses: HashMap, error_kind: Arc>>, } #[async_trait] impl MainNodeClient for MockMainNodeClient { + async fn sealed_miniblock_number(&self) -> Result { + if let &Some(error_kind) = &*self.error_kind.lock().unwrap() { + return Err(error_kind.into()); + } + Ok(self + .latest_miniblock_response + .expect("unexpected `sealed_miniblock_number` request")) + } + + async fn sealed_l1_batch_number(&self) -> Result { + if let &Some(error_kind) = &*self.error_kind.lock().unwrap() { + return Err(error_kind.into()); + } + Ok(self + .latest_l1_batch_response + .expect("unexpected `sealed_l1_batch_number` request")) + } + async fn miniblock_hash(&self, number: MiniblockNumber) -> Result, RpcError> { if let &Some(error_kind) = &*self.error_kind.lock().unwrap() { return Err(error_kind.into()); @@ -125,18 +143,22 @@ impl UpdateCorrectBlock for mpsc::UnboundedSender<(MiniblockNumber, L1BatchNumbe async fn normal_reorg_function(snapshot_recovery: bool, with_transient_errors: bool) { let pool = ConnectionPool::test_pool().await; let mut storage = pool.access_storage().await.unwrap(); + let mut client = MockMainNodeClient::default(); if snapshot_recovery { storage .protocol_versions_dal() .save_protocol_version_with_tx(ProtocolVersion::default()) .await; } else { - ensure_genesis_state(&mut storage, L2ChainId::default(), &GenesisParams::mock()) - .await - .unwrap(); + let genesis_root_hash = + ensure_genesis_state(&mut storage, L2ChainId::default(), &GenesisParams::mock()) + .await + .unwrap(); + client + .l1_batch_root_hash_responses + .insert(L1BatchNumber(0), genesis_root_hash); } - let mut client = MockMainNodeClient::default(); let l1_batch_numbers = if snapshot_recovery { 11_u32..=20 } else { @@ -227,12 +249,16 @@ async fn detector_stops_on_fatal_rpc_error() { async fn reorg_is_detected_on_batch_hash_mismatch() { let pool = ConnectionPool::test_pool().await; let mut storage = pool.access_storage().await.unwrap(); - ensure_genesis_state(&mut storage, L2ChainId::default(), &GenesisParams::mock()) - .await - .unwrap(); + let genesis_root_hash = + ensure_genesis_state(&mut storage, L2ChainId::default(), &GenesisParams::mock()) + .await + .unwrap(); + let mut client = MockMainNodeClient::default(); + client + .l1_batch_root_hash_responses + .insert(L1BatchNumber(0), genesis_root_hash); let (_stop_sender, stop_receiver) = watch::channel(false); - let mut client = MockMainNodeClient::default(); let miniblock_hash = H256::from_low_u64_be(23); client .miniblock_hash_responses @@ -271,12 +297,16 @@ async fn reorg_is_detected_on_batch_hash_mismatch() { async fn reorg_is_detected_on_miniblock_hash_mismatch() { let pool = ConnectionPool::test_pool().await; let mut storage = pool.access_storage().await.unwrap(); - ensure_genesis_state(&mut storage, L2ChainId::default(), &GenesisParams::mock()) - .await - .unwrap(); + let mut client = MockMainNodeClient::default(); + let genesis_root_hash = + ensure_genesis_state(&mut storage, L2ChainId::default(), &GenesisParams::mock()) + .await + .unwrap(); + client + .l1_batch_root_hash_responses + .insert(L1BatchNumber(0), genesis_root_hash); let (_stop_sender, stop_receiver) = watch::channel(false); - let mut client = MockMainNodeClient::default(); let miniblock_hash = H256::from_low_u64_be(23); client .miniblock_hash_responses @@ -349,6 +379,13 @@ async fn reorg_is_detected_on_historic_batch_hash_mismatch( seal_l1_batch(&mut storage, earliest_l1_batch_number, H256::zero()).await; let mut client = MockMainNodeClient::default(); + client + .miniblock_hash_responses + .insert(MiniblockNumber(earliest_l1_batch_number), H256::zero()); + client + .l1_batch_root_hash_responses + .insert(L1BatchNumber(earliest_l1_batch_number), H256::zero()); + let miniblock_and_l1_batch_hashes = l1_batch_numbers.clone().map(|number| { let mut miniblock_hash = H256::from_low_u64_be(number.into()); client @@ -491,3 +528,54 @@ async fn detector_errors_on_earliest_batch_hash_mismatch_with_snapshot_recovery( let err = detector.run_inner().await.unwrap_err(); assert_matches!(err, HashMatchError::EarliestHashMismatch(L1BatchNumber(3))); } + +#[tokio::test] +async fn reorg_is_detected_without_waiting_for_main_node_to_catch_up() { + let pool = ConnectionPool::test_pool().await; + let mut storage = pool.access_storage().await.unwrap(); + let genesis_root_hash = + ensure_genesis_state(&mut storage, L2ChainId::default(), &GenesisParams::mock()) + .await + .unwrap(); + // Fill in local storage with some data, so that it's ahead of the main node. + for number in 1..5 { + store_miniblock(&mut storage, number, H256::zero()).await; + seal_l1_batch(&mut storage, number, H256::zero()).await; + } + drop(storage); + + let mut client = MockMainNodeClient::default(); + client + .l1_batch_root_hash_responses + .insert(L1BatchNumber(0), genesis_root_hash); + for number in 1..3 { + client + .miniblock_hash_responses + .insert(MiniblockNumber(number), H256::zero()); + client + .l1_batch_root_hash_responses + .insert(L1BatchNumber(number), H256::zero()); + } + client + .miniblock_hash_responses + .insert(MiniblockNumber(3), H256::zero()); + client + .l1_batch_root_hash_responses + .insert(L1BatchNumber(3), H256::repeat_byte(0xff)); + client.latest_l1_batch_response = Some(L1BatchNumber(3)); + client.latest_miniblock_response = Some(MiniblockNumber(3)); + + let (_stop_sender, stop_receiver) = watch::channel(false); + let detector = ReorgDetector { + client: Box::new(client), + block_updater: Box::new(()), + pool, + stop_receiver, + sleep_interval: Duration::from_millis(10), + }; + let detector_task = tokio::spawn(detector.run()); + + let task_result = detector_task.await.unwrap(); + let last_correct_l1_batch = task_result.unwrap(); + assert_eq!(last_correct_l1_batch, Some(L1BatchNumber(2))); +}