Skip to content

Commit

Permalink
fix: better timeout for lagging (tari-project#5705)
Browse files Browse the repository at this point in the history
Description
---
Removes some very specific off by one check, and instead use a
configured timeout before considering lagging.

This should also prevent a bug where a node knows of a stronger chain,
but it is at a lower height and this would cause it to think it was up
to date

<!-- Checklist -->
<!-- 1. Is the title of your PR in the form that would make nice release
notes? The title, excluding the conventional commit
tag, will be included exactly as is in the CHANGELOG, so please think
about it carefully. -->


Breaking Changes
---

- [x] None
- [ ] Requires data directory on base node to be deleted
- [ ] Requires hard fork
- [ ] Other - Please specify

<!-- Does this include a breaking change? If so, include this line as a
footer -->
<!-- BREAKING CHANGE: Description what the user should do, e.g. delete a
database, resync the chain -->
  • Loading branch information
stringhandler authored Aug 31, 2023
1 parent b0337cf commit 5e8a3ec
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,13 @@
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
// USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
use std::{future::Future, sync::Arc};
use std::{future::Future, sync::Arc, time::Duration};

use futures::{future, future::Either};
use log::*;
use randomx_rs::RandomXFlag;
use serde::{Deserialize, Serialize};
use tari_common::configuration::serializers;
use tari_comms::{connectivity::ConnectivityRequester, PeerManager};
use tari_shutdown::ShutdownSignal;
use tokio::sync::{broadcast, watch};
Expand Down Expand Up @@ -55,14 +56,21 @@ pub struct BaseNodeStateMachineConfig {
/// The amount of blocks this node can be behind a peer before considered to be lagging (to test the block
/// propagation by delaying lagging)
pub blocks_behind_before_considered_lagging: u64,
/// The amount of time this node can know about a stronger chain before considered to be lagging.
/// This is to give a node time to receive the block via propagation, which is usually less network
/// intensive. Be careful of setting this higher than the block time, which would potentially cause it
/// to always be behind the network
#[serde(with = "serializers::seconds")]
pub time_before_considered_lagging: Duration,
}

#[allow(clippy::derivable_impls)]
impl Default for BaseNodeStateMachineConfig {
fn default() -> Self {
Self {
blockchain_sync_config: Default::default(),
blocks_behind_before_considered_lagging: 0,
blocks_behind_before_considered_lagging: 1,
time_before_considered_lagging: Duration::from_secs(10),
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,18 @@ pub enum SyncStatus {
network: ChainMetadata,
sync_peers: Vec<SyncPeer>,
},
// There is a stronger chain but we are less the `blocks_before_considered_lagging` blocks behind it.
BehindButNotYetLagging {
local: ChainMetadata,
network: ChainMetadata,
sync_peers: Vec<SyncPeer>,
},
UpToDate,
}

impl SyncStatus {
pub fn is_lagging(&self) -> bool {
!self.is_up_to_date()
matches!(self, SyncStatus::Lagging { .. })
}

pub fn is_up_to_date(&self) -> bool {
Expand All @@ -117,6 +123,7 @@ impl Display for SyncStatus {
network.accumulated_difficulty(),
),
UpToDate => f.write_str("UpToDate"),
SyncStatus::BehindButNotYetLagging { .. } => f.write_str("Behind but not yet lagging"),
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use std::{
convert::TryFrom,
fmt::{Display, Formatter},
ops::Deref,
time::{Duration, Instant},
time::Instant,
};

use log::*;
Expand Down Expand Up @@ -56,9 +56,6 @@ use crate::{

const LOG_TARGET: &str = "c::bn::state_machine_service::states::listening";

/// The length of time to wait for a propagated block when one block behind before proceeding to sync
const ONE_BLOCK_BEHIND_WAIT_PERIOD: Duration = Duration::from_secs(20);

/// This struct contains the info of the peer, and is used to serialised and deserialised.
#[derive(Serialize, Deserialize)]
pub struct PeerMetadata {
Expand Down Expand Up @@ -170,35 +167,8 @@ impl Listening {
}
};

let local = match shared.db.get_chain_metadata().await {
Ok(m) => m,
Err(e) => {
return FatalError(format!("Could not get local blockchain metadata. {}", e));
},
};
log_mdc::extend(mdc.clone());

// If this node is just one block behind, wait for block propagation before
// rushing to sync mode
if self.is_synced &&
peer_metadata.claimed_chain_metadata().height_of_longest_chain() ==
local.height_of_longest_chain() + 1 &&
time_since_better_block
.map(|ts: Instant| ts.elapsed() < ONE_BLOCK_BEHIND_WAIT_PERIOD)
.unwrap_or(true)
{
if time_since_better_block.is_none() {
time_since_better_block = Some(Instant::now());
}
debug!(
target: LOG_TARGET,
"This node is one block behind. Best network metadata is at height {}.",
peer_metadata.claimed_chain_metadata().height_of_longest_chain()
);
continue;
}
time_since_better_block = None;

let local_metadata = match shared.db.get_chain_metadata().await {
Ok(m) => m,
Err(e) => {
Expand All @@ -213,6 +183,39 @@ impl Listening {
peer_metadata,
);

// Generally we will receive a block via incoming blocks, but something might have
// happened that we have not synced to them, e.g. our network could have been down.
// If we know about a stronger chain, but haven't synced to it, because we didn't get
// the blocks propagated to us, or we have a high `blocks_before_considered_lagging`
// then we will wait at least `time_before_considered_lagging` before we try to sync
// to that new chain. If you want to sync to a new chain immediately, then you can
// set this value to 1 second or lower.
if let SyncStatus::BehindButNotYetLagging {
local,
network,
sync_peers,
} = &sync_mode
{
if time_since_better_block.is_none() {
time_since_better_block = Some(Instant::now());
}
if time_since_better_block
.map(|t| t.elapsed() > shared.config.time_before_considered_lagging)
.unwrap()
{
return StateEvent::FallenBehind(SyncStatus::Lagging {
local: local.clone(),
network: network.clone(),
sync_peers: sync_peers.clone(),
});
}
} else {
// We might have gotten up to date via propagation outside of this state, so reset the timer
if sync_mode == SyncStatus::UpToDate {
time_since_better_block = None;
}
}

if sync_mode.is_lagging() {
return StateEvent::FallenBehind(sync_mode);
}
Expand Down Expand Up @@ -300,7 +303,11 @@ fn determine_sync_mode(
waiting for the propagated blocks",
blocks_behind_before_considered_lagging
);
return UpToDate;
return SyncStatus::BehindButNotYetLagging {
local: local.clone(),
network: network.claimed_chain_metadata().clone(),
sync_peers: vec![network.clone().into()],
};
};

debug!(
Expand Down Expand Up @@ -388,6 +395,6 @@ mod test {
assert!(sync_mode.is_lagging());

let sync_mode = determine_sync_mode(2, behind_node.claimed_chain_metadata(), &archival_node);
assert!(sync_mode.is_up_to_date());
assert!(matches!(sync_mode, SyncStatus::BehindButNotYetLagging { .. }));
}
}
2 changes: 1 addition & 1 deletion base_layer/p2p/src/services/liveness/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ mod test {
let msg = create_dummy_message(PingPongMessage::pong_with_metadata(123, metadata.clone()));

state.add_inflight_ping(
msg.inner.as_ref().map(|i| i.nonce.clone()).unwrap(),
msg.inner.as_ref().map(|i| i.nonce).unwrap(),
msg.source_peer.node_id.clone(),
);

Expand Down
13 changes: 11 additions & 2 deletions common/config/presets/c_base_node.toml
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,17 @@ track_reorgs = true
# The maximum amount of VMs that RandomX will be use (default = 0)
#max_randomx_vms = 0
# The amount of blocks this node can be behind a peer before considered to be lagging (to test the block
# propagation by delaying lagging) (default = 0)
#blocks_behind_before_considered_lagging = 0
# propagation by delaying lagging, but also to give it time to receive the block via propagation, which is more network
# efficient)
# Note that time_before_considered_lagging will override this setting if the node sees a stronger chain for longer than
# that configured time.
# (default = 1)
#blocks_behind_before_considered_lagging = 1
# The amount of time this node can know about a stronger chain before considered to be lagging.
# This is to give a node time to receive the block via propagation, which is usually less network
# intensive. Be careful of setting this higher than the block time, which would potentially cause it
# to always be behind the network (default = 10) (in seconds)
#time_before_considered_lagging = 10

[base_node.p2p]
# The node's publicly-accessible hostname. This is the host name that is advertised on the network so that
Expand Down

0 comments on commit 5e8a3ec

Please sign in to comment.