Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change getHealth to compare optimistically confirmed slots #33651

Merged
merged 11 commits into from
Oct 16, 2023
1 change: 0 additions & 1 deletion core/src/validator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -980,7 +980,6 @@ impl Validator {
ledger_path,
config.validator_exit.clone(),
exit.clone(),
config.known_validators.clone(),
rpc_override_health_check.clone(),
startup_verification_complete,
optimistically_confirmed_bank.clone(),
Expand Down
8 changes: 3 additions & 5 deletions docs/src/api/http.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,11 +156,9 @@ health-check mechanism for use by load balancers or other network
infrastructure. This request will always return a HTTP 200 OK response with a body of
"ok", "behind" or "unknown" based on the following conditions:

1. If one or more `--known-validator` arguments are provided to `solana-validator` - "ok" is returned
when the node has within `HEALTH_CHECK_SLOT_DISTANCE` slots of the highest
known validator, otherwise "behind". "unknown" is returned when no slot
information from known validators is not yet available.
2. "ok" is always returned if no known validators are provided.
1. If the node is within `HEALTH_CHECK_SLOT_DISTANCE` slots from the latest cluster confirmed slot, "ok" is returned.
steviez marked this conversation as resolved.
Show resolved Hide resolved
2. If the node is behind more than `HEALTH_CHECK_SLOT_DISTANCE` slots from the latest cluster confirmed slot, an error is returned that will contain more information about far behind the node is.
3. If the node is unable to determine its' health, an error is returned.

## JSON RPC API Reference

Expand Down
9 changes: 2 additions & 7 deletions docs/src/api/methods/_getHealth.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,8 @@ import {

## getHealth

Returns the current health of the node.

:::caution
If one or more `--known-validator` arguments are provided to `solana-validator` - "ok" is returned
when the node has within `HEALTH_CHECK_SLOT_DISTANCE` slots of the highest known validator,
otherwise an error is returned. "ok" is always returned if no known validators are provided.
:::
Returns the current health of the node. A healthy node is one that is within
`HEALTH_CHECK_SLOT_DISTANCE` slots of the latest cluster confirmed slot.

<DocSideBySide>
<CodeParams>
Expand Down
44 changes: 27 additions & 17 deletions rpc/src/rpc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -374,31 +374,33 @@ impl JsonRpcRequestProcessor {
);

let leader_schedule_cache = Arc::new(LeaderScheduleCache::new_from_bank(&bank));
let startup_verification_complete = Arc::clone(bank.get_startup_verification_complete());
let slot = bank.slot();
let optimistically_confirmed_bank =
Arc::new(RwLock::new(OptimisticallyConfirmedBank { bank }));
Self {
config: JsonRpcConfig::default(),
snapshot_config: None,
bank_forks,
block_commitment_cache: Arc::new(RwLock::new(BlockCommitmentCache::new(
HashMap::new(),
0,
CommitmentSlots::new_from_slot(bank.slot()),
CommitmentSlots::new_from_slot(slot),
))),
blockstore,
blockstore: Arc::clone(&blockstore),
validator_exit: create_validator_exit(exit.clone()),
health: Arc::new(RpcHealth::new(
cluster_info.clone(),
None,
Arc::clone(&optimistically_confirmed_bank),
blockstore,
0,
exit,
Arc::clone(bank.get_startup_verification_complete()),
startup_verification_complete,
)),
cluster_info,
genesis_hash,
transaction_sender: Arc::new(Mutex::new(sender)),
bigtable_ledger_storage: None,
optimistically_confirmed_bank: Arc::new(RwLock::new(OptimisticallyConfirmedBank {
bank,
})),
optimistically_confirmed_bank,
largest_accounts_cache: Arc::new(RwLock::new(LargestAccountsCache::new(30))),
max_slots: Arc::new(MaxSlots::default()),
leader_schedule_cache,
Expand Down Expand Up @@ -4787,6 +4789,8 @@ pub mod tests {
// note that this means that slot 0 will always be considered complete
let max_complete_transaction_status_slot = Arc::new(AtomicU64::new(0));
let max_complete_rewards_slot = Arc::new(AtomicU64::new(0));
let optimistically_confirmed_bank =
OptimisticallyConfirmedBank::locked_from_bank_forks_root(&bank_forks);

let meta = JsonRpcRequestProcessor::new(
config,
Expand All @@ -4795,11 +4799,11 @@ pub mod tests {
block_commitment_cache.clone(),
blockstore.clone(),
validator_exit,
RpcHealth::stub(),
RpcHealth::stub(optimistically_confirmed_bank.clone(), blockstore.clone()),
cluster_info,
Hash::default(),
None,
OptimisticallyConfirmedBank::locked_from_bank_forks_root(&bank_forks),
optimistically_confirmed_bank,
Arc::new(RwLock::new(LargestAccountsCache::new(30))),
max_slots.clone(),
Arc::new(LeaderScheduleCache::new_from_bank(&bank)),
Expand Down Expand Up @@ -6398,7 +6402,11 @@ pub mod tests {
let blockstore = Arc::new(Blockstore::open(&ledger_path).unwrap());
let block_commitment_cache = Arc::new(RwLock::new(BlockCommitmentCache::default()));
let (bank_forks, mint_keypair, ..) = new_bank_forks();
let health = RpcHealth::stub();
let optimistically_confirmed_bank =
OptimisticallyConfirmedBank::locked_from_bank_forks_root(&bank_forks);
let health = RpcHealth::stub(optimistically_confirmed_bank.clone(), blockstore.clone());
// Mark the node as healthy to start
health.stub_set_health_status(Some(RpcHealthStatus::Ok));

// Freeze bank 0 to prevent a panic in `run_transaction_simulation()`
bank_forks.write().unwrap().get(0).unwrap().freeze();
Expand Down Expand Up @@ -6429,7 +6437,7 @@ pub mod tests {
cluster_info,
Hash::default(),
None,
OptimisticallyConfirmedBank::locked_from_bank_forks_root(&bank_forks),
optimistically_confirmed_bank,
Arc::new(RwLock::new(LargestAccountsCache::new(30))),
Arc::new(MaxSlots::default()),
Arc::new(LeaderScheduleCache::default()),
Expand Down Expand Up @@ -6690,18 +6698,20 @@ pub mod tests {
.my_contact_info()
.tpu(connection_cache.protocol())
.unwrap();
let optimistically_confirmed_bank =
OptimisticallyConfirmedBank::locked_from_bank_forks_root(&bank_forks);
let (request_processor, receiver) = JsonRpcRequestProcessor::new(
JsonRpcConfig::default(),
None,
bank_forks.clone(),
block_commitment_cache,
blockstore,
blockstore.clone(),
validator_exit,
RpcHealth::stub(),
RpcHealth::stub(optimistically_confirmed_bank.clone(), blockstore),
cluster_info,
Hash::default(),
None,
OptimisticallyConfirmedBank::locked_from_bank_forks_root(&bank_forks),
optimistically_confirmed_bank,
Arc::new(RwLock::new(LargestAccountsCache::new(30))),
Arc::new(MaxSlots::default()),
Arc::new(LeaderScheduleCache::default()),
Expand Down Expand Up @@ -8327,9 +8337,9 @@ pub mod tests {
None,
bank_forks.clone(),
block_commitment_cache,
blockstore,
blockstore.clone(),
validator_exit,
RpcHealth::stub(),
RpcHealth::stub(optimistically_confirmed_bank.clone(), blockstore.clone()),
cluster_info,
Hash::default(),
None,
Expand Down
150 changes: 69 additions & 81 deletions rpc/src/rpc_health.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
use {
solana_gossip::cluster_info::ClusterInfo,
solana_sdk::{clock::Slot, pubkey::Pubkey},
std::{
collections::HashSet,
sync::{
atomic::{AtomicBool, Ordering},
Arc,
},
crate::optimistically_confirmed_bank_tracker::OptimisticallyConfirmedBank,
solana_ledger::blockstore::Blockstore,
solana_sdk::clock::Slot,
std::sync::{
atomic::{AtomicBool, Ordering},
Arc, RwLock,
},
};

Expand All @@ -18,8 +16,8 @@ pub enum RpcHealthStatus {
}

pub struct RpcHealth {
cluster_info: Arc<ClusterInfo>,
known_validators: Option<HashSet<Pubkey>>,
optimistically_confirmed_bank: Arc<RwLock<OptimisticallyConfirmedBank>>,
blockstore: Arc<Blockstore>,
health_check_slot_distance: u64,
override_health_check: Arc<AtomicBool>,
startup_verification_complete: Arc<AtomicBool>,
Expand All @@ -29,15 +27,15 @@ pub struct RpcHealth {

impl RpcHealth {
pub fn new(
cluster_info: Arc<ClusterInfo>,
known_validators: Option<HashSet<Pubkey>>,
optimistically_confirmed_bank: Arc<RwLock<OptimisticallyConfirmedBank>>,
blockstore: Arc<Blockstore>,
health_check_slot_distance: u64,
override_health_check: Arc<AtomicBool>,
startup_verification_complete: Arc<AtomicBool>,
) -> Self {
Self {
cluster_info,
known_validators,
optimistically_confirmed_bank,
blockstore,
health_check_slot_distance,
override_health_check,
startup_verification_complete,
Expand All @@ -57,81 +55,71 @@ impl RpcHealth {
if !self.startup_verification_complete.load(Ordering::Acquire) {
return RpcHealthStatus::Unknown;
}

if self.override_health_check.load(Ordering::Relaxed) {
RpcHealthStatus::Ok
} else if let Some(known_validators) = &self.known_validators {
match (
self.cluster_info
.get_accounts_hash_for_node(&self.cluster_info.id(), |hashes| {
hashes
.iter()
.max_by(|a, b| a.0.cmp(&b.0))
.map(|slot_hash| slot_hash.0)
})
.flatten(),
known_validators
.iter()
.filter_map(|known_validator| {
self.cluster_info
.get_accounts_hash_for_node(known_validator, |hashes| {
hashes
.iter()
.max_by(|a, b| a.0.cmp(&b.0))
.map(|slot_hash| slot_hash.0)
})
.flatten()
})
.max(),
) {
(
Some(latest_account_hash_slot),
Some(latest_known_validator_account_hash_slot),
) => {
// The validator is considered healthy if its latest account hash slot is within
// `health_check_slot_distance` of the latest known validator's account hash slot
if latest_account_hash_slot
> latest_known_validator_account_hash_slot
.saturating_sub(self.health_check_slot_distance)
{
RpcHealthStatus::Ok
} else {
let num_slots = latest_known_validator_account_hash_slot
.saturating_sub(latest_account_hash_slot);
warn!(
"health check: behind by {} slots: me={}, latest known_validator={}",
num_slots,
latest_account_hash_slot,
latest_known_validator_account_hash_slot
);
RpcHealthStatus::Behind { num_slots }
}
}
(latest_account_hash_slot, latest_known_validator_account_hash_slot) => {
if latest_account_hash_slot.is_none() {
warn!("health check: latest_account_hash_slot not available");
}
if latest_known_validator_account_hash_slot.is_none() {
warn!(
"health check: latest_known_validator_account_hash_slot not available"
);
}
RpcHealthStatus::Unknown
}
return RpcHealthStatus::Ok;
}

// A node can observe votes by both replaying blocks and observing gossip.
//
// ClusterInfoVoteListener receives votes from both of these sources and then records
// optimistically confirmed slots in the Blockstore via OptimisticConfirmationVerifier.
// Thus, it is possible for a node to record an optimistically confirmed slot before the
// node has replayed and validated the slot for itself.
//
// OptimisticallyConfirmedBank holds a bank for the latest optimistically confirmed slot
// that the node has replayed. It is true that the node will have replayed that slot by
// virtue of having a bank available. Observing that the cluster has optimistically
// confirmed a slot through gossip is not enough to reconstruct the bank.
//
// So, comparing the latest optimistic slot from the Blockstore vs. the slot from the
// OptimisticallyConfirmedBank bank allows a node to see where it stands in relation to the
// tip of the cluster.
let my_latest_optimistically_confirmed_slot = self
.optimistically_confirmed_bank
.read()
.unwrap()
.bank
.slot();

let mut optimistic_slot_infos = match self.blockstore.get_latest_optimistic_slots(1) {
steviez marked this conversation as resolved.
Show resolved Hide resolved
Ok(infos) => infos,
Err(err) => {
warn!("health check: blockstore error: {err}");
return RpcHealthStatus::Unknown;
}
} else {
// No known validator point of reference available, so this validator is healthy
// because it's running
};
let Some((cluster_latest_optimistically_confirmed_slot, _, _)) =
optimistic_slot_infos.pop()
else {
warn!("health check: blockstore does not contain any optimistically confirmed slots");
return RpcHealthStatus::Unknown;
};

if my_latest_optimistically_confirmed_slot
>= cluster_latest_optimistically_confirmed_slot
.saturating_sub(self.health_check_slot_distance)
{
RpcHealthStatus::Ok
} else {
let num_slots = cluster_latest_optimistically_confirmed_slot
.saturating_sub(my_latest_optimistically_confirmed_slot);
warn!(
"health check: behind by {num_slots} \
slots: me={my_latest_optimistically_confirmed_slot}, \
latest cluster={cluster_latest_optimistically_confirmed_slot}",
);
RpcHealthStatus::Behind { num_slots }
}
}

#[cfg(test)]
pub(crate) fn stub() -> Arc<Self> {
use crate::rpc::tests::new_test_cluster_info;
pub(crate) fn stub(
optimistically_confirmed_bank: Arc<RwLock<OptimisticallyConfirmedBank>>,
blockstore: Arc<Blockstore>,
) -> Arc<Self> {
Arc::new(Self::new(
Arc::new(new_test_cluster_info()),
None,
optimistically_confirmed_bank,
blockstore,
42,
Arc::new(AtomicBool::new(false)),
Arc::new(AtomicBool::new(true)),
Expand Down
Loading