Skip to content

Commit

Permalink
Change getHealth to compare optimistically confirmed slots (#33651)
Browse files Browse the repository at this point in the history
The current getHealth mechanism checks a local accounts hash slot vs.
those of other nodes as specified by --known-validator. This is a
very coarse comparison given that the default for this value is 100
slots. More so, any nodes using a value larger than the default
(ie --incremental-snapshot-interval 500) will likely see getHealth
return status behind at some point.

Change the underlying mechanism of how health is computed. Instead of
using the accounts hash slots published in gossip, use the latest
optimistically confirmed slot from the cluster. Even when a node is
behind, it is able to observe cluster optimistically confirmed by slots
by viewing votes published in gossip.

Thus, the latest cluster optimistically confirmed slot can be compared
against the latest optimistically confirmed bank from replay to
determine health. This new comparison is much more granular, and not
needing to depend on individual known validators is also a plus.

(cherry picked from commit 8bd0e4c)
  • Loading branch information
steviez authored and mergify[bot] committed Oct 16, 2023
1 parent 02f9c97 commit 76ac0ac
Show file tree
Hide file tree
Showing 8 changed files with 213 additions and 238 deletions.
4 changes: 2 additions & 2 deletions core/src/validator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -936,7 +936,8 @@ impl Validator {
// (by both replay stage and banking stage)
let prioritization_fee_cache = Arc::new(PrioritizationFeeCache::default());

let rpc_override_health_check = Arc::new(AtomicBool::new(false));
let rpc_override_health_check =
Arc::new(AtomicBool::new(config.rpc_config.disable_health_check));
let (
json_rpc_service,
pubsub_service,
Expand Down Expand Up @@ -973,7 +974,6 @@ impl Validator {
ledger_path,
config.validator_exit.clone(),
exit.clone(),
config.known_validators.clone(),
rpc_override_health_check.clone(),
startup_verification_complete,
optimistically_confirmed_bank.clone(),
Expand Down
10 changes: 4 additions & 6 deletions docs/src/api/http.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,13 +154,11 @@ Some methods support providing a `filters` object to enable pre-filtering the da
Although not a JSON RPC API, a `GET /health` at the RPC HTTP Endpoint provides a
health-check mechanism for use by load balancers or other network
infrastructure. This request will always return a HTTP 200 OK response with a body of
"ok", "behind" or "unknown" based on the following conditions:
"ok", "behind" or "unknown":

1. If one or more `--known-validator` arguments are provided to `solana-validator` - "ok" is returned
when the node has within `HEALTH_CHECK_SLOT_DISTANCE` slots of the highest
known validator, otherwise "behind". "unknown" is returned when no slot
information from known validators is not yet available.
2. "ok" is always returned if no known validators are provided.
- `ok`: The node is within `HEALTH_CHECK_SLOT_DISTANCE` slots from the latest cluster confirmed slot
- `behind { distance }`: The node is behind `distance` slots from the latest cluster confirmed slot where `distance > HEALTH_CHECK_SLOT_DISTANCE`
- `unknown`: The node is unable to determine where it stands in relation to the cluster

## JSON RPC API Reference

Expand Down
9 changes: 2 additions & 7 deletions docs/src/api/methods/_getHealth.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,8 @@ import {

## getHealth

Returns the current health of the node.

:::caution
If one or more `--known-validator` arguments are provided to `solana-validator` - "ok" is returned
when the node has within `HEALTH_CHECK_SLOT_DISTANCE` slots of the highest known validator,
otherwise an error is returned. "ok" is always returned if no known validators are provided.
:::
Returns the current health of the node. A healthy node is one that is within
`HEALTH_CHECK_SLOT_DISTANCE` slots of the latest cluster confirmed slot.

<DocSideBySide>
<CodeParams>
Expand Down
47 changes: 30 additions & 17 deletions rpc/src/rpc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,12 +149,15 @@ pub struct JsonRpcConfig {
pub obsolete_v1_7_api: bool,
pub rpc_scan_and_fix_roots: bool,
pub max_request_body_size: Option<usize>,
/// Disable the health check, used for tests and TestValidator
pub disable_health_check: bool,
}

impl JsonRpcConfig {
pub fn default_for_test() -> Self {
Self {
full_api: true,
disable_health_check: true,
..Self::default()
}
}
Expand Down Expand Up @@ -374,31 +377,33 @@ impl JsonRpcRequestProcessor {
);

let leader_schedule_cache = Arc::new(LeaderScheduleCache::new_from_bank(&bank));
let startup_verification_complete = Arc::clone(bank.get_startup_verification_complete());
let slot = bank.slot();
let optimistically_confirmed_bank =
Arc::new(RwLock::new(OptimisticallyConfirmedBank { bank }));
Self {
config: JsonRpcConfig::default(),
snapshot_config: None,
bank_forks,
block_commitment_cache: Arc::new(RwLock::new(BlockCommitmentCache::new(
HashMap::new(),
0,
CommitmentSlots::new_from_slot(bank.slot()),
CommitmentSlots::new_from_slot(slot),
))),
blockstore,
blockstore: Arc::clone(&blockstore),
validator_exit: create_validator_exit(exit.clone()),
health: Arc::new(RpcHealth::new(
cluster_info.clone(),
None,
Arc::clone(&optimistically_confirmed_bank),
blockstore,
0,
exit,
Arc::clone(bank.get_startup_verification_complete()),
startup_verification_complete,
)),
cluster_info,
genesis_hash,
transaction_sender: Arc::new(Mutex::new(sender)),
bigtable_ledger_storage: None,
optimistically_confirmed_bank: Arc::new(RwLock::new(OptimisticallyConfirmedBank {
bank,
})),
optimistically_confirmed_bank,
largest_accounts_cache: Arc::new(RwLock::new(LargestAccountsCache::new(30))),
max_slots: Arc::new(MaxSlots::default()),
leader_schedule_cache,
Expand Down Expand Up @@ -4787,6 +4792,8 @@ pub mod tests {
// note that this means that slot 0 will always be considered complete
let max_complete_transaction_status_slot = Arc::new(AtomicU64::new(0));
let max_complete_rewards_slot = Arc::new(AtomicU64::new(0));
let optimistically_confirmed_bank =
OptimisticallyConfirmedBank::locked_from_bank_forks_root(&bank_forks);

let meta = JsonRpcRequestProcessor::new(
config,
Expand All @@ -4795,11 +4802,11 @@ pub mod tests {
block_commitment_cache.clone(),
blockstore.clone(),
validator_exit,
RpcHealth::stub(),
RpcHealth::stub(optimistically_confirmed_bank.clone(), blockstore.clone()),
cluster_info,
Hash::default(),
None,
OptimisticallyConfirmedBank::locked_from_bank_forks_root(&bank_forks),
optimistically_confirmed_bank,
Arc::new(RwLock::new(LargestAccountsCache::new(30))),
max_slots.clone(),
Arc::new(LeaderScheduleCache::new_from_bank(&bank)),
Expand Down Expand Up @@ -6398,7 +6405,11 @@ pub mod tests {
let blockstore = Arc::new(Blockstore::open(&ledger_path).unwrap());
let block_commitment_cache = Arc::new(RwLock::new(BlockCommitmentCache::default()));
let (bank_forks, mint_keypair, ..) = new_bank_forks();
let health = RpcHealth::stub();
let optimistically_confirmed_bank =
OptimisticallyConfirmedBank::locked_from_bank_forks_root(&bank_forks);
let health = RpcHealth::stub(optimistically_confirmed_bank.clone(), blockstore.clone());
// Mark the node as healthy to start
health.stub_set_health_status(Some(RpcHealthStatus::Ok));

// Freeze bank 0 to prevent a panic in `run_transaction_simulation()`
bank_forks.write().unwrap().get(0).unwrap().freeze();
Expand Down Expand Up @@ -6429,7 +6440,7 @@ pub mod tests {
cluster_info,
Hash::default(),
None,
OptimisticallyConfirmedBank::locked_from_bank_forks_root(&bank_forks),
optimistically_confirmed_bank,
Arc::new(RwLock::new(LargestAccountsCache::new(30))),
Arc::new(MaxSlots::default()),
Arc::new(LeaderScheduleCache::default()),
Expand Down Expand Up @@ -6690,18 +6701,20 @@ pub mod tests {
.my_contact_info()
.tpu(connection_cache.protocol())
.unwrap();
let optimistically_confirmed_bank =
OptimisticallyConfirmedBank::locked_from_bank_forks_root(&bank_forks);
let (request_processor, receiver) = JsonRpcRequestProcessor::new(
JsonRpcConfig::default(),
None,
bank_forks.clone(),
block_commitment_cache,
blockstore,
blockstore.clone(),
validator_exit,
RpcHealth::stub(),
RpcHealth::stub(optimistically_confirmed_bank.clone(), blockstore),
cluster_info,
Hash::default(),
None,
OptimisticallyConfirmedBank::locked_from_bank_forks_root(&bank_forks),
optimistically_confirmed_bank,
Arc::new(RwLock::new(LargestAccountsCache::new(30))),
Arc::new(MaxSlots::default()),
Arc::new(LeaderScheduleCache::default()),
Expand Down Expand Up @@ -8327,9 +8340,9 @@ pub mod tests {
None,
bank_forks.clone(),
block_commitment_cache,
blockstore,
blockstore.clone(),
validator_exit,
RpcHealth::stub(),
RpcHealth::stub(optimistically_confirmed_bank.clone(), blockstore.clone()),
cluster_info,
Hash::default(),
None,
Expand Down
Loading

0 comments on commit 76ac0ac

Please sign in to comment.