Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add wen_restart module #33344

Merged
merged 25 commits into from
Oct 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
228cf2b
Add wen_restart module:
wen-coding Sep 21, 2023
d575ff8
Update lock file.
wen-coding Sep 21, 2023
a1fba4e
Merge branch 'master' into wen_restart_add_module
wen-coding Sep 21, 2023
652f724
Fix linter errors.
wen-coding Sep 21, 2023
d9e2f93
Fix depencies order.
wen-coding Sep 21, 2023
baebed8
Update wen_restart explanation and small fixes.
wen-coding Sep 21, 2023
9c0c78c
Merge branch 'master' into wen_restart_add_module
wen-coding Sep 21, 2023
57c9228
Generate tower outside tvu.
wen-coding Sep 21, 2023
3ac0a02
Update validator/src/cli.rs
wen-coding Sep 21, 2023
fd7e157
Update wen-restart/protos/wen_restart.proto
wen-coding Sep 21, 2023
84ee985
Update wen-restart/build.rs
wen-coding Sep 21, 2023
690cef7
Update wen-restart/src/wen_restart.rs
wen-coding Sep 21, 2023
8c0c04a
Rename proto directory.
wen-coding Sep 21, 2023
bcf9942
Rename InitRecord to MyLastVotedForkSlots, add imports.
wen-coding Sep 21, 2023
d5375bd
Update wen-restart/Cargo.toml
wen-coding Sep 22, 2023
ab88c0a
Update wen-restart/src/wen_restart.rs
wen-coding Sep 22, 2023
6c5c8ec
Move prost-build dependency to project toml.
wen-coding Sep 22, 2023
fbb920e
Merge branch 'master' into wen_restart_add_module
wen-coding Sep 22, 2023
72ada79
No need to continue if the distance between slot and last_vote is
wen-coding Oct 2, 2023
d9c76f2
Merge branch 'master' into wen_restart_add_module
wen-coding Oct 2, 2023
caea7d7
Use 16k slots instead of 81k slots, a few more wording changes.
wen-coding Oct 6, 2023
e1fb692
Use AncestorIterator which does the same thing.
wen-coding Oct 6, 2023
e3ea405
Merge branch 'master' into wen_restart_add_module
wen-coding Oct 6, 2023
433a18c
Update Cargo.lock
wen-coding Oct 6, 2023
2a61944
Update Cargo.lock
wen-coding Oct 6, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ members = [
"version",
"vote",
"watchtower",
"wen-restart",
"zk-keygen",
"zk-token-sdk",
]
Expand Down Expand Up @@ -261,6 +262,7 @@ pretty-hex = "0.3.0"
proc-macro2 = "1.0.67"
proptest = "1.2"
prost = "0.11.9"
prost-build = "0.11.9"
prost-types = "0.11.9"
protobuf-src = "1.1.0"
qstring = "0.7.2"
Expand Down Expand Up @@ -371,6 +373,7 @@ solana-udp-client = { path = "udp-client", version = "=1.18.0" }
solana-version = { path = "version", version = "=1.18.0" }
solana-vote = { path = "vote", version = "=1.18.0" }
solana-vote-program = { path = "programs/vote", version = "=1.18.0" }
solana-wen-restart = { path = "wen-restart", version = "=1.18.0" }
solana-zk-keygen = { path = "zk-keygen", version = "=1.18.0" }
solana-zk-token-proof-program = { path = "programs/zk-token-proof", version = "=1.18.0" }
solana-zk-token-sdk = { path = "zk-token-sdk", version = "=1.18.0" }
Expand Down
1 change: 1 addition & 0 deletions core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ solana-turbine = { workspace = true }
solana-version = { workspace = true }
solana-vote = { workspace = true }
solana-vote-program = { workspace = true }
solana-wen-restart = { workspace = true }
strum = { workspace = true, features = ["derive"] }
strum_macros = { workspace = true }
sys-info = { workspace = true }
Expand Down
12 changes: 1 addition & 11 deletions core/src/replay_stage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ use {
},
rewards_recorder_service::{RewardsMessage, RewardsRecorderSender},
unfrozen_gossip_verified_vote_hashes::UnfrozenGossipVerifiedVoteHashes,
validator::ProcessBlockStore,
voting_service::VoteOp,
window_service::DuplicateSlotReceiver,
},
Expand Down Expand Up @@ -483,7 +482,7 @@ impl ReplayStage {
ledger_signal_receiver: Receiver<bool>,
duplicate_slots_receiver: DuplicateSlotReceiver,
poh_recorder: Arc<RwLock<PohRecorder>>,
maybe_process_blockstore: Option<ProcessBlockStore>,
mut tower: Tower,
vote_tracker: Arc<VoteTracker>,
cluster_slots: Arc<ClusterSlots>,
retransmit_slots_sender: Sender<Slot>,
Expand All @@ -502,15 +501,6 @@ impl ReplayStage {
banking_tracer: Arc<BankingTracer>,
popular_pruned_forks_receiver: PopularPrunedForksReceiver,
) -> Result<Self, String> {
let mut tower = if let Some(process_blockstore) = maybe_process_blockstore {
let tower = process_blockstore.process_to_create_tower()?;
info!("Tower state: {:?}", tower);
tower
} else {
warn!("creating default tower....");
Tower::default()
};

let ReplayStageConfig {
vote_account,
authorized_voter_keypairs,
Expand Down
9 changes: 4 additions & 5 deletions core/src/tvu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,14 @@ use {
},
cluster_slots_service::{cluster_slots::ClusterSlots, ClusterSlotsService},
completed_data_sets_service::CompletedDataSetsSender,
consensus::tower_storage::TowerStorage,
consensus::{tower_storage::TowerStorage, Tower},
cost_update_service::CostUpdateService,
drop_bank_service::DropBankService,
ledger_cleanup_service::LedgerCleanupService,
repair::{quic_endpoint::LocalRequest, repair_service::RepairInfo},
replay_stage::{ReplayStage, ReplayStageConfig},
rewards_recorder_service::RewardsRecorderSender,
shred_fetch_stage::ShredFetchStage,
validator::ProcessBlockStore,
voting_service::VotingService,
warm_quic_cache_service::WarmQuicCacheService,
window_service::WindowService,
Expand Down Expand Up @@ -109,7 +108,7 @@ impl Tvu {
ledger_signal_receiver: Receiver<bool>,
rpc_subscriptions: &Arc<RpcSubscriptions>,
poh_recorder: &Arc<RwLock<PohRecorder>>,
maybe_process_block_store: Option<ProcessBlockStore>,
tower: Tower,
tower_storage: Arc<dyn TowerStorage>,
leader_schedule_cache: &Arc<LeaderScheduleCache>,
exit: Arc<AtomicBool>,
Expand Down Expand Up @@ -292,7 +291,7 @@ impl Tvu {
ledger_signal_receiver,
duplicate_slots_receiver,
poh_recorder.clone(),
maybe_process_block_store,
tower,
vote_tracker,
cluster_slots,
retransmit_slots_sender,
Expand Down Expand Up @@ -463,7 +462,7 @@ pub mod tests {
OptimisticallyConfirmedBank::locked_from_bank_forks_root(&bank_forks),
)),
&poh_recorder,
None,
Tower::default(),
Arc::new(FileTowerStorage::default()),
&leader_schedule_cache,
exit.clone(),
Expand Down
36 changes: 35 additions & 1 deletion core/src/validator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ use {
solana_streamer::{socket::SocketAddrSpace, streamer::StakedNodes},
solana_turbine::{self, broadcast_stage::BroadcastStageType},
solana_vote_program::vote_state,
solana_wen_restart::wen_restart::wait_for_wen_restart,
std::{
collections::{HashMap, HashSet},
net::SocketAddr,
Expand Down Expand Up @@ -259,6 +260,7 @@ pub struct ValidatorConfig {
pub block_production_method: BlockProductionMethod,
pub generator_config: Option<GeneratorConfig>,
pub use_snapshot_archives_at_startup: UseSnapshotArchivesAtStartup,
pub wen_restart_proto_path: Option<PathBuf>,
}

impl Default for ValidatorConfig {
Expand Down Expand Up @@ -326,6 +328,7 @@ impl Default for ValidatorConfig {
block_production_method: BlockProductionMethod::default(),
generator_config: None,
use_snapshot_archives_at_startup: UseSnapshotArchivesAtStartup::default(),
wen_restart_proto_path: None,
}
}
}
Expand Down Expand Up @@ -1202,6 +1205,22 @@ impl Validator {
)
.unwrap();

let in_wen_restart = config.wen_restart_proto_path.is_some() && !waited_for_supermajority;
let tower = match process_blockstore.process_to_create_tower() {
Ok(tower) => {
info!("Tower state: {:?}", tower);
tower
}
Err(e) => {
warn!(
"Unable to retrieve tower: {:?} creating default tower....",
e
);
Tower::default()
}
};
let last_vote = tower.last_vote();
AshwinSekar marked this conversation as resolved.
Show resolved Hide resolved

let (replay_vote_sender, replay_vote_receiver) = unbounded();
let tvu = Tvu::new(
vote_account,
Expand All @@ -1218,7 +1237,7 @@ impl Validator {
ledger_signal_receiver,
&rpc_subscriptions,
&poh_recorder,
Some(process_blockstore),
tower,
config.tower_storage.clone(),
&leader_schedule_cache,
exit.clone(),
Expand Down Expand Up @@ -1257,6 +1276,21 @@ impl Validator {
repair_quic_endpoint_sender,
)?;

if in_wen_restart {
info!("Waiting for wen_restart phase one to finish");
match wait_for_wen_restart(
&config.wen_restart_proto_path.clone().unwrap(),
last_vote,
blockstore.clone(),
cluster_info.clone(),
) {
Ok(()) => {
return Err("wen_restart phase one completedy".to_string());
}
Err(e) => return Err(format!("wait_for_wen_restart failed: {e:?}")),
};
}

let tpu = Tpu::new(
&cluster_info,
&poh_recorder,
Expand Down
2 changes: 1 addition & 1 deletion gossip/src/epoch_slots.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use {
},
};

const MAX_SLOTS_PER_ENTRY: usize = 2048 * 8;
pub const MAX_SLOTS_PER_ENTRY: usize = 2048 * 8;
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, AbiExample)]
pub struct Uncompressed {
pub first_slot: Slot,
Expand Down
1 change: 1 addition & 0 deletions local-cluster/src/validator_configs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ pub fn safe_clone_config(config: &ValidatorConfig) -> ValidatorConfig {
block_production_method: config.block_production_method.clone(),
generator_config: config.generator_config.clone(),
use_snapshot_archives_at_startup: config.use_snapshot_archives_at_startup,
wen_restart_proto_path: config.wen_restart_proto_path.clone(),
}
}

Expand Down
20 changes: 20 additions & 0 deletions programs/sbf/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

32 changes: 32 additions & 0 deletions validator/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1381,6 +1381,35 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> {
.possible_values(BlockProductionMethod::cli_names())
.help(BlockProductionMethod::cli_message())
)
.arg(
Arg::with_name("wen_restart")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Took a quick look at this because of #33633. I believe (but could be mistaken) that this feature is not yet production ready. If that is correct, i think we should add this

.hidden(hidden_unless_forced())

with the idea being to keep the in-development features hidden from solana-validator --help until they're ready

.long("wen-restart")
.value_name("DIR")
.takes_value(true)
.required(false)
.default_value(&default_args.wen_restart_path)
.conflicts_with("wait_for_supermajority")
.help(
AshwinSekar marked this conversation as resolved.
Show resolved Hide resolved
"When specified, the validator will enter Wen Restart mode which
pauses normal activity. Validators in this mode will gossip their last
vote to reach consensus on a safe restart slot and repair all blocks
on the selected fork. The safe slot will be a descendant of the latest
optimistically confirmed slot to ensure we do not roll back any
optimistically confirmed slots.

The progress in this mode will be saved in the file location provided.
If consensus is reached, the validator will automatically exit and then
execute wait_for_supermajority logic so the cluster will resume execution.
The progress file will be kept around for future debugging.

After the cluster resumes normal operation, the validator arguments can
be adjusted to remove --wen_restart and update expected_shred_version to
the new shred_version agreed on in the consensus.

If wen_restart fails, refer to the progress file (in proto3 format) for
further debugging.
")
)
.args(&get_deprecated_arguments())
.after_help("The default subcommand is run")
.subcommand(
Expand Down Expand Up @@ -1930,6 +1959,8 @@ pub struct DefaultArgs {
pub wait_for_restart_window_max_delinquent_stake: String,

pub banking_trace_dir_byte_limit: String,

pub wen_restart_path: String,
}

impl DefaultArgs {
Expand Down Expand Up @@ -2008,6 +2039,7 @@ impl DefaultArgs {
wait_for_restart_window_min_idle_time: "10".to_string(),
wait_for_restart_window_max_delinquent_stake: "5".to_string(),
banking_trace_dir_byte_limit: BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT.to_string(),
wen_restart_path: "wen_restart_progress.proto".to_string(),
}
}
}
Expand Down
43 changes: 43 additions & 0 deletions wen-restart/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
[package]
name = "solana-wen-restart"
description = "Automatic repair and restart protocol"
documentation = "https://github.com/solana-foundation/solana-improvement-documents/pull/46"
version = { workspace = true }
authors = { workspace = true }
repository = { workspace = true }
homepage = { workspace = true }
license = { workspace = true }
edition = { workspace = true }
publish = false

[dependencies]
log = { workspace = true }
prost = { workspace = true }
prost-types = { workspace = true }
solana-gossip = { workspace = true }
solana-ledger = { workspace = true }
solana-logger = { workspace = true }
solana-program = { workspace = true }
solana-runtime = { workspace = true }
solana-sdk = { workspace = true }
solana-vote-program = { workspace = true }

[dev-dependencies]
serial_test = { workspace = true }
solana-entry = { workspace = true }
solana-streamer = { workspace = true }

[build-dependencies]
prost-build = { workspace = true }
rustc_version = { workspace = true }

# windows users should install the protobuf compiler manually and set the PROTOC
# envar to point to the installed binary
[target."cfg(not(windows))".build-dependencies]
protobuf-src = { workspace = true }

[lib]
name = "solana_wen_restart"

[package.metadata.docs.rs]
targets = ["x86_64-unknown-linux-gnu"]
Loading