Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature - Epoch boundary recompilation phase #33477

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ledger-tool/src/program.rs
Original file line number Diff line number Diff line change
Expand Up @@ -552,7 +552,7 @@ pub fn program(ledger_path: &Path, matches: &ArgMatches<'_>) {
.clone(),
);
for key in cached_account_keys {
loaded_programs.replenish(key, bank.load_program(&key, false));
loaded_programs.replenish(key, bank.load_program(&key, false, None));
debug!("Loaded program {}", key);
}
invoke_context.programs_loaded_for_tx_batch = &loaded_programs;
Expand Down
54 changes: 34 additions & 20 deletions program-runtime/src/loaded_programs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,14 @@ pub struct LoadedPrograms<FG: ForkGraph> {
pub latest_root_epoch: Epoch,
/// Environments of the current epoch
pub environments: ProgramRuntimeEnvironments,
/// Anticipated replacement for `environments` at the next epoch
///
/// This is `None` during most of an epoch, and only `Some` around the boundaries (at the end and beginning of an epoch).
/// More precisely, it starts with the recompilation phase a few hundred slots before the epoch boundary,
/// and it ends with the first rerooting after the epoch boundary.
pub upcoming_environments: Option<ProgramRuntimeEnvironments>,
/// List of loaded programs which should be recompiled before the next epoch (but don't have to).
pub programs_to_recompile: Vec<(Pubkey, Arc<LoadedProgram>)>,
pub stats: Stats,
pub fork_graph: Option<Arc<RwLock<FG>>>,
}
Expand All @@ -481,6 +489,8 @@ impl<FG: ForkGraph> Default for LoadedPrograms<FG> {
latest_root_slot: 0,
latest_root_epoch: 0,
environments: ProgramRuntimeEnvironments::default(),
upcoming_environments: None,
programs_to_recompile: Vec::default(),
stats: Stats::default(),
fork_graph: None,
}
Expand Down Expand Up @@ -567,7 +577,12 @@ impl<FG: ForkGraph> LoadedPrograms<FG> {
}

/// Returns the current environments depending on the given epoch
pub fn get_environments_for_epoch(&self, _epoch: Epoch) -> &ProgramRuntimeEnvironments {
pub fn get_environments_for_epoch(&self, epoch: Epoch) -> &ProgramRuntimeEnvironments {
if epoch != self.latest_root_epoch {
if let Some(upcoming_environments) = self.upcoming_environments.as_ref() {
return upcoming_environments;
}
}
&self.environments
}

Expand Down Expand Up @@ -630,22 +645,6 @@ impl<FG: ForkGraph> LoadedPrograms<FG> {
entry
}

/// On the epoch boundary this removes all programs of the outdated feature set
pub fn prune_feature_set_transition(&mut self) {
for second_level in self.entries.values_mut() {
second_level.retain(|entry| {
if Self::matches_environment(entry, &self.environments) {
return true;
}
self.stats
.prunes_environment
.fetch_add(1, Ordering::Relaxed);
false
});
}
self.remove_programs_with_no_entries();
}

pub fn prune_by_deployment_slot(&mut self, slot: Slot) {
self.entries.retain(|_key, second_level| {
*second_level = second_level
Expand All @@ -668,6 +667,15 @@ impl<FG: ForkGraph> LoadedPrograms<FG> {
error!("Failed to lock fork graph for reading.");
return;
};
let mut recompilation_phase_ends = false;
if self.latest_root_epoch != new_root_epoch {
self.latest_root_epoch = new_root_epoch;
if let Some(upcoming_environments) = self.upcoming_environments.take() {
recompilation_phase_ends = true;
self.environments = upcoming_environments;
self.programs_to_recompile.clear();
}
}
for second_level in self.entries.values_mut() {
// Remove entries un/re/deployed on orphan forks
let mut first_ancestor_found = false;
Expand Down Expand Up @@ -697,6 +705,15 @@ impl<FG: ForkGraph> LoadedPrograms<FG> {
return false;
}
}
// Remove outdated environment of previous feature set
if recompilation_phase_ends
&& !Self::matches_environment(entry, &self.environments)
{
self.stats
.prunes_environment
.fetch_add(1, Ordering::Relaxed);
return false;
}
true
})
.cloned()
Expand All @@ -706,9 +723,6 @@ impl<FG: ForkGraph> LoadedPrograms<FG> {
self.remove_programs_with_no_entries();
debug_assert!(self.latest_root_slot <= new_root_slot);
self.latest_root_slot = new_root_slot;
if self.latest_root_epoch < new_root_epoch {
self.latest_root_epoch = new_root_epoch;
}
}

fn matches_environment(
Expand Down
160 changes: 109 additions & 51 deletions runtime/src/bank.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ use {
},
solana_bpf_loader_program::syscalls::create_program_runtime_environment_v1,
solana_cost_model::cost_tracker::CostTracker,
solana_loader_v4_program::create_program_runtime_environment_v2,
solana_measure::{measure, measure::Measure, measure_us},
solana_perf::perf_libs,
solana_program_runtime::{
Expand Down Expand Up @@ -1442,11 +1443,10 @@ impl Bank {
});

// Following code may touch AccountsDb, requiring proper ancestors
let parent_epoch = parent.epoch();
let (_, update_epoch_time_us) = measure_us!({
if parent_epoch < new.epoch() {
if parent.epoch() < new.epoch() {
new.process_new_epoch(
parent_epoch,
parent.epoch(),
parent.slot(),
parent.block_height(),
reward_calc_tracer,
Expand All @@ -1461,11 +1461,71 @@ impl Bank {
}
});

let (_, recompilation_time_us) = measure_us!({
// Recompile loaded programs one at a time before the next epoch hits
let (_epoch, slot_index) = new.get_epoch_and_slot_index(new.slot());
let slots_in_epoch = new.get_slots_in_epoch(new.epoch());
let slots_in_recompilation_phase =
(solana_program_runtime::loaded_programs::MAX_LOADED_ENTRY_COUNT as u64)
.min(slots_in_epoch)
.checked_div(2)
.unwrap();
let mut loaded_programs_cache = new.loaded_programs_cache.write().unwrap();
if loaded_programs_cache.upcoming_environments.is_some() {
if let Some((key, program_to_recompile)) =
loaded_programs_cache.programs_to_recompile.pop()
{
drop(loaded_programs_cache);
let recompiled = new.load_program(&key, false, Some(program_to_recompile));
let mut loaded_programs_cache = new.loaded_programs_cache.write().unwrap();
loaded_programs_cache.replenish(key, recompiled);
}
} else if new.epoch() != loaded_programs_cache.latest_root_epoch
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could a malicious leader trigger this check by adding a bank in a future epoch? Does it make sense to make this check more specific (e.g. new.epoch == loaded_programs_cache.latest_root_epoch + 1, and slot is within N slots in the new epoch)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aren't leaders bound by the leader schedule in that they can't just submit a block for any slot, but only the one they were assigned to?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I remember seeing this in one of the outages where the leader submitted a slot way in future. Maybe its not an issue anymore due to other checks.

Would making this check more strict cause any problems? It can be done in a separate PR if that's preferred.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://github.com/solana-labs/solana/blob/1814b2bc81aca29b322f8e1f280494c1b9ee6b32/turbine/src/sigverify_shreds.rs#L141C6-L141C6

Would making this check more strict cause any problems?

Probably not, but haven't thought it through.

|| slot_index.saturating_add(slots_in_recompilation_phase) >= slots_in_epoch
{
// Anticipate the upcoming program runtime environment for the next epoch,
// so we can try to recompile loaded programs before the feature transition hits.
drop(loaded_programs_cache);
let (feature_set, _new_feature_activations) = new.compute_active_feature_set(true);
let mut loaded_programs_cache = new.loaded_programs_cache.write().unwrap();
let program_runtime_environment_v1 = create_program_runtime_environment_v1(
&feature_set,
&new.runtime_config.compute_budget.unwrap_or_default(),
false, /* deployment */
false, /* debugging_features */
)
.unwrap();
let program_runtime_environment_v2 = create_program_runtime_environment_v2(
&new.runtime_config.compute_budget.unwrap_or_default(),
false, /* debugging_features */
);
let mut upcoming_environments = loaded_programs_cache.environments.clone();
let changed_program_runtime_v1 =
*upcoming_environments.program_runtime_v1 != program_runtime_environment_v1;
let changed_program_runtime_v2 =
*upcoming_environments.program_runtime_v2 != program_runtime_environment_v2;
if changed_program_runtime_v1 {
upcoming_environments.program_runtime_v1 =
Arc::new(program_runtime_environment_v1);
}
if changed_program_runtime_v2 {
upcoming_environments.program_runtime_v2 =
Arc::new(program_runtime_environment_v2);
}
loaded_programs_cache.upcoming_environments = Some(upcoming_environments);
loaded_programs_cache.programs_to_recompile = loaded_programs_cache
.get_entries_sorted_by_tx_usage(
changed_program_runtime_v1,
changed_program_runtime_v2,
);
}
});

// Update sysvars before processing transactions
let (_, update_sysvars_time_us) = measure_us!({
new.update_slot_hashes();
new.update_stake_history(Some(parent_epoch));
new.update_clock(Some(parent_epoch));
new.update_stake_history(Some(parent.epoch()));
new.update_clock(Some(parent.epoch()));
new.update_fees();
new.update_last_restart_slot()
});
Expand Down Expand Up @@ -1493,6 +1553,7 @@ impl Bank {
feature_set_time_us,
ancestors_time_us,
update_epoch_time_us,
recompilation_time_us,
update_sysvars_time_us,
fill_sysvar_cache_time_us,
},
Expand Down Expand Up @@ -4690,16 +4751,25 @@ impl Bank {
ProgramAccountLoadResult::InvalidAccountData
}

pub fn load_program(&self, pubkey: &Pubkey, reload: bool) -> Arc<LoadedProgram> {
pub fn load_program(
&self,
pubkey: &Pubkey,
reload: bool,
recompile: Option<Arc<LoadedProgram>>,
) -> Arc<LoadedProgram> {
let loaded_programs_cache = self.loaded_programs_cache.read().unwrap();
let environments = loaded_programs_cache.get_environments_for_epoch(self.epoch);

let effective_epoch = if recompile.is_some() {
loaded_programs_cache.latest_root_epoch.saturating_add(1)
pgarg66 marked this conversation as resolved.
Show resolved Hide resolved
} else {
self.epoch
};
let environments = loaded_programs_cache.get_environments_for_epoch(effective_epoch);
let mut load_program_metrics = LoadProgramMetrics {
program_id: pubkey.to_string(),
..LoadProgramMetrics::default()
};

let loaded_program = match self.load_program_accounts(pubkey) {
let mut loaded_program = match self.load_program_accounts(pubkey) {
ProgramAccountLoadResult::AccountNotFound => Ok(LoadedProgram::new_tombstone(
self.slot,
LoadedProgramType::Closed,
Expand Down Expand Up @@ -4806,6 +4876,16 @@ impl Bank {

let mut timings = ExecuteDetailsTimings::default();
load_program_metrics.submit_datapoint(&mut timings);
if let Some(recompile) = recompile {
loaded_program.effective_slot = loaded_program.effective_slot.max(
self.epoch_schedule()
.get_first_slot_in_epoch(effective_epoch),
);
loaded_program.tx_usage_counter =
AtomicU64::new(recompile.tx_usage_counter.load(Ordering::Relaxed));
loaded_program.ix_usage_counter =
AtomicU64::new(recompile.ix_usage_counter.load(Ordering::Relaxed));
}
Arc::new(loaded_program)
}

Expand Down Expand Up @@ -5052,7 +5132,7 @@ impl Bank {
let missing_programs: Vec<(Pubkey, Arc<LoadedProgram>)> = missing
.iter()
.map(|(key, count)| {
let program = self.load_program(key, false);
let program = self.load_program(key, false, None);
program.tx_usage_counter.store(*count, Ordering::Relaxed);
(*key, program)
})
Expand All @@ -5062,7 +5142,7 @@ impl Bank {
let unloaded_programs: Vec<(Pubkey, Arc<LoadedProgram>)> = unloaded
.iter()
.map(|(key, count)| {
let program = self.load_program(key, true);
let program = self.load_program(key, true, None);
program.tx_usage_counter.store(*count, Ordering::Relaxed);
(*key, program)
})
Expand Down Expand Up @@ -6797,6 +6877,24 @@ impl Bank {
}
}

Lichtso marked this conversation as resolved.
Show resolved Hide resolved
let mut loaded_programs_cache = self.loaded_programs_cache.write().unwrap();
loaded_programs_cache.latest_root_slot = self.slot();
loaded_programs_cache.latest_root_epoch = self.epoch();
loaded_programs_cache.environments.program_runtime_v1 = Arc::new(
create_program_runtime_environment_v1(
&self.feature_set,
&self.runtime_config.compute_budget.unwrap_or_default(),
false, /* deployment */
false, /* debugging_features */
)
.unwrap(),
);
loaded_programs_cache.environments.program_runtime_v2 =
Arc::new(create_program_runtime_environment_v2(
&self.runtime_config.compute_budget.unwrap_or_default(),
false, /* debugging_features */
));

if self
.feature_set
.is_active(&feature_set::cap_accounts_data_len::id())
Expand Down Expand Up @@ -8153,46 +8251,6 @@ impl Bank {
only_apply_transitions_for_new_features: bool,
new_feature_activations: &HashSet<Pubkey>,
) {
const FEATURES_AFFECTING_RBPF: &[Pubkey] = &[
feature_set::error_on_syscall_bpf_function_hash_collisions::id(),
feature_set::reject_callx_r10::id(),
feature_set::switch_to_new_elf_parser::id(),
feature_set::bpf_account_data_direct_mapping::id(),
feature_set::enable_alt_bn128_syscall::id(),
feature_set::enable_alt_bn128_compression_syscall::id(),
feature_set::enable_big_mod_exp_syscall::id(),
feature_set::blake3_syscall_enabled::id(),
feature_set::curve25519_syscall_enabled::id(),
feature_set::disable_fees_sysvar::id(),
feature_set::enable_partitioned_epoch_reward::id(),
feature_set::disable_deploy_of_alloc_free_syscall::id(),
feature_set::last_restart_slot_sysvar::id(),
feature_set::remaining_compute_units_syscall_enabled::id(),
];
if !only_apply_transitions_for_new_features
|| FEATURES_AFFECTING_RBPF
.iter()
.any(|key| new_feature_activations.contains(key))
{
let program_runtime_environment_v1 = create_program_runtime_environment_v1(
&self.feature_set,
&self.runtime_config.compute_budget.unwrap_or_default(),
false, /* deployment */
false, /* debugging_features */
)
.unwrap();
let mut loaded_programs_cache = self.loaded_programs_cache.write().unwrap();
loaded_programs_cache.environments.program_runtime_v1 =
Arc::new(program_runtime_environment_v1);
let program_runtime_environment_v2 =
solana_loader_v4_program::create_program_runtime_environment_v2(
&self.runtime_config.compute_budget.unwrap_or_default(),
false, /* debugging_features */
);
loaded_programs_cache.environments.program_runtime_v2 =
Arc::new(program_runtime_environment_v2);
loaded_programs_cache.prune_feature_set_transition();
}
for builtin in BUILTINS.iter() {
if let Some(feature_id) = builtin.feature_id {
let should_apply_action_for_feature_transition =
Expand Down
2 changes: 2 additions & 0 deletions runtime/src/bank/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ pub(crate) struct NewBankTimings {
pub(crate) feature_set_time_us: u64,
pub(crate) ancestors_time_us: u64,
pub(crate) update_epoch_time_us: u64,
pub(crate) recompilation_time_us: u64,
pub(crate) update_sysvars_time_us: u64,
pub(crate) fill_sysvar_cache_time_us: u64,
}
Expand Down Expand Up @@ -144,6 +145,7 @@ pub(crate) fn report_new_bank_metrics(
("feature_set_us", timings.feature_set_time_us, i64),
("ancestors_us", timings.ancestors_time_us, i64),
("update_epoch_us", timings.update_epoch_time_us, i64),
("recompilation_time_us", timings.recompilation_time_us, i64),
("update_sysvars_us", timings.update_sysvars_time_us, i64),
(
"fill_sysvar_cache_us",
Expand Down
Loading