Skip to content

Commit

Permalink
Fix validator lockfiles (#1586)
Browse files Browse the repository at this point in the history
## Issue Addressed

- Resolves #1313 

## Proposed Changes

Changes the way we start the validator client and beacon node to ensure that we cleanly drop the validator keystores (which therefore ensures we cleanup their lockfiles).

Previously we were holding the validator keystores in a tokio task that was being forcefully killed (i.e., without `Drop`). Now, we hold them in a task that can gracefully handle a shutdown.

Also, switches the `--strict-lockfiles` flag to `--delete-lockfiles`. This means two things:

1. We are now strict on lockfiles by default (before we weren't).
1. There's a simple way for people delete the lockfiles if they experience a crash.

## Additional Info

I've only given the option to ignore *and* delete lockfiles, not just ignore them. I can't see a strong need for ignore-only but could easily add it, if the need arises.

I've flagged this as `api-breaking` since users that have lockfiles lingering around will be required to supply `--delete-lockfiles` next time they run.
  • Loading branch information
paulhauner committed Sep 26, 2020
1 parent 45fe596 commit c580d40
Show file tree
Hide file tree
Showing 6 changed files with 169 additions and 131 deletions.
5 changes: 2 additions & 3 deletions beacon_node/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ mod config;
pub use beacon_chain;
pub use cli::cli_app;
pub use client::{Client, ClientBuilder, ClientConfig, ClientGenesis};
pub use config::{get_data_dir, get_eth2_testnet_config, set_network_config};
pub use config::{get_config, get_data_dir, get_eth2_testnet_config, set_network_config};
pub use eth2_config::Eth2Config;

use beacon_chain::events::TeeEventHandler;
Expand All @@ -17,7 +17,6 @@ use beacon_chain::{
builder::Witness, eth1_chain::CachingEth1Backend, slot_clock::SystemTimeSlotClock,
};
use clap::ArgMatches;
use config::get_config;
use environment::RuntimeContext;
use slog::{info, warn};
use std::ops::{Deref, DerefMut};
Expand Down Expand Up @@ -54,7 +53,7 @@ impl<E: EthSpec> ProductionBeaconNode<E> {
/// configurations hosted remotely.
pub async fn new_from_cli(
context: RuntimeContext<E>,
matches: &ArgMatches<'_>,
matches: ArgMatches<'static>,
) -> Result<Self, String> {
let client_config = get_config::<E>(
&matches,
Expand Down
98 changes: 50 additions & 48 deletions lighthouse/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -255,61 +255,63 @@ fn run<E: EthSpec>(
"name" => testnet_name
);

let beacon_node = if let Some(sub_matches) = matches.subcommand_matches("beacon_node") {
let runtime_context = environment.core_context();

let beacon = environment
.runtime()
.block_on(ProductionBeaconNode::new_from_cli(
runtime_context,
sub_matches,
))
.map_err(|e| format!("Failed to start beacon node: {}", e))?;

Some(beacon)
} else {
None
};

let validator_client = if let Some(sub_matches) = matches.subcommand_matches("validator_client")
{
let runtime_context = environment.core_context();

let mut validator = environment
.runtime()
.block_on(ProductionValidatorClient::new_from_cli(
runtime_context,
sub_matches,
))
.map_err(|e| format!("Failed to init validator client: {}", e))?;

environment
.core_context()
.executor
.runtime_handle()
.enter(|| {
validator
.start_service()
.map_err(|e| format!("Failed to start validator client service: {}", e))
})?;

Some(validator)
} else {
None
match matches.subcommand() {
("beacon_node", Some(matches)) => {
let context = environment.core_context();
let log = context.log().clone();
let executor = context.executor.clone();
let config = beacon_node::get_config::<E>(
matches,
&context.eth2_config.spec_constants,
&context.eth2_config().spec,
context.log().clone(),
)?;
environment.runtime().spawn(async move {
if let Err(e) = ProductionBeaconNode::new(context.clone(), config).await {
crit!(log, "Failed to start beacon node"; "reason" => e);
// Ignore the error since it always occurs during normal operation when
// shutting down.
let _ = executor
.shutdown_sender()
.try_send("Failed to start beacon node");
}
})
}
("validator_client", Some(matches)) => {
let context = environment.core_context();
let log = context.log().clone();
let executor = context.executor.clone();
let config = validator_client::Config::from_cli(&matches)
.map_err(|e| format!("Unable to initialize validator config: {}", e))?;
environment.runtime().spawn(async move {
let run = async {
ProductionValidatorClient::new(context, config)
.await?
.start_service()?;

Ok::<(), String>(())
};
if let Err(e) = run.await {
crit!(log, "Failed to start validator client"; "reason" => e);
// Ignore the error since it always occurs during normal operation when
// shutting down.
let _ = executor
.shutdown_sender()
.try_send("Failed to start validator client");
}
})
}
_ => {
crit!(log, "No subcommand supplied. See --help .");
return Err("No subcommand supplied.".into());
}
};

if beacon_node.is_none() && validator_client.is_none() {
crit!(log, "No subcommand supplied. See --help .");
return Err("No subcommand supplied.".into());
}

// Block this thread until we get a ctrl-c or a task sends a shutdown signal.
environment.block_until_shutdown_requested()?;
info!(log, "Shutting down..");

environment.fire_signal();
drop(beacon_node);
drop(validator_client);

// Shutdown the environment once all tasks have completed.
environment.shutdown_on_idle();
Expand Down
12 changes: 8 additions & 4 deletions validator_client/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,15 @@ pub fn cli_app<'a, 'b>() -> App<'a, 'b> {
nodes using the same key. Automatically enabled unless `--strict` is specified",
))
.arg(
Arg::with_name("strict-lockfiles")
.long("strict-lockfiles")
Arg::with_name("delete-lockfiles")
.long("delete-lockfiles")
.help(
"If present, do not load validators that are guarded by a lockfile. Note: for \
Eth2 mainnet, this flag will likely be removed and its behaviour will become default."
"If present, ignore and delete any keystore lockfiles encountered during start up. \
This is useful if the validator client did not exit gracefully on the last run. \
WARNING: lockfiles help prevent users from accidentally running the same validator \
using two different validator clients, an action that likely leads to slashing. \
Ensure you are certain that there are no other validator client instances running \
that might also be using the same keystores."
)
)
.arg(
Expand Down
8 changes: 4 additions & 4 deletions validator_client/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ pub struct Config {
/// If true, the validator client will still poll for duties and produce blocks even if the
/// beacon node is not synced at startup.
pub allow_unsynced_beacon_node: bool,
/// If true, refuse to unlock a keypair that is guarded by a lockfile.
pub strict_lockfiles: bool,
/// If true, delete any validator keystore lockfiles that would prevent starting.
pub delete_lockfiles: bool,
/// If true, don't scan the validators dir for new keystores.
pub disable_auto_discover: bool,
/// Graffiti to be inserted everytime we create a block.
Expand All @@ -46,7 +46,7 @@ impl Default for Config {
secrets_dir,
http_server: DEFAULT_HTTP_SERVER.to_string(),
allow_unsynced_beacon_node: false,
strict_lockfiles: false,
delete_lockfiles: false,
disable_auto_discover: false,
graffiti: None,
}
Expand Down Expand Up @@ -77,7 +77,7 @@ impl Config {
}

config.allow_unsynced_beacon_node = cli_args.is_present("allow-unsynced");
config.strict_lockfiles = cli_args.is_present("strict-lockfiles");
config.delete_lockfiles = cli_args.is_present("delete-lockfiles");
config.disable_auto_discover = cli_args.is_present("disable-auto-discover");

if let Some(secrets_dir) = parse_optional(cli_args, "secrets-dir")? {
Expand Down
51 changes: 34 additions & 17 deletions validator_client/src/initialized_validators.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ pub enum Error {
PasswordUnknown(PathBuf),
/// There was an error reading from stdin.
UnableToReadPasswordFromUser(String),
/// There was an error running a tokio async task.
TokioJoin(tokio::task::JoinError),
/// There was a filesystem error when deleting a lockfile.
UnableToDeleteLockfile(io::Error),
}

/// A method used by a validator to sign messages.
Expand Down Expand Up @@ -86,7 +90,7 @@ impl InitializedValidator {
/// If the validator is unable to be initialized for whatever reason.
pub fn from_definition(
def: ValidatorDefinition,
strict_lockfiles: bool,
delete_lockfiles: bool,
log: &Logger,
) -> Result<Self, Error> {
if !def.enabled {
Expand Down Expand Up @@ -150,16 +154,17 @@ impl InitializedValidator {
})?;

if voting_keystore_lockfile_path.exists() {
if strict_lockfiles {
return Err(Error::LockfileExists(voting_keystore_lockfile_path));
} else {
// If **not** respecting lockfiles, just raise a warning if the voting
// keypair cannot be unlocked.
if delete_lockfiles {
warn!(
log,
"Ignoring validator lockfile";
"Deleting validator lockfile";
"file" => format!("{:?}", voting_keystore_lockfile_path)
);

fs::remove_file(&voting_keystore_lockfile_path)
.map_err(Error::UnableToDeleteLockfile)?;
} else {
return Err(Error::LockfileExists(voting_keystore_lockfile_path));
}
} else {
// Create a new lockfile.
Expand Down Expand Up @@ -279,7 +284,7 @@ pub struct InitializedValidators {

impl InitializedValidators {
/// Instantiates `Self`, initializing all validators in `definitions`.
pub fn from_definitions(
pub async fn from_definitions(
definitions: ValidatorDefinitions,
validators_dir: PathBuf,
strict_lockfiles: bool,
Expand All @@ -292,7 +297,7 @@ impl InitializedValidators {
validators: HashMap::default(),
log,
};
this.update_validators()?;
this.update_validators().await?;
Ok(this)
}

Expand Down Expand Up @@ -328,7 +333,7 @@ impl InitializedValidators {
/// validator will be removed from `self.validators`.
///
/// Saves the `ValidatorDefinitions` to file, even if no definitions were changed.
pub fn set_validator_status(
pub async fn set_validator_status(
&mut self,
voting_public_key: &PublicKey,
enabled: bool,
Expand All @@ -342,7 +347,7 @@ impl InitializedValidators {
def.enabled = enabled;
}

self.update_validators()?;
self.update_validators().await?;

self.definitions
.save(&self.validators_dir)
Expand All @@ -362,7 +367,7 @@ impl InitializedValidators {
/// A validator is considered "already known" and skipped if the public key is already known.
/// I.e., if there are two different definitions with the same public key then the second will
/// be ignored.
fn update_validators(&mut self) -> Result<(), Error> {
async fn update_validators(&mut self) -> Result<(), Error> {
for def in self.definitions.as_slice() {
if def.enabled {
match &def.signing_definition {
Expand All @@ -371,11 +376,23 @@ impl InitializedValidators {
continue;
}

match InitializedValidator::from_definition(
def.clone(),
self.strict_lockfiles,
&self.log,
) {
// Decoding a local keystore can take several seconds, therefore it's best
// to keep if off the core executor. This also has the fortunate effect of
// interrupting the potentially long-running task during shut down.
let inner_def = def.clone();
let strict_lockfiles = self.strict_lockfiles;
let inner_log = self.log.clone();
let result = tokio::task::spawn_blocking(move || {
InitializedValidator::from_definition(
inner_def,
strict_lockfiles,
&inner_log,
)
})
.await
.map_err(Error::TokioJoin)?;

match result {
Ok(init) => {
self.validators
.insert(init.voting_public_key().clone(), init);
Expand Down
Loading

0 comments on commit c580d40

Please sign in to comment.