diff --git a/polkadot/node/core/candidate-validation/src/lib.rs b/polkadot/node/core/candidate-validation/src/lib.rs index bf6e09fd1b69..8237137fdca0 100644 --- a/polkadot/node/core/candidate-validation/src/lib.rs +++ b/polkadot/node/core/candidate-validation/src/lib.rs @@ -695,6 +695,8 @@ async fn validate_candidate_exhaustive( ))), Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))) => Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))), + Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::RuntimeConstruction(err))) => + Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))), Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))) => Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(format!( @@ -780,40 +782,50 @@ trait ValidationBackend { return validation_result } + macro_rules! break_if_no_retries_left { + ($counter:ident) => { + if $counter > 0 { + $counter -= 1; + } else { + break + } + }; + } + // Allow limited retries for each kind of error. let mut num_death_retries_left = 1; let mut num_job_error_retries_left = 1; let mut num_internal_retries_left = 1; + let mut num_runtime_construction_retries_left = 1; loop { // Stop retrying if we exceeded the timeout. if total_time_start.elapsed() + retry_delay > exec_timeout { break } - + let mut retry_immediately = false; match validation_result { Err(ValidationError::PossiblyInvalid( PossiblyInvalidError::AmbiguousWorkerDeath | PossiblyInvalidError::AmbiguousJobDeath(_), - )) => - if num_death_retries_left > 0 { - num_death_retries_left -= 1; - } else { - break - }, + )) => break_if_no_retries_left!(num_death_retries_left), Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(_))) => - if num_job_error_retries_left > 0 { - num_job_error_retries_left -= 1; - } else { - break - }, + break_if_no_retries_left!(num_job_error_retries_left), Err(ValidationError::Internal(_)) => - if num_internal_retries_left > 0 { - num_internal_retries_left -= 1; - } else { - break - }, + break_if_no_retries_left!(num_internal_retries_left), + + Err(ValidationError::PossiblyInvalid( + PossiblyInvalidError::RuntimeConstruction(_), + )) => { + break_if_no_retries_left!(num_runtime_construction_retries_left); + self.precheck_pvf(pvf.clone()).await?; + // In this case the error is deterministic + // And a retry forces the ValidationBackend + // to re-prepare the artifact so + // there is no need to wait before the retry + retry_immediately = true; + }, Ok(_) | Err(ValidationError::Invalid(_) | ValidationError::Preparation(_)) => break, } @@ -821,8 +833,11 @@ trait ValidationBackend { // If we got a possibly transient error, retry once after a brief delay, on the // assumption that the conditions that caused this error may have resolved on their own. { - // Wait a brief delay before retrying. - futures_timer::Delay::new(retry_delay).await; + // In case of many transient errors it is necessary to wait a little bit + // for the error to be probably resolved + if !retry_immediately { + futures_timer::Delay::new(retry_delay).await; + } let new_timeout = exec_timeout.saturating_sub(total_time_start.elapsed()); diff --git a/polkadot/node/core/pvf/common/src/error.rs b/polkadot/node/core/pvf/common/src/error.rs index f8faefc24e65..cf274044456f 100644 --- a/polkadot/node/core/pvf/common/src/error.rs +++ b/polkadot/node/core/pvf/common/src/error.rs @@ -16,6 +16,7 @@ use crate::prepare::{PrepareSuccess, PrepareWorkerSuccess}; use parity_scale_codec::{Decode, Encode}; +pub use sc_executor_common::error::Error as ExecuteError; /// Result of PVF preparation from a worker, with checksum of the compiled PVF and stats of the /// preparation if successful. diff --git a/polkadot/node/core/pvf/common/src/execute.rs b/polkadot/node/core/pvf/common/src/execute.rs index 6b3becf524d7..18c97b03cbcd 100644 --- a/polkadot/node/core/pvf/common/src/execute.rs +++ b/polkadot/node/core/pvf/common/src/execute.rs @@ -40,6 +40,9 @@ pub enum WorkerResponse { }, /// The candidate is invalid. InvalidCandidate(String), + /// Instantiation of the WASM module instance failed during an execution. + /// Possibly related to local issues or dirty node update. May be retried with re-preparation. + RuntimeConstruction(String), /// The job timed out. JobTimedOut, /// The job process has died. We must kill the worker just in case. @@ -68,6 +71,9 @@ pub enum JobResponse { /// The result of parachain validation. result_descriptor: ValidationResult, }, + /// A possibly transient runtime instantiation error happened during the execution; may be + /// retried with re-preparation + RuntimeConstruction(String), /// The candidate is invalid. InvalidCandidate(String), } @@ -81,6 +87,15 @@ impl JobResponse { Self::InvalidCandidate(format!("{}: {}", ctx, msg)) } } + + /// Creates a may retry response from a context `ctx` and a message `msg` (which can be empty). + pub fn runtime_construction(ctx: &'static str, msg: &str) -> Self { + if msg.is_empty() { + Self::RuntimeConstruction(ctx.to_string()) + } else { + Self::RuntimeConstruction(format!("{}: {}", ctx, msg)) + } + } } /// An unexpected error occurred in the execution job process. Because this comes from the job, diff --git a/polkadot/node/core/pvf/common/src/executor_interface.rs b/polkadot/node/core/pvf/common/src/executor_interface.rs index 4cd2f5c85eec..b69a87890f60 100644 --- a/polkadot/node/core/pvf/common/src/executor_interface.rs +++ b/polkadot/node/core/pvf/common/src/executor_interface.rs @@ -16,6 +16,7 @@ //! Interface to the Substrate Executor +use crate::error::ExecuteError; use polkadot_primitives::{ executor_params::{DEFAULT_LOGICAL_STACK_MAX, DEFAULT_NATIVE_STACK_MAX}, ExecutorParam, ExecutorParams, @@ -109,7 +110,7 @@ pub unsafe fn execute_artifact( compiled_artifact_blob: &[u8], executor_params: &ExecutorParams, params: &[u8], -) -> Result, String> { +) -> Result, ExecuteError> { let mut extensions = sp_externalities::Extensions::new(); extensions.register(sp_core::traits::ReadRuntimeVersionExt::new(ReadRuntimeVersion)); @@ -123,7 +124,6 @@ pub unsafe fn execute_artifact( Ok(Ok(ok)) => Ok(ok), Ok(Err(err)) | Err(err) => Err(err), } - .map_err(|err| format!("execute error: {:?}", err)) } /// Constructs the runtime for the given PVF, given the artifact bytes. diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index 0cfa5a786946..bd7e76010a6d 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -16,7 +16,9 @@ //! Contains the logic for executing PVFs. Used by the polkadot-execute-worker binary. -pub use polkadot_node_core_pvf_common::executor_interface::execute_artifact; +pub use polkadot_node_core_pvf_common::{ + error::ExecuteError, executor_interface::execute_artifact, +}; // NOTE: Initializing logging in e.g. tests will not have an effect in the workers, as they are // separate spawned processes. Run with e.g. `RUST_LOG=parachain::pvf-execute-worker=trace`. @@ -237,7 +239,9 @@ fn validate_using_artifact( // [`executor_interface::prepare`]. execute_artifact(compiled_artifact_blob, executor_params, params) } { - Err(err) => return JobResponse::format_invalid("execute", &err), + Err(ExecuteError::RuntimeConstruction(wasmerr)) => + return JobResponse::runtime_construction("execute", &wasmerr.to_string()), + Err(err) => return JobResponse::format_invalid("execute", &err.to_string()), Ok(d) => d, }; @@ -550,6 +554,8 @@ fn handle_parent_process( Ok(WorkerResponse::Ok { result_descriptor, duration: cpu_tv }) }, Ok(JobResponse::InvalidCandidate(err)) => Ok(WorkerResponse::InvalidCandidate(err)), + Ok(JobResponse::RuntimeConstruction(err)) => + Ok(WorkerResponse::RuntimeConstruction(err)), Err(job_error) => { gum::warn!( target: LOG_TARGET, diff --git a/polkadot/node/core/pvf/src/artifacts.rs b/polkadot/node/core/pvf/src/artifacts.rs index 78dfe71adadd..6288755526d4 100644 --- a/polkadot/node/core/pvf/src/artifacts.rs +++ b/polkadot/node/core/pvf/src/artifacts.rs @@ -238,6 +238,14 @@ impl Artifacts { .is_none()); } + /// Remove artifact by its id. + pub fn remove(&mut self, artifact_id: ArtifactId) -> Option<(ArtifactId, PathBuf)> { + self.inner.remove(&artifact_id).and_then(|state| match state { + ArtifactState::Prepared { path, .. } => Some((artifact_id, path)), + _ => None, + }) + } + /// Remove artifacts older than the given TTL and return id and path of the removed ones. pub fn prune(&mut self, artifact_ttl: Duration) -> Vec<(ArtifactId, PathBuf)> { let now = SystemTime::now(); diff --git a/polkadot/node/core/pvf/src/error.rs b/polkadot/node/core/pvf/src/error.rs index 80d41d5c64be..8dc96305eadb 100644 --- a/polkadot/node/core/pvf/src/error.rs +++ b/polkadot/node/core/pvf/src/error.rs @@ -86,6 +86,10 @@ pub enum PossiblyInvalidError { /// vote invalid. #[error("possibly invalid: job error: {0}")] JobError(String), + /// Instantiation of the WASM module instance failed during an execution. + /// Possibly related to local issues or dirty node update. May be retried with re-preparation. + #[error("possibly invalid: runtime construction: {0}")] + RuntimeConstruction(String), } impl From for ValidationError { diff --git a/polkadot/node/core/pvf/src/execute/mod.rs b/polkadot/node/core/pvf/src/execute/mod.rs index c6d9cf90fa28..365e98196cae 100644 --- a/polkadot/node/core/pvf/src/execute/mod.rs +++ b/polkadot/node/core/pvf/src/execute/mod.rs @@ -23,4 +23,4 @@ mod queue; mod worker_interface; -pub use queue::{start, PendingExecutionRequest, ToQueue}; +pub use queue::{start, FromQueue, PendingExecutionRequest, ToQueue}; diff --git a/polkadot/node/core/pvf/src/execute/queue.rs b/polkadot/node/core/pvf/src/execute/queue.rs index aa91d11781fc..bdc3c7327b06 100644 --- a/polkadot/node/core/pvf/src/execute/queue.rs +++ b/polkadot/node/core/pvf/src/execute/queue.rs @@ -25,7 +25,7 @@ use crate::{ InvalidCandidate, PossiblyInvalidError, ValidationError, LOG_TARGET, }; use futures::{ - channel::mpsc, + channel::{mpsc, oneshot}, future::BoxFuture, stream::{FuturesUnordered, StreamExt as _}, Future, FutureExt, @@ -54,6 +54,12 @@ pub enum ToQueue { Enqueue { artifact: ArtifactPathId, pending_execution_request: PendingExecutionRequest }, } +/// A response from queue. +#[derive(Debug)] +pub enum FromQueue { + RemoveArtifact { artifact: ArtifactId, reply_to: oneshot::Sender<()> }, +} + /// An execution request that should execute the PVF (known in the context) and send the results /// to the given result sender. #[derive(Debug)] @@ -137,6 +143,8 @@ struct Queue { /// The receiver that receives messages to the pool. to_queue_rx: mpsc::Receiver, + /// The sender to send messages back to validation host. + from_queue_tx: mpsc::UnboundedSender, // Some variables related to the current session. program_path: PathBuf, @@ -161,6 +169,7 @@ impl Queue { node_version: Option, security_status: SecurityStatus, to_queue_rx: mpsc::Receiver, + from_queue_tx: mpsc::UnboundedSender, ) -> Self { Self { metrics, @@ -170,6 +179,7 @@ impl Queue { node_version, security_status, to_queue_rx, + from_queue_tx, queue: VecDeque::new(), mux: Mux::new(), workers: Workers { @@ -301,7 +311,7 @@ async fn handle_mux(queue: &mut Queue, event: QueueEvent) { handle_worker_spawned(queue, idle, handle, job); }, QueueEvent::StartWork(worker, outcome, artifact_id, result_tx) => { - handle_job_finish(queue, worker, outcome, artifact_id, result_tx); + handle_job_finish(queue, worker, outcome, artifact_id, result_tx).await; }, } } @@ -327,42 +337,69 @@ fn handle_worker_spawned( /// If there are pending jobs in the queue, schedules the next of them onto the just freed up /// worker. Otherwise, puts back into the available workers list. -fn handle_job_finish( +async fn handle_job_finish( queue: &mut Queue, worker: Worker, outcome: Outcome, artifact_id: ArtifactId, result_tx: ResultSender, ) { - let (idle_worker, result, duration) = match outcome { + let (idle_worker, result, duration, sync_channel) = match outcome { Outcome::Ok { result_descriptor, duration, idle_worker } => { // TODO: propagate the soft timeout - (Some(idle_worker), Ok(result_descriptor), Some(duration)) + (Some(idle_worker), Ok(result_descriptor), Some(duration), None) }, Outcome::InvalidCandidate { err, idle_worker } => ( Some(idle_worker), Err(ValidationError::Invalid(InvalidCandidate::WorkerReportedInvalid(err))), None, + None, ), - Outcome::InternalError { err } => (None, Err(ValidationError::Internal(err)), None), + Outcome::RuntimeConstruction { err, idle_worker } => { + // The task for artifact removal is executed concurrently with + // the message to the host on the execution result. + let (result_tx, result_rx) = oneshot::channel(); + queue + .from_queue_tx + .unbounded_send(FromQueue::RemoveArtifact { + artifact: artifact_id.clone(), + reply_to: result_tx, + }) + .expect("from execute queue receiver is listened by the host; qed"); + ( + Some(idle_worker), + Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::RuntimeConstruction( + err, + ))), + None, + Some(result_rx), + ) + }, + Outcome::InternalError { err } => (None, Err(ValidationError::Internal(err)), None, None), // Either the worker or the job timed out. Kill the worker in either case. Treated as // definitely-invalid, because if we timed out, there's no time left for a retry. Outcome::HardTimeout => - (None, Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)), None), + (None, Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)), None, None), // "Maybe invalid" errors (will retry). Outcome::WorkerIntfErr => ( None, Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)), None, + None, ), Outcome::JobDied { err } => ( None, Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))), None, + None, + ), + Outcome::JobError { err } => ( + None, + Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))), + None, + None, ), - Outcome::JobError { err } => - (None, Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))), None), }; queue.metrics.execute_finished(); @@ -386,6 +423,12 @@ fn handle_job_finish( ); } + if let Some(sync_channel) = sync_channel { + // err means the sender is dropped (the artifact is already removed from the cache) + // so that's legitimate to ignore the result + let _ = sync_channel.await; + } + // First we send the result. It may fail due to the other end of the channel being dropped, // that's legitimate and we don't treat that as an error. let _ = result_tx.send(result); @@ -521,8 +564,10 @@ pub fn start( spawn_timeout: Duration, node_version: Option, security_status: SecurityStatus, -) -> (mpsc::Sender, impl Future) { +) -> (mpsc::Sender, mpsc::UnboundedReceiver, impl Future) { let (to_queue_tx, to_queue_rx) = mpsc::channel(20); + let (from_queue_tx, from_queue_rx) = mpsc::unbounded(); + let run = Queue::new( metrics, program_path, @@ -532,7 +577,8 @@ pub fn start( node_version, security_status, to_queue_rx, + from_queue_tx, ) .run(); - (to_queue_tx, run) + (to_queue_tx, from_queue_rx, run) } diff --git a/polkadot/node/core/pvf/src/execute/worker_interface.rs b/polkadot/node/core/pvf/src/execute/worker_interface.rs index 9f7738f00e69..db81da118d7b 100644 --- a/polkadot/node/core/pvf/src/execute/worker_interface.rs +++ b/polkadot/node/core/pvf/src/execute/worker_interface.rs @@ -87,6 +87,10 @@ pub enum Outcome { /// a trap. Errors related to the preparation process are not expected to be encountered by the /// execution workers. InvalidCandidate { err: String, idle_worker: IdleWorker }, + /// The error is probably transient. It may be for example + /// because the artifact was prepared with a Wasmtime version different from the version + /// in the current execution environment. + RuntimeConstruction { err: String, idle_worker: IdleWorker }, /// The execution time exceeded the hard limit. The worker is terminated. HardTimeout, /// An I/O error happened during communication with the worker. This may mean that the worker @@ -193,6 +197,10 @@ pub async fn start_work( err, idle_worker: IdleWorker { stream, pid, worker_dir }, }, + WorkerResponse::RuntimeConstruction(err) => Outcome::RuntimeConstruction { + err, + idle_worker: IdleWorker { stream, pid, worker_dir }, + }, WorkerResponse::JobTimedOut => Outcome::HardTimeout, WorkerResponse::JobDied { err, job_pid: _ } => Outcome::JobDied { err }, WorkerResponse::JobError(err) => Outcome::JobError { err }, diff --git a/polkadot/node/core/pvf/src/host.rs b/polkadot/node/core/pvf/src/host.rs index ae9fdc7d2dea..8ec46f4b08f1 100644 --- a/polkadot/node/core/pvf/src/host.rs +++ b/polkadot/node/core/pvf/src/host.rs @@ -274,7 +274,7 @@ pub async fn start( from_prepare_pool, ); - let (to_execute_queue_tx, run_execute_queue) = execute::start( + let (to_execute_queue_tx, from_execute_queue_rx, run_execute_queue) = execute::start( metrics, config.execute_worker_program_path.to_owned(), config.cache_path.clone(), @@ -296,6 +296,7 @@ pub async fn start( to_prepare_queue_tx, from_prepare_queue_rx, to_execute_queue_tx, + from_execute_queue_rx, to_sweeper_tx, awaiting_prepare: AwaitingPrepare::default(), }) @@ -342,6 +343,8 @@ struct Inner { from_prepare_queue_rx: mpsc::UnboundedReceiver, to_execute_queue_tx: mpsc::Sender, + from_execute_queue_rx: mpsc::UnboundedReceiver, + to_sweeper_tx: mpsc::Sender, awaiting_prepare: AwaitingPrepare, @@ -358,6 +361,7 @@ async fn run( to_host_rx, from_prepare_queue_rx, mut to_prepare_queue_tx, + from_execute_queue_rx, mut to_execute_queue_tx, mut to_sweeper_tx, mut awaiting_prepare, @@ -384,10 +388,21 @@ async fn run( let mut to_host_rx = to_host_rx.fuse(); let mut from_prepare_queue_rx = from_prepare_queue_rx.fuse(); + let mut from_execute_queue_rx = from_execute_queue_rx.fuse(); loop { // biased to make it behave deterministically for tests. futures::select_biased! { + from_execute_queue_rx = from_execute_queue_rx.next() => { + let from_queue = break_if_fatal!(from_execute_queue_rx.ok_or(Fatal)); + let execute::FromQueue::RemoveArtifact { artifact, reply_to } = from_queue; + break_if_fatal!(handle_artifact_removal( + &mut to_sweeper_tx, + &mut artifacts, + artifact, + reply_to, + ).await); + }, () = cleanup_pulse.select_next_some() => { // `select_next_some` because we don't expect this to fail, but if it does, we // still don't fail. The trade-off is that the compiled cache will start growing @@ -861,6 +876,37 @@ async fn handle_cleanup_pulse( Ok(()) } +async fn handle_artifact_removal( + sweeper_tx: &mut mpsc::Sender, + artifacts: &mut Artifacts, + artifact_id: ArtifactId, + reply_to: oneshot::Sender<()>, +) -> Result<(), Fatal> { + let (artifact_id, path) = if let Some(artifact) = artifacts.remove(artifact_id) { + artifact + } else { + // if we haven't found the artifact by its id, + // it has been probably removed + // anyway with the randomness of the artifact name + // it is safe to ignore + return Ok(()); + }; + reply_to + .send(()) + .expect("the execute queue waits for the artifact remove confirmation; qed"); + // Thanks to the randomness of the artifact name (see + // `artifacts::generate_artifact_path`) there is no issue with any name conflict on + // future repreparation. + // So we can confirm the artifact removal already + gum::debug!( + target: LOG_TARGET, + validation_code_hash = ?artifact_id.code_hash, + "PVF pruning: pruning artifact by request from the execute queue", + ); + sweeper_tx.send(path).await.map_err(|_| Fatal)?; + Ok(()) +} + /// A simple task which sole purpose is to delete files thrown at it. async fn sweeper_task(mut sweeper_rx: mpsc::Receiver) { loop { @@ -968,6 +1014,8 @@ pub(crate) mod tests { to_prepare_queue_rx: mpsc::Receiver, from_prepare_queue_tx: mpsc::UnboundedSender, to_execute_queue_rx: mpsc::Receiver, + #[allow(unused)] + from_execute_queue_tx: mpsc::UnboundedSender, to_sweeper_rx: mpsc::Receiver, run: BoxFuture<'static, ()>, @@ -979,6 +1027,7 @@ pub(crate) mod tests { let (to_prepare_queue_tx, to_prepare_queue_rx) = mpsc::channel(10); let (from_prepare_queue_tx, from_prepare_queue_rx) = mpsc::unbounded(); let (to_execute_queue_tx, to_execute_queue_rx) = mpsc::channel(10); + let (from_execute_queue_tx, from_execute_queue_rx) = mpsc::unbounded(); let (to_sweeper_tx, to_sweeper_rx) = mpsc::channel(10); let run = run(Inner { @@ -989,6 +1038,7 @@ pub(crate) mod tests { to_prepare_queue_tx, from_prepare_queue_rx, to_execute_queue_tx, + from_execute_queue_rx, to_sweeper_tx, awaiting_prepare: AwaitingPrepare::default(), }) @@ -999,6 +1049,7 @@ pub(crate) mod tests { to_prepare_queue_rx, from_prepare_queue_tx, to_execute_queue_rx, + from_execute_queue_tx, to_sweeper_rx, run, } diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index bcc10749e746..cdfbcd8e5785 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -21,13 +21,14 @@ use parity_scale_codec::Encode as _; #[cfg(all(feature = "ci-only-tests", target_os = "linux"))] use polkadot_node_core_pvf::SecurityStatus; use polkadot_node_core_pvf::{ - start, testing::build_workers_and_get_paths, Config, InvalidCandidate, Metrics, PrepareError, - PrepareJobKind, PvfPrepData, ValidationError, ValidationHost, JOB_TIMEOUT_WALL_CLOCK_FACTOR, + start, testing::build_workers_and_get_paths, Config, InvalidCandidate, Metrics, + PossiblyInvalidError, PrepareError, PrepareJobKind, PvfPrepData, ValidationError, + ValidationHost, JOB_TIMEOUT_WALL_CLOCK_FACTOR, }; use polkadot_parachain_primitives::primitives::{BlockData, ValidationParams, ValidationResult}; use polkadot_primitives::{ExecutorParam, ExecutorParams}; -use std::time::Duration; +use std::{io::Write, time::Duration}; use tokio::sync::Mutex; mod adder; @@ -352,10 +353,80 @@ async fn deleting_prepared_artifact_does_not_dispute() { ) .await; - match result { - Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)) => {}, - r => panic!("{:?}", r), + assert_matches!(result, Err(ValidationError::Invalid(InvalidCandidate::HardTimeout))); +} + +// Test that corruption of a prepared artifact does not lead to a dispute when we try to execute it. +#[tokio::test] +async fn corrupted_prepared_artifact_does_not_dispute() { + let host = TestHost::new().await; + let cache_dir = host.cache_dir.path(); + + let _stats = host.precheck_pvf(halt::wasm_binary_unwrap(), Default::default()).await.unwrap(); + + // Manually corrupting the prepared artifact from disk. The in-memory artifacts table won't + // change. + let artifact_path = { + // Get the artifact path (asserting it exists). + let mut cache_dir: Vec<_> = std::fs::read_dir(cache_dir).unwrap().collect(); + // Should contain the artifact and the worker dir. + assert_eq!(cache_dir.len(), 2); + let mut artifact_path = cache_dir.pop().unwrap().unwrap(); + if artifact_path.path().is_dir() { + artifact_path = cache_dir.pop().unwrap().unwrap(); + } + + // Corrupt the artifact. + let mut f = std::fs::OpenOptions::new() + .write(true) + .truncate(true) + .open(artifact_path.path()) + .unwrap(); + f.write_all(b"corrupted wasm").unwrap(); + f.flush().unwrap(); + artifact_path + }; + + assert!(artifact_path.path().exists()); + + // Try to validate, artifact should get removed because of the corruption. + let result = host + .validate_candidate( + halt::wasm_binary_unwrap(), + ValidationParams { + block_data: BlockData(Vec::new()), + parent_head: Default::default(), + relay_parent_number: 1, + relay_parent_storage_root: Default::default(), + }, + Default::default(), + ) + .await; + + assert_matches!( + result, + Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::RuntimeConstruction(_))) + ); + + // because of RuntimeConstruction we may retry + host.precheck_pvf(halt::wasm_binary_unwrap(), Default::default()).await.unwrap(); + + // The actual artifact removal is done concurrently + // with sending of the result of the execution + // it is not a problem for further re-preparation as + // artifact filenames are random + for _ in 1..5 { + if !artifact_path.path().exists() { + break; + } + tokio::time::sleep(Duration::from_secs(1)).await; } + + assert!( + !artifact_path.path().exists(), + "the corrupted artifact ({}) should be deleted by the host", + artifact_path.path().display() + ); } #[tokio::test] diff --git a/polkadot/primitives/src/v6/executor_params.rs b/polkadot/primitives/src/v6/executor_params.rs index 112a529f62b0..1e19f3b23fec 100644 --- a/polkadot/primitives/src/v6/executor_params.rs +++ b/polkadot/primitives/src/v6/executor_params.rs @@ -159,7 +159,9 @@ impl sp_std::fmt::LowerHex for ExecutorParamsHash { // into individual fields of the structure. Thus, complex migrations shall be avoided when adding // new entries and removing old ones. At the moment, there's no mandatory parameters defined. If // they show up, they must be clearly documented as mandatory ones. -#[derive(Clone, Debug, Encode, Decode, PartialEq, Eq, TypeInfo, Serialize, Deserialize)] +#[derive( + Clone, Debug, Default, Encode, Decode, PartialEq, Eq, TypeInfo, Serialize, Deserialize, +)] pub struct ExecutorParams(Vec); impl ExecutorParams { @@ -334,9 +336,3 @@ impl From<&[ExecutorParam]> for ExecutorParams { ExecutorParams(arr.to_vec()) } } - -impl Default for ExecutorParams { - fn default() -> Self { - ExecutorParams(vec![]) - } -} diff --git a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md index e0984bd58d1d..39a317a07c84 100644 --- a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md +++ b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md @@ -125,6 +125,14 @@ execution request: reason, which may or may not be independent of the candidate or PVF. 5. **Internal errors:** See "Internal Errors" section. In this case, after the retry we abstain from voting. +6. **RuntimeConstruction** error. The precheck handles a general case of a wrong + artifact but doesn't guarantee its consistency between the preparation and + the execution. If something happened with the artifact between + the preparation of the artifact and its execution (e.g. the artifact was + corrupted on disk or a dirty node upgrade happened when the prepare worker + has a wasmtime version different from the execute worker's wasmtime version). + We treat such an error as possibly transient due to local issues and retry + one time. ### Preparation timeouts diff --git a/prdoc/pr_3187.prdoc b/prdoc/pr_3187.prdoc new file mode 100644 index 000000000000..bda41142d86c --- /dev/null +++ b/prdoc/pr_3187.prdoc @@ -0,0 +1,16 @@ +# Schema: Polkadot SDK PRDoc Schema (prdoc) v1.0.0 +# See doc at https://raw.githubusercontent.com/paritytech/polkadot-sdk/master/prdoc/schema_user.json + +title: Retrying an execution on failed runtime construction + +doc: + - audience: Node Dev + description: | + If a runtime construction error happened during the execution request, then the artifact is re-prepared + and the execution request is retried at most once. See also the related issue. + +crates: + - name: polkadot-node-core-candidate-validation + - name: polkadot-node-core-pvf + - name: polkadot-node-core-pvf-execute-worker + - name: polkadot-node-core-pvf-common