paritytech · mrcnski · Dec 20, 2022 · Dec 2, 2022 · Dec 3, 2022 · Dec 5, 2022
diff --git a/node/core/candidate-validation/src/lib.rs b/node/core/candidate-validation/src/lib.rs
@@ -320,12 +320,12 @@ where
 
 	match validation_backend.precheck_pvf(validation_code).await {
 		Ok(_) => PreCheckOutcome::Valid,
-		Err(prepare_err) => match prepare_err {
-			PrepareError::Prevalidation(_) |
-			PrepareError::Preparation(_) |
-			PrepareError::Panic(_) => PreCheckOutcome::Invalid,
-			PrepareError::TimedOut | PrepareError::DidNotMakeIt => PreCheckOutcome::Failed,
-		},
+		Err(prepare_err) =>
+			if prepare_err.is_deterministic() {
+				PreCheckOutcome::Invalid
+			} else {
+				PreCheckOutcome::Failed
+			},
 	}
 }
 
@@ -666,11 +666,14 @@ impl ValidationBackend for ValidationHost {
 
 	async fn precheck_pvf(&mut self, pvf: Pvf) -> Result<Duration, PrepareError> {
 		let (tx, rx) = oneshot::channel();
+		// TODO: Why are we always returning a non-deterministic error? This causes the pre-check
+		// outcome to always be `Failed`, never `Invalid`, thus never triggering disputes according
+		// to comments in pvf/src/error.rs. Can we change this to return the actual error?
 		if let Err(_) = self.precheck_pvf(pvf, tx).await {
-			return Err(PrepareError::DidNotMakeIt)
+			return Err(PrepareError::IoErr)
 		}
 
-		let precheck_result = rx.await.or(Err(PrepareError::DidNotMakeIt))?;
+		let precheck_result = rx.await.or(Err(PrepareError::IoErr))?;
 
 		precheck_result
 	}

diff --git a/node/core/candidate-validation/src/tests.rs b/node/core/candidate-validation/src/tests.rs
@@ -1053,5 +1053,5 @@ fn precheck_properly_classifies_outcomes() {
 	inner(Err(PrepareError::Panic("baz".to_owned())), PreCheckOutcome::Invalid);
 
 	inner(Err(PrepareError::TimedOut), PreCheckOutcome::Failed);
-	inner(Err(PrepareError::DidNotMakeIt), PreCheckOutcome::Failed);
+	inner(Err(PrepareError::IoErr), PreCheckOutcome::Failed);
 }
diff --git a/node/core/pvf/src/error.rs b/node/core/pvf/src/error.rs
@@ -15,7 +15,7 @@
 // along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
 
 use parity_scale_codec::{Decode, Encode};
-use std::{any::Any, time::Duration};
+use std::{any::Any, fmt, time::Duration};
 
 /// Result of PVF preparation performed by the validation host. Contains the elapsed CPU time if
 /// successful
@@ -32,9 +32,46 @@ pub enum PrepareError {
 	Panic(String),
 	/// Failed to prepare the PVF due to the time limit.
 	TimedOut,
-	/// This state indicates that the process assigned to prepare the artifact wasn't responsible
-	/// or were killed. This state is reported by the validation host (not by the worker).
-	DidNotMakeIt,
+	/// An IO error occurred while receiving the result from the worker process. This state is reported by the
+	/// validation host (not by the worker).
+	IoErr,
+	/// The temporary file for the artifact could not be created at the given cache path. This state is reported by the
+	/// validation host (not by the worker).
+	CreateTmpFileErr(String),
+	/// The response from the worker is received, but the file cannot be renamed (moved) to the final destination
+	/// location. This state is reported by the validation host (not by the worker).
+	RenameTmpFileErr(String),
+}
+
+impl PrepareError {
+	/// Returns whether this is a deterministic error, i.e. one that should trigger reliably. Those
+	/// errors depend on the PVF itself and the sc-executor/wasmtime logic.
+	///
+	/// Non-deterministic errors can happen spuriously. Typically, they occur due to resource
+	/// starvation, e.g. under heavy load or memory pressure. Those errors are typically transient
+	/// but may persist e.g. if the node is run by overwhelmingly underpowered machine.
+	pub fn is_deterministic(&self) -> bool {
+		use PrepareError::*;
+		match self {
+			Prevalidation(_) | Preparation(_) | Panic(_) => true,
+			TimedOut | IoErr | CreateTmpFileErr(_) | RenameTmpFileErr(_) => false,
+		}
+	}
+}
+
+impl fmt::Display for PrepareError {
+	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+		use PrepareError::*;
+		match self {
+			Prevalidation(err) => write!(f, "prevalidation: {}", err),
+			Preparation(err) => write!(f, "preparation: {}", err),
+			Panic(err) => write!(f, "panic: {}", err),
+			TimedOut => write!(f, "prepare: timeout"),
+			IoErr => write!(f, "prepare: io error while receiving response"),
+			CreateTmpFileErr(err) => write!(f, "prepare: error creating tmp file: {}", err),
+			RenameTmpFileErr(err) => write!(f, "prepare: error renaming tmp file: {}", err),
+		}
+	}
 }
 
 /// A error raised during validation of the candidate.
@@ -81,32 +118,18 @@ pub enum InvalidCandidate {
 impl From<PrepareError> for ValidationError {
 	fn from(error: PrepareError) -> Self {
 		// Here we need to classify the errors into two errors: deterministic and non-deterministic.
+		// See [`PrepareError::is_deterministic`].
 		//
-		// Non-deterministic errors can happen spuriously. Typically, they occur due to resource
-		// starvation, e.g. under heavy load or memory pressure. Those errors are typically transient
-		// but may persist e.g. if the node is run by overwhelmingly underpowered machine.
-		//
-		// Deterministic errors should trigger reliably. Those errors depend on the PVF itself and
-		// the sc-executor/wasmtime logic.
-		//
-		// For now, at least until the PVF pre-checking lands, the deterministic errors will be
-		// treated as `InvalidCandidate`. Should those occur they could potentially trigger disputes.
+		// We treat the deterministic errors as `InvalidCandidate`. Should those occur they could
+		// potentially trigger disputes.
 		//
+		// TODO: Is this up-to-date?
 		// All non-deterministic errors are qualified as `InternalError`s and will not trigger
 		// disputes.
-		match error {
-			PrepareError::Prevalidation(err) => ValidationError::InvalidCandidate(
-				InvalidCandidate::PrepareError(format!("prevalidation: {}", err)),
-			),
-			PrepareError::Preparation(err) => ValidationError::InvalidCandidate(
-				InvalidCandidate::PrepareError(format!("preparation: {}", err)),
-			),
-			PrepareError::Panic(err) => ValidationError::InvalidCandidate(
-				InvalidCandidate::PrepareError(format!("panic: {}", err)),
-			),
-			PrepareError::TimedOut => ValidationError::InternalError("prepare: timeout".to_owned()),
-			PrepareError::DidNotMakeIt =>
-				ValidationError::InternalError("prepare: did not make it".to_owned()),
+		if error.is_deterministic() {
+			ValidationError::InvalidCandidate(InvalidCandidate::PrepareError(error.to_string()))
+		} else {
+			ValidationError::InternalError(error.to_string())
 		}
 	}
 }

diff --git a/node/core/pvf/src/host.rs b/node/core/pvf/src/host.rs
@@ -776,16 +776,15 @@ fn can_retry_prepare_after_failure(
 	num_failures: u32,
 	error: &PrepareError,
 ) -> bool {
-	use PrepareError::*;
-	match error {
-		// Gracefully returned an error, so it will probably be reproducible. Don't retry.
-		Prevalidation(_) | Preparation(_) => false,
-		// Retry if the retry cooldown has elapsed and if we have already retried less than
-		// `NUM_PREPARE_RETRIES` times. IO errors may resolve themselves.
-		Panic(_) | TimedOut | DidNotMakeIt =>
-			SystemTime::now() >= last_time_failed + PREPARE_FAILURE_COOLDOWN &&
-				num_failures <= NUM_PREPARE_RETRIES,
+	if error.is_deterministic() {
+		// This error is considered deterministic, so it will probably be reproducible. Don't retry.
+		return false
 	}
+
+	// Retry if the retry cooldown has elapsed and if we have already retried less than `NUM_PREPARE_RETRIES` times. IO
+	// errors may resolve themselves.
+	SystemTime::now() >= last_time_failed + PREPARE_FAILURE_COOLDOWN &&
+		num_failures <= NUM_PREPARE_RETRIES
 }
 
 /// A stream that yields a pulse continuously at a given interval.

diff --git a/node/core/pvf/src/prepare/pool.rs b/node/core/pvf/src/prepare/pool.rs
@@ -232,7 +232,7 @@ fn handle_to_pool(
 					// items concluded;
 					// thus idle token is Some;
 					// qed.
-					never!("unexpected abscence of the idle token in prepare pool");
+					never!("unexpected absence of the idle token in prepare pool");
 				}
 			} else {
 				// That's a relatively normal situation since the queue may send `start_work` and
@@ -295,40 +295,40 @@ fn handle_mux(
 		},
 		PoolEvent::StartWork(worker, outcome) => {
 			match outcome {
-				Outcome::Concluded { worker: idle, result } => {
-					let data = match spawned.get_mut(worker) {
-						None => {
-							// Perhaps the worker was killed meanwhile and the result is no longer
-							// relevant.
-							return Ok(())
-						},
-						Some(data) => data,
-					};
-
-					// We just replace the idle worker that was loaned from this option during
-					// the work starting.
-					let old = data.idle.replace(idle);
-					assert_matches!(old, None, "attempt to overwrite an idle worker");
-
-					reply(from_pool, FromPool::Concluded { worker, rip: false, result })?;
-
-					Ok(())
-				},
+				Outcome::Concluded { worker: idle, result } =>
+					handle_concluded_no_rip(from_pool, spawned, worker, idle, result),
+				// Return `Concluded`, but do not kill the worker since the error was on the host side.
+				Outcome::CreateTmpFileErr { worker: idle, err } => handle_concluded_no_rip(
+					from_pool,
+					spawned,
+					worker,
+					idle,
+					Err(PrepareError::CreateTmpFileErr(err)),
+				),
+				// Return `Concluded`, but do not kill the worker since the error was on the host side.
+				Outcome::RenameTmpFileErr { worker: idle, result: _, err } =>
+					handle_concluded_no_rip(
+						from_pool,
+						spawned,
+						worker,
+						idle,
+						Err(PrepareError::RenameTmpFileErr(err)),
+					),
 				Outcome::Unreachable => {
 					if attempt_retire(metrics, spawned, worker) {
 						reply(from_pool, FromPool::Rip(worker))?;
 					}
 
 					Ok(())
 				},
-				Outcome::DidNotMakeIt => {
+				Outcome::IoErr => {
 					if attempt_retire(metrics, spawned, worker) {
 						reply(
 							from_pool,
 							FromPool::Concluded {
 								worker,
 								rip: true,
-								result: Err(PrepareError::DidNotMakeIt),
+								result: Err(PrepareError::IoErr),
 							},
 						)?;
 					}
@@ -377,6 +377,37 @@ fn attempt_retire(
 	}
 }
 
+/// Handles the case where we received a response. There potentially was an error, but not the fault
+/// of the worker as far as we know, so the worker should not be killed.
+///
+/// This function tries to put the idle worker back into the pool and then replies with
+/// `FromPool::Concluded` with `rip: false`.
+fn handle_concluded_no_rip(
+	from_pool: &mut mpsc::UnboundedSender<FromPool>,
+	spawned: &mut HopSlotMap<Worker, WorkerData>,
+	worker: Worker,
+	idle: IdleWorker,
+	result: PrepareResult,
+) -> Result<(), Fatal> {
+	let data = match spawned.get_mut(worker) {
+		None => {
+			// Perhaps the worker was killed meanwhile and the result is no longer
+			// relevant.
+			return Ok(())
+		},
+		Some(data) => data,
+	};
+
+	// We just replace the idle worker that was loaned from this option during
+	// the work starting.
+	let old = data.idle.replace(idle);
+	assert_matches!(old, None, "attempt to overwrite an idle worker");
+
+	reply(from_pool, FromPool::Concluded { worker, rip: false, result })?;
+
+	Ok(())
+}
+
 /// Spins up the pool and returns the future that should be polled to make the pool functional.
 pub fn start(
 	metrics: Metrics,