Skip to content

Commit

Permalink
Refactor ValidationError (#2406)
Browse files Browse the repository at this point in the history
  • Loading branch information
eagr authored Nov 21, 2023
1 parent 2fd8c51 commit 0619049
Show file tree
Hide file tree
Showing 10 changed files with 122 additions and 110 deletions.
60 changes: 35 additions & 25 deletions polkadot/node/core/candidate-validation/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
#![warn(missing_docs)]

use polkadot_node_core_pvf::{
InternalValidationError, InvalidCandidate as WasmInvalidCandidate, PrepareError,
PrepareJobKind, PvfPrepData, ValidationError, ValidationHost,
InternalValidationError, InvalidCandidate as WasmInvalidCandidate, PossiblyInvalidError,
PrepareError, PrepareJobKind, PvfPrepData, ValidationError, ValidationHost,
};
use polkadot_node_primitives::{
BlockData, InvalidCandidate, PoV, ValidationResult, POV_BOMB_LIMIT, VALIDATION_CODE_BOMB_LIMIT,
Expand Down Expand Up @@ -642,7 +642,7 @@ async fn validate_candidate_exhaustive(
}

match result {
Err(ValidationError::InternalError(e)) => {
Err(ValidationError::Internal(e)) => {
gum::warn!(
target: LOG_TARGET,
?para_id,
Expand All @@ -651,34 +651,29 @@ async fn validate_candidate_exhaustive(
);
Err(ValidationFailed(e.to_string()))
},
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::HardTimeout)) =>
Err(ValidationError::Invalid(WasmInvalidCandidate::HardTimeout)) =>
Ok(ValidationResult::Invalid(InvalidCandidate::Timeout)),
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::WorkerReportedInvalid(e))) =>
Err(ValidationError::Invalid(WasmInvalidCandidate::WorkerReportedInvalid(e))) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(e))),
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)) =>
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(
"ambiguous worker death".to_string(),
))),
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError(err))) =>
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))),

Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousJobDeath(err))) =>
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(format!(
"ambiguous job death: {err}"
)))),
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::PrepareError(e))) => {
// In principle if preparation of the `WASM` fails, the current candidate can not be the
// reason for that. So we can't say whether it is invalid or not. In addition, with
// pre-checking enabled only valid runtimes should ever get enacted, so we can be
// reasonably sure that this is some local problem on the current node. However, as this
// particular error *seems* to indicate a deterministic error, we raise a warning.
Err(ValidationError::Preparation(e)) => {
gum::warn!(
target: LOG_TARGET,
?para_id,
?e,
"Deterministic error occurred during preparation (should have been ruled out by pre-checking phase)",
);
Err(ValidationFailed(e))
Err(ValidationFailed(e.to_string()))
},
Ok(res) =>
if res.head_data.hash() != candidate_receipt.descriptor.para_head {
Expand Down Expand Up @@ -762,16 +757,31 @@ trait ValidationBackend {
}

match validation_result {
Err(ValidationError::InvalidCandidate(
WasmInvalidCandidate::AmbiguousWorkerDeath |
WasmInvalidCandidate::AmbiguousJobDeath(_),
)) if num_death_retries_left > 0 => num_death_retries_left -= 1,
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError(_)))
if num_job_error_retries_left > 0 =>
num_job_error_retries_left -= 1,
Err(ValidationError::InternalError(_)) if num_internal_retries_left > 0 =>
num_internal_retries_left -= 1,
_ => break,
Err(ValidationError::PossiblyInvalid(
PossiblyInvalidError::AmbiguousWorkerDeath |
PossiblyInvalidError::AmbiguousJobDeath(_),
)) =>
if num_death_retries_left > 0 {
num_death_retries_left -= 1;
} else {
break;
},

Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(_))) =>
if num_job_error_retries_left > 0 {
num_job_error_retries_left -= 1;
} else {
break;
},

Err(ValidationError::Internal(_)) =>
if num_internal_retries_left > 0 {
num_internal_retries_left -= 1;
} else {
break;
},

Ok(_) | Err(ValidationError::Invalid(_) | ValidationError::Preparation(_)) => break,
}

// If we got a possibly transient error, retry once after a brief delay, on the
Expand Down
40 changes: 20 additions & 20 deletions polkadot/node/core/candidate-validation/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -480,9 +480,9 @@ fn candidate_validation_bad_return_is_invalid() {
let candidate_receipt = CandidateReceipt { descriptor, commitments_hash: Hash::zero() };

let v = executor::block_on(validate_candidate_exhaustive(
MockValidateCandidateBackend::with_hardcoded_result(Err(
ValidationError::InvalidCandidate(WasmInvalidCandidate::HardTimeout),
)),
MockValidateCandidateBackend::with_hardcoded_result(Err(ValidationError::Invalid(
WasmInvalidCandidate::HardTimeout,
))),
validation_data,
validation_code,
candidate_receipt,
Expand Down Expand Up @@ -561,7 +561,7 @@ fn candidate_validation_one_ambiguous_error_is_valid() {

let v = executor::block_on(validate_candidate_exhaustive(
MockValidateCandidateBackend::with_hardcoded_result_list(vec![
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
Ok(validation_result),
]),
validation_data.clone(),
Expand Down Expand Up @@ -602,8 +602,8 @@ fn candidate_validation_multiple_ambiguous_errors_is_invalid() {

let v = executor::block_on(validate_candidate_exhaustive(
MockValidateCandidateBackend::with_hardcoded_result_list(vec![
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)),
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
]),
validation_data,
validation_code,
Expand All @@ -626,7 +626,7 @@ fn candidate_validation_retry_internal_errors() {
vec![
Err(InternalValidationError::HostCommunication("foo".into()).into()),
// Throw an AJD error, we should still retry again.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousJobDeath(
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(
"baz".into(),
))),
// Throw another internal error.
Expand All @@ -644,7 +644,7 @@ fn candidate_validation_dont_retry_internal_errors() {
vec![
Err(InternalValidationError::HostCommunication("foo".into()).into()),
// Throw an AWD error, we should still retry again.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
// Throw another internal error.
Err(InternalValidationError::HostCommunication("bar".into()).into()),
],
Expand All @@ -659,11 +659,11 @@ fn candidate_validation_retry_panic_errors() {
let v = candidate_validation_retry_on_error_helper(
PvfExecKind::Approval,
vec![
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError("foo".into()))),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError("foo".into()))),
// Throw an AWD error, we should still retry again.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
// Throw another panic error.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError("bar".into()))),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError("bar".into()))),
],
);

Expand All @@ -676,11 +676,11 @@ fn candidate_validation_dont_retry_panic_errors() {
let v = candidate_validation_retry_on_error_helper(
PvfExecKind::Backing,
vec![
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError("foo".into()))),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError("foo".into()))),
// Throw an AWD error, we should still retry again.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
// Throw another panic error.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError("bar".into()))),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError("bar".into()))),
],
);

Expand Down Expand Up @@ -758,9 +758,9 @@ fn candidate_validation_timeout_is_internal_error() {
let candidate_receipt = CandidateReceipt { descriptor, commitments_hash: Hash::zero() };

let v = executor::block_on(validate_candidate_exhaustive(
MockValidateCandidateBackend::with_hardcoded_result(Err(
ValidationError::InvalidCandidate(WasmInvalidCandidate::HardTimeout),
)),
MockValidateCandidateBackend::with_hardcoded_result(Err(ValidationError::Invalid(
WasmInvalidCandidate::HardTimeout,
))),
validation_data,
validation_code,
candidate_receipt,
Expand Down Expand Up @@ -852,9 +852,9 @@ fn candidate_validation_code_mismatch_is_invalid() {
let (_ctx, _ctx_handle) = test_helpers::make_subsystem_context::<AllMessages, _>(pool.clone());

let v = executor::block_on(validate_candidate_exhaustive(
MockValidateCandidateBackend::with_hardcoded_result(Err(
ValidationError::InvalidCandidate(WasmInvalidCandidate::HardTimeout),
)),
MockValidateCandidateBackend::with_hardcoded_result(Err(ValidationError::Invalid(
WasmInvalidCandidate::HardTimeout,
))),
validation_data,
validation_code,
candidate_receipt,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@
use criterion::{criterion_group, criterion_main, BatchSize, Criterion, SamplingMode};
use polkadot_node_core_pvf::{
start, testing, Config, Metrics, PrepareError, PrepareJobKind, PrepareStats, PvfPrepData,
ValidationHost,
start, testing, Config, Metrics, PrepareError, PrepareJobKind, PvfPrepData, ValidationHost,
};
use polkadot_primitives::ExecutorParams;
use rococo_runtime::WASM_BINARY;
Expand Down
19 changes: 8 additions & 11 deletions polkadot/node/core/pvf/common/src/executor_intf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,22 +140,20 @@ pub unsafe fn create_runtime_from_artifact_bytes(
executor_params: &ExecutorParams,
) -> Result<WasmtimeRuntime, WasmError> {
let mut config = DEFAULT_CONFIG.clone();
config.semantics =
params_to_wasmtime_semantics(executor_params).map_err(|err| WasmError::Other(err))?;
config.semantics = params_to_wasmtime_semantics(executor_params);

sc_executor_wasmtime::create_runtime_from_artifact_bytes::<HostFunctions>(
compiled_artifact_blob,
config,
)
}

pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> Result<Semantics, String> {
pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> Semantics {
let mut sem = DEFAULT_CONFIG.semantics.clone();
let mut stack_limit = if let Some(stack_limit) = sem.deterministic_stack_limit.clone() {
stack_limit
} else {
return Err("No default stack limit set".to_owned())
};
let mut stack_limit = sem
.deterministic_stack_limit
.expect("There is a comment to not change the default stack limit; it should always be available; qed")
.clone();

for p in par.iter() {
match p {
Expand All @@ -172,7 +170,7 @@ pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> Result<Semantics, S
}
}
sem.deterministic_stack_limit = Some(stack_limit);
Ok(sem)
sem
}

/// Runs the prevalidation on the given code. Returns a [`RuntimeBlob`] if it succeeds.
Expand All @@ -191,8 +189,7 @@ pub fn prepare(
blob: RuntimeBlob,
executor_params: &ExecutorParams,
) -> Result<Vec<u8>, sc_executor_common::error::WasmError> {
let semantics = params_to_wasmtime_semantics(executor_params)
.map_err(|e| sc_executor_common::error::WasmError::Other(e))?;
let semantics = params_to_wasmtime_semantics(executor_params);
sc_executor_wasmtime::prepare_runtime_artifact(blob, &semantics)
}

Expand Down
37 changes: 25 additions & 12 deletions polkadot/node/core/pvf/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,31 +19,44 @@ use polkadot_node_core_pvf_common::error::{InternalValidationError, PrepareError
/// A error raised during validation of the candidate.
#[derive(Debug, Clone)]
pub enum ValidationError {
/// The error was raised because the candidate is invalid.
/// Deterministic preparation issue. In practice, most of the problems should be caught by
/// prechecking, so this may be a sign of internal conditions.
///
/// Whenever we are unsure if the error was due to the candidate or not, we must vote invalid.
InvalidCandidate(InvalidCandidate),
/// Some internal error occurred.
InternalError(InternalValidationError),
/// In principle if preparation of the `WASM` fails, the current candidate cannot be the
/// reason for that. So we can't say whether it is invalid or not. In addition, with
/// pre-checking enabled only valid runtimes should ever get enacted, so we can be
/// reasonably sure that this is some local problem on the current node. However, as this
/// particular error *seems* to indicate a deterministic error, we raise a warning.
Preparation(PrepareError),
/// The error was raised because the candidate is invalid. Should vote against.
Invalid(InvalidCandidate),
/// Possibly transient issue that may resolve after retries. Should vote against when retries
/// fail.
PossiblyInvalid(PossiblyInvalidError),
/// Preparation or execution issue caused by an internal condition. Should not vote against.
Internal(InternalValidationError),
}

/// A description of an error raised during executing a PVF and can be attributed to the combination
/// of the candidate [`polkadot_parachain_primitives::primitives::ValidationParams`] and the PVF.
#[derive(Debug, Clone)]
pub enum InvalidCandidate {
/// PVF preparation ended up with a deterministic error.
PrepareError(String),
/// The candidate is reported to be invalid by the execution worker. The string contains the
/// error message.
WorkerReportedInvalid(String),
/// PVF execution (compilation is not included) took more time than was allotted.
HardTimeout,
}

/// Possibly transient issue that may resolve after retries.
#[derive(Debug, Clone)]
pub enum PossiblyInvalidError {
/// The worker process (not the job) has died during validation of a candidate.
///
/// It's unlikely that this is caused by malicious code since workers spawn separate job
/// processes, and those job processes are sandboxed. But, it is possible. We retry in this
/// case, and if the error persists, we assume it's caused by the candidate and vote against.
AmbiguousWorkerDeath,
/// PVF execution (compilation is not included) took more time than was allotted.
HardTimeout,
/// The job process (not the worker) has died for one of the following reasons:
///
/// (a) A seccomp violation occurred, most likely due to an attempt by malicious code to
Expand All @@ -68,7 +81,7 @@ pub enum InvalidCandidate {

impl From<InternalValidationError> for ValidationError {
fn from(error: InternalValidationError) -> Self {
Self::InternalError(error)
Self::Internal(error)
}
}

Expand All @@ -77,9 +90,9 @@ impl From<PrepareError> for ValidationError {
// Here we need to classify the errors into two errors: deterministic and non-deterministic.
// See [`PrepareError::is_deterministic`].
if error.is_deterministic() {
Self::InvalidCandidate(InvalidCandidate::PrepareError(error.to_string()))
Self::Preparation(error)
} else {
Self::InternalError(InternalValidationError::NonDeterministicPrepareError(error))
Self::Internal(InternalValidationError::NonDeterministicPrepareError(error))
}
}
}
14 changes: 7 additions & 7 deletions polkadot/node/core/pvf/src/execute/queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use crate::{
host::ResultSender,
metrics::Metrics,
worker_intf::{IdleWorker, WorkerHandle},
InvalidCandidate, ValidationError, LOG_TARGET,
InvalidCandidate, PossiblyInvalidError, ValidationError, LOG_TARGET,
};
use futures::{
channel::mpsc,
Expand Down Expand Up @@ -342,27 +342,27 @@ fn handle_job_finish(
},
Outcome::InvalidCandidate { err, idle_worker } => (
Some(idle_worker),
Err(ValidationError::InvalidCandidate(InvalidCandidate::WorkerReportedInvalid(err))),
Err(ValidationError::Invalid(InvalidCandidate::WorkerReportedInvalid(err))),
None,
),
Outcome::InternalError { err } => (None, Err(ValidationError::InternalError(err)), None),
Outcome::InternalError { err } => (None, Err(ValidationError::Internal(err)), None),
// Either the worker or the job timed out. Kill the worker in either case. Treated as
// definitely-invalid, because if we timed out, there's no time left for a retry.
Outcome::HardTimeout =>
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout)), None),
(None, Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)), None),
// "Maybe invalid" errors (will retry).
Outcome::WorkerIntfErr => (
None,
Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath)),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
None,
),
Outcome::JobDied { err } => (
None,
Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousJobDeath(err))),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))),
None,
),
Outcome::JobError { err } =>
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::JobError(err))), None),
(None, Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))), None),
};

queue.metrics.execute_finished();
Expand Down
Loading

0 comments on commit 0619049

Please sign in to comment.