Skip to content

Commit

Permalink
fix(proof_data_handler): TEE blob fetching error handling
Browse files Browse the repository at this point in the history
We ran into a problem in the staging environment where TEE blob fetching
failed because of a 30-day retention policy on blobs in Google Cloud
Storage. The TEE prover was failing for all old batches
(`l1_batch_number < 58300`). This commit fixes the issue by adding
better error handling when the blob for a given batch number isn't
available.
  • Loading branch information
pbeza committed Aug 16, 2024
1 parent 47a082b commit f84702b
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 21 deletions.
2 changes: 1 addition & 1 deletion core/bin/zksync_tee_prover/src/tee_prover.rs
Original file line number Diff line number Diff line change
Expand Up @@ -201,8 +201,8 @@ impl Task for TeeProver {
if !err.is_retriable() || retries > self.config.max_retries {
return Err(err.into());
}
retries += 1;
tracing::warn!(%err, "Failed TEE prover step function {retries}/{}, retrying in {} milliseconds.", self.config.max_retries, backoff.as_millis());
retries += 1;
backoff = std::cmp::min(
backoff.mul_f32(self.config.retry_backoff_multiplier),
self.config.max_backoff,
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions core/lib/dal/doc/TeeProofGenerationDal.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,7 @@ ready_to_be_proven --> picked_by_prover : get_next_batch_to_be_proven
picked_by_prover --> generated : save_proof_artifacts_metadata
generated --> [*]
[*] --> skipped : mark_proof_generation_job_as_skipped
skipped --> [*]
```
38 changes: 38 additions & 0 deletions core/lib/dal/src/tee_proof_generation_dal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,44 @@ impl TeeProofGenerationDal<'_, '_> {
Ok(())
}

pub async fn mark_proof_generation_job_as_skipped(
&mut self,
batch_number: L1BatchNumber,
tee_type: TeeType,
) -> DalResult<()> {
let l1_batch_number = i64::from(batch_number.0);
let query = sqlx::query!(
r#"
UPDATE tee_proof_generation_details
SET
status = 'skipped',
updated_at = NOW()
WHERE
l1_batch_number = $1
AND tee_type = $2
"#,
l1_batch_number,
tee_type.to_string()
);
let instrumentation = Instrumented::new("mark_proof_generation_job_as_skipped")
.with_arg("l1_batch_number", &l1_batch_number)
.with_arg("tee_type", &tee_type);
let result = instrumentation
.clone()
.with(query)
.execute(self.storage)
.await?;
if result.rows_affected() == 0 {
let err = instrumentation.constraint_error(anyhow::anyhow!(
"Cannot mark proof as skipped because batch number {} does not exist",
l1_batch_number
));
return Err(err);
}

Ok(())
}

pub async fn save_attestation(&mut self, pubkey: &[u8], attestation: &[u8]) -> DalResult<()> {
let query = sqlx::query!(
r#"
Expand Down
81 changes: 61 additions & 20 deletions core/node/proof_data_handler/src/tee_request_processor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ use std::sync::Arc;
use axum::{extract::Path, Json};
use zksync_config::configs::ProofDataHandlerConfig;
use zksync_dal::{ConnectionPool, Core, CoreDal};
use zksync_object_store::ObjectStore;
use zksync_object_store::{ObjectStore, ObjectStoreError};
use zksync_prover_interface::{
api::{
RegisterTeeAttestationRequest, RegisterTeeAttestationResponse, SubmitProofResponse,
SubmitTeeProofRequest, TeeProofGenerationDataRequest, TeeProofGenerationDataResponse,
},
inputs::TeeVerifierInput,
};
use zksync_types::L1BatchNumber;
use zksync_types::{tee_types::TeeType, L1BatchNumber};

use crate::errors::RequestProcessorError;

Expand Down Expand Up @@ -47,26 +47,67 @@ impl TeeRequestProcessor {
.await
.map_err(RequestProcessorError::Dal)?;

let l1_batch_number_result = connection
.tee_proof_generation_dal()
.get_next_batch_to_be_proven(request.tee_type, self.config.proof_generation_timeout())
.await
.map_err(RequestProcessorError::Dal)?;

let l1_batch_number = match l1_batch_number_result {
Some(number) => number,
None => return Ok(Json(TeeProofGenerationDataResponse(None))),
};

let tee_verifier_input: TeeVerifierInput = self
.blob_store
.get(l1_batch_number)
.await
.map_err(RequestProcessorError::ObjectStore)?;
loop {
let l1_batch_number = match connection
.tee_proof_generation_dal()
.get_next_batch_to_be_proven(
request.tee_type,
self.config.proof_generation_timeout(),
)
.await
.map_err(RequestProcessorError::Dal)?
{
Some(number) => number,
None => return Ok(Json(TeeProofGenerationDataResponse(None))),
};

match self
.get_blob(l1_batch_number, request.tee_type, &mut connection)
.await
{
Ok(input) => {
return Ok(Json(TeeProofGenerationDataResponse(Some(Box::new(input)))));
}
Err(ObjectStoreError::KeyNotFound(_)) => {
tracing::warn!(
"Blob for batch number {} has not been found in the object store. Marking the job as skipped.",
l1_batch_number
);
connection
.tee_proof_generation_dal()
.mark_proof_generation_job_as_skipped(l1_batch_number, request.tee_type)
.await
.map_err(RequestProcessorError::Dal)?;
continue;
}
Err(err) => return Err(RequestProcessorError::ObjectStore(err)),
}
}
}

let response = TeeProofGenerationDataResponse(Some(Box::new(tee_verifier_input)));
async fn get_blob(
&self,
l1_batch_number: L1BatchNumber,
tee_type: TeeType,
connection: &mut zksync_dal::Connection<'_, Core>,
) -> Result<TeeVerifierInput, ObjectStoreError> {
let max_blob_store_retries = 3;

for _ in 0..max_blob_store_retries {
match self.blob_store.get(l1_batch_number).await {
Ok(input) => return Ok(input),
Err(ObjectStoreError::Other { is_retriable, .. }) if is_retriable => continue,
Err(err) => return Err(err),
}
}

Ok(Json(response))
Err(ObjectStoreError::Other {
is_retriable: false,
message: format!(
"Max retries ({}) exceeded while fetching blob for batch number {} from the object store",
max_blob_store_retries, l1_batch_number
),
})
}

pub(crate) async fn submit_proof(
Expand Down

0 comments on commit f84702b

Please sign in to comment.