From 3e309e06b24649c74bfe120e8ca45247cb2b5628 Mon Sep 17 00:00:00 2001 From: Patrick Date: Thu, 21 Nov 2024 08:57:58 +0100 Subject: [PATCH] feat(proof-data-handler): exclude batches without object file in GCS (#2980) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What ❔ `/tee/proof_inputs` endpoint no longer returns batches that have no corresponding object file in Google Cloud Storage for an extended period. ## Why ❔ TEE's `proof-data-handler` on `mainnet` was flooded with warnings. Since the recent `mainnet`'s `24.25.0` redeployment, we've been [flooded with warnings][warnings] for the `proof-data-handler` on `mainnet` (the warnings are actually _not_ fatal in this context): ``` Failed request with a fatal error (...) Blobs for batch numbers 490520 to 490555 not found in the object store. Marked as unpicked. ``` The issue is caused [by the code][code] behind the `/tee/proof_inputs` [endpoint][endpoint_proof_inputs] (which is equivalent to the `/proof_generation_data` [endpoint][endpoint_proof_generation_data]) – it finds the next batch to send to the [requesting][requesting] `tee-prover` by looking for the first batch that has a corresponding object in the Google object store. As it skips over batches that don’t have the objects, [it logs][logging] `Failed request with a fatal error` for each one (unless the skipped batch was successfully proven, in which case it doesn’t log the error). This happens with every request the `tee-prover` sends, which is why we're getting so much noise in the logs. One possible solution is to flag the problematic batches as `permanently_ignored`, like [Thomas did before][Thomas] on `mainnet`. [warnings]: https://grafana.matterlabs.dev/goto/TjlaXQgHg?orgId=1 [code]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/node/proof_data_handler/src/tee_request_processor.rs#L35-L79 [endpoint_proof_inputs]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/node/proof_data_handler/src/lib.rs#L96 [endpoint_proof_generation_data]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/node/proof_data_handler/src/lib.rs#L67 [requesting]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/bin/zksync_tee_prover/src/tee_prover.rs#L93 [logging]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/lib/object_store/src/retries.rs#L56 [Thomas]: https://matter-labs-workspace.slack.com/archives/C05ANUCGCKV/p1725284962312929 ## Checklist - [x] PR title corresponds to the body of PR (we generate changelog entries from PRs). - [ ] Tests for the changes have been added / updated. - [ ] Documentation comments have been added / updated. - [x] Code has been formatted via `zk fmt` and `zk lint`. --- Cargo.lock | 1 - .../config/src/configs/proof_data_handler.rs | 19 ++++- core/lib/config/src/testonly.rs | 1 + ...203a62629904bc4956249e690a8ad7a48983.json} | 10 ++- core/lib/dal/doc/TeeProofGenerationDal.md | 8 ++- ...tee_add_permanently_ignored_state.down.sql | 0 ...0_tee_add_permanently_ignored_state.up.sql | 8 +++ core/lib/dal/src/models/storage_tee_proof.rs | 20 +++++- core/lib/dal/src/tee_proof_generation_dal.rs | 69 ++++++++++++------ core/lib/env_config/src/proof_data_handler.rs | 2 + core/lib/object_store/src/retries.rs | 1 - .../protobuf_config/src/proof_data_handler.rs | 11 +++ .../src/proto/config/prover.proto | 1 + core/lib/types/src/api/mod.rs | 2 +- core/node/proof_data_handler/Cargo.toml | 2 - .../src/tee_request_processor.rs | 71 ++++++++++++------- core/node/proof_data_handler/src/tests.rs | 13 ++-- etc/env/base/proof_data_handler.toml | 3 +- etc/env/file_based/general.yaml | 3 +- 19 files changed, 177 insertions(+), 68 deletions(-) rename core/lib/dal/.sqlx/{query-cee7a608bd77815e9582531383481b01395cfd2a3e95fb4593229bd878163320.json => query-e46c99b23db91800b27c717100f8203a62629904bc4956249e690a8ad7a48983.json} (50%) create mode 100644 core/lib/dal/migrations/20240930110000_tee_add_permanently_ignored_state.down.sql create mode 100644 core/lib/dal/migrations/20240930110000_tee_add_permanently_ignored_state.up.sql diff --git a/Cargo.lock b/Cargo.lock index c92a4690e221..dedf555255a3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12339,7 +12339,6 @@ dependencies = [ "tower-http", "tracing", "vise", - "zksync_basic_types", "zksync_config", "zksync_contracts", "zksync_dal", diff --git a/core/lib/config/src/configs/proof_data_handler.rs b/core/lib/config/src/configs/proof_data_handler.rs index 1d8703df51aa..443d602b8126 100644 --- a/core/lib/config/src/configs/proof_data_handler.rs +++ b/core/lib/config/src/configs/proof_data_handler.rs @@ -9,9 +9,12 @@ pub struct TeeConfig { pub tee_support: bool, /// All batches before this one are considered to be processed. pub first_tee_processed_batch: L1BatchNumber, - /// Timeout in seconds for retrying TEE proof generation if it fails. Retries continue - /// indefinitely until successful. + /// Timeout in seconds for retrying the preparation of input for TEE proof generation if it + /// previously failed (e.g., due to a transient network issue) or if it was picked by a TEE + /// prover but the TEE proof was not submitted within that time. pub tee_proof_generation_timeout_in_secs: u16, + /// Timeout in hours after which a batch will be permanently ignored if repeated retries failed. + pub tee_batch_permanently_ignored_timeout_in_hours: u16, } impl Default for TeeConfig { @@ -21,6 +24,8 @@ impl Default for TeeConfig { first_tee_processed_batch: Self::default_first_tee_processed_batch(), tee_proof_generation_timeout_in_secs: Self::default_tee_proof_generation_timeout_in_secs(), + tee_batch_permanently_ignored_timeout_in_hours: + Self::default_tee_batch_permanently_ignored_timeout_in_hours(), } } } @@ -35,12 +40,20 @@ impl TeeConfig { } pub fn default_tee_proof_generation_timeout_in_secs() -> u16 { - 600 + 60 + } + + pub fn default_tee_batch_permanently_ignored_timeout_in_hours() -> u16 { + 10 * 24 } pub fn tee_proof_generation_timeout(&self) -> Duration { Duration::from_secs(self.tee_proof_generation_timeout_in_secs.into()) } + + pub fn tee_batch_permanently_ignored_timeout(&self) -> Duration { + Duration::from_secs(3600 * u64::from(self.tee_batch_permanently_ignored_timeout_in_hours)) + } } #[derive(Debug, Deserialize, Clone, PartialEq)] diff --git a/core/lib/config/src/testonly.rs b/core/lib/config/src/testonly.rs index c24d47f27b33..8a9e96c96a5e 100644 --- a/core/lib/config/src/testonly.rs +++ b/core/lib/config/src/testonly.rs @@ -681,6 +681,7 @@ impl Distribution for EncodeDist { tee_support: self.sample(rng), first_tee_processed_batch: L1BatchNumber(rng.gen()), tee_proof_generation_timeout_in_secs: self.sample(rng), + tee_batch_permanently_ignored_timeout_in_hours: self.sample(rng), }, } } diff --git a/core/lib/dal/.sqlx/query-cee7a608bd77815e9582531383481b01395cfd2a3e95fb4593229bd878163320.json b/core/lib/dal/.sqlx/query-e46c99b23db91800b27c717100f8203a62629904bc4956249e690a8ad7a48983.json similarity index 50% rename from core/lib/dal/.sqlx/query-cee7a608bd77815e9582531383481b01395cfd2a3e95fb4593229bd878163320.json rename to core/lib/dal/.sqlx/query-e46c99b23db91800b27c717100f8203a62629904bc4956249e690a8ad7a48983.json index 4b219bfee0a5..7ca2c9e7e9fa 100644 --- a/core/lib/dal/.sqlx/query-cee7a608bd77815e9582531383481b01395cfd2a3e95fb4593229bd878163320.json +++ b/core/lib/dal/.sqlx/query-e46c99b23db91800b27c717100f8203a62629904bc4956249e690a8ad7a48983.json @@ -1,12 +1,17 @@ { "db_name": "PostgreSQL", - "query": "\n WITH upsert AS (\n SELECT\n p.l1_batch_number\n FROM\n proof_generation_details p\n LEFT JOIN\n tee_proof_generation_details tee\n ON\n p.l1_batch_number = tee.l1_batch_number\n AND tee.tee_type = $1\n WHERE\n (\n p.l1_batch_number >= $5\n AND p.vm_run_data_blob_url IS NOT NULL\n AND p.proof_gen_data_blob_url IS NOT NULL\n )\n AND (\n tee.l1_batch_number IS NULL\n OR (\n tee.status = $3\n OR (\n tee.status = $2\n AND tee.prover_taken_at < NOW() - $4::INTERVAL\n )\n )\n )\n FETCH FIRST ROW ONLY\n )\n \n INSERT INTO\n tee_proof_generation_details (\n l1_batch_number, tee_type, status, created_at, updated_at, prover_taken_at\n )\n SELECT\n l1_batch_number,\n $1,\n $2,\n NOW(),\n NOW(),\n NOW()\n FROM\n upsert\n ON CONFLICT (l1_batch_number, tee_type) DO\n UPDATE\n SET\n status = $2,\n updated_at = NOW(),\n prover_taken_at = NOW()\n RETURNING\n l1_batch_number\n ", + "query": "\n WITH upsert AS (\n SELECT\n p.l1_batch_number\n FROM\n proof_generation_details p\n LEFT JOIN\n tee_proof_generation_details tee\n ON\n p.l1_batch_number = tee.l1_batch_number\n AND tee.tee_type = $1\n WHERE\n (\n p.l1_batch_number >= $5\n AND p.vm_run_data_blob_url IS NOT NULL\n AND p.proof_gen_data_blob_url IS NOT NULL\n )\n AND (\n tee.l1_batch_number IS NULL\n OR (\n (tee.status = $2 OR tee.status = $3)\n AND tee.prover_taken_at < NOW() - $4::INTERVAL\n )\n )\n FETCH FIRST ROW ONLY\n )\n \n INSERT INTO\n tee_proof_generation_details (\n l1_batch_number, tee_type, status, created_at, updated_at, prover_taken_at\n )\n SELECT\n l1_batch_number,\n $1,\n $2,\n NOW(),\n NOW(),\n NOW()\n FROM\n upsert\n ON CONFLICT (l1_batch_number, tee_type) DO\n UPDATE\n SET\n status = $2,\n updated_at = NOW(),\n prover_taken_at = NOW()\n RETURNING\n l1_batch_number,\n created_at\n ", "describe": { "columns": [ { "ordinal": 0, "name": "l1_batch_number", "type_info": "Int8" + }, + { + "ordinal": 1, + "name": "created_at", + "type_info": "Timestamp" } ], "parameters": { @@ -19,8 +24,9 @@ ] }, "nullable": [ + false, false ] }, - "hash": "cee7a608bd77815e9582531383481b01395cfd2a3e95fb4593229bd878163320" + "hash": "e46c99b23db91800b27c717100f8203a62629904bc4956249e690a8ad7a48983" } diff --git a/core/lib/dal/doc/TeeProofGenerationDal.md b/core/lib/dal/doc/TeeProofGenerationDal.md index fcfa379816c7..d9ae70aeb2fd 100644 --- a/core/lib/dal/doc/TeeProofGenerationDal.md +++ b/core/lib/dal/doc/TeeProofGenerationDal.md @@ -11,9 +11,11 @@ title: Status Diagram --- stateDiagram-v2 -[*] --> unpicked : insert_tee_proof_generation_job -unpicked --> picked_by_prover : lock_batch_for_proving +[*] --> picked_by_prover : lock picked_by_prover --> generated : save_proof_artifacts_metadata -picked_by_prover --> unpicked : unlock_batch +picked_by_prover --> permanently_ignored : unlock_batch +picked_by_prover --> failed : unlock_batch +failed --> picked_by_prover : lock +permanently_ignored --> [*] generated --> [*] ``` diff --git a/core/lib/dal/migrations/20240930110000_tee_add_permanently_ignored_state.down.sql b/core/lib/dal/migrations/20240930110000_tee_add_permanently_ignored_state.down.sql new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/core/lib/dal/migrations/20240930110000_tee_add_permanently_ignored_state.up.sql b/core/lib/dal/migrations/20240930110000_tee_add_permanently_ignored_state.up.sql new file mode 100644 index 000000000000..12a21d1728c8 --- /dev/null +++ b/core/lib/dal/migrations/20240930110000_tee_add_permanently_ignored_state.up.sql @@ -0,0 +1,8 @@ +-- There were manually added tee_proof_generation_details entries with status 'permanently_ignore'. + +UPDATE tee_proof_generation_details SET status = 'permanently_ignored' WHERE status = 'permanently_ignore'; + +-- Entries with the status 'unpicked' were not used at all after the migration to the logic +-- introduced in https://github.com/matter-labs/zksync-era/pull/3017. This was overlooked. + +DELETE FROM tee_proof_generation_details WHERE status = 'unpicked'; diff --git a/core/lib/dal/src/models/storage_tee_proof.rs b/core/lib/dal/src/models/storage_tee_proof.rs index 5c93361e7df1..6f80c59511f9 100644 --- a/core/lib/dal/src/models/storage_tee_proof.rs +++ b/core/lib/dal/src/models/storage_tee_proof.rs @@ -1,4 +1,7 @@ -use chrono::NaiveDateTime; +use chrono::{DateTime, NaiveDateTime, Utc}; +use zksync_types::L1BatchNumber; + +use crate::tee_proof_generation_dal::LockedBatch; #[derive(Debug, Clone, sqlx::FromRow)] pub struct StorageTeeProof { @@ -8,3 +11,18 @@ pub struct StorageTeeProof { pub updated_at: NaiveDateTime, pub attestation: Option>, } + +#[derive(Debug, Clone, sqlx::FromRow)] +pub struct StorageLockedBatch { + pub l1_batch_number: i64, + pub created_at: NaiveDateTime, +} + +impl From for LockedBatch { + fn from(tx: StorageLockedBatch) -> LockedBatch { + LockedBatch { + l1_batch_number: L1BatchNumber::from(tx.l1_batch_number as u32), + created_at: DateTime::::from_naive_utc_and_offset(tx.created_at, Utc), + } + } +} diff --git a/core/lib/dal/src/tee_proof_generation_dal.rs b/core/lib/dal/src/tee_proof_generation_dal.rs index 755d02769101..4d19c3ff0c8b 100644 --- a/core/lib/dal/src/tee_proof_generation_dal.rs +++ b/core/lib/dal/src/tee_proof_generation_dal.rs @@ -1,6 +1,7 @@ #![doc = include_str!("../doc/TeeProofGenerationDal.md")] use std::time::Duration; +use chrono::{DateTime, Utc}; use strum::{Display, EnumString}; use zksync_db_connection::{ connection::Connection, @@ -10,21 +11,47 @@ use zksync_db_connection::{ }; use zksync_types::{tee_types::TeeType, L1BatchNumber}; -use crate::{models::storage_tee_proof::StorageTeeProof, Core}; +use crate::{ + models::storage_tee_proof::{StorageLockedBatch, StorageTeeProof}, + Core, +}; #[derive(Debug)] pub struct TeeProofGenerationDal<'a, 'c> { pub(crate) storage: &'a mut Connection<'c, Core>, } -#[derive(Debug, EnumString, Display)] -enum TeeProofGenerationJobStatus { - #[strum(serialize = "unpicked")] - Unpicked, +#[derive(Debug, Clone, Copy, EnumString, Display)] +pub enum TeeProofGenerationJobStatus { + /// The batch has been picked by a TEE prover and is currently being processed. #[strum(serialize = "picked_by_prover")] PickedByProver, + /// The proof has been successfully generated and submitted for the batch. #[strum(serialize = "generated")] Generated, + /// The proof generation for the batch has failed, which can happen if its inputs (GCS blob + /// files) are incomplete or the API is unavailable. Failed batches are retried for a specified + /// period, as defined in the configuration. + #[strum(serialize = "failed")] + Failed, + /// The batch will not be processed again because the proof generation has been failing for an + /// extended period, as specified in the configuration. + #[strum(serialize = "permanently_ignored")] + PermanentlyIgnored, +} + +/// Represents a locked batch picked by a TEE prover. A batch is locked when taken by a TEE prover +/// ([TeeProofGenerationJobStatus::PickedByProver]). It can transition to one of three states: +/// 1. [TeeProofGenerationJobStatus::Generated]. +/// 2. [TeeProofGenerationJobStatus::Failed]. +/// 3. [TeeProofGenerationJobStatus::PermanentlyIgnored]. +#[derive(Clone, Debug)] +pub struct LockedBatch { + /// Locked batch number. + pub l1_batch_number: L1BatchNumber, + /// The creation time of the job for this batch. It is used to determine if the batch should + /// transition to [TeeProofGenerationJobStatus::PermanentlyIgnored] or [TeeProofGenerationJobStatus::Failed]. + pub created_at: DateTime, } impl TeeProofGenerationDal<'_, '_> { @@ -33,10 +60,11 @@ impl TeeProofGenerationDal<'_, '_> { tee_type: TeeType, processing_timeout: Duration, min_batch_number: L1BatchNumber, - ) -> DalResult> { + ) -> DalResult> { let processing_timeout = pg_interval_from_duration(processing_timeout); let min_batch_number = i64::from(min_batch_number.0); - sqlx::query!( + let locked_batch = sqlx::query_as!( + StorageLockedBatch, r#" WITH upsert AS ( SELECT @@ -57,11 +85,8 @@ impl TeeProofGenerationDal<'_, '_> { AND ( tee.l1_batch_number IS NULL OR ( - tee.status = $3 - OR ( - tee.status = $2 - AND tee.prover_taken_at < NOW() - $4::INTERVAL - ) + (tee.status = $2 OR tee.status = $3) + AND tee.prover_taken_at < NOW() - $4::INTERVAL ) ) FETCH FIRST ROW ONLY @@ -87,11 +112,12 @@ impl TeeProofGenerationDal<'_, '_> { updated_at = NOW(), prover_taken_at = NOW() RETURNING - l1_batch_number + l1_batch_number, + created_at "#, tee_type.to_string(), TeeProofGenerationJobStatus::PickedByProver.to_string(), - TeeProofGenerationJobStatus::Unpicked.to_string(), + TeeProofGenerationJobStatus::Failed.to_string(), processing_timeout, min_batch_number ) @@ -100,14 +126,17 @@ impl TeeProofGenerationDal<'_, '_> { .with_arg("processing_timeout", &processing_timeout) .with_arg("l1_batch_number", &min_batch_number) .fetch_optional(self.storage) - .await - .map(|record| record.map(|record| L1BatchNumber(record.l1_batch_number as u32))) + .await? + .map(Into::into); + + Ok(locked_batch) } pub async fn unlock_batch( &mut self, l1_batch_number: L1BatchNumber, tee_type: TeeType, + status: TeeProofGenerationJobStatus, ) -> DalResult<()> { let batch_number = i64::from(l1_batch_number.0); sqlx::query!( @@ -120,7 +149,7 @@ impl TeeProofGenerationDal<'_, '_> { l1_batch_number = $2 AND tee_type = $3 "#, - TeeProofGenerationJobStatus::Unpicked.to_string(), + status.to_string(), batch_number, tee_type.to_string() ) @@ -266,7 +295,7 @@ impl TeeProofGenerationDal<'_, '_> { "#, batch_number, tee_type.to_string(), - TeeProofGenerationJobStatus::Unpicked.to_string(), + TeeProofGenerationJobStatus::PickedByProver.to_string(), ); let instrumentation = Instrumented::new("insert_tee_proof_generation_job") .with_arg("l1_batch_number", &batch_number) @@ -281,7 +310,7 @@ impl TeeProofGenerationDal<'_, '_> { } /// For testing purposes only. - pub async fn get_oldest_unpicked_batch(&mut self) -> DalResult> { + pub async fn get_oldest_picked_by_prover_batch(&mut self) -> DalResult> { let query = sqlx::query!( r#" SELECT @@ -295,7 +324,7 @@ impl TeeProofGenerationDal<'_, '_> { LIMIT 1 "#, - TeeProofGenerationJobStatus::Unpicked.to_string(), + TeeProofGenerationJobStatus::PickedByProver.to_string(), ); let batch_number = Instrumented::new("get_oldest_unpicked_batch") .with(query) diff --git a/core/lib/env_config/src/proof_data_handler.rs b/core/lib/env_config/src/proof_data_handler.rs index 47848585e769..65fd1d516de3 100644 --- a/core/lib/env_config/src/proof_data_handler.rs +++ b/core/lib/env_config/src/proof_data_handler.rs @@ -29,6 +29,7 @@ mod tests { tee_support: true, first_tee_processed_batch: L1BatchNumber(1337), tee_proof_generation_timeout_in_secs: 600, + tee_batch_permanently_ignored_timeout_in_hours: 240, }, } } @@ -41,6 +42,7 @@ mod tests { PROOF_DATA_HANDLER_TEE_SUPPORT="true" PROOF_DATA_HANDLER_FIRST_TEE_PROCESSED_BATCH="1337" PROOF_DATA_HANDLER_TEE_PROOF_GENERATION_TIMEOUT_IN_SECS="600" + PROOF_DATA_HANDLER_TEE_BATCH_PERMANENTLY_IGNORED_TIMEOUT_IN_HOURS="240" "#; let mut lock = MUTEX.lock(); lock.set_env(config); diff --git a/core/lib/object_store/src/retries.rs b/core/lib/object_store/src/retries.rs index 2cccbb17c2bb..16d2c1cd55f1 100644 --- a/core/lib/object_store/src/retries.rs +++ b/core/lib/object_store/src/retries.rs @@ -53,7 +53,6 @@ impl Request<'_> { backoff_secs *= 2; } Err(err) => { - tracing::warn!(%err, "Failed request with a fatal error"); break Err(err); } } diff --git a/core/lib/protobuf_config/src/proof_data_handler.rs b/core/lib/protobuf_config/src/proof_data_handler.rs index c01e163bd771..92a9c90bbb64 100644 --- a/core/lib/protobuf_config/src/proof_data_handler.rs +++ b/core/lib/protobuf_config/src/proof_data_handler.rs @@ -29,6 +29,12 @@ impl ProtoRepr for proto::ProofDataHandler { .unwrap_or_else( configs::TeeConfig::default_tee_proof_generation_timeout_in_secs, ), + tee_batch_permanently_ignored_timeout_in_hours: self + .tee_batch_permanently_ignored_timeout_in_hours + .map(|x| x as u16) + .unwrap_or_else( + configs::TeeConfig::default_tee_batch_permanently_ignored_timeout_in_hours, + ), }, }) } @@ -42,6 +48,11 @@ impl ProtoRepr for proto::ProofDataHandler { tee_proof_generation_timeout_in_secs: Some( this.tee_config.tee_proof_generation_timeout_in_secs.into(), ), + tee_batch_permanently_ignored_timeout_in_hours: Some( + this.tee_config + .tee_batch_permanently_ignored_timeout_in_hours + .into(), + ), } } } diff --git a/core/lib/protobuf_config/src/proto/config/prover.proto b/core/lib/protobuf_config/src/proto/config/prover.proto index 392834d25f3d..64735713fcab 100644 --- a/core/lib/protobuf_config/src/proto/config/prover.proto +++ b/core/lib/protobuf_config/src/proto/config/prover.proto @@ -110,4 +110,5 @@ message ProofDataHandler { optional bool tee_support = 3; // optional optional uint64 first_tee_processed_batch = 4; // optional optional uint32 tee_proof_generation_timeout_in_secs = 5; // optional + optional uint32 tee_batch_permanently_ignored_timeout_in_hours = 6; // optional } diff --git a/core/lib/types/src/api/mod.rs b/core/lib/types/src/api/mod.rs index 5f81e889b537..b5d2b3276527 100644 --- a/core/lib/types/src/api/mod.rs +++ b/core/lib/types/src/api/mod.rs @@ -4,7 +4,6 @@ use serde_json::Value; use serde_with::{hex::Hex, serde_as}; use strum::Display; use zksync_basic_types::{ - tee_types::TeeType, web3::{AccessList, Bytes, Index}, Bloom, L1BatchNumber, H160, H256, H64, U256, U64, }; @@ -16,6 +15,7 @@ pub use crate::transaction_request::{ use crate::{ debug_flat_call::{DebugCallFlat, ResultDebugCallFlat}, protocol_version::L1VerifierConfig, + tee_types::TeeType, Address, L2BlockNumber, ProtocolVersionId, }; diff --git a/core/node/proof_data_handler/Cargo.toml b/core/node/proof_data_handler/Cargo.toml index 639266a2be96..0bd1501277b7 100644 --- a/core/node/proof_data_handler/Cargo.toml +++ b/core/node/proof_data_handler/Cargo.toml @@ -27,9 +27,7 @@ tracing.workspace = true [dev-dependencies] hyper.workspace = true -chrono.workspace = true zksync_multivm.workspace = true serde_json.workspace = true tower.workspace = true -zksync_basic_types.workspace = true zksync_contracts.workspace = true diff --git a/core/node/proof_data_handler/src/tee_request_processor.rs b/core/node/proof_data_handler/src/tee_request_processor.rs index ee5be844b981..971b94fe315f 100644 --- a/core/node/proof_data_handler/src/tee_request_processor.rs +++ b/core/node/proof_data_handler/src/tee_request_processor.rs @@ -1,9 +1,12 @@ use std::sync::Arc; use axum::{extract::Path, Json}; -use chrono::Utc; +use chrono::{Duration as ChronoDuration, Utc}; use zksync_config::configs::ProofDataHandlerConfig; -use zksync_dal::{ConnectionPool, Core, CoreDal}; +use zksync_dal::{ + tee_proof_generation_dal::{LockedBatch, TeeProofGenerationJobStatus}, + ConnectionPool, Core, CoreDal, +}; use zksync_object_store::{ObjectStore, ObjectStoreError}; use zksync_prover_interface::{ api::{ @@ -48,49 +51,62 @@ impl TeeRequestProcessor { ) -> Result>, RequestProcessorError> { tracing::info!("Received request for proof generation data: {:?}", request); - let mut min_batch_number = self.config.tee_config.first_tee_processed_batch; - let mut missing_range: Option<(L1BatchNumber, L1BatchNumber)> = None; + let batch_ignored_timeout = ChronoDuration::from_std( + self.config + .tee_config + .tee_batch_permanently_ignored_timeout(), + ) + .map_err(|err| { + RequestProcessorError::GeneralError(format!( + "Failed to convert batch_ignored_timeout: {}", + err + )) + })?; + let min_batch_number = self.config.tee_config.first_tee_processed_batch; - let result = loop { - let Some(l1_batch_number) = self + loop { + let Some(locked_batch) = self .lock_batch_for_proving(request.tee_type, min_batch_number) .await? else { - // No job available - return Ok(None); + break Ok(None); // no job available }; + let batch_number = locked_batch.l1_batch_number; match self - .tee_verifier_input_for_existing_batch(l1_batch_number) + .tee_verifier_input_for_existing_batch(batch_number) .await { Ok(input) => { break Ok(Some(Json(TeeProofGenerationDataResponse(Box::new(input))))); } Err(RequestProcessorError::ObjectStore(ObjectStoreError::KeyNotFound(_))) => { - missing_range = match missing_range { - Some((start, _)) => Some((start, l1_batch_number)), - None => Some((l1_batch_number, l1_batch_number)), + let duration = Utc::now().signed_duration_since(locked_batch.created_at); + let status = if duration > batch_ignored_timeout { + TeeProofGenerationJobStatus::PermanentlyIgnored + } else { + TeeProofGenerationJobStatus::Failed }; - self.unlock_batch(l1_batch_number, request.tee_type).await?; - min_batch_number = l1_batch_number + 1; + self.unlock_batch(batch_number, request.tee_type, status) + .await?; + tracing::warn!( + "Assigned status {} to batch {} created at {}", + status, + batch_number, + locked_batch.created_at + ); } Err(err) => { - self.unlock_batch(l1_batch_number, request.tee_type).await?; + self.unlock_batch( + batch_number, + request.tee_type, + TeeProofGenerationJobStatus::Failed, + ) + .await?; break Err(err); } } - }; - - if let Some((start, end)) = missing_range { - tracing::warn!( - "Blobs for batch numbers {} to {} not found in the object store. Marked as unpicked.", - start, - end - ); } - - result } #[tracing::instrument(skip(self))] @@ -158,7 +174,7 @@ impl TeeRequestProcessor { &self, tee_type: TeeType, min_batch_number: L1BatchNumber, - ) -> Result, RequestProcessorError> { + ) -> Result, RequestProcessorError> { self.pool .connection_tagged("tee_request_processor") .await? @@ -176,12 +192,13 @@ impl TeeRequestProcessor { &self, l1_batch_number: L1BatchNumber, tee_type: TeeType, + status: TeeProofGenerationJobStatus, ) -> Result<(), RequestProcessorError> { self.pool .connection_tagged("tee_request_processor") .await? .tee_proof_generation_dal() - .unlock_batch(l1_batch_number, tee_type) + .unlock_batch(l1_batch_number, tee_type, status) .await?; Ok(()) } diff --git a/core/node/proof_data_handler/src/tests.rs b/core/node/proof_data_handler/src/tests.rs index 87c6bff8a1f4..dae2ef8cd0c0 100644 --- a/core/node/proof_data_handler/src/tests.rs +++ b/core/node/proof_data_handler/src/tests.rs @@ -6,12 +6,13 @@ use axum::{ }; use serde_json::json; use tower::ServiceExt; -use zksync_basic_types::L2ChainId; use zksync_config::configs::{ProofDataHandlerConfig, TeeConfig}; use zksync_dal::{ConnectionPool, CoreDal}; use zksync_object_store::MockObjectStore; use zksync_prover_interface::api::SubmitTeeProofRequest; -use zksync_types::{commitment::L1BatchCommitmentMode, tee_types::TeeType, L1BatchNumber}; +use zksync_types::{ + commitment::L1BatchCommitmentMode, tee_types::TeeType, L1BatchNumber, L2ChainId, +}; use crate::create_proof_processing_router; @@ -29,6 +30,7 @@ async fn request_tee_proof_inputs() { tee_support: true, first_tee_processed_batch: L1BatchNumber(0), tee_proof_generation_timeout_in_secs: 600, + tee_batch_permanently_ignored_timeout_in_hours: 10 * 24, }, }, L1BatchCommitmentMode::Rollup, @@ -88,6 +90,7 @@ async fn submit_tee_proof() { tee_support: true, first_tee_processed_batch: L1BatchNumber(0), tee_proof_generation_timeout_in_secs: 600, + tee_batch_permanently_ignored_timeout_in_hours: 10 * 24, }, }, L1BatchCommitmentMode::Rollup, @@ -119,7 +122,7 @@ async fn submit_tee_proof() { let mut proof_db_conn = db_conn_pool.connection().await.unwrap(); let oldest_batch_number = proof_db_conn .tee_proof_generation_dal() - .get_oldest_unpicked_batch() + .get_oldest_picked_by_prover_batch() .await .unwrap(); @@ -156,7 +159,7 @@ async fn mock_tee_batch_status( // there should not be any batches awaiting proof in the db yet - let oldest_batch_number = proof_dal.get_oldest_unpicked_batch().await.unwrap(); + let oldest_batch_number = proof_dal.get_oldest_picked_by_prover_batch().await.unwrap(); assert!(oldest_batch_number.is_none()); // mock SQL table with relevant information about the status of TEE proof generation @@ -169,7 +172,7 @@ async fn mock_tee_batch_status( // now, there should be one batch in the db awaiting proof let oldest_batch_number = proof_dal - .get_oldest_unpicked_batch() + .get_oldest_picked_by_prover_batch() .await .unwrap() .unwrap(); diff --git a/etc/env/base/proof_data_handler.toml b/etc/env/base/proof_data_handler.toml index b56ac26fb177..767d1d16da2e 100644 --- a/etc/env/base/proof_data_handler.toml +++ b/etc/env/base/proof_data_handler.toml @@ -1,5 +1,6 @@ [proof_data_handler] http_port = 3320 proof_generation_timeout_in_secs = 18000 -tee_proof_generation_timeout_in_secs = 600 +tee_proof_generation_timeout_in_secs = 60 +tee_batch_permanently_ignored_timeout_in_hours = 240 tee_support = true diff --git a/etc/env/file_based/general.yaml b/etc/env/file_based/general.yaml index a4005e9477a8..23e8b3ee420c 100644 --- a/etc/env/file_based/general.yaml +++ b/etc/env/file_based/general.yaml @@ -166,7 +166,8 @@ witness_vector_generator: data_handler: http_port: 3320 proof_generation_timeout_in_secs: 18000 - tee_proof_generation_timeout_in_secs: 600 + tee_proof_generation_timeout_in_secs: 60 + tee_batch_permanently_ignored_timeout_in_hours: 240 tee_support: true prover_gateway: api_url: http://127.0.0.1:3320