diff --git a/Cargo.lock b/Cargo.lock index 5cc13f2897ce..6387576e914f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9412,6 +9412,7 @@ dependencies = [ "tokio", "tracing", "url", + "vise", "zksync_basic_types", "zksync_config", "zksync_env_config", diff --git a/core/bin/zksync_tee_prover/Cargo.toml b/core/bin/zksync_tee_prover/Cargo.toml index f225c8a785e0..d0565eee35a8 100644 --- a/core/bin/zksync_tee_prover/Cargo.toml +++ b/core/bin/zksync_tee_prover/Cargo.toml @@ -19,6 +19,7 @@ thiserror.workspace = true tokio = { workspace = true, features = ["full"] } tracing.workspace = true url.workspace = true +vise.workspace = true zksync_basic_types.workspace = true zksync_config.workspace = true zksync_env_config.workspace = true diff --git a/core/bin/zksync_tee_prover/src/api_client.rs b/core/bin/zksync_tee_prover/src/api_client.rs index 2507d9b54fb1..1530da971157 100644 --- a/core/bin/zksync_tee_prover/src/api_client.rs +++ b/core/bin/zksync_tee_prover/src/api_client.rs @@ -13,7 +13,7 @@ use zksync_prover_interface::{ }; use zksync_types::{tee_types::TeeType, L1BatchNumber}; -use crate::error::TeeProverError; +use crate::{error::TeeProverError, metrics::METRICS}; /// Implementation of the API client for the proof data handler, run by /// [`zksync_proof_data_handler::run_server`]. @@ -97,11 +97,13 @@ impl TeeApiClient { proof: root_hash.as_bytes().into(), tee_type, })); + let observer = METRICS.proof_submitting_time.start(); self.post::<_, SubmitTeeProofResponse, _>( format!("/tee/submit_proofs/{batch_number}").as_str(), request, ) .await?; + observer.observe(); tracing::info!( "Proof submitted successfully for batch number {}", batch_number diff --git a/core/bin/zksync_tee_prover/src/main.rs b/core/bin/zksync_tee_prover/src/main.rs index 30d4b9a98002..8de6bacef6fd 100644 --- a/core/bin/zksync_tee_prover/src/main.rs +++ b/core/bin/zksync_tee_prover/src/main.rs @@ -1,15 +1,20 @@ use anyhow::Context as _; use config::TeeProverConfig; use tee_prover::TeeProverLayer; -use zksync_config::configs::ObservabilityConfig; +use zksync_config::configs::{ObservabilityConfig, PrometheusConfig}; use zksync_env_config::FromEnv; use zksync_node_framework::{ - implementations::layers::sigint::SigintHandlerLayer, service::ZkStackServiceBuilder, + implementations::layers::{ + prometheus_exporter::PrometheusExporterLayer, sigint::SigintHandlerLayer, + }, + service::ZkStackServiceBuilder, }; +use zksync_vlog::prometheus::PrometheusExporterConfig; mod api_client; mod config; mod error; +mod metrics; mod tee_prover; /// This application serves as a TEE verifier, a.k.a. a TEE prover. @@ -41,8 +46,15 @@ fn main() -> anyhow::Result<()> { let tee_prover_config = TeeProverConfig::from_env()?; let attestation_quote_bytes = std::fs::read(tee_prover_config.attestation_quote_file_path)?; + let prometheus_config = PrometheusConfig::from_env()?; + let exporter_config = PrometheusExporterConfig::push( + prometheus_config.gateway_endpoint(), + prometheus_config.push_interval(), + ); + ZkStackServiceBuilder::new() .add_layer(SigintHandlerLayer) + .add_layer(PrometheusExporterLayer(exporter_config)) .add_layer(TeeProverLayer::new( tee_prover_config.api_url, tee_prover_config.signing_key, diff --git a/core/bin/zksync_tee_prover/src/metrics.rs b/core/bin/zksync_tee_prover/src/metrics.rs new file mode 100644 index 000000000000..9f535967f79f --- /dev/null +++ b/core/bin/zksync_tee_prover/src/metrics.rs @@ -0,0 +1,21 @@ +//! Metrics for the TEE Prover. + +use std::time::Duration; + +use vise::{Buckets, Gauge, Histogram, Metrics, Unit}; + +#[derive(Debug, Metrics)] +#[metrics(prefix = "tee_prover")] +pub(crate) struct TeeProverMetrics { + #[metrics(buckets = Buckets::LATENCIES, unit = Unit::Seconds)] + pub job_waiting_time: Histogram<Duration>, + #[metrics(buckets = Buckets::LATENCIES, unit = Unit::Seconds)] + pub proof_generation_time: Histogram<Duration>, + #[metrics(buckets = Buckets::LATENCIES, unit = Unit::Seconds)] + pub proof_submitting_time: Histogram<Duration>, + pub network_errors_counter: Gauge<u64>, + pub last_batch_number_processed: Gauge<u64>, +} + +#[vise::register] +pub(super) static METRICS: vise::Global<TeeProverMetrics> = vise::Global::new(); diff --git a/core/bin/zksync_tee_prover/src/tee_prover.rs b/core/bin/zksync_tee_prover/src/tee_prover.rs index b7a7f6f743e0..9d692e84f10e 100644 --- a/core/bin/zksync_tee_prover/src/tee_prover.rs +++ b/core/bin/zksync_tee_prover/src/tee_prover.rs @@ -12,7 +12,7 @@ use zksync_prover_interface::inputs::TeeVerifierInput; use zksync_tee_verifier::Verify; use zksync_types::{tee_types::TeeType, L1BatchNumber}; -use crate::{api_client::TeeApiClient, error::TeeProverError}; +use crate::{api_client::TeeApiClient, error::TeeProverError, metrics::METRICS}; /// Wiring layer for `TeeProver` /// @@ -83,12 +83,14 @@ impl TeeProver { ) -> Result<(Signature, L1BatchNumber, H256), TeeProverError> { match tvi { TeeVerifierInput::V1(tvi) => { + let observer = METRICS.proof_generation_time.start(); let verification_result = tvi.verify().map_err(TeeProverError::Verification)?; let root_hash_bytes = verification_result.value_hash.as_bytes(); let batch_number = verification_result.batch_number; let msg_to_sign = Message::from_slice(root_hash_bytes) .map_err(|e| TeeProverError::Verification(e.into()))?; let signature = self.signing_key.sign_ecdsa(msg_to_sign); + observer.observe(); Ok((signature, batch_number, verification_result.value_hash)) } _ => Err(TeeProverError::Verification(anyhow::anyhow!( @@ -97,7 +99,7 @@ impl TeeProver { } } - async fn step(&self) -> Result<(), TeeProverError> { + async fn step(&self) -> Result<Option<L1BatchNumber>, TeeProverError> { match self.api_client.get_job().await? { Some(job) => { let (signature, batch_number, root_hash) = self.verify(*job)?; @@ -110,10 +112,13 @@ impl TeeProver { self.tee_type, ) .await?; + Ok(Some(batch_number)) + } + None => { + tracing::trace!("There are currently no pending batches to be proven"); + Ok(None) } - None => tracing::trace!("There are currently no pending batches to be proven"), } - Ok(()) } } @@ -156,6 +161,7 @@ impl Task for TeeProver { let mut retries = 1; let mut backoff = self.config.initial_retry_backoff; + let mut observer = METRICS.job_waiting_time.start(); loop { if *stop_receiver.0.borrow() { @@ -164,11 +170,19 @@ impl Task for TeeProver { } let result = self.step().await; match result { - Ok(()) => { + Ok(batch_number) => { retries = 1; backoff = self.config.initial_retry_backoff; + if let Some(batch_number) = batch_number { + observer.observe(); + observer = METRICS.job_waiting_time.start(); + METRICS + .last_batch_number_processed + .set(batch_number.0 as u64); + } } Err(err) => { + METRICS.network_errors_counter.inc_by(1); if !err.is_transient() || retries > self.config.max_retries { return Err(err.into()); } diff --git a/core/node/tee_verifier_input_producer/src/lib.rs b/core/node/tee_verifier_input_producer/src/lib.rs index c45af4cf31b0..52cdf3d5d36f 100644 --- a/core/node/tee_verifier_input_producer/src/lib.rs +++ b/core/node/tee_verifier_input_producer/src/lib.rs @@ -216,15 +216,13 @@ impl JobProcessor for TeeVerifierInputProducer { started_at: Instant, artifacts: Self::JobArtifacts, ) -> anyhow::Result<()> { - let upload_started_at = Instant::now(); + let observer: vise::LatencyObserver = METRICS.upload_input_time.start(); let object_path = self .object_store .put(job_id, &artifacts) .await .context("failed to upload artifacts for TeeVerifierInputProducer")?; - METRICS - .upload_input_time - .observe(upload_started_at.elapsed()); + observer.observe(); let mut connection = self .connection_pool .connection() @@ -247,7 +245,7 @@ impl JobProcessor for TeeVerifierInputProducer { .commit() .await .context("failed to commit DB transaction for TeeVerifierInputProducer")?; - METRICS.block_number_processed.set(job_id.0 as i64); + METRICS.block_number_processed.set(job_id.0 as u64); Ok(()) } diff --git a/core/node/tee_verifier_input_producer/src/metrics.rs b/core/node/tee_verifier_input_producer/src/metrics.rs index 51daa20baadb..362804d338e9 100644 --- a/core/node/tee_verifier_input_producer/src/metrics.rs +++ b/core/node/tee_verifier_input_producer/src/metrics.rs @@ -11,7 +11,7 @@ pub(crate) struct TeeVerifierInputProducerMetrics { pub process_batch_time: Histogram<Duration>, #[metrics(buckets = Buckets::LATENCIES, unit = Unit::Seconds)] pub upload_input_time: Histogram<Duration>, - pub block_number_processed: Gauge, + pub block_number_processed: Gauge<u64>, } #[vise::register]