Skip to content

Commit

Permalink
feat(tee): add Prometheus metrics to the TEE Prover (#2386)
Browse files Browse the repository at this point in the history
## What ❔

This commit adds Prometheus metrics to the TEE Prover. Specifically, the
following metrics were added:
- Waiting time for a new batch to be proven
- Proof generation time
- Proof submitting time
- Network error counter
- Last block number processed

## Why ❔

Setting up Prometheus metrics is a prerequisite before rolling them out
to staging and testnet environments. Prometheus metrics are useful for
monitoring, providing valuable insights into the running system.

## Checklist

- [x] PR title corresponds to the body of PR (we generate changelog
entries from PRs).
- [ ] Tests for the changes have been added / updated.
- [ ] Documentation comments have been added / updated.
- [x] Code has been formatted via `zk fmt` and `zk lint`.
  • Loading branch information
pbeza authored Jul 5, 2024
1 parent 217a4ba commit 6153e99
Show file tree
Hide file tree
Showing 8 changed files with 63 additions and 14 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions core/bin/zksync_tee_prover/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ thiserror.workspace = true
tokio = { workspace = true, features = ["full"] }
tracing.workspace = true
url.workspace = true
vise.workspace = true
zksync_basic_types.workspace = true
zksync_config.workspace = true
zksync_env_config.workspace = true
Expand Down
4 changes: 3 additions & 1 deletion core/bin/zksync_tee_prover/src/api_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use zksync_prover_interface::{
};
use zksync_types::{tee_types::TeeType, L1BatchNumber};

use crate::error::TeeProverError;
use crate::{error::TeeProverError, metrics::METRICS};

/// Implementation of the API client for the proof data handler, run by
/// [`zksync_proof_data_handler::run_server`].
Expand Down Expand Up @@ -97,11 +97,13 @@ impl TeeApiClient {
proof: root_hash.as_bytes().into(),
tee_type,
}));
let observer = METRICS.proof_submitting_time.start();
self.post::<_, SubmitTeeProofResponse, _>(
format!("/tee/submit_proofs/{batch_number}").as_str(),
request,
)
.await?;
observer.observe();
tracing::info!(
"Proof submitted successfully for batch number {}",
batch_number
Expand Down
16 changes: 14 additions & 2 deletions core/bin/zksync_tee_prover/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
use anyhow::Context as _;
use config::TeeProverConfig;
use tee_prover::TeeProverLayer;
use zksync_config::configs::ObservabilityConfig;
use zksync_config::configs::{ObservabilityConfig, PrometheusConfig};
use zksync_env_config::FromEnv;
use zksync_node_framework::{
implementations::layers::sigint::SigintHandlerLayer, service::ZkStackServiceBuilder,
implementations::layers::{
prometheus_exporter::PrometheusExporterLayer, sigint::SigintHandlerLayer,
},
service::ZkStackServiceBuilder,
};
use zksync_vlog::prometheus::PrometheusExporterConfig;

mod api_client;
mod config;
mod error;
mod metrics;
mod tee_prover;

/// This application serves as a TEE verifier, a.k.a. a TEE prover.
Expand Down Expand Up @@ -41,8 +46,15 @@ fn main() -> anyhow::Result<()> {
let tee_prover_config = TeeProverConfig::from_env()?;
let attestation_quote_bytes = std::fs::read(tee_prover_config.attestation_quote_file_path)?;

let prometheus_config = PrometheusConfig::from_env()?;
let exporter_config = PrometheusExporterConfig::push(
prometheus_config.gateway_endpoint(),
prometheus_config.push_interval(),
);

ZkStackServiceBuilder::new()
.add_layer(SigintHandlerLayer)
.add_layer(PrometheusExporterLayer(exporter_config))
.add_layer(TeeProverLayer::new(
tee_prover_config.api_url,
tee_prover_config.signing_key,
Expand Down
21 changes: 21 additions & 0 deletions core/bin/zksync_tee_prover/src/metrics.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
//! Metrics for the TEE Prover.
use std::time::Duration;

use vise::{Buckets, Gauge, Histogram, Metrics, Unit};

#[derive(Debug, Metrics)]
#[metrics(prefix = "tee_prover")]
pub(crate) struct TeeProverMetrics {
#[metrics(buckets = Buckets::LATENCIES, unit = Unit::Seconds)]
pub job_waiting_time: Histogram<Duration>,
#[metrics(buckets = Buckets::LATENCIES, unit = Unit::Seconds)]
pub proof_generation_time: Histogram<Duration>,
#[metrics(buckets = Buckets::LATENCIES, unit = Unit::Seconds)]
pub proof_submitting_time: Histogram<Duration>,
pub network_errors_counter: Gauge<u64>,
pub last_batch_number_processed: Gauge<u64>,
}

#[vise::register]
pub(super) static METRICS: vise::Global<TeeProverMetrics> = vise::Global::new();
24 changes: 19 additions & 5 deletions core/bin/zksync_tee_prover/src/tee_prover.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use zksync_prover_interface::inputs::TeeVerifierInput;
use zksync_tee_verifier::Verify;
use zksync_types::{tee_types::TeeType, L1BatchNumber};

use crate::{api_client::TeeApiClient, error::TeeProverError};
use crate::{api_client::TeeApiClient, error::TeeProverError, metrics::METRICS};

/// Wiring layer for `TeeProver`
///
Expand Down Expand Up @@ -83,12 +83,14 @@ impl TeeProver {
) -> Result<(Signature, L1BatchNumber, H256), TeeProverError> {
match tvi {
TeeVerifierInput::V1(tvi) => {
let observer = METRICS.proof_generation_time.start();
let verification_result = tvi.verify().map_err(TeeProverError::Verification)?;
let root_hash_bytes = verification_result.value_hash.as_bytes();
let batch_number = verification_result.batch_number;
let msg_to_sign = Message::from_slice(root_hash_bytes)
.map_err(|e| TeeProverError::Verification(e.into()))?;
let signature = self.signing_key.sign_ecdsa(msg_to_sign);
observer.observe();
Ok((signature, batch_number, verification_result.value_hash))
}
_ => Err(TeeProverError::Verification(anyhow::anyhow!(
Expand All @@ -97,7 +99,7 @@ impl TeeProver {
}
}

async fn step(&self) -> Result<(), TeeProverError> {
async fn step(&self) -> Result<Option<L1BatchNumber>, TeeProverError> {
match self.api_client.get_job().await? {
Some(job) => {
let (signature, batch_number, root_hash) = self.verify(*job)?;
Expand All @@ -110,10 +112,13 @@ impl TeeProver {
self.tee_type,
)
.await?;
Ok(Some(batch_number))
}
None => {
tracing::trace!("There are currently no pending batches to be proven");
Ok(None)
}
None => tracing::trace!("There are currently no pending batches to be proven"),
}
Ok(())
}
}

Expand Down Expand Up @@ -156,6 +161,7 @@ impl Task for TeeProver {

let mut retries = 1;
let mut backoff = self.config.initial_retry_backoff;
let mut observer = METRICS.job_waiting_time.start();

loop {
if *stop_receiver.0.borrow() {
Expand All @@ -164,11 +170,19 @@ impl Task for TeeProver {
}
let result = self.step().await;
match result {
Ok(()) => {
Ok(batch_number) => {
retries = 1;
backoff = self.config.initial_retry_backoff;
if let Some(batch_number) = batch_number {
observer.observe();
observer = METRICS.job_waiting_time.start();
METRICS
.last_batch_number_processed
.set(batch_number.0 as u64);
}
}
Err(err) => {
METRICS.network_errors_counter.inc_by(1);
if !err.is_transient() || retries > self.config.max_retries {
return Err(err.into());
}
Expand Down
8 changes: 3 additions & 5 deletions core/node/tee_verifier_input_producer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,15 +216,13 @@ impl JobProcessor for TeeVerifierInputProducer {
started_at: Instant,
artifacts: Self::JobArtifacts,
) -> anyhow::Result<()> {
let upload_started_at = Instant::now();
let observer: vise::LatencyObserver = METRICS.upload_input_time.start();
let object_path = self
.object_store
.put(job_id, &artifacts)
.await
.context("failed to upload artifacts for TeeVerifierInputProducer")?;
METRICS
.upload_input_time
.observe(upload_started_at.elapsed());
observer.observe();
let mut connection = self
.connection_pool
.connection()
Expand All @@ -247,7 +245,7 @@ impl JobProcessor for TeeVerifierInputProducer {
.commit()
.await
.context("failed to commit DB transaction for TeeVerifierInputProducer")?;
METRICS.block_number_processed.set(job_id.0 as i64);
METRICS.block_number_processed.set(job_id.0 as u64);
Ok(())
}

Expand Down
2 changes: 1 addition & 1 deletion core/node/tee_verifier_input_producer/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ pub(crate) struct TeeVerifierInputProducerMetrics {
pub process_batch_time: Histogram<Duration>,
#[metrics(buckets = Buckets::LATENCIES, unit = Unit::Seconds)]
pub upload_input_time: Histogram<Duration>,
pub block_number_processed: Gauge,
pub block_number_processed: Gauge<u64>,
}

#[vise::register]
Expand Down

0 comments on commit 6153e99

Please sign in to comment.