From dd154f388c23ff67068a1053fec878e80ba9bd17 Mon Sep 17 00:00:00 2001 From: Daniyar Itegulov Date: Wed, 12 Jun 2024 02:15:35 +1000 Subject: [PATCH] feat(vm-runner): add basic metrics (#2203) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What ❔ This PR just adds some basic VM runner-specific metrics. Note that VM runner already inherits VM-related metrics from state keeper. I decided to split `storage_load`/`vm_run`/`output_handle` into 3 different metrics as opposed to a single metric with 3 stages as the stages happen asynchronously and don't represent a continuous execution. Lmk if you disagree though. ## Why ❔ ## Checklist - [x] PR title corresponds to the body of PR (we generate changelog entries from PRs). - [ ] Tests for the changes have been added / updated. - [x] Documentation comments have been added / updated. - [x] Code has been formatted via `zk fmt` and `zk lint`. - [x] Spellcheck has been run via `zk spellcheck`. --- Cargo.lock | 1 + core/node/vm_runner/Cargo.toml | 1 + core/node/vm_runner/src/lib.rs | 1 + core/node/vm_runner/src/metrics.rs | 28 +++++++++++++++++++++++ core/node/vm_runner/src/output_handler.rs | 10 ++++++-- core/node/vm_runner/src/process.rs | 8 ++++++- core/node/vm_runner/src/storage.rs | 4 +++- 7 files changed, 49 insertions(+), 4 deletions(-) create mode 100644 core/node/vm_runner/src/metrics.rs diff --git a/Cargo.lock b/Cargo.lock index 6feb9b0e472f..00638f6973a9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9442,6 +9442,7 @@ dependencies = [ "tempfile", "tokio", "tracing", + "vise", "vm_utils", "zksync_contracts", "zksync_dal", diff --git a/core/node/vm_runner/Cargo.toml b/core/node/vm_runner/Cargo.toml index b3ede5a796be..5571bb7f3fde 100644 --- a/core/node/vm_runner/Cargo.toml +++ b/core/node/vm_runner/Cargo.toml @@ -26,6 +26,7 @@ async-trait.workspace = true once_cell.workspace = true tracing.workspace = true dashmap.workspace = true +vise.workspace = true [dev-dependencies] zksync_node_test_utils.workspace = true diff --git a/core/node/vm_runner/src/lib.rs b/core/node/vm_runner/src/lib.rs index ca9f8bdc0eb4..50cf2a4433c1 100644 --- a/core/node/vm_runner/src/lib.rs +++ b/core/node/vm_runner/src/lib.rs @@ -9,6 +9,7 @@ mod output_handler; mod process; mod storage; +mod metrics; #[cfg(test)] mod tests; diff --git a/core/node/vm_runner/src/metrics.rs b/core/node/vm_runner/src/metrics.rs new file mode 100644 index 000000000000..4252ad5f0d4f --- /dev/null +++ b/core/node/vm_runner/src/metrics.rs @@ -0,0 +1,28 @@ +//! Metrics for `VmRunner`. + +use std::time::Duration; + +use vise::{Buckets, Gauge, Histogram, Metrics}; + +#[derive(Debug, Metrics)] +#[metrics(prefix = "vm_runner")] +pub(super) struct VmRunnerMetrics { + /// Last batch that has been marked as processed. + pub last_processed_batch: Gauge, + /// Last batch that is ready to be processed. + pub last_ready_batch: Gauge, + /// Current amount of batches that are being processed. + pub in_progress_l1_batches: Gauge, + /// Total latency of loading an L1 batch (RocksDB mode only). + #[metrics(buckets = Buckets::LATENCIES)] + pub storage_load_time: Histogram, + /// Total latency of running VM on an L1 batch. + #[metrics(buckets = Buckets::LATENCIES)] + pub run_vm_time: Histogram, + /// Total latency of handling output of an L1 batch. + #[metrics(buckets = Buckets::LATENCIES)] + pub output_handle_time: Histogram, +} + +#[vise::register] +pub(super) static METRICS: vise::Global = vise::Global::new(); diff --git a/core/node/vm_runner/src/output_handler.rs b/core/node/vm_runner/src/output_handler.rs index 49bed83cd96e..4052c245a44f 100644 --- a/core/node/vm_runner/src/output_handler.rs +++ b/core/node/vm_runner/src/output_handler.rs @@ -16,7 +16,7 @@ use zksync_dal::{ConnectionPool, Core}; use zksync_state_keeper::{StateKeeperOutputHandler, UpdatesManager}; use zksync_types::L1BatchNumber; -use crate::VmRunnerIo; +use crate::{metrics::METRICS, VmRunnerIo}; type BatchReceiver = oneshot::Receiver>>; @@ -173,7 +173,10 @@ impl StateKeeperOutputHandler for AsyncOutputHandler { } => { sender .send(tokio::task::spawn(async move { - handler.handle_l1_batch(updates_manager).await + let latency = METRICS.output_handle_time.start(); + let result = handler.handle_l1_batch(updates_manager).await; + latency.observe(); + result })) .ok(); Ok(()) @@ -248,6 +251,9 @@ impl ConcurrentOutputHandlerFactoryTask { self.io .mark_l1_batch_as_completed(&mut conn, latest_processed_batch) .await?; + METRICS + .last_processed_batch + .set(latest_processed_batch.0.into()); } } } diff --git a/core/node/vm_runner/src/process.rs b/core/node/vm_runner/src/process.rs index 2d992fdd31dc..945d35477ce6 100644 --- a/core/node/vm_runner/src/process.rs +++ b/core/node/vm_runner/src/process.rs @@ -10,7 +10,7 @@ use zksync_state_keeper::{ }; use zksync_types::{block::L2BlockExecutionData, L1BatchNumber}; -use crate::{storage::StorageLoader, OutputHandlerFactory, VmRunnerIo}; +use crate::{metrics::METRICS, storage::StorageLoader, OutputHandlerFactory, VmRunnerIo}; /// VM runner represents a logic layer of L1 batch / L2 block processing flow akin to that of state /// keeper. The difference is that VM runner is designed to be run on batches/blocks that have @@ -61,6 +61,7 @@ impl VmRunner { mut updates_manager: UpdatesManager, mut output_handler: Box, ) -> anyhow::Result<()> { + let latency = METRICS.run_vm_time.start(); for (i, l2_block) in l2_blocks.into_iter().enumerate() { if i > 0 { // First L2 block in every batch is already preloaded @@ -114,6 +115,7 @@ impl VmRunner { .await .context("failed finishing L1 batch in executor")?; updates_manager.finish_batch(finished_batch); + latency.observe(); output_handler .handle_l1_batch(Arc::new(updates_manager)) .await @@ -153,11 +155,15 @@ impl VmRunner { } } task_handles = retained_handles; + METRICS + .in_progress_l1_batches + .set(task_handles.len() as u64); let last_ready_batch = self .io .last_ready_to_be_loaded_batch(&mut self.pool.connection().await?) .await?; + METRICS.last_ready_batch.set(last_ready_batch.0.into()); if next_batch > last_ready_batch { // Next batch is not ready to be processed yet tokio::time::sleep(SLEEP_INTERVAL).await; diff --git a/core/node/vm_runner/src/storage.rs b/core/node/vm_runner/src/storage.rs index e7a8b147c76f..7a53f6034a75 100644 --- a/core/node/vm_runner/src/storage.rs +++ b/core/node/vm_runner/src/storage.rs @@ -19,7 +19,7 @@ use zksync_state::{ use zksync_storage::RocksDB; use zksync_types::{block::L2BlockExecutionData, L1BatchNumber, L2ChainId}; -use crate::VmRunnerIo; +use crate::{metrics::METRICS, VmRunnerIo}; #[async_trait] pub trait StorageLoader: ReadStorageFactory { @@ -338,6 +338,7 @@ impl StorageSyncTask { drop(state); let max_desired = self.io.last_ready_to_be_loaded_batch(&mut conn).await?; for l1_batch_number in max_present.0 + 1..=max_desired.0 { + let latency = METRICS.storage_load_time.start(); let l1_batch_number = L1BatchNumber(l1_batch_number); let Some(execute_data) = Self::load_batch_execute_data( &mut conn, @@ -374,6 +375,7 @@ impl StorageSyncTask { .storage .insert(l1_batch_number, BatchData { execute_data, diff }); drop(state); + latency.observe(); } drop(conn); }