From dd154f388c23ff67068a1053fec878e80ba9bd17 Mon Sep 17 00:00:00 2001
From: Daniyar Itegulov <di@matterlabs.dev>
Date: Wed, 12 Jun 2024 02:15:35 +1000
Subject: [PATCH] feat(vm-runner): add basic metrics (#2203)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What ❔

This PR just adds some basic VM runner-specific metrics. Note that VM
runner already inherits VM-related metrics from state keeper.

I decided to split `storage_load`/`vm_run`/`output_handle` into 3
different metrics as opposed to a single metric with 3 stages as the
stages happen asynchronously and don't represent a continuous execution.
Lmk if you disagree though.

## Why ❔

<!-- Why are these changes done? What goal do they contribute to? What
are the principles behind them? -->
<!-- Example: PR templates ensure PR reviewers, observers, and future
iterators are in context about the evolution of repos. -->

## Checklist

<!-- Check your PR fulfills the following items. -->
<!-- For draft PRs check the boxes as you complete them. -->

- [x] PR title corresponds to the body of PR (we generate changelog
entries from PRs).
- [ ] Tests for the changes have been added / updated.
- [x] Documentation comments have been added / updated.
- [x] Code has been formatted via `zk fmt` and `zk lint`.
- [x] Spellcheck has been run via `zk spellcheck`.
---
 Cargo.lock                                |  1 +
 core/node/vm_runner/Cargo.toml            |  1 +
 core/node/vm_runner/src/lib.rs            |  1 +
 core/node/vm_runner/src/metrics.rs        | 28 +++++++++++++++++++++++
 core/node/vm_runner/src/output_handler.rs | 10 ++++++--
 core/node/vm_runner/src/process.rs        |  8 ++++++-
 core/node/vm_runner/src/storage.rs        |  4 +++-
 7 files changed, 49 insertions(+), 4 deletions(-)
 create mode 100644 core/node/vm_runner/src/metrics.rs
diff --git a/Cargo.lock b/Cargo.lock
index 6feb9b0e472f..00638f6973a9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9442,6 +9442,7 @@ dependencies = [
  "tempfile",
  "tokio",
  "tracing",
+ "vise",
  "vm_utils",
  "zksync_contracts",
  "zksync_dal",
diff --git a/core/node/vm_runner/Cargo.toml b/core/node/vm_runner/Cargo.toml
index b3ede5a796be..5571bb7f3fde 100644
--- a/core/node/vm_runner/Cargo.toml
+++ b/core/node/vm_runner/Cargo.toml
@@ -26,6 +26,7 @@ async-trait.workspace = true
 once_cell.workspace = true
 tracing.workspace = true
 dashmap.workspace = true
+vise.workspace = true
 
 [dev-dependencies]
 zksync_node_test_utils.workspace = true
diff --git a/core/node/vm_runner/src/lib.rs b/core/node/vm_runner/src/lib.rs
index ca9f8bdc0eb4..50cf2a4433c1 100644
--- a/core/node/vm_runner/src/lib.rs
+++ b/core/node/vm_runner/src/lib.rs
@@ -9,6 +9,7 @@ mod output_handler;
 mod process;
 mod storage;
 
+mod metrics;
 #[cfg(test)]
 mod tests;
 
diff --git a/core/node/vm_runner/src/metrics.rs b/core/node/vm_runner/src/metrics.rs
new file mode 100644
index 000000000000..4252ad5f0d4f
--- /dev/null
+++ b/core/node/vm_runner/src/metrics.rs
@@ -0,0 +1,28 @@
+//! Metrics for `VmRunner`.
+
+use std::time::Duration;
+
+use vise::{Buckets, Gauge, Histogram, Metrics};
+
+#[derive(Debug, Metrics)]
+#[metrics(prefix = "vm_runner")]
+pub(super) struct VmRunnerMetrics {
+    /// Last batch that has been marked as processed.
+    pub last_processed_batch: Gauge<u64>,
+    /// Last batch that is ready to be processed.
+    pub last_ready_batch: Gauge<u64>,
+    /// Current amount of batches that are being processed.
+    pub in_progress_l1_batches: Gauge<u64>,
+    /// Total latency of loading an L1 batch (RocksDB mode only).
+    #[metrics(buckets = Buckets::LATENCIES)]
+    pub storage_load_time: Histogram<Duration>,
+    /// Total latency of running VM on an L1 batch.
+    #[metrics(buckets = Buckets::LATENCIES)]
+    pub run_vm_time: Histogram<Duration>,
+    /// Total latency of handling output of an L1 batch.
+    #[metrics(buckets = Buckets::LATENCIES)]
+    pub output_handle_time: Histogram<Duration>,
+}
+
+#[vise::register]
+pub(super) static METRICS: vise::Global<VmRunnerMetrics> = vise::Global::new();
diff --git a/core/node/vm_runner/src/output_handler.rs b/core/node/vm_runner/src/output_handler.rs
index 49bed83cd96e..4052c245a44f 100644
--- a/core/node/vm_runner/src/output_handler.rs
+++ b/core/node/vm_runner/src/output_handler.rs
@@ -16,7 +16,7 @@ use zksync_dal::{ConnectionPool, Core};
 use zksync_state_keeper::{StateKeeperOutputHandler, UpdatesManager};
 use zksync_types::L1BatchNumber;
 
-use crate::VmRunnerIo;
+use crate::{metrics::METRICS, VmRunnerIo};
 
 type BatchReceiver = oneshot::Receiver<JoinHandle<anyhow::Result<()>>>;
 
@@ -173,7 +173,10 @@ impl StateKeeperOutputHandler for AsyncOutputHandler {
             } => {
                 sender
                     .send(tokio::task::spawn(async move {
-                        handler.handle_l1_batch(updates_manager).await
+                        let latency = METRICS.output_handle_time.start();
+                        let result = handler.handle_l1_batch(updates_manager).await;
+                        latency.observe();
+                        result
                     }))
                     .ok();
                 Ok(())
@@ -248,6 +251,9 @@ impl<Io: VmRunnerIo> ConcurrentOutputHandlerFactoryTask<Io> {
                     self.io
                         .mark_l1_batch_as_completed(&mut conn, latest_processed_batch)
                         .await?;
+                    METRICS
+                        .last_processed_batch
+                        .set(latest_processed_batch.0.into());
                 }
             }
         }
diff --git a/core/node/vm_runner/src/process.rs b/core/node/vm_runner/src/process.rs
index 2d992fdd31dc..945d35477ce6 100644
--- a/core/node/vm_runner/src/process.rs
+++ b/core/node/vm_runner/src/process.rs
@@ -10,7 +10,7 @@ use zksync_state_keeper::{
 };
 use zksync_types::{block::L2BlockExecutionData, L1BatchNumber};
 
-use crate::{storage::StorageLoader, OutputHandlerFactory, VmRunnerIo};
+use crate::{metrics::METRICS, storage::StorageLoader, OutputHandlerFactory, VmRunnerIo};
 
 /// VM runner represents a logic layer of L1 batch / L2 block processing flow akin to that of state
 /// keeper. The difference is that VM runner is designed to be run on batches/blocks that have
@@ -61,6 +61,7 @@ impl VmRunner {
         mut updates_manager: UpdatesManager,
         mut output_handler: Box<dyn StateKeeperOutputHandler>,
     ) -> anyhow::Result<()> {
+        let latency = METRICS.run_vm_time.start();
         for (i, l2_block) in l2_blocks.into_iter().enumerate() {
             if i > 0 {
                 // First L2 block in every batch is already preloaded
@@ -114,6 +115,7 @@ impl VmRunner {
             .await
             .context("failed finishing L1 batch in executor")?;
         updates_manager.finish_batch(finished_batch);
+        latency.observe();
         output_handler
             .handle_l1_batch(Arc::new(updates_manager))
             .await
@@ -153,11 +155,15 @@ impl VmRunner {
                 }
             }
             task_handles = retained_handles;
+            METRICS
+                .in_progress_l1_batches
+                .set(task_handles.len() as u64);
 
             let last_ready_batch = self
                 .io
                 .last_ready_to_be_loaded_batch(&mut self.pool.connection().await?)
                 .await?;
+            METRICS.last_ready_batch.set(last_ready_batch.0.into());
             if next_batch > last_ready_batch {
                 // Next batch is not ready to be processed yet
                 tokio::time::sleep(SLEEP_INTERVAL).await;
diff --git a/core/node/vm_runner/src/storage.rs b/core/node/vm_runner/src/storage.rs
index e7a8b147c76f..7a53f6034a75 100644
--- a/core/node/vm_runner/src/storage.rs
+++ b/core/node/vm_runner/src/storage.rs
@@ -19,7 +19,7 @@ use zksync_state::{
 use zksync_storage::RocksDB;
 use zksync_types::{block::L2BlockExecutionData, L1BatchNumber, L2ChainId};
 
-use crate::VmRunnerIo;
+use crate::{metrics::METRICS, VmRunnerIo};
 
 #[async_trait]
 pub trait StorageLoader: ReadStorageFactory {
@@ -338,6 +338,7 @@ impl<Io: VmRunnerIo> StorageSyncTask<Io> {
             drop(state);
             let max_desired = self.io.last_ready_to_be_loaded_batch(&mut conn).await?;
             for l1_batch_number in max_present.0 + 1..=max_desired.0 {
+                let latency = METRICS.storage_load_time.start();
                 let l1_batch_number = L1BatchNumber(l1_batch_number);
                 let Some(execute_data) = Self::load_batch_execute_data(
                     &mut conn,
@@ -374,6 +375,7 @@ impl<Io: VmRunnerIo> StorageSyncTask<Io> {
                     .storage
                     .insert(l1_batch_number, BatchData { execute_data, diff });
                 drop(state);
+                latency.observe();
             }
             drop(conn);
         }