Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add metrics to measure compiled-contract cache hits/misses for chunk production vs. chunk validation. #11178

Merged
merged 24 commits into from
May 6, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
1f8cff8
Initial commit.
tayfunelmas Apr 26, 2024
11e66e1
Connect pieces second commit.
tayfunelmas Apr 27, 2024
deccb30
Move reason to primitives and not use string.
tayfunelmas Apr 27, 2024
162e583
Merge branch 'master' into compile-metric
tayfunelmas Apr 29, 2024
005cde2
Change string to ApplyChunkReason in ApplyState
tayfunelmas Apr 29, 2024
eec1b28
Address clippy findings.
tayfunelmas Apr 29, 2024
7436146
Merge branch 'master' into compile-metric
tayfunelmas Apr 29, 2024
c3e8ccd
Merge branch 'master' into compile-metric
tayfunelmas Apr 30, 2024
4347216
Remove apply_reason from VContext and use exported Metrics structure …
tayfunelmas Apr 30, 2024
9857410
fix clippy errors and format
tayfunelmas Apr 30, 2024
15d66a9
Add one more cache hit record.
tayfunelmas Apr 30, 2024
b4dfbf1
Merge hit and miss functions and also move metrics.record to record o…
tayfunelmas Apr 30, 2024
7beb96d
Address comments v1
tayfunelmas May 1, 2024
d975a96
Make reset and report operations static
tayfunelmas May 1, 2024
292b613
Set the metric correctly to consider memory and disk cache
tayfunelmas May 1, 2024
35f9ec9
Merge branch 'master' into compile-metric
tayfunelmas May 1, 2024
565dd0c
Remove extra call to record_compiled_contract_cache_lookup
tayfunelmas May 1, 2024
a68c34e
Merge branch 'compile-metric' of https://github.com/tayfunelmas/nearc…
tayfunelmas May 1, 2024
e720ebd
Merge branch 'master' into compile-metric
tayfunelmas May 3, 2024
60617ad
Merge branch 'master' into compile-metric
tayfunelmas May 6, 2024
17f0a9b
Merge branch 'master' into compile-metric
tayfunelmas May 6, 2024
21886bf
Merge branch 'master' into compile-metric
tayfunelmas May 6, 2024
7d60706
Add instrumentation to enable record_compiled_contract_cache_lookup f…
tayfunelmas May 6, 2024
b129a73
Also add x86_64 annotation to skip MacOS tests (the codepath is not e…
tayfunelmas May 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions chain/chain/src/chain_update.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use near_chain_primitives::error::Error;
use near_epoch_manager::shard_tracker::ShardTracker;
use near_epoch_manager::types::BlockHeaderInfo;
use near_epoch_manager::EpochManagerAdapter;
use near_primitives::apply::ApplyChunkReason;
use near_primitives::block::{Block, Tip};
use near_primitives::block_header::BlockHeader;
#[cfg(feature = "new_epoch_sync")]
Expand Down Expand Up @@ -761,6 +762,7 @@ impl<'a> ChainUpdate<'a> {

let apply_result = self.runtime_adapter.apply_chunk(
RuntimeStorageConfig::new(chunk_header.prev_state_root(), true),
ApplyChunkReason::UpdateTrackedShard,
ApplyChunkShardContext {
shard_id,
gas_limit,
Expand Down Expand Up @@ -869,6 +871,7 @@ impl<'a> ChainUpdate<'a> {

let apply_result = self.runtime_adapter.apply_chunk(
RuntimeStorageConfig::new(*chunk_extra.state_root(), true),
ApplyChunkReason::UpdateTrackedShard,
ApplyChunkShardContext {
shard_id,
last_validator_proposals: chunk_extra.validator_proposals(),
Expand Down
5 changes: 5 additions & 0 deletions chain/chain/src/runtime/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use near_epoch_manager::{EpochManagerAdapter, EpochManagerHandle};
use near_parameters::{ActionCosts, ExtCosts, RuntimeConfigStore};
use near_pool::types::TransactionGroupIterator;
use near_primitives::account::{AccessKey, Account};
use near_primitives::apply::ApplyChunkReason;
use near_primitives::checked_feature;
use near_primitives::errors::{InvalidTxError, RuntimeError, StorageError};
use near_primitives::hash::{hash, CryptoHash};
Expand Down Expand Up @@ -246,6 +247,7 @@ impl NightshadeRuntime {
fn process_state_update(
&self,
trie: Trie,
apply_reason: ApplyChunkReason,
chunk: ApplyChunkShardContext,
block: ApplyChunkBlockContext,
receipts: &[Receipt],
Expand Down Expand Up @@ -361,6 +363,7 @@ impl NightshadeRuntime {
);

let apply_state = ApplyState {
apply_reason: Some(apply_reason),
block_height,
prev_block_hash: *prev_block_hash,
block_hash,
Expand Down Expand Up @@ -868,6 +871,7 @@ impl RuntimeAdapter for NightshadeRuntime {
fn apply_chunk(
&self,
storage_config: RuntimeStorageConfig,
apply_reason: ApplyChunkReason,
chunk: ApplyChunkShardContext,
block: ApplyChunkBlockContext,
receipts: &[Receipt],
Expand Down Expand Up @@ -907,6 +911,7 @@ impl RuntimeAdapter for NightshadeRuntime {

match self.process_state_update(
trie,
apply_reason,
chunk,
block,
receipts,
Expand Down
2 changes: 2 additions & 0 deletions chain/chain/src/runtime/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use near_epoch_manager::{EpochManager, RngSeed};
use near_pool::{
InsertTransactionResult, PoolIteratorWrapper, TransactionGroupIteratorWrapper, TransactionPool,
};
use near_primitives::apply::ApplyChunkReason;
use near_primitives::checked_feature;
use near_primitives::test_utils::create_test_signer;
use near_primitives::types::validator_stake::{ValidatorStake, ValidatorStakeIter};
Expand Down Expand Up @@ -83,6 +84,7 @@ impl NightshadeRuntime {
let mut result = self
.apply_chunk(
RuntimeStorageConfig::new(*state_root, true),
ApplyChunkReason::UpdateTrackedShard,
ApplyChunkShardContext {
shard_id,
last_validator_proposals,
Expand Down
2 changes: 2 additions & 0 deletions chain/chain/src/test_utils/kv_runtime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use near_epoch_manager::types::BlockHeaderInfo;
use near_epoch_manager::{EpochManagerAdapter, RngSeed};
use near_pool::types::TransactionGroupIterator;
use near_primitives::account::{AccessKey, Account};
use near_primitives::apply::ApplyChunkReason;
use near_primitives::block::Tip;
use near_primitives::block_header::{Approval, ApprovalInner};
use near_primitives::congestion_info::CongestionInfo;
Expand Down Expand Up @@ -1110,6 +1111,7 @@ impl RuntimeAdapter for KeyValueRuntime {
fn apply_chunk(
&self,
storage_config: RuntimeStorageConfig,
_apply_reason: ApplyChunkReason,
chunk: ApplyChunkShardContext,
block: ApplyChunkBlockContext,
receipts: &[Receipt],
Expand Down
2 changes: 2 additions & 0 deletions chain/chain/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use near_chain_configs::ReshardingConfig;
use near_chain_primitives::Error;
pub use near_epoch_manager::EpochManagerAdapter;
use near_pool::types::TransactionGroupIterator;
use near_primitives::apply::ApplyChunkReason;
pub use near_primitives::block::{Block, BlockHeader, Tip};
use near_primitives::challenge::{ChallengesResult, PartialState};
use near_primitives::checked_feature;
Expand Down Expand Up @@ -450,6 +451,7 @@ pub trait RuntimeAdapter: Send + Sync {
fn apply_chunk(
&self,
storage: RuntimeStorageConfig,
apply_reason: ApplyChunkReason,
chunk: ApplyChunkShardContext,
block: ApplyChunkBlockContext,
receipts: &[Receipt],
Expand Down
7 changes: 7 additions & 0 deletions chain/chain/src/update_shard.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use crate::types::{
use near_async::time::Clock;
use near_chain_primitives::Error;
use near_epoch_manager::EpochManagerAdapter;
use near_primitives::apply::ApplyChunkReason;
use near_primitives::hash::CryptoHash;
use near_primitives::receipt::Receipt;
use near_primitives::sandbox::state_patch::SandboxStatePatch;
Expand Down Expand Up @@ -128,13 +129,15 @@ pub(crate) fn process_shard_update(
) -> Result<ShardUpdateResult, Error> {
Ok(match shard_update_reason {
ShardUpdateReason::NewChunk(data) => ShardUpdateResult::NewChunk(apply_new_chunk(
ApplyChunkReason::UpdateTrackedShard,
parent_span,
data,
shard_context,
runtime,
epoch_manager,
)?),
ShardUpdateReason::OldChunk(data) => ShardUpdateResult::OldChunk(apply_old_chunk(
ApplyChunkReason::UpdateTrackedShard,
parent_span,
data,
shard_context,
Expand All @@ -154,6 +157,7 @@ pub(crate) fn process_shard_update(
/// Applies new chunk, which includes applying transactions from chunk and
/// receipts filtered from outgoing receipts from previous chunks.
pub fn apply_new_chunk(
apply_reason: ApplyChunkReason,
parent_span: &tracing::Span,
data: NewChunkData,
shard_context: ShardContext,
Expand Down Expand Up @@ -187,6 +191,7 @@ pub fn apply_new_chunk(
};
match runtime.apply_chunk(
storage_config,
apply_reason,
ApplyChunkShardContext {
shard_id,
last_validator_proposals: chunk_header.prev_validator_proposals(),
Expand Down Expand Up @@ -225,6 +230,7 @@ pub fn apply_new_chunk(
/// (logunov) From what I know, the state update may include only validator
/// accounts update on epoch start.
pub fn apply_old_chunk(
apply_reason: ApplyChunkReason,
parent_span: &tracing::Span,
data: OldChunkData,
shard_context: ShardContext,
Expand All @@ -248,6 +254,7 @@ pub fn apply_old_chunk(
};
match runtime.apply_chunk(
storage_config,
apply_reason,
ApplyChunkShardContext {
shard_id,
last_validator_proposals: prev_chunk_extra.validator_proposals(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use near_chain_primitives::Error;
use near_epoch_manager::EpochManagerAdapter;
use near_network::types::{NetworkRequests, PeerManagerMessageRequest};
use near_pool::TransactionGroupIteratorWrapper;
use near_primitives::apply::ApplyChunkReason;
use near_primitives::hash::{hash, CryptoHash};
use near_primitives::merkle::merklize;
use near_primitives::receipt::Receipt;
Expand Down Expand Up @@ -482,6 +483,7 @@ pub(crate) fn validate_chunk_state_witness(
MainTransition::NewChunk(new_chunk_data) => {
let chunk_header = new_chunk_data.chunk_header.clone();
let NewChunkResult { apply_result: mut main_apply_result, .. } = apply_new_chunk(
ApplyChunkReason::ValidateChunkStateWitness,
&span,
new_chunk_data,
ShardContext {
Expand Down Expand Up @@ -529,6 +531,7 @@ pub(crate) fn validate_chunk_state_witness(
},
};
let OldChunkResult { apply_result, .. } = apply_old_chunk(
ApplyChunkReason::ValidateChunkStateWitness,
&span,
old_chunk_data,
ShardContext {
Expand Down
30 changes: 30 additions & 0 deletions core/primitives-core/src/apply.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
use std::fmt::{Display, Formatter};

/// Indicates the phase of block production the apply-chunk operation is invoked.
/// This is currently used to analyze and compare the metrics collected while applying the receipts
/// for different purposes, eg. while updating state of a tracked shard vs. validating a chunk.
/// TODO: Consider combining ApplyChunkReason, ApplyChunkBlockContext, and ApplyChunkBlockContext
/// under a common wrapper struct such as ApplyChunkContext.
#[derive(Clone, Debug)]
pub enum ApplyChunkReason {
/// Apply-chunk is invoked to update the state of a shards being tracked.
UpdateTrackedShard,
/// Apply-chunk is invoked to validate the state witness for a shard in the context of stateless validation.
ValidateChunkStateWitness,
}

impl ApplyChunkReason {
/// Returns a static, short string representation of the reason, to be used for metrics.
pub fn as_str(&self) -> &'static str {
match self {
ApplyChunkReason::UpdateTrackedShard => "update_shard",
ApplyChunkReason::ValidateChunkStateWitness => "validate_chunk",
}
}
}

impl Display for ApplyChunkReason {
fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
f.write_str(self.as_str())
}
}
1 change: 1 addition & 0 deletions core/primitives-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ pub use borsh;
pub use num_rational;

pub mod account;
pub mod apply;
pub mod chains;
pub mod config;
pub mod hash;
Expand Down
1 change: 1 addition & 0 deletions core/primitives/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
pub use near_primitives_core::account;
pub use near_primitives_core::apply;
pub use near_primitives_core::borsh;
pub use near_primitives_core::config;
pub use near_primitives_core::hash;
Expand Down
1 change: 1 addition & 0 deletions integration-tests/src/user/runtime_user.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ impl RuntimeUser {
let shard_id = 0;

ApplyState {
apply_reason: None,
block_height: 1,
prev_block_hash: Default::default(),
block_hash: Default::default(),
Expand Down
49 changes: 47 additions & 2 deletions runtime/near-vm-runner/src/metrics.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
use near_o11y::metrics::{try_create_histogram_vec, HistogramVec};
use near_o11y::metrics::{
try_create_histogram_vec, try_create_int_counter_vec, HistogramVec, IntCounterVec,
};
use once_cell::sync::Lazy;
use std::{cell::RefCell, time::Duration};

thread_local! {
static METRICS: RefCell<Metrics> = const { RefCell::new(Metrics {
near_vm_compilation_time: Duration::new(0, 0),
wasmtime_compilation_time: Duration::new(0, 0),
compiled_contract_cache_hit: None,
}) };
}

Expand All @@ -19,10 +22,30 @@ pub static COMPILATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
.unwrap()
});

pub static COMPILED_CONTRACT_CACHE_HIT: Lazy<IntCounterVec> = Lazy::new(|| {
try_create_int_counter_vec(
"near_vm_compiled_contract_cache_hits_total",
"The number of times the runtime finds compiled code in cache for the given caller context and shard_id",
&["context", "shard_id"],
)
.unwrap()
});

pub static COMPILED_CONTRACT_CACHE_MISS: Lazy<IntCounterVec> = Lazy::new(|| {
try_create_int_counter_vec(
"near_vm_compiled_contract_cache_misses_total",
"The number of times the runtime cannot find compiled code in cache for the given caller context and shard_id",
&["context", "shard_id"],
)
.unwrap()
});

#[derive(Default, Copy, Clone)]
pub struct Metrics {
near_vm_compilation_time: Duration,
wasmtime_compilation_time: Duration,
/// True iff the runtime checked the compiled contract cache and found already-compiled code.
compiled_contract_cache_hit: Option<bool>,
}
Copy link
Collaborator

@nagisa nagisa May 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd use a usize that counts the count. It is less error prone this way (e.g. what happens if the code is adjusted to access the cache multiple times?)

It might also solve your concerns with double counting if we make fn report operate on the value inside the TLS... So we would change the return value of run back to only return an VMResult, and then change fn report to be a static method or something that we would operate on the TLS value much like the increment functions do.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I like that. I changed report to a static function (renamed to report_metrics) and also renamed reset to reset_metrics. And calling reset_metrics and report_metrics once in the VM runner (outside of VM.run). PTAL.


impl Metrics {
Expand All @@ -37,7 +60,8 @@ impl Metrics {
METRICS.with_borrow(|m| *m)
}

pub fn report(&mut self, shard_id: &str) {
/// Report the current metrics at the end of a single VM invocation (eg. to run a function call).
pub fn report(&mut self, shard_id: &str, caller_context: &str) {
if !self.near_vm_compilation_time.is_zero() {
COMPILATION_TIME
.with_label_values(&["near_vm", shard_id])
Expand All @@ -50,6 +74,15 @@ impl Metrics {
.observe(self.wasmtime_compilation_time.as_secs_f64());
self.wasmtime_compilation_time = Duration::default();
}
match self.compiled_contract_cache_hit.take() {
Some(true) => {
COMPILED_CONTRACT_CACHE_HIT.with_label_values(&[caller_context, shard_id]).inc();
}
Some(false) => {
COMPILED_CONTRACT_CACHE_MISS.with_label_values(&[caller_context, shard_id]).inc();
}
tayfunelmas marked this conversation as resolved.
Show resolved Hide resolved
None => {}
};
}
}

Expand All @@ -63,3 +96,15 @@ pub(crate) fn compilation_duration(kind: near_parameters::vm::VMKind, duration:
VMKind::NearVm => m.near_vm_compilation_time += duration,
});
}

/// Updates metrics to record a compiled-contract cache lookup that finds
/// an entry in the cache (is_hit=true) or not (is_hit=false).
pub(crate) fn record_compiled_contract_cache_lookup(is_hit: bool) {
METRICS.with_borrow_mut(|m| {
debug_assert!(
m.compiled_contract_cache_hit.is_none(),
"Compiled contract cache lookup should be reported once."
);
m.compiled_contract_cache_hit = Some(is_hit);
});
}
3 changes: 3 additions & 0 deletions runtime/near-vm-runner/src/near_vm_runner/runner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -239,10 +239,12 @@ impl NearVM {
let key = get_contract_cache_key(code_hash, &self.config);
let cache_record = cache.get(&key).map_err(CacheError::ReadError)?;
let Some(code) = cache_record else {
crate::metrics::record_compiled_contract_cache_lookup(false);
return Err(VMRunnerError::CacheError(CacheError::ReadError(
std::io::Error::from(std::io::ErrorKind::NotFound),
)));
};
crate::metrics::record_compiled_contract_cache_lookup(true);

match &code.compiled {
CompiledContract::CompileModuleError(err) => {
Expand Down Expand Up @@ -295,6 +297,7 @@ impl NearVM {
move |value| {
let _span =
tracing::debug_span!(target: "vm", "NearVM::load_from_mem_cache").entered();
crate::metrics::record_compiled_contract_cache_lookup(true);
let &(wasm_bytes, ref downcast) = value
.downcast_ref::<MemoryCacheType>()
.expect("downcast should always succeed");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ impl<'c> EstimatorContext<'c> {

let shard_id = ShardUId::single_shard().shard_id();
ApplyState {
apply_reason: None,
// Put each runtime into a separate shard.
block_height: 1,
// Epoch length is long enough to avoid corner cases.
Expand Down
Loading
Loading