Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix av-distribution Jaeger spans mem leak #5321

Merged
merged 3 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions polkadot/node/network/availability-distribution/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use polkadot_node_subsystem::{
jaeger, messages::AvailabilityDistributionMessage, overseer, FromOrchestra, OverseerSignal,
SpawnedSubsystem, SubsystemError,
};
use polkadot_primitives::Hash;
use polkadot_primitives::{BlockNumber, Hash};
use std::collections::HashMap;

/// Error and [`Result`] type for this subsystem.
Expand Down Expand Up @@ -104,7 +104,7 @@ impl AvailabilityDistributionSubsystem {
/// Start processing work as passed on from the Overseer.
async fn run<Context>(self, mut ctx: Context) -> std::result::Result<(), FatalError> {
let Self { mut runtime, recvs, metrics, req_protocol_names } = self;
let mut spans: HashMap<Hash, jaeger::PerLeafSpan> = HashMap::new();
let mut spans: HashMap<Hash, (BlockNumber, jaeger::PerLeafSpan)> = HashMap::new();

let IncomingRequestReceivers {
pov_req_receiver,
Expand Down Expand Up @@ -162,7 +162,7 @@ impl AvailabilityDistributionSubsystem {
};
let span =
jaeger::PerLeafSpan::new(cloned_leaf.span, "availability-distribution");
spans.insert(cloned_leaf.hash, span);
spans.insert(cloned_leaf.hash, (cloned_leaf.number, span));
log_error(
requester
.get_mut()
Expand All @@ -172,8 +172,8 @@ impl AvailabilityDistributionSubsystem {
&mut warn_freq,
)?;
},
FromOrchestra::Signal(OverseerSignal::BlockFinalized(hash, _)) => {
spans.remove(&hash);
FromOrchestra::Signal(OverseerSignal::BlockFinalized(_hash, finalized_number)) => {
spans.retain(|_hash, (block_number, _span)| *block_number > finalized_number);
},
FromOrchestra::Signal(OverseerSignal::Conclude) => return Ok(()),
FromOrchestra::Communication {
Expand All @@ -189,7 +189,7 @@ impl AvailabilityDistributionSubsystem {
} => {
let span = spans
.get(&relay_parent)
.map(|span| span.child("fetch-pov"))
.map(|(_, span)| span.child("fetch-pov"))
.unwrap_or_else(|| jaeger::Span::new(&relay_parent, "fetch-pov"))
.with_trace_id(candidate_hash)
.with_candidate(candidate_hash)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ use polkadot_node_subsystem_util::{
availability_chunks::availability_chunk_index,
runtime::{get_occupied_cores, RuntimeInfo},
};
use polkadot_primitives::{CandidateHash, CoreIndex, Hash, OccupiedCore, SessionIndex};
use polkadot_primitives::{
BlockNumber, CandidateHash, CoreIndex, Hash, OccupiedCore, SessionIndex,
};

use super::{FatalError, Metrics, Result, LOG_TARGET};

Expand Down Expand Up @@ -112,14 +114,14 @@ impl Requester {
ctx: &mut Context,
runtime: &mut RuntimeInfo,
update: ActiveLeavesUpdate,
spans: &HashMap<Hash, jaeger::PerLeafSpan>,
spans: &HashMap<Hash, (BlockNumber, jaeger::PerLeafSpan)>,
) -> Result<()> {
gum::trace!(target: LOG_TARGET, ?update, "Update fetching heads");
let ActiveLeavesUpdate { activated, deactivated } = update;
if let Some(leaf) = activated {
let span = spans
.get(&leaf.hash)
.map(|span| span.child("update-fetching-heads"))
.map(|(_, span)| span.child("update-fetching-heads"))
.unwrap_or_else(|| jaeger::Span::new(&leaf.hash, "update-fetching-heads"))
.with_string_tag("leaf", format!("{:?}", leaf.hash))
.with_stage(jaeger::Stage::AvailabilityDistribution);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ fn check_ancestry_lookup_in_same_session() {

test_harness(test_state.clone(), |mut ctx| async move {
let chain = &test_state.relay_chain;
let spans: HashMap<Hash, jaeger::PerLeafSpan> = HashMap::new();
let spans: HashMap<Hash, (u32, jaeger::PerLeafSpan)> = HashMap::new();
let block_number = 1;
let update = ActiveLeavesUpdate {
activated: Some(new_leaf(chain[block_number], block_number as u32)),
Expand Down Expand Up @@ -281,7 +281,7 @@ fn check_ancestry_lookup_in_different_sessions() {

test_harness(test_state.clone(), |mut ctx| async move {
let chain = &test_state.relay_chain;
let spans: HashMap<Hash, jaeger::PerLeafSpan> = HashMap::new();
let spans: HashMap<Hash, (u32, jaeger::PerLeafSpan)> = HashMap::new();
let block_number = 3;
let update = ActiveLeavesUpdate {
activated: Some(new_leaf(chain[block_number], block_number as u32)),
Expand Down
11 changes: 11 additions & 0 deletions prdoc/pr_5321.prdoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
title: fix availability-distribution Jaeger spans memory leak

doc:
- audience: Node Dev
description: |
Fixes a memory leak which caused the Jaeger span storage in availability-distribution to never be pruned and therefore increasing indefinitely.
This was caused by improper handling of finalized heads. More info in https://github.com/paritytech/polkadot-sdk/issues/5258

crates:
- name: polkadot-availability-distribution
bump: patch
Loading