From d93b4675857e3bbf3148464b76d635297bbc6981 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 18 Nov 2019 15:36:49 -0800 Subject: [PATCH] subscriber: fix spans never being removed from the registry (#435) ## Motivation This branch fixes several bugs in the `Registry` span close logic, and adds new tests for these issues. ## Solutions * Spans not being removed at the end of `on_close` There is currently a bug in the `Registry` that prevents spans from being removed, even when all `on_close` callbacks are complete. The test for this behavior ( `span_is_removed_from_registry`) fails to catch the bug, since there is a bug in the test as well. The test asserts that a span extension was dropped after the span closes. Current, the test uses `subscriber::with_default`, and then made the assertion _outside_ of the `with_default` block. However, this was incorrect, as even if the `Registry` fails to drop the span, the span extension will be dropped when the whole `Registry` itself is dropped, at the end of the `with_default` block. This branch fixes the test, and the bug in the close logic. I've changed the test to use an explicit `Dispatch` to keep the registry alive until the end of the test. This reveals that the span is _not_ being removed as it should be. The reason the span fails to be removed is that the `Drop` implementation for `CloseGuard` drops the span if the value of the `CLOSE_COUNT` cell is 0. However, the `if` statement that tests for this compares the value returned by `Cell::get` with 0. The value returned by `Cell::get` is the _previous_ value, not the current one after dropping this guard; if the guard being dropped is the final guard, then the value returned by `Cell::get` will be 1, rather than 0. I've fixed this logic, and refactored it slightly to hopefully make it easier to understand in the future. Thanks to @jtescher for catching this bug! * Only the first span being removed at the end of `on_close` In addition, I've fixed a bug where the remove after close logic would only work for the _first_ span to close on each thread. This is because dropping a `CloseGuard` does not currently subtract from CLOSE_COUNT when it is the final guard. This means that when the next span is closed, the count will start at 1, rather than 0. I've fixed this by ensuring that the close count is always decremented, and changed the tests to close multiple spans. * Spans removed on the first `try_close` Finally, there is also an issue where the removal logic is run on _every_ call to `try_close`, regardless of whether or not the subscriber actually indicates that the span closes. This means that a span will be removed from the registry even when there are one or more span handles that reference it. This is due to the `start_close` method being called _before_ `Subscriber::try_close` is called. When a close guard is dropped, the span is currently _always_ removed. However, since we call `start_close` at the beginning of the `Layered::try_close` method, we may be decrementing a ref count _without_ closing the span, but the close guard is unaware of this. I've fixed this bug by updating the `CloseGuard` struct to track whether the span is closed. It now has a bool that is set only when the `Subscriber::try_close` call returns true. Only creating the `CloseGuard` if `try_close` returns true may seem like a simpler solution, but that won't work, since the call to `try_close` may call into another `Layered` subscriber. In order to handle situations where many layers are nested, we need to construct the `CloseGuard` for each stack frame before calling into the next one, so it's necessary to set whether the span is closing only after the call to `try_close` returns. Signed-off-by: Eliza Weisman --- tracing-subscriber/src/layer.rs | 11 +- tracing-subscriber/src/registry/sharded.rs | 162 ++++++++++++++++----- 2 files changed, 136 insertions(+), 37 deletions(-) diff --git a/tracing-subscriber/src/layer.rs b/tracing-subscriber/src/layer.rs index 8b189c9ab6..68c7a73db4 100644 --- a/tracing-subscriber/src/layer.rs +++ b/tracing-subscriber/src/layer.rs @@ -497,13 +497,16 @@ where #[cfg(feature = "registry")] let subscriber = &self.inner as &dyn Subscriber; #[cfg(feature = "registry")] - let _guard = subscriber + let mut guard = subscriber .downcast_ref::() .and_then(|registry| Some(registry.start_close(id.clone()))); + if self.inner.try_close(id.clone()) { + // If we have a registry's close guard, indicate that the span is + // closing. + #[cfg(feature = "registry")] + guard.as_mut().map(|g| g.is_closing()); - let id2 = id.clone(); - if self.inner.try_close(id) { - self.layer.on_close(id2, self.ctx()); + self.layer.on_close(id, self.ctx()); true } else { false diff --git a/tracing-subscriber/src/registry/sharded.rs b/tracing-subscriber/src/registry/sharded.rs index e5ae5f51f4..2f011a10d4 100644 --- a/tracing-subscriber/src/registry/sharded.rs +++ b/tracing-subscriber/src/registry/sharded.rs @@ -114,6 +114,7 @@ fn id_to_idx(id: &Id) -> usize { pub(crate) struct CloseGuard<'a> { id: Id, registry: &'a Registry, + is_closing: bool, } impl Registry { @@ -138,6 +139,7 @@ impl Registry { CloseGuard { id, registry: &self, + is_closing: false, } } } @@ -295,6 +297,12 @@ impl Drop for DataInner { } } +impl<'a> CloseGuard<'a> { + pub(crate) fn is_closing(&mut self) { + self.is_closing = true; + } +} + impl<'a> Drop for CloseGuard<'a> { fn drop(&mut self) { // If this returns with an error, we are already panicking. At @@ -302,11 +310,16 @@ impl<'a> Drop for CloseGuard<'a> { // except by avoiding a double-panic. let _ = CLOSE_COUNT.try_with(|count| { let c = count.get(); - if c > 0 { - count.set(c - 1); - } else { + // If the current close count is 1, this stack frame is the last + // `on_close` call. If the span is closing, it's okay to remove the + // span. + if c == 1 && self.is_closing { self.registry.spans.remove(id_to_idx(&self.id)); } + + // Decrement the count to indicate that _this_ guard's + // `on_close` callback has completed. + count.set(c - 1); }); } } @@ -339,9 +352,13 @@ impl<'a> SpanData<'a> for Data<'a> { pub(crate) mod tests { use super::Registry; use crate::{layer::Context, registry::LookupSpan, Layer}; - use std::sync::atomic::{AtomicBool, Ordering}; + use std::sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }; use tracing::{self, subscriber::with_default}; use tracing_core::{ + dispatcher, span::{Attributes, Id}, Subscriber, }; @@ -352,6 +369,7 @@ pub(crate) mod tests { S: Subscriber + for<'a> LookupSpan<'a>, { fn on_close(&self, id: Id, ctx: Context<'_, S>) { + dbg!(format_args!("closing {:?}", id)); assert!(&ctx.span(&id).is_some()); } } @@ -378,45 +396,123 @@ pub(crate) mod tests { }); } - #[test] - fn span_is_removed_from_registry() { - static IS_REMOVED: AtomicBool = AtomicBool::new(false); - - struct ClosingLayer; - impl Layer for ClosingLayer - where - S: Subscriber + for<'a> LookupSpan<'a>, - { - fn new_span(&self, _: &Attributes<'_>, id: &Id, ctx: Context<'_, S>) { - let span = ctx.span(id).expect("Missing span; this is a bug"); - let mut extensions = span.extensions_mut(); - extensions.insert(ClosingSpan); - } + struct ClosingLayer { + span1_removed: Arc, + span2_removed: Arc, + } - fn on_close(&self, id: Id, ctx: Context<'_, S>) { - assert!(&ctx.span(&id).is_some()); - let span = &ctx.span(&id).unwrap(); - let extensions = span.extensions(); - assert!(extensions.get::().is_some()); - } + impl Layer for ClosingLayer + where + S: Subscriber + for<'a> LookupSpan<'a>, + { + fn new_span(&self, _: &Attributes<'_>, id: &Id, ctx: Context<'_, S>) { + let span = ctx.span(id).expect("Missing span; this is a bug"); + let is_removed = match dbg!(span.name()) { + "span1" => self.span1_removed.clone(), + "span2" => self.span2_removed.clone(), + _ => return, + }; + let mut extensions = span.extensions_mut(); + extensions.insert(ClosingSpan { is_removed }); } - struct ClosingSpan; - impl Drop for ClosingSpan { - fn drop(&mut self) { - IS_REMOVED.store(true, Ordering::Release) - } + fn on_close(&self, id: Id, ctx: Context<'_, S>) { + dbg!(&id); + assert!(&ctx.span(&id).is_some()); + let span = &ctx.span(&id).unwrap(); + match dbg!(span.name()) { + "span1" | "span2" => {} + _ => return, + }; + let extensions = span.extensions(); + assert!(extensions.get::().is_some()); } + } + + struct ClosingSpan { + is_removed: Arc, + } + + impl Drop for ClosingSpan { + fn drop(&mut self) { + self.is_removed.store(true, Ordering::Release) + } + } + + #[test] + fn spans_are_removed_from_registry() { + let span1_removed = Arc::new(AtomicBool::new(false)); + let span2_removed = Arc::new(AtomicBool::new(false)); + + let span1_removed2 = span1_removed.clone(); + let span2_removed2 = span2_removed.clone(); let subscriber = AssertionLayer - .and_then(ClosingLayer) + .and_then(ClosingLayer { + span1_removed, + span2_removed, + }) .with_subscriber(Registry::default()); - with_default(subscriber, || { - let span = tracing::debug_span!("span"); + // Create a `Dispatch` (which is internally reference counted) so that + // the subscriber lives to the end of the test. Otherwise, if we just + // passed the subscriber itself to `with_default`, we could see the span + // be dropped when the subscriber itself is dropped, destroying the + // registry. + let dispatch = dispatcher::Dispatch::new(subscriber); + + dispatcher::with_default(&dispatch, || { + let span = tracing::debug_span!("span1"); + drop(span); + let span = tracing::info_span!("span2"); + drop(span); + }); + + assert!(span1_removed2.load(Ordering::Acquire) == true); + assert!(span2_removed2.load(Ordering::Acquire) == true); + + // Ensure the registry itself outlives the span. + drop(dispatch); + } + + #[test] + fn spans_are_only_closed_when_the_last_ref_drops() { + let span1_removed = Arc::new(AtomicBool::new(false)); + let span2_removed = Arc::new(AtomicBool::new(false)); + + let span1_removed2 = span1_removed.clone(); + let span2_removed2 = span2_removed.clone(); + + let subscriber = AssertionLayer + .and_then(ClosingLayer { + span1_removed, + span2_removed, + }) + .with_subscriber(Registry::default()); + + // Create a `Dispatch` (which is internally reference counted) so that + // the subscriber lives to the end of the test. Otherwise, if we just + // passed the subscriber itself to `with_default`, we could see the span + // be dropped when the subscriber itself is dropped, destroying the + // registry. + let dispatch = dispatcher::Dispatch::new(subscriber); + + let span2 = dispatcher::with_default(&dispatch, || { + let span = tracing::debug_span!("span1"); drop(span); + let span2 = tracing::info_span!("span2"); + let span2_clone = span2.clone(); + drop(span2); + span2_clone }); - assert!(IS_REMOVED.load(Ordering::Acquire) == true); + assert!(span1_removed2.load(Ordering::Acquire) == true); + assert!(span2_removed2.load(Ordering::Acquire) == false); + + drop(span2); + assert!(span2_removed2.load(Ordering::Acquire) == true); + + // Ensure the registry itself outlives the span. + drop(dispatch); } }