-
Notifications
You must be signed in to change notification settings - Fork 78
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Tail sampling #263
Tail sampling #263
Changes from 14 commits
cc7211d
fa714d2
6cf0ba1
f01cdfd
486d35e
a251545
8be336b
f5c4de6
5d1491b
3ef6d5e
9bf994e
4996d74
789124b
b39de8a
80e8f4e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -67,9 +67,10 @@ | |
from .exporters.fallback import FallbackSpanExporter | ||
from .exporters.file import FileSpanExporter | ||
from .exporters.otlp import OTLPExporterHttpSession, RetryFewerSpansSpanExporter | ||
from .exporters.processor_wrapper import SpanProcessorWrapper | ||
from .exporters.processor_wrapper import MainSpanProcessorWrapper | ||
from .exporters.quiet_metrics import QuietMetricExporter | ||
from .exporters.remove_pending import RemovePendingSpansExporter | ||
from .exporters.tail_sampling import TailSamplingOptions, TailSamplingProcessor | ||
from .integrations.executors import instrument_executors | ||
from .metrics import ProxyMeterProvider, configure_metrics | ||
from .scrubbing import Scrubber, ScrubCallback | ||
|
@@ -159,6 +160,7 @@ def configure( | |
scrubbing_patterns: Sequence[str] | None = None, | ||
scrubbing_callback: ScrubCallback | None = None, | ||
inspect_arguments: bool | None = None, | ||
tail_sampling: TailSamplingOptions | None = None, | ||
) -> None: | ||
"""Configure the logfire SDK. | ||
|
||
|
@@ -216,6 +218,7 @@ def configure( | |
[f-string magic](https://docs.pydantic.dev/logfire/guides/onboarding_checklist/add_manual_tracing/#f-strings). | ||
If `None` uses the `LOGFIRE_INSPECT_ARGUMENTS` environment variable. | ||
Defaults to `True` if and only if the Python version is at least 3.11. | ||
tail_sampling: Tail sampling options. Not ready for general use. | ||
""" | ||
if processors is not None: # pragma: no cover | ||
raise ValueError( | ||
|
@@ -253,6 +256,7 @@ def configure( | |
scrubbing_patterns=scrubbing_patterns, | ||
scrubbing_callback=scrubbing_callback, | ||
inspect_arguments=inspect_arguments, | ||
tail_sampling=tail_sampling, | ||
) | ||
|
||
|
||
|
@@ -332,6 +336,12 @@ class _LogfireConfigData: | |
scrubbing_callback: ScrubCallback | None | ||
"""A function that is called for each match found by the scrubber.""" | ||
|
||
inspect_arguments: bool | ||
"""Whether to enable f-string magic""" | ||
|
||
tail_sampling: TailSamplingOptions | None | ||
"""Tail sampling options""" | ||
|
||
def _load_configuration( | ||
self, | ||
# note that there are no defaults here so that the only place | ||
|
@@ -360,6 +370,7 @@ def _load_configuration( | |
scrubbing_patterns: Sequence[str] | None, | ||
scrubbing_callback: ScrubCallback | None, | ||
inspect_arguments: bool | None, | ||
tail_sampling: TailSamplingOptions | None, | ||
) -> None: | ||
"""Merge the given parameters with the environment variables file configurations.""" | ||
param_manager = ParamManager.create(config_dir) | ||
|
@@ -414,6 +425,12 @@ def _load_configuration( | |
|
||
if get_version(pydantic.__version__) < get_version('2.5.0'): # pragma: no cover | ||
raise RuntimeError('The Pydantic plugin requires Pydantic 2.5.0 or newer.') | ||
|
||
if isinstance(tail_sampling, dict): | ||
# This is particularly for deserializing from a dict as in executors.py | ||
tail_sampling = TailSamplingOptions(**tail_sampling) # type: ignore | ||
self.tail_sampling = tail_sampling | ||
|
||
self.fast_shutdown = fast_shutdown | ||
|
||
self.id_generator = id_generator or RandomIdGenerator() | ||
|
@@ -457,6 +474,7 @@ def __init__( | |
scrubbing_patterns: Sequence[str] | None = None, | ||
scrubbing_callback: ScrubCallback | None = None, | ||
inspect_arguments: bool | None = None, | ||
tail_sampling: TailSamplingOptions | None = None, | ||
) -> None: | ||
"""Create a new LogfireConfig. | ||
|
||
|
@@ -490,6 +508,7 @@ def __init__( | |
scrubbing_patterns=scrubbing_patterns, | ||
scrubbing_callback=scrubbing_callback, | ||
inspect_arguments=inspect_arguments, | ||
tail_sampling=tail_sampling, | ||
) | ||
# initialize with no-ops so that we don't impact OTEL's global config just because logfire is installed | ||
# that is, we defer setting logfire as the otel global config until `configure` is called | ||
|
@@ -527,6 +546,7 @@ def configure( | |
scrubbing_patterns: Sequence[str] | None, | ||
scrubbing_callback: ScrubCallback | None, | ||
inspect_arguments: bool | None, | ||
tail_sampling: TailSamplingOptions | None, | ||
) -> None: | ||
with self._lock: | ||
self._initialized = False | ||
|
@@ -554,6 +574,7 @@ def configure( | |
scrubbing_patterns, | ||
scrubbing_callback, | ||
inspect_arguments, | ||
tail_sampling, | ||
) | ||
self.initialize() | ||
|
||
|
@@ -592,8 +613,15 @@ def _initialize(self) -> ProxyTracerProvider: | |
# Both recommend generating a UUID. | ||
resource = Resource({ResourceAttributes.SERVICE_INSTANCE_ID: uuid4().hex}).merge(resource) | ||
|
||
# Avoid using the usual sampler if we're using tail-based sampling. | ||
# The TailSamplingProcessor will handle the random sampling part as well. | ||
sampler = ( | ||
ParentBasedTraceIdRatio(self.trace_sample_rate) | ||
if self.trace_sample_rate < 1 and self.tail_sampling is None | ||
else None | ||
) | ||
Comment on lines
+616
to
+622
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How do we document / tell users they can't combine these? Is there a world where I want to head sample down to 10% (to reduce overhead in the SDK) and then tail sample down to 1%? There's still advantages to head sampling. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They can combine them, see the PR body or the new
I don't know what this means. Tail sample down to a percentage? It sounds like you want to be able to discard most spans up front randomly regardless of whether tail-sampling would include them, so that a span only gets through if it's 'notable' AND 'lucky'. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes exactly |
||
tracer_provider = SDKTracerProvider( | ||
sampler=ParentBasedTraceIdRatio(self.trace_sample_rate), | ||
sampler=sampler, | ||
resource=resource, | ||
id_generator=self.id_generator, | ||
) | ||
|
@@ -607,7 +635,16 @@ def add_span_processor(span_processor: SpanProcessor) -> None: | |
# Most span processors added to the tracer provider should also be recorded in the `processors` list | ||
# so that they can be used by the final pending span processor. | ||
# This means that `tracer_provider.add_span_processor` should only appear in two places. | ||
span_processor = SpanProcessorWrapper(span_processor, self.scrubber) | ||
if self.tail_sampling: | ||
span_processor = TailSamplingProcessor( | ||
span_processor, | ||
self.tail_sampling, | ||
# If self.trace_sample_rate < 1 then that ratio of spans should be included randomly by this. | ||
# In that case the tracer provider doesn't need to do any sampling, see above. | ||
# Otherwise we're not using any random sampling, so 0% of spans should be included 'randomly'. | ||
self.trace_sample_rate if self.trace_sample_rate < 1 else 0, | ||
) | ||
span_processor = MainSpanProcessorWrapper(span_processor, self.scrubber) | ||
tracer_provider.add_span_processor(span_processor) | ||
processors.append(span_processor) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,17 +16,18 @@ | |
) | ||
from ..scrubbing import Scrubber | ||
from ..utils import ReadableSpanDict, is_instrumentation_suppressed, span_to_dict, truncate_string | ||
from .wrapper import WrapperSpanProcessor | ||
|
||
|
||
class SpanProcessorWrapper(SpanProcessor): | ||
class MainSpanProcessorWrapper(WrapperSpanProcessor): | ||
"""Wrapper around other processors to intercept starting and ending spans with our own global logic. | ||
|
||
Suppresses starting/ending if the current context has a `suppress_instrumentation` value. | ||
Tweaks the send/receive span names generated by the ASGI middleware. | ||
""" | ||
|
||
def __init__(self, processor: SpanProcessor, scrubber: Scrubber) -> None: | ||
self.processor = processor | ||
super().__init__(processor) | ||
self.scrubber = scrubber | ||
|
||
def on_start( | ||
|
@@ -37,7 +38,7 @@ def on_start( | |
if is_instrumentation_suppressed(): | ||
return | ||
_set_log_level_on_asgi_send_receive_spans(span) | ||
self.processor.on_start(span, parent_context) | ||
super().on_start(span, parent_context) | ||
|
||
def on_end(self, span: ReadableSpan) -> None: | ||
if is_instrumentation_suppressed(): | ||
|
@@ -47,13 +48,7 @@ def on_end(self, span: ReadableSpan) -> None: | |
_tweak_http_spans(span_dict) | ||
self.scrubber.scrub_span(span_dict) | ||
span = ReadableSpan(**span_dict) | ||
self.processor.on_end(span) | ||
|
||
def shutdown(self) -> None: | ||
self.processor.shutdown() | ||
|
||
def force_flush(self, timeout_millis: int = 30000) -> bool: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this the method we were using to get logfire to work with AWS lambda? If so, I guess we need it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's moved to the base class. |
||
return self.processor.force_flush(timeout_millis) # pragma: no cover | ||
super().on_end(span) | ||
|
||
|
||
def _set_log_level_on_asgi_send_receive_spans(span: Span) -> None: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
from __future__ import annotations | ||
|
||
import random | ||
from dataclasses import dataclass | ||
from functools import cached_property | ||
from threading import Lock | ||
|
||
from opentelemetry import context | ||
from opentelemetry.sdk.trace import ReadableSpan, Span, SpanProcessor | ||
|
||
from logfire._internal.constants import ( | ||
ATTRIBUTES_LOG_LEVEL_NUM_KEY, | ||
LEVEL_NUMBERS, | ||
ONE_SECOND_IN_NANOSECONDS, | ||
LevelName, | ||
) | ||
from logfire._internal.exporters.wrapper import WrapperSpanProcessor | ||
|
||
|
||
@dataclass | ||
class TailSamplingOptions: | ||
level: LevelName | None = 'notice' | ||
""" | ||
Include all spans/logs with level greater than or equal to this level. | ||
If None, spans are not included based on level. | ||
""" | ||
|
||
duration: float | None = 1.0 | ||
""" | ||
Include all spans/logs with duration greater than this duration in seconds. | ||
If None, spans are not included based on duration. | ||
""" | ||
|
||
|
||
@dataclass | ||
class TraceBuffer: | ||
"""Arguments of `on_start` and `on_end` for spans in a single trace.""" | ||
|
||
started: list[tuple[Span, context.Context | None]] | ||
ended: list[ReadableSpan] | ||
|
||
@cached_property | ||
def first_span(self) -> Span: | ||
return self.started[0][0] | ||
|
||
|
||
class TailSamplingProcessor(WrapperSpanProcessor): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm really starting to feel like we should stop adding layers / wrapping processors like this. I would prefer to have a single processor that handles everything (sampling, batching, retries, etc.). I feel like it could be optimized more and would be easier to understand. It can still have pluggable bits just more explicit and less abstract. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This layer wraps all processors, including the console and user-defined processors. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I get that. But I feel like we should just make |
||
"""Passes spans to the wrapped processor if any span in a trace meets the sampling criteria.""" | ||
|
||
def __init__(self, processor: SpanProcessor, options: TailSamplingOptions, random_rate: float) -> None: | ||
super().__init__(processor) | ||
self.duration: float = ( | ||
float('inf') if options.duration is None else options.duration * ONE_SECOND_IN_NANOSECONDS | ||
) | ||
Comment on lines
+52
to
+54
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd like these durations to have the unit in their name. So |
||
self.level: float | int = float('inf') if options.level is None else LEVEL_NUMBERS[options.level] | ||
self.random_rate = random_rate | ||
|
||
# A TraceBuffer is typically created for each new trace. | ||
# If a span meets the sampling criteria, the buffer is dropped and all spans within are pushed | ||
# to the wrapped processor. | ||
# So when more spans arrive and there's no buffer, they get passed through immediately. | ||
self.traces: dict[int, TraceBuffer] = {} | ||
|
||
# Code that touches self.traces and its contents should be protected by this lock. | ||
self.lock = Lock() | ||
|
||
def on_start(self, span: Span, parent_context: context.Context | None = None) -> None: | ||
dropped = False | ||
buffer = None | ||
|
||
with self.lock: | ||
# span.context could supposedly be None, not sure how. | ||
if span.context: # pragma: no branch | ||
trace_id = span.context.trace_id | ||
# If span.parent is None, it's the root span of a trace. | ||
# If random.random() <= self.random_rate, immediately include this trace, | ||
# meaning no buffer for it. | ||
if span.parent is None and random.random() > self.random_rate: | ||
self.traces[trace_id] = TraceBuffer([], []) | ||
|
||
buffer = self.traces.get(trace_id) | ||
if buffer is not None: | ||
# This trace's spans haven't met the criteria yet, so add this span to the buffer. | ||
buffer.started.append((span, parent_context)) | ||
dropped = self.check_span(span, buffer) | ||
# The opposite case is handled outside the lock since it may take some time. | ||
|
||
# This code may take longer since it calls the wrapped processor which might do anything. | ||
# It shouldn't be inside the lock to avoid blocking other threads. | ||
# Since it's not in the lock, it shouldn't touch self.traces or its contents. | ||
if buffer is None: | ||
super().on_start(span, parent_context) | ||
elif dropped: | ||
self.push_buffer(buffer) | ||
|
||
def on_end(self, span: ReadableSpan) -> None: | ||
# This has a very similar structure and reasoning to on_start. | ||
|
||
dropped = False | ||
buffer = None | ||
|
||
with self.lock: | ||
if span.context: # pragma: no branch | ||
trace_id = span.context.trace_id | ||
buffer = self.traces.get(trace_id) | ||
if buffer is not None: | ||
buffer.ended.append(span) | ||
dropped = self.check_span(span, buffer) | ||
if span.parent is None: | ||
# This is the root span, so the trace is hopefully complete. | ||
# Delete the buffer to save memory. | ||
self.traces.pop(trace_id, None) | ||
|
||
if buffer is None: | ||
super().on_end(span) | ||
elif dropped: | ||
self.push_buffer(buffer) | ||
|
||
def check_span(self, span: ReadableSpan, buffer: TraceBuffer) -> bool: | ||
"""If the span meets the sampling criteria, drop the buffer and return True. Otherwise, return False.""" | ||
# span.end_time and span.start_time are in nanoseconds and can be None. | ||
if (span.end_time or span.start_time or 0) - (buffer.first_span.start_time or float('inf')) > self.duration: | ||
self.drop_buffer(buffer) | ||
return True | ||
|
||
attributes = span.attributes or {} | ||
level = attributes.get(ATTRIBUTES_LOG_LEVEL_NUM_KEY) | ||
if not isinstance(level, int): | ||
level = LEVEL_NUMBERS['info'] | ||
if level >= self.level: | ||
self.drop_buffer(buffer) | ||
return True | ||
|
||
return False | ||
|
||
def drop_buffer(self, buffer: TraceBuffer) -> None: | ||
span_context = buffer.first_span.context | ||
assert span_context is not None | ||
del self.traces[span_context.trace_id] | ||
|
||
def push_buffer(self, buffer: TraceBuffer) -> None: | ||
for started in buffer.started: | ||
super().on_start(*started) | ||
for span in buffer.ended: | ||
super().on_end(span) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could just be a typeddict?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried that, I've also wanted
ConsoleOptions
andPydanticPlugin
to be typed dicts, but they seemed worse. They can't define defaults in the class, and passing in a plain dict is actually not that user friendly.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense