From b71328215387b13184075ec2bc6880060dbb90f7 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 8 Aug 2024 16:45:43 +0200 Subject: [PATCH 01/48] system_metrics.py --- .../_internal/integrations/system_metrics.py | 118 ++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 logfire/_internal/integrations/system_metrics.py diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py new file mode 100644 index 000000000..3a336b061 --- /dev/null +++ b/logfire/_internal/integrations/system_metrics.py @@ -0,0 +1,118 @@ +import sys +from typing import Any, Dict, Iterable, List, Literal, Optional, Union + +from opentelemetry.metrics import MeterProvider + +try: + from opentelemetry.instrumentation.system_metrics import SystemMetricsInstrumentor +except ModuleNotFoundError: # pragma: no cover + raise RuntimeError( + '`logfire.instrument_system_metrics()` requires the `opentelemetry-instrumentation-system-metrics` package.\n' + 'You can install this with:\n' + " pip install 'logfire[system-metrics]'" + ) + +MetricName = Literal[ + 'system.cpu.time', + 'system.cpu.utilization', + 'system.memory.usage', + 'system.memory.utilization', + 'system.swap.usage', + 'system.swap.utilization', + 'system.disk.io', + 'system.disk.operations', + 'system.disk.time', + 'system.network.dropped.packets', + 'system.network.packets', + 'system.network.errors', + 'system.network.io', + 'system.network.connections', + 'system.thread_count', + 'process.runtime.memory', + 'process.runtime.cpu.time', + 'process.runtime.gc_count', +] + +ConfigString = Union[Literal['basic'], MetricName] +ConfigDict = Dict[MetricName, Optional[Iterable[str]]] +Config = Union[Literal['all'], ConfigString, Iterable[ConfigString], Dict[ConfigString, Optional[Iterable[str]]]] + + +# All the cpu_times fields provided by psutil (used by system_metrics) across all platforms, +# except for 'guest' and 'guest_nice' which are included in 'user' and 'nice' in Linux (see psutil._cpu_tot_time). +# Docs: https://psutil.readthedocs.io/en/latest/#psutil.cpu_times +CPU_FIELDS = 'idle user system irq softirq nice iowait steal interrupt dpc'.split() + +# All the virtual_memory fields provided by psutil across all platforms, +# except for 'percent' which can be calculated as `(total - available) / total * 100`. +# Docs: https://psutil.readthedocs.io/en/latest/#psutil.virtual_memory +MEMORY_FIELDS = 'total available used free active inactive buffers cached shared wired slab'.split() + +# Based on opentelemetry/instrumentation/system_metrics/__init__.py +DEFAULT_CONFIG: ConfigDict = { + 'system.cpu.time': CPU_FIELDS, + 'system.cpu.utilization': CPU_FIELDS, + 'system.memory.usage': MEMORY_FIELDS, + 'system.memory.utilization': MEMORY_FIELDS, + 'system.swap.usage': ['used', 'free'], + 'system.swap.utilization': ['used', 'free'], + 'system.disk.io': ['read', 'write'], + 'system.disk.operations': ['read', 'write'], + 'system.disk.time': ['read', 'write'], + 'system.network.dropped.packets': ['transmit', 'receive'], + 'system.network.packets': ['transmit', 'receive'], + 'system.network.errors': ['transmit', 'receive'], + 'system.network.io': ['transmit', 'receive'], + 'system.network.connections': ['family', 'type'], + 'system.thread_count': None, + 'process.runtime.memory': ['rss', 'vms'], + 'process.runtime.cpu.time': ['user', 'system'], + 'process.runtime.gc_count': None, +} + +if sys.platform == 'darwin': # pragma: no cover + # see https://github.com/giampaolo/psutil/issues/1219 + # upstream pr: https://github.com/open-telemetry/opentelemetry-python-contrib/pull/2008 + DEFAULT_CONFIG.pop('system.network.connections') + + +BASIC_METRICS: List[MetricName] = [ + 'system.thread_count', # used by process count + 'system.cpu.utilization', + 'system.memory.utilization', + 'system.swap.usage', +] + + +def parse_config(config: Config) -> ConfigDict: + if isinstance(config, str): + if config == 'all': + return DEFAULT_CONFIG + config = [config] + + config_dict: Dict[ConfigString, Optional[Iterable[str]]] + if isinstance(config, dict): + config_dict = config + else: + config_dict: Dict[ConfigString, Optional[Iterable[str]]] = {} + key: ConfigString + for key in config: + if key == 'basic': + config_dict[key] = None + else: + config_dict[key] = DEFAULT_CONFIG[key] + + result: ConfigDict = {} + for key, value in config_dict.items(): + if key == 'basic': + for metric in BASIC_METRICS: + result[metric] = DEFAULT_CONFIG[metric] + elif value is None: + result[key] = DEFAULT_CONFIG[key] + else: + result[key] = value + return result + + +def instrument_system_metrics(meter_provider: MeterProvider, config: Any = 'basic') -> None: + SystemMetricsInstrumentor(config=parse_config(config)).instrument(meter_provider=meter_provider) # type: ignore From 7400de8dcf38ab04e72ea5d19ac3d1b5f88375bb Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 8 Aug 2024 17:07:57 +0200 Subject: [PATCH 02/48] use OTEL config for most metrics --- .../_internal/integrations/system_metrics.py | 28 ++++++++----------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index 3a336b061..0069501ea 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -1,10 +1,13 @@ import sys -from typing import Any, Dict, Iterable, List, Literal, Optional, Union +from typing import Any, Dict, Iterable, List, Literal, Optional, Union, cast from opentelemetry.metrics import MeterProvider try: - from opentelemetry.instrumentation.system_metrics import SystemMetricsInstrumentor + from opentelemetry.instrumentation.system_metrics import ( + _DEFAULT_CONFIG, # type: ignore + SystemMetricsInstrumentor, + ) except ModuleNotFoundError: # pragma: no cover raise RuntimeError( '`logfire.instrument_system_metrics()` requires the `opentelemetry-instrumentation-system-metrics` package.\n' @@ -31,6 +34,10 @@ 'process.runtime.memory', 'process.runtime.cpu.time', 'process.runtime.gc_count', + 'process.runtime.thread_count', + 'process.runtime.cpu.utilization', + 'process.runtime.context_switches', + 'process.open_file_descriptor.count', ] ConfigString = Union[Literal['basic'], MetricName] @@ -50,24 +57,11 @@ # Based on opentelemetry/instrumentation/system_metrics/__init__.py DEFAULT_CONFIG: ConfigDict = { + **cast(ConfigDict, _DEFAULT_CONFIG), 'system.cpu.time': CPU_FIELDS, 'system.cpu.utilization': CPU_FIELDS, 'system.memory.usage': MEMORY_FIELDS, 'system.memory.utilization': MEMORY_FIELDS, - 'system.swap.usage': ['used', 'free'], - 'system.swap.utilization': ['used', 'free'], - 'system.disk.io': ['read', 'write'], - 'system.disk.operations': ['read', 'write'], - 'system.disk.time': ['read', 'write'], - 'system.network.dropped.packets': ['transmit', 'receive'], - 'system.network.packets': ['transmit', 'receive'], - 'system.network.errors': ['transmit', 'receive'], - 'system.network.io': ['transmit', 'receive'], - 'system.network.connections': ['family', 'type'], - 'system.thread_count': None, - 'process.runtime.memory': ['rss', 'vms'], - 'process.runtime.cpu.time': ['user', 'system'], - 'process.runtime.gc_count': None, } if sys.platform == 'darwin': # pragma: no cover @@ -94,7 +88,7 @@ def parse_config(config: Config) -> ConfigDict: if isinstance(config, dict): config_dict = config else: - config_dict: Dict[ConfigString, Optional[Iterable[str]]] = {} + config_dict = {} key: ConfigString for key in config: if key == 'basic': From c1c4e6b30e3dcb76c79899cd09d932a7821d7558 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 8 Aug 2024 17:41:45 +0200 Subject: [PATCH 03/48] tests --- logfire/__init__.py | 2 + .../_internal/integrations/system_metrics.py | 3 +- logfire/_internal/main.py | 12 ++ .../otel_integrations/test_system_metrics.py | 135 ++++++++++++++++++ tests/test_metrics.py | 22 --- 5 files changed, 151 insertions(+), 23 deletions(-) create mode 100644 tests/otel_integrations/test_system_metrics.py diff --git a/logfire/__init__.py b/logfire/__init__.py index 47c18b0f3..7e469b417 100644 --- a/logfire/__init__.py +++ b/logfire/__init__.py @@ -39,6 +39,7 @@ instrument_redis = DEFAULT_LOGFIRE_INSTANCE.instrument_redis instrument_pymongo = DEFAULT_LOGFIRE_INSTANCE.instrument_pymongo instrument_mysql = DEFAULT_LOGFIRE_INSTANCE.instrument_mysql +instrument_system_metrics = DEFAULT_LOGFIRE_INSTANCE.instrument_system_metrics shutdown = DEFAULT_LOGFIRE_INSTANCE.shutdown with_tags = DEFAULT_LOGFIRE_INSTANCE.with_tags # with_trace_sample_rate = DEFAULT_LOGFIRE_INSTANCE.with_trace_sample_rate @@ -115,6 +116,7 @@ def loguru_handler() -> dict[str, Any]: 'instrument_redis', 'instrument_pymongo', 'instrument_mysql', + 'instrument_system_metrics', 'AutoTraceModule', 'with_tags', 'with_settings', diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index 0069501ea..489a78181 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -67,7 +67,7 @@ if sys.platform == 'darwin': # pragma: no cover # see https://github.com/giampaolo/psutil/issues/1219 # upstream pr: https://github.com/open-telemetry/opentelemetry-python-contrib/pull/2008 - DEFAULT_CONFIG.pop('system.network.connections') + DEFAULT_CONFIG.pop('system.network.connections', None) BASIC_METRICS: List[MetricName] = [ @@ -109,4 +109,5 @@ def parse_config(config: Config) -> ConfigDict: def instrument_system_metrics(meter_provider: MeterProvider, config: Any = 'basic') -> None: + SystemMetricsInstrumentor().uninstrument() # type: ignore SystemMetricsInstrumentor(config=parse_config(config)).instrument(meter_provider=meter_provider) # type: ignore diff --git a/logfire/_internal/main.py b/logfire/_internal/main.py index 192804f74..ad093b6c6 100644 --- a/logfire/_internal/main.py +++ b/logfire/_internal/main.py @@ -74,6 +74,7 @@ from .integrations.redis import RedisInstrumentKwargs from .integrations.sqlalchemy import SQLAlchemyInstrumentKwargs from .integrations.starlette import StarletteInstrumentKwargs + from .integrations.system_metrics import Config as SystemMetricsConfig from .utils import SysExcInfo # This is the type of the exc_info/_exc_info parameter of the log methods. @@ -1252,6 +1253,17 @@ def instrument_mysql( self._warn_if_not_initialized_for_instrumentation() return instrument_mysql(conn, **kwargs) + def instrument_system_metrics(self, config: SystemMetricsConfig = 'basic'): + """Instrument the system metrics. + + Args: + config: The system metrics configuration. + """ + from .integrations.system_metrics import instrument_system_metrics + + self._warn_if_not_initialized_for_instrumentation() + return instrument_system_metrics(self.config.get_meter_provider(), config) + def metric_counter(self, name: str, *, unit: str = '', description: str = '') -> Counter: """Create a counter metric. diff --git a/tests/otel_integrations/test_system_metrics.py b/tests/otel_integrations/test_system_metrics.py new file mode 100644 index 000000000..1473857c8 --- /dev/null +++ b/tests/otel_integrations/test_system_metrics.py @@ -0,0 +1,135 @@ +from __future__ import annotations + +from inline_snapshot import snapshot +from opentelemetry.sdk.metrics.export import InMemoryMetricReader + +import logfire +import logfire._internal.metrics +from logfire._internal.integrations.system_metrics import Config, parse_config +from tests.test_metrics import get_collected_metrics + + +def get_collected_metric_names(metrics_reader: InMemoryMetricReader) -> list[str]: + return sorted( + { + metric['name'] + for metric in get_collected_metrics(metrics_reader) + if metric['name'] != 'system.network.connections' + } + ) + + +def test_default_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None: + logfire.instrument_system_metrics() + assert get_collected_metric_names(metrics_reader) == snapshot( + [ + 'system.cpu.utilization', + 'system.memory.utilization', + 'system.swap.usage', + 'system.thread_count', + ] + ) + + +def test_single_system_metric_collection(metrics_reader: InMemoryMetricReader) -> None: + logfire.instrument_system_metrics('system.cpu.time') + assert get_collected_metric_names(metrics_reader) == [ + 'system.cpu.time', + ] + + +def test_list_with_basic_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None: + logfire.instrument_system_metrics(['basic', 'system.cpu.time']) + assert get_collected_metric_names(metrics_reader) == snapshot( + [ + 'system.cpu.time', + 'system.cpu.utilization', + 'system.memory.utilization', + 'system.swap.usage', + 'system.thread_count', + ] + ) + + +def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None: + logfire.instrument_system_metrics('all') + assert get_collected_metric_names(metrics_reader) == snapshot( + [ + 'process.open_file_descriptor.count', + 'process.runtime.cpython.context_switches', + 'process.runtime.cpython.cpu.utilization', + 'process.runtime.cpython.cpu_time', + 'process.runtime.cpython.gc_count', + 'process.runtime.cpython.memory', + 'process.runtime.cpython.thread_count', + 'system.cpu.time', + 'system.cpu.utilization', + 'system.disk.io', + 'system.disk.operations', + 'system.disk.time', + 'system.memory.usage', + 'system.memory.utilization', + 'system.network.dropped_packets', + 'system.network.errors', + 'system.network.io', + 'system.network.packets', + 'system.swap.usage', + 'system.swap.utilization', + 'system.thread_count', + ] + ) + + +def test_dict_with_basic_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None: + config: Config = { + 'basic': None, + 'system.cpu.time': None, + 'system.cpu.utilization': ['idle'], + } + assert parse_config(config) == snapshot( + { + 'system.thread_count': None, + 'system.cpu.utilization': [ + 'idle', + ], + 'system.memory.utilization': [ + 'total', + 'available', + 'used', + 'free', + 'active', + 'inactive', + 'buffers', + 'cached', + 'shared', + 'wired', + 'slab', + ], + 'system.swap.usage': [ + 'used', + 'free', + ], + 'system.cpu.time': [ + 'idle', + 'user', + 'system', + 'irq', + 'softirq', + 'nice', + 'iowait', + 'steal', + 'interrupt', + 'dpc', + ], + } + ) + logfire.instrument_system_metrics(config) + assert get_collected_metric_names(metrics_reader) == snapshot( + [ + 'system.cpu.time', + 'system.cpu.utilization', + 'system.memory.utilization', + 'system.swap.usage', + 'system.thread_count', + ] + ) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index e62050c6c..5a3f0a04e 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -16,28 +16,6 @@ from logfire._internal.exporters.quiet_metrics import QuietMetricExporter -def test_system_metrics_collection() -> None: - metrics_reader = InMemoryMetricReader() - logfire.configure( - send_to_logfire=False, - additional_metric_readers=[metrics_reader], - # i.e. use the default value, in contrast to `False` which the automatic test fixture uses. - collect_system_metrics=None, - ) - metrics_collected = {metric['name'] for metric in get_collected_metrics(metrics_reader)} - - # collected metrics vary by platform, etc. - # assert that we at least collected _some_ of the metrics we expect - assert metrics_collected.issuperset( - { - 'system.swap.usage', - 'system.disk.operations', - 'system.memory.usage', - 'system.cpu.utilization', - } - ), metrics_collected - - def test_create_metric_counter(metrics_reader: InMemoryMetricReader) -> None: counter = logfire.metric_counter('counter') counter.add(1) From 13d0baa6c7c7b193601d229759c1eeef1b915290 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 8 Aug 2024 17:48:49 +0200 Subject: [PATCH 04/48] Update generated stubs --- logfire-api/logfire_api/__init__.py | 3 +++ logfire-api/logfire_api/__init__.pyi | 3 ++- logfire-api/logfire_api/_internal/main.pyi | 7 +++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/logfire-api/logfire_api/__init__.py b/logfire-api/logfire_api/__init__.py index 396c1ea8f..02a4c60d4 100644 --- a/logfire-api/logfire_api/__init__.py +++ b/logfire-api/logfire_api/__init__.py @@ -123,6 +123,8 @@ def instrument_openai(self, *args, **kwargs) -> ContextManager[None]: def instrument_aiohttp_client(self, *args, **kwargs) -> None: ... + def instrument_system_metrics(self, *args, **kwargs) -> None: ... + def shutdown(self, *args, **kwargs) -> None: ... DEFAULT_LOGFIRE_INSTANCE = Logfire() @@ -158,6 +160,7 @@ def shutdown(self, *args, **kwargs) -> None: ... instrument_redis = DEFAULT_LOGFIRE_INSTANCE.instrument_redis instrument_pymongo = DEFAULT_LOGFIRE_INSTANCE.instrument_pymongo instrument_mysql = DEFAULT_LOGFIRE_INSTANCE.instrument_mysql + instrument_system_metrics = DEFAULT_LOGFIRE_INSTANCE.instrument_system_metrics shutdown = DEFAULT_LOGFIRE_INSTANCE.shutdown def no_auto_trace(x): diff --git a/logfire-api/logfire_api/__init__.pyi b/logfire-api/logfire_api/__init__.pyi index fd1c038c6..c7de505b8 100644 --- a/logfire-api/logfire_api/__init__.pyi +++ b/logfire-api/logfire_api/__init__.pyi @@ -11,7 +11,7 @@ from .integrations.logging import LogfireLoggingHandler as LogfireLoggingHandler from .integrations.structlog import LogfireProcessor as StructlogProcessor from .version import VERSION as VERSION -__all__ = ['Logfire', 'LogfireSpan', 'LevelName', 'ConsoleOptions', 'PydanticPlugin', 'configure', 'span', 'instrument', 'log', 'trace', 'debug', 'notice', 'info', 'warn', 'error', 'exception', 'fatal', 'force_flush', 'log_slow_async_callbacks', 'install_auto_tracing', 'instrument_fastapi', 'instrument_openai', 'instrument_anthropic', 'instrument_asyncpg', 'instrument_httpx', 'instrument_celery', 'instrument_requests', 'instrument_psycopg', 'instrument_django', 'instrument_flask', 'instrument_starlette', 'instrument_aiohttp_client', 'instrument_sqlalchemy', 'instrument_redis', 'instrument_pymongo', 'instrument_mysql', 'AutoTraceModule', 'with_tags', 'with_settings', 'shutdown', 'load_spans_from_file', 'no_auto_trace', 'METRICS_PREFERRED_TEMPORALITY', 'ScrubMatch', 'ScrubbingOptions', 'VERSION', 'suppress_instrumentation', 'StructlogProcessor', 'LogfireLoggingHandler', 'TailSamplingOptions'] +__all__ = ['Logfire', 'LogfireSpan', 'LevelName', 'ConsoleOptions', 'PydanticPlugin', 'configure', 'span', 'instrument', 'log', 'trace', 'debug', 'notice', 'info', 'warn', 'error', 'exception', 'fatal', 'force_flush', 'log_slow_async_callbacks', 'install_auto_tracing', 'instrument_fastapi', 'instrument_openai', 'instrument_anthropic', 'instrument_asyncpg', 'instrument_httpx', 'instrument_celery', 'instrument_requests', 'instrument_psycopg', 'instrument_django', 'instrument_flask', 'instrument_starlette', 'instrument_aiohttp_client', 'instrument_sqlalchemy', 'instrument_redis', 'instrument_pymongo', 'instrument_mysql', 'instrument_system_metrics', 'AutoTraceModule', 'with_tags', 'with_settings', 'shutdown', 'load_spans_from_file', 'no_auto_trace', 'METRICS_PREFERRED_TEMPORALITY', 'ScrubMatch', 'ScrubbingOptions', 'VERSION', 'suppress_instrumentation', 'StructlogProcessor', 'LogfireLoggingHandler', 'TailSamplingOptions'] DEFAULT_LOGFIRE_INSTANCE = Logfire() span = DEFAULT_LOGFIRE_INSTANCE.span @@ -35,6 +35,7 @@ instrument_sqlalchemy = DEFAULT_LOGFIRE_INSTANCE.instrument_sqlalchemy instrument_redis = DEFAULT_LOGFIRE_INSTANCE.instrument_redis instrument_pymongo = DEFAULT_LOGFIRE_INSTANCE.instrument_pymongo instrument_mysql = DEFAULT_LOGFIRE_INSTANCE.instrument_mysql +instrument_system_metrics = DEFAULT_LOGFIRE_INSTANCE.instrument_system_metrics shutdown = DEFAULT_LOGFIRE_INSTANCE.shutdown with_tags = DEFAULT_LOGFIRE_INSTANCE.with_tags with_settings = DEFAULT_LOGFIRE_INSTANCE.with_settings diff --git a/logfire-api/logfire_api/_internal/main.pyi b/logfire-api/logfire_api/_internal/main.pyi index a902a257b..ecec2a257 100644 --- a/logfire-api/logfire_api/_internal/main.pyi +++ b/logfire-api/logfire_api/_internal/main.pyi @@ -18,6 +18,7 @@ from .integrations.pymongo import PymongoInstrumentKwargs as PymongoInstrumentKw from .integrations.redis import RedisInstrumentKwargs as RedisInstrumentKwargs from .integrations.sqlalchemy import SQLAlchemyInstrumentKwargs as SQLAlchemyInstrumentKwargs from .integrations.starlette import StarletteInstrumentKwargs as StarletteInstrumentKwargs +from .integrations.system_metrics import Config as SystemMetricsConfig from .json_encoder import logfire_json_dumps as logfire_json_dumps from .json_schema import JsonSchemaProperties as JsonSchemaProperties, attributes_json_schema as attributes_json_schema, attributes_json_schema_properties as attributes_json_schema_properties, create_json_schema as create_json_schema from .metrics import ProxyMeterProvider as ProxyMeterProvider @@ -643,6 +644,12 @@ class Logfire: If a connection is provided, returns the instrumented connection. If no connection is provided, returns None. """ + def instrument_system_metrics(self, config: SystemMetricsConfig = 'basic'): + """Instrument the system metrics. + + Args: + config: The system metrics configuration. + """ def metric_counter(self, name: str, *, unit: str = '', description: str = '') -> Counter: """Create a counter metric. From dd0538055fcff2478103eda28cca9adb0bddb299 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 8 Aug 2024 17:49:10 +0200 Subject: [PATCH 05/48] Update generated stubs --- .../_internal/integrations/system_metrics.pyi | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 logfire-api/logfire_api/_internal/integrations/system_metrics.pyi diff --git a/logfire-api/logfire_api/_internal/integrations/system_metrics.pyi b/logfire-api/logfire_api/_internal/integrations/system_metrics.pyi new file mode 100644 index 000000000..45010d652 --- /dev/null +++ b/logfire-api/logfire_api/_internal/integrations/system_metrics.pyi @@ -0,0 +1,15 @@ +from _typeshed import Incomplete +from opentelemetry.metrics import MeterProvider +from typing import Any, Iterable + +MetricName: Incomplete +ConfigString: Incomplete +ConfigDict = dict[MetricName, Iterable[str] | None] +Config: Incomplete +CPU_FIELDS: Incomplete +MEMORY_FIELDS: Incomplete +DEFAULT_CONFIG: ConfigDict +BASIC_METRICS: list[MetricName] + +def parse_config(config: Config) -> ConfigDict: ... +def instrument_system_metrics(meter_provider: MeterProvider, config: Any = 'basic') -> None: ... From ac2b5d71ec57a8456af2c6b323b4b6386480f365 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 8 Aug 2024 17:59:53 +0200 Subject: [PATCH 06/48] Remove collect_system_metrics and old code --- logfire/_internal/cli.py | 2 +- logfire/_internal/config.py | 32 ++++++++--------- logfire/_internal/config_params.py | 11 ------ logfire/_internal/metrics.py | 57 ------------------------------ tests/conftest.py | 1 - tests/test_configure.py | 27 ++++++-------- tests/test_secret_scrubbing.py | 2 -- 7 files changed, 27 insertions(+), 105 deletions(-) diff --git a/logfire/_internal/cli.py b/logfire/_internal/cli.py index 88064d372..dbff87715 100644 --- a/logfire/_internal/cli.py +++ b/logfire/_internal/cli.py @@ -124,7 +124,7 @@ def parse_backfill(args: argparse.Namespace) -> None: # pragma: no cover sys.exit(1) logfire_url = cast(str, args.logfire_url) - logfire.configure(data_dir=data_dir, base_url=logfire_url, collect_system_metrics=False) + logfire.configure(data_dir=data_dir, base_url=logfire_url) config = logfire_config.GLOBAL_CONFIG config.initialize() token = config.token diff --git a/logfire/_internal/config.py b/logfire/_internal/config.py index e41c5a279..88521899a 100644 --- a/logfire/_internal/config.py +++ b/logfire/_internal/config.py @@ -76,7 +76,7 @@ from .exporters.tail_sampling import TailSamplingOptions, TailSamplingProcessor from .exporters.test import TestExporter from .integrations.executors import instrument_executors -from .metrics import ProxyMeterProvider, configure_metrics +from .metrics import ProxyMeterProvider from .scrubbing import NOOP_SCRUBBER, BaseScrubber, Scrubber, ScrubbingOptions, ScrubCallback from .stack_info import warn_at_user_stacklevel from .tracer import PendingSpanProcessor, ProxyTracerProvider @@ -150,7 +150,7 @@ def configure( config_dir: Path | str | None = None, data_dir: Path | str | None = None, base_url: str | None = None, - collect_system_metrics: bool | None = None, + collect_system_metrics: None = None, id_generator: IdGenerator | None = None, ns_timestamp_generator: Callable[[], int] | None = None, processors: None = None, @@ -192,8 +192,7 @@ def configure( `LOGFIRE_CONFIG_DIR` environment variable, otherwise defaults to the current working directory. data_dir: Directory to store credentials, and logs. If `None` uses the `LOGFIRE_CREDENTIALS_DIR` environment variable, otherwise defaults to `'.logfire'`. base_url: Root URL for the Logfire API. If `None` uses the `LOGFIRE_BASE_URL` environment variable, otherwise defaults to https://logfire-api.pydantic.dev. - collect_system_metrics: Whether to collect system metrics like CPU and memory usage. If `None` uses the `LOGFIRE_COLLECT_SYSTEM_METRICS` environment variable, - otherwise defaults to `True`. + collect_system_metrics: Legacy argument, use `logfire.instrument_system_metrics()` instead. id_generator: Generator for span IDs. Defaults to `RandomIdGenerator()` from the OpenTelemetry SDK. ns_timestamp_generator: Generator for nanosecond timestamps. Defaults to [`time.time_ns`][time.time_ns] from the Python standard library. @@ -228,6 +227,19 @@ def configure( 'The `metric_readers` argument has been replaced by `additional_metric_readers`. ' 'Set `send_to_logfire=False` to disable the default metric reader.' ) + + if collect_system_metrics is False: + raise ValueError( + 'The `collect_system_metrics` argument has been removed. ' + 'System metrics are no longer collected by default.' + ) + + if collect_system_metrics is not None: + raise ValueError( + 'The `collect_system_metrics` argument has been removed. ' + 'Use `logfire.instrument_system_metrics()` instead.' + ) + if scrubbing_callback or scrubbing_patterns: if scrubbing is not None: raise ValueError( @@ -253,7 +265,6 @@ def configure( show_summary=show_summary, config_dir=Path(config_dir) if config_dir else None, data_dir=Path(data_dir) if data_dir else None, - collect_system_metrics=collect_system_metrics, id_generator=id_generator, ns_timestamp_generator=ns_timestamp_generator, additional_span_processors=additional_span_processors, @@ -313,9 +324,6 @@ class _LogfireConfigData: data_dir: Path """The directory to store Logfire config in""" - collect_system_metrics: bool - """Whether to collect system metrics like CPU and memory usage""" - id_generator: IdGenerator """The ID generator to use""" @@ -359,7 +367,6 @@ def _load_configuration( show_summary: bool | None, config_dir: Path | None, data_dir: Path | None, - collect_system_metrics: bool | None, id_generator: IdGenerator | None, ns_timestamp_generator: Callable[[], int] | None, additional_span_processors: Sequence[SpanProcessor] | None, @@ -383,7 +390,6 @@ def _load_configuration( self.trace_sample_rate = param_manager.load_param('trace_sample_rate', trace_sample_rate) self.show_summary = param_manager.load_param('show_summary', show_summary) self.data_dir = param_manager.load_param('data_dir', data_dir) - self.collect_system_metrics = param_manager.load_param('collect_system_metrics', collect_system_metrics) self.inspect_arguments = param_manager.load_param('inspect_arguments', inspect_arguments) self.ignore_no_config = param_manager.load_param('ignore_no_config') if self.inspect_arguments and sys.version_info[:2] <= (3, 8): @@ -463,7 +469,6 @@ def __init__( show_summary: bool | None = None, config_dir: Path | None = None, data_dir: Path | None = None, - collect_system_metrics: bool | None = None, id_generator: IdGenerator | None = None, ns_timestamp_generator: Callable[[], int] | None = None, additional_span_processors: Sequence[SpanProcessor] | None = None, @@ -495,7 +500,6 @@ def __init__( show_summary=show_summary, config_dir=config_dir, data_dir=data_dir, - collect_system_metrics=collect_system_metrics, id_generator=id_generator, ns_timestamp_generator=ns_timestamp_generator, additional_span_processors=additional_span_processors, @@ -531,7 +535,6 @@ def configure( show_summary: bool | None, config_dir: Path | None, data_dir: Path | None, - collect_system_metrics: bool | None, id_generator: IdGenerator | None, ns_timestamp_generator: Callable[[], int] | None, additional_span_processors: Sequence[SpanProcessor] | None, @@ -557,7 +560,6 @@ def configure( show_summary, config_dir, data_dir, - collect_system_metrics, id_generator, ns_timestamp_generator, additional_span_processors, @@ -751,8 +753,6 @@ def check_token(): ) ], ) - if self.collect_system_metrics: - configure_metrics(meter_provider) # we need to shut down any existing providers to avoid leaking resources (like threads) # but if this takes longer than 100ms you should call `logfire.shutdown` before reconfiguring diff --git a/logfire/_internal/config_params.py b/logfire/_internal/config_params.py index 15daa7f95..3b0ad1436 100644 --- a/logfire/_internal/config_params.py +++ b/logfire/_internal/config_params.py @@ -17,14 +17,6 @@ from .exporters.console import ConsoleColorsValues from .utils import read_toml_file -try: - import opentelemetry.instrumentation.system_metrics # noqa: F401 # type: ignore - - COLLECT_SYSTEM_METRICS_DEFAULT = True -except ImportError: # pragma: no cover - COLLECT_SYSTEM_METRICS_DEFAULT = False # type: ignore - - T = TypeVar('T') slots_true = {'slots': True} if sys.version_info >= (3, 10) else {} @@ -77,8 +69,6 @@ class _DefaultCallback: """Whether to show the summary when a new project is created.""" CREDENTIALS_DIR = ConfigParam(env_vars=['LOGFIRE_CREDENTIALS_DIR'], allow_file_config=True, default='.logfire', tp=Path) """The directory where to store the configuration file.""" -COLLECT_SYSTEM_METRICS = ConfigParam(env_vars=['LOGFIRE_COLLECT_SYSTEM_METRICS'], allow_file_config=True, default=COLLECT_SYSTEM_METRICS_DEFAULT, tp=bool) -"""Whether to collect system metrics.""" CONSOLE = ConfigParam(env_vars=['LOGFIRE_CONSOLE'], allow_file_config=True, default=True, tp=bool) """Whether to enable/disable the console exporter.""" CONSOLE_COLORS = ConfigParam(env_vars=['LOGFIRE_CONSOLE_COLORS'], allow_file_config=True, default='auto', tp=ConsoleColorsValues) @@ -120,7 +110,6 @@ class _DefaultCallback: 'trace_sample_rate': TRACE_SAMPLE_RATE, 'show_summary': SHOW_SUMMARY, 'data_dir': CREDENTIALS_DIR, - 'collect_system_metrics': COLLECT_SYSTEM_METRICS, 'console': CONSOLE, 'console_colors': CONSOLE_COLORS, 'console_span_style': CONSOLE_SPAN_STYLE, diff --git a/logfire/_internal/metrics.py b/logfire/_internal/metrics.py index 09ff6b23d..27a873cb7 100644 --- a/logfire/_internal/metrics.py +++ b/logfire/_internal/metrics.py @@ -1,7 +1,6 @@ from __future__ import annotations import dataclasses -import sys from abc import ABC, abstractmethod from threading import Lock from typing import Any, Generic, Sequence, TypeVar @@ -30,62 +29,6 @@ except ImportError: # pragma: no cover Gauge = None -# All the cpu_times fields provided by psutil (used by system_metrics) across all platforms, -# except for 'guest' and 'guest_nice' which are included in 'user' and 'nice' in Linux (see psutil._cpu_tot_time). -# Docs: https://psutil.readthedocs.io/en/latest/#psutil.cpu_times -CPU_FIELDS = 'idle user system irq softirq nice iowait steal interrupt dpc'.split() - -# All the virtual_memory fields provided by psutil across all platforms, -# except for 'percent' which can be calculated as `(total - available) / total * 100`. -# Docs: https://psutil.readthedocs.io/en/latest/#psutil.virtual_memory -MEMORY_FIELDS = 'total available used free active inactive buffers cached shared wired slab'.split() - -# Based on opentelemetry/instrumentation/system_metrics/__init__.py -DEFAULT_CONFIG = { - 'system.cpu.time': CPU_FIELDS, - 'system.cpu.utilization': CPU_FIELDS, - 'system.memory.usage': MEMORY_FIELDS, - 'system.memory.utilization': MEMORY_FIELDS, - 'system.swap.usage': ['used', 'free'], - 'system.swap.utilization': ['used', 'free'], - 'system.disk.io': ['read', 'write'], - 'system.disk.operations': ['read', 'write'], - 'system.disk.time': ['read', 'write'], - 'system.network.dropped.packets': ['transmit', 'receive'], - 'system.network.packets': ['transmit', 'receive'], - 'system.network.errors': ['transmit', 'receive'], - 'system.network.io': ['transmit', 'receive'], - 'system.network.connections': ['family', 'type'], - 'system.thread_count': None, - 'process.runtime.memory': ['rss', 'vms'], - 'process.runtime.cpu.time': ['user', 'system'], - 'process.runtime.gc_count': None, -} - - -try: - from opentelemetry.instrumentation.system_metrics import SystemMetricsInstrumentor - - INSTRUMENTOR = SystemMetricsInstrumentor(config=DEFAULT_CONFIG) # type: ignore -except ImportError: # pragma: no cover - INSTRUMENTOR = None # type: ignore - -if sys.platform == 'darwin': # pragma: no cover - # see https://github.com/giampaolo/psutil/issues/1219 - # upstream pr: https://github.com/open-telemetry/opentelemetry-python-contrib/pull/2008 - DEFAULT_CONFIG.pop('system.network.connections') - - -def configure_metrics(meter_provider: MeterProvider) -> None: - if INSTRUMENTOR is None: # pragma: no cover - raise RuntimeError('Install logfire[system-metrics] to use `collect_system_metrics=True`.') - - # we need to call uninstrument() otherwise instrument() will do nothing - # even if the meter provider is different - if INSTRUMENTOR.is_instrumented_by_opentelemetry: - INSTRUMENTOR.uninstrument() # type: ignore - INSTRUMENTOR.instrument(meter_provider=meter_provider) # type: ignore - # The following proxy classes are adapted from OTEL's SDK @dataclasses.dataclass diff --git a/tests/conftest.py b/tests/conftest.py index c65bd242e..1b5420cbb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -62,7 +62,6 @@ def config_kwargs( id_generator=id_generator, ns_timestamp_generator=time_generator, additional_span_processors=[SimpleSpanProcessor(exporter)], - collect_system_metrics=False, # Ensure that inspect_arguments doesn't break things in most versions # (it's off by default for <3.11) but it's completely forbidden for 3.8. inspect_arguments=sys.version_info[:2] >= (3, 9), diff --git a/tests/test_configure.py b/tests/test_configure.py index 23fb6985b..3856b3059 100644 --- a/tests/test_configure.py +++ b/tests/test_configure.py @@ -461,7 +461,6 @@ def test_read_config_from_pyproject_toml(tmp_path: Path) -> None: console_colors = "never" console_include_timestamp = false data_dir = "{tmp_path}" - collect_system_metrics = false pydantic_plugin_record = "metrics" pydantic_plugin_include = " test1, test2" pydantic_plugin_exclude = "test3 ,test4" @@ -480,7 +479,6 @@ def test_read_config_from_pyproject_toml(tmp_path: Path) -> None: assert GLOBAL_CONFIG.console.colors == 'never' assert GLOBAL_CONFIG.console.include_timestamps is False assert GLOBAL_CONFIG.data_dir == tmp_path - assert GLOBAL_CONFIG.collect_system_metrics is False assert GLOBAL_CONFIG.pydantic_plugin.record == 'metrics' assert GLOBAL_CONFIG.pydantic_plugin.include == {'test1', 'test2'} assert GLOBAL_CONFIG.pydantic_plugin.exclude == {'test3', 'test4'} @@ -553,7 +551,6 @@ def default_span_processor(exporter: SpanExporter) -> SimpleSpanProcessor: token='abc1', default_span_processor=default_span_processor, additional_metric_readers=[InMemoryMetricReader()], - collect_system_metrics=False, ) wait_for_check_token_thread() @@ -583,7 +580,6 @@ def test_configure_service_version(tmp_path: str) -> None: token='abc2', service_version='1.2.3', additional_metric_readers=[InMemoryMetricReader()], - collect_system_metrics=False, ) assert GLOBAL_CONFIG.service_version == '1.2.3' @@ -591,7 +587,6 @@ def test_configure_service_version(tmp_path: str) -> None: configure( token='abc3', additional_metric_readers=[InMemoryMetricReader()], - collect_system_metrics=False, ) assert GLOBAL_CONFIG.service_version == git_sha @@ -603,7 +598,6 @@ def test_configure_service_version(tmp_path: str) -> None: configure( token='abc4', additional_metric_readers=[InMemoryMetricReader()], - collect_system_metrics=False, ) assert GLOBAL_CONFIG.service_version is None finally: @@ -866,7 +860,7 @@ def test_initialize_project_use_existing_project_no_projects(tmp_dir_cwd: Path, } request_mocker.post('https://logfire-api.pydantic.dev/v1/projects/fake_org', [create_project_response]) - logfire.configure(send_to_logfire=True, collect_system_metrics=False) + logfire.configure(send_to_logfire=True) assert confirm_mock.mock_calls == [ call('The project will be created in the organization "fake_org". Continue?', default=True), @@ -901,7 +895,7 @@ def test_initialize_project_use_existing_project(tmp_dir_cwd: Path, tmp_path: Pa [create_project_response], ) - logfire.configure(send_to_logfire=True, collect_system_metrics=False) + logfire.configure(send_to_logfire=True) assert confirm_mock.mock_calls == [ call('Do you want to use one of your existing projects? ', default=True), @@ -960,7 +954,6 @@ def test_initialize_project_not_using_existing_project( logfire.configure( send_to_logfire=True, - collect_system_metrics=False, ) assert confirm_mock.mock_calls == [ @@ -1001,7 +994,7 @@ def test_initialize_project_not_confirming_organization(tmp_path: Path) -> None: ) with pytest.raises(SystemExit): - logfire.configure(data_dir=tmp_path, send_to_logfire=True, collect_system_metrics=False) + logfire.configure(data_dir=tmp_path, send_to_logfire=True) assert confirm_mock.mock_calls == [ call('Do you want to use one of your existing projects? ', default=True), @@ -1078,7 +1071,7 @@ def test_initialize_project_create_project(tmp_dir_cwd: Path, tmp_path: Path, ca ], ) - logfire.configure(send_to_logfire=True, collect_system_metrics=False) + logfire.configure(send_to_logfire=True) for request in request_mocker.request_history: assert request.headers['Authorization'] == 'fake_user_token' @@ -1161,7 +1154,7 @@ def test_initialize_project_create_project_default_organization(tmp_dir_cwd: Pat [create_project_response], ) - logfire.configure(send_to_logfire=True, collect_system_metrics=False) + logfire.configure(send_to_logfire=True) assert prompt_mock.mock_calls == [ call( @@ -1193,7 +1186,7 @@ def test_send_to_logfire_true(tmp_path: Path) -> None: ) ) with pytest.raises(RuntimeError, match='^expected$'): - configure(send_to_logfire=True, console=False, data_dir=data_dir, collect_system_metrics=False) + configure(send_to_logfire=True, console=False, data_dir=data_dir) def test_send_to_logfire_false() -> None: @@ -1340,7 +1333,7 @@ def test_configure_fstring_python_38(): def test_default_exporters(monkeypatch: pytest.MonkeyPatch): monkeypatch.setattr(LogfireConfig, '_initialize_credentials_from_token', lambda *args: None) # type: ignore - logfire.configure(send_to_logfire=True, token='foo', collect_system_metrics=False) + logfire.configure(send_to_logfire=True, token='foo') [console_processor, send_to_logfire_processor, pending_span_processor] = get_span_processors() @@ -1382,7 +1375,7 @@ def test_custom_exporters(): def test_otel_exporter_otlp_endpoint_env_var(): # Setting this env var creates an OTLPSpanExporter and an OTLPMetricExporter with patch.dict(os.environ, {'OTEL_EXPORTER_OTLP_ENDPOINT': 'otel_endpoint'}): - logfire.configure(send_to_logfire=False, console=False, collect_system_metrics=False) + logfire.configure(send_to_logfire=False, console=False) [otel_processor] = get_span_processors() assert isinstance(otel_processor, MainSpanProcessorWrapper) @@ -1399,7 +1392,7 @@ def test_otel_exporter_otlp_endpoint_env_var(): def test_otel_traces_exporter_env_var(): # Setting OTEL_TRACES_EXPORTER to something other than otlp prevents creating an OTLPSpanExporter with patch.dict(os.environ, {'OTEL_EXPORTER_OTLP_ENDPOINT': 'otel_endpoint2', 'OTEL_TRACES_EXPORTER': 'grpc'}): - logfire.configure(send_to_logfire=False, console=False, collect_system_metrics=False) + logfire.configure(send_to_logfire=False, console=False) assert len(list(get_span_processors())) == 0 @@ -1440,7 +1433,7 @@ def test_otel_exporter_otlp_traces_endpoint_env_var(): def test_otel_exporter_otlp_metrics_endpoint_env_var(): # Setting just OTEL_EXPORTER_OTLP_METRICS_ENDPOINT only creates an OTLPMetricExporter with patch.dict(os.environ, {'OTEL_EXPORTER_OTLP_METRICS_ENDPOINT': 'otel_metrics_endpoint'}): - logfire.configure(send_to_logfire=False, console=False, collect_system_metrics=False) + logfire.configure(send_to_logfire=False, console=False) assert len(list(get_span_processors())) == 0 diff --git a/tests/test_secret_scrubbing.py b/tests/test_secret_scrubbing.py index 16b73b7d7..923fc8db5 100644 --- a/tests/test_secret_scrubbing.py +++ b/tests/test_secret_scrubbing.py @@ -231,7 +231,6 @@ def callback(match: logfire.ScrubMatch): id_generator=id_generator, ns_timestamp_generator=time_generator, additional_span_processors=[SimpleSpanProcessor(exporter)], - collect_system_metrics=False, ) # Note the values (or lack thereof) of each of these attributes in the exported span. @@ -279,7 +278,6 @@ def test_dont_scrub_resource( id_generator=id_generator, ns_timestamp_generator=time_generator, additional_span_processors=[SimpleSpanProcessor(exporter)], - collect_system_metrics=False, ) logfire.info('hi') assert dict(exporter.exported_spans[0].resource.attributes) == IsPartialDict( From f882a10930bc328bcbcc5001c72c6c101f663624 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 8 Aug 2024 18:01:52 +0200 Subject: [PATCH 07/48] Test config errors --- tests/test_configure.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/test_configure.py b/tests/test_configure.py index 3856b3059..d05a0be88 100644 --- a/tests/test_configure.py +++ b/tests/test_configure.py @@ -11,6 +11,7 @@ from unittest import mock from unittest.mock import call, patch +import inline_snapshot.extra import pytest import requests_mock from inline_snapshot import snapshot @@ -1449,3 +1450,23 @@ def get_span_processors() -> Iterable[SpanProcessor]: def get_metric_readers() -> Iterable[SpanProcessor]: return get_meter_provider().provider._sdk_config.metric_readers # type: ignore + + +def test_collect_system_metrics_false(): + with inline_snapshot.extra.raises( + snapshot( + 'ValueError: The `collect_system_metrics` argument has been removed. ' + 'System metrics are no longer collected by default.' + ) + ): + logfire.configure(collect_system_metrics=False) # type: ignore + + +def test_collect_system_metrics_true(): + with inline_snapshot.extra.raises( + snapshot( + 'ValueError: The `collect_system_metrics` argument has been removed. ' + 'Use `logfire.instrument_system_metrics()` instead.' + ) + ): + logfire.configure(collect_system_metrics=True) # type: ignore From 4a9ade1a5cff6e7fd3f68ed36d6b99242e19b2f6 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 8 Aug 2024 18:04:47 +0200 Subject: [PATCH 08/48] Update generated stubs --- logfire-api/logfire_api/_internal/config.pyi | 12 +++++------- logfire-api/logfire_api/_internal/config_params.pyi | 2 -- logfire-api/logfire_api/_internal/metrics.pyi | 6 ------ 3 files changed, 5 insertions(+), 15 deletions(-) diff --git a/logfire-api/logfire_api/_internal/config.pyi b/logfire-api/logfire_api/_internal/config.pyi index f064b3f68..7a7a1b1ba 100644 --- a/logfire-api/logfire_api/_internal/config.pyi +++ b/logfire-api/logfire_api/_internal/config.pyi @@ -14,7 +14,7 @@ from .exporters.remove_pending import RemovePendingSpansExporter as RemovePendin from .exporters.tail_sampling import TailSamplingOptions as TailSamplingOptions, TailSamplingProcessor as TailSamplingProcessor from .exporters.test import TestExporter as TestExporter from .integrations.executors import instrument_executors as instrument_executors -from .metrics import ProxyMeterProvider as ProxyMeterProvider, configure_metrics as configure_metrics +from .metrics import ProxyMeterProvider as ProxyMeterProvider from .scrubbing import BaseScrubber as BaseScrubber, NOOP_SCRUBBER as NOOP_SCRUBBER, ScrubCallback as ScrubCallback, Scrubber as Scrubber, ScrubbingOptions as ScrubbingOptions from .stack_info import warn_at_user_stacklevel as warn_at_user_stacklevel from .tracer import PendingSpanProcessor as PendingSpanProcessor, ProxyTracerProvider as ProxyTracerProvider @@ -54,7 +54,7 @@ class PydanticPlugin: include: set[str] = ... exclude: set[str] = ... -def configure(*, send_to_logfire: bool | Literal['if-token-present'] | None = None, token: str | None = None, project_name: str | None = None, service_name: str | None = None, service_version: str | None = None, trace_sample_rate: float | None = None, console: ConsoleOptions | Literal[False] | None = None, show_summary: bool | None = None, config_dir: Path | str | None = None, data_dir: Path | str | None = None, base_url: str | None = None, collect_system_metrics: bool | None = None, id_generator: IdGenerator | None = None, ns_timestamp_generator: Callable[[], int] | None = None, processors: None = None, additional_span_processors: Sequence[SpanProcessor] | None = None, default_span_processor: Callable[[SpanExporter], SpanProcessor] | None = None, metric_readers: None = None, additional_metric_readers: Sequence[MetricReader] | None = None, pydantic_plugin: PydanticPlugin | None = None, fast_shutdown: bool = False, scrubbing_patterns: Sequence[str] | None = None, scrubbing_callback: ScrubCallback | None = None, scrubbing: ScrubbingOptions | Literal[False] | None = None, inspect_arguments: bool | None = None, tail_sampling: TailSamplingOptions | None = None) -> None: +def configure(*, send_to_logfire: bool | Literal['if-token-present'] | None = None, token: str | None = None, project_name: str | None = None, service_name: str | None = None, service_version: str | None = None, trace_sample_rate: float | None = None, console: ConsoleOptions | Literal[False] | None = None, show_summary: bool | None = None, config_dir: Path | str | None = None, data_dir: Path | str | None = None, base_url: str | None = None, collect_system_metrics: None = None, id_generator: IdGenerator | None = None, ns_timestamp_generator: Callable[[], int] | None = None, processors: None = None, additional_span_processors: Sequence[SpanProcessor] | None = None, default_span_processor: Callable[[SpanExporter], SpanProcessor] | None = None, metric_readers: None = None, additional_metric_readers: Sequence[MetricReader] | None = None, pydantic_plugin: PydanticPlugin | None = None, fast_shutdown: bool = False, scrubbing_patterns: Sequence[str] | None = None, scrubbing_callback: ScrubCallback | None = None, scrubbing: ScrubbingOptions | Literal[False] | None = None, inspect_arguments: bool | None = None, tail_sampling: TailSamplingOptions | None = None) -> None: """Configure the logfire SDK. Args: @@ -81,8 +81,7 @@ def configure(*, send_to_logfire: bool | Literal['if-token-present'] | None = No `LOGFIRE_CONFIG_DIR` environment variable, otherwise defaults to the current working directory. data_dir: Directory to store credentials, and logs. If `None` uses the `LOGFIRE_CREDENTIALS_DIR` environment variable, otherwise defaults to `'.logfire'`. base_url: Root URL for the Logfire API. If `None` uses the `LOGFIRE_BASE_URL` environment variable, otherwise defaults to https://logfire-api.pydantic.dev. - collect_system_metrics: Whether to collect system metrics like CPU and memory usage. If `None` uses the `LOGFIRE_COLLECT_SYSTEM_METRICS` environment variable, - otherwise defaults to `True`. + collect_system_metrics: Legacy argument, use `logfire.instrument_system_metrics()` instead. id_generator: Generator for span IDs. Defaults to `RandomIdGenerator()` from the OpenTelemetry SDK. ns_timestamp_generator: Generator for nanosecond timestamps. Defaults to [`time.time_ns`][time.time_ns] from the Python standard library. @@ -127,7 +126,6 @@ class _LogfireConfigData: console: ConsoleOptions | Literal[False] | None show_summary: bool data_dir: Path - collect_system_metrics: bool id_generator: IdGenerator ns_timestamp_generator: Callable[[], int] additional_span_processors: Sequence[SpanProcessor] | None @@ -139,14 +137,14 @@ class _LogfireConfigData: tail_sampling: TailSamplingOptions | None class LogfireConfig(_LogfireConfigData): - def __init__(self, base_url: str | None = None, send_to_logfire: bool | None = None, token: str | None = None, project_name: str | None = None, service_name: str | None = None, service_version: str | None = None, trace_sample_rate: float | None = None, console: ConsoleOptions | Literal[False] | None = None, show_summary: bool | None = None, config_dir: Path | None = None, data_dir: Path | None = None, collect_system_metrics: bool | None = None, id_generator: IdGenerator | None = None, ns_timestamp_generator: Callable[[], int] | None = None, additional_span_processors: Sequence[SpanProcessor] | None = None, default_span_processor: Callable[[SpanExporter], SpanProcessor] | None = None, additional_metric_readers: Sequence[MetricReader] | None = None, pydantic_plugin: PydanticPlugin | None = None, fast_shutdown: bool = False, scrubbing: ScrubbingOptions | Literal[False] | None = None, inspect_arguments: bool | None = None, tail_sampling: TailSamplingOptions | None = None) -> None: + def __init__(self, base_url: str | None = None, send_to_logfire: bool | None = None, token: str | None = None, project_name: str | None = None, service_name: str | None = None, service_version: str | None = None, trace_sample_rate: float | None = None, console: ConsoleOptions | Literal[False] | None = None, show_summary: bool | None = None, config_dir: Path | None = None, data_dir: Path | None = None, id_generator: IdGenerator | None = None, ns_timestamp_generator: Callable[[], int] | None = None, additional_span_processors: Sequence[SpanProcessor] | None = None, default_span_processor: Callable[[SpanExporter], SpanProcessor] | None = None, additional_metric_readers: Sequence[MetricReader] | None = None, pydantic_plugin: PydanticPlugin | None = None, fast_shutdown: bool = False, scrubbing: ScrubbingOptions | Literal[False] | None = None, inspect_arguments: bool | None = None, tail_sampling: TailSamplingOptions | None = None) -> None: """Create a new LogfireConfig. Users should never need to call this directly, instead use `logfire.configure`. See `_LogfireConfigData` for parameter documentation. """ - def configure(self, base_url: str | None, send_to_logfire: bool | Literal['if-token-present'] | None, token: str | None, project_name: str | None, service_name: str | None, service_version: str | None, trace_sample_rate: float | None, console: ConsoleOptions | Literal[False] | None, show_summary: bool | None, config_dir: Path | None, data_dir: Path | None, collect_system_metrics: bool | None, id_generator: IdGenerator | None, ns_timestamp_generator: Callable[[], int] | None, additional_span_processors: Sequence[SpanProcessor] | None, default_span_processor: Callable[[SpanExporter], SpanProcessor] | None, additional_metric_readers: Sequence[MetricReader] | None, pydantic_plugin: PydanticPlugin | None, fast_shutdown: bool, scrubbing: ScrubbingOptions | Literal[False] | None, inspect_arguments: bool | None, tail_sampling: TailSamplingOptions | None) -> None: ... + def configure(self, base_url: str | None, send_to_logfire: bool | Literal['if-token-present'] | None, token: str | None, project_name: str | None, service_name: str | None, service_version: str | None, trace_sample_rate: float | None, console: ConsoleOptions | Literal[False] | None, show_summary: bool | None, config_dir: Path | None, data_dir: Path | None, id_generator: IdGenerator | None, ns_timestamp_generator: Callable[[], int] | None, additional_span_processors: Sequence[SpanProcessor] | None, default_span_processor: Callable[[SpanExporter], SpanProcessor] | None, additional_metric_readers: Sequence[MetricReader] | None, pydantic_plugin: PydanticPlugin | None, fast_shutdown: bool, scrubbing: ScrubbingOptions | Literal[False] | None, inspect_arguments: bool | None, tail_sampling: TailSamplingOptions | None) -> None: ... def initialize(self) -> ProxyTracerProvider: """Configure internals to start exporting traces and metrics.""" def force_flush(self, timeout_millis: int = 30000) -> bool: diff --git a/logfire-api/logfire_api/_internal/config_params.pyi b/logfire-api/logfire_api/_internal/config_params.pyi index adb32310a..8780ce420 100644 --- a/logfire-api/logfire_api/_internal/config_params.pyi +++ b/logfire-api/logfire_api/_internal/config_params.pyi @@ -9,7 +9,6 @@ from logfire.exceptions import LogfireConfigError as LogfireConfigError from pathlib import Path from typing import Any, Callable, TypeVar -COLLECT_SYSTEM_METRICS_DEFAULT: bool T = TypeVar('T') slots_true: Incomplete PydanticPluginRecordValues: Incomplete @@ -38,7 +37,6 @@ SERVICE_NAME: Incomplete SERVICE_VERSION: Incomplete SHOW_SUMMARY: Incomplete CREDENTIALS_DIR: Incomplete -COLLECT_SYSTEM_METRICS: Incomplete CONSOLE: Incomplete CONSOLE_COLORS: Incomplete CONSOLE_SPAN_STYLE: Incomplete diff --git a/logfire-api/logfire_api/_internal/metrics.pyi b/logfire-api/logfire_api/_internal/metrics.pyi index eb09308f2..a5f56d253 100644 --- a/logfire-api/logfire_api/_internal/metrics.pyi +++ b/logfire-api/logfire_api/_internal/metrics.pyi @@ -8,12 +8,6 @@ from typing import Any, Generic, Sequence, TypeVar from weakref import WeakSet Gauge: Incomplete -CPU_FIELDS: Incomplete -MEMORY_FIELDS: Incomplete -DEFAULT_CONFIG: Incomplete -INSTRUMENTOR: Incomplete - -def configure_metrics(meter_provider: MeterProvider) -> None: ... @dataclasses.dataclass class ProxyMeterProvider(MeterProvider): From be7282c1162591fce1235bae117e0e05f679f72c Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Tue, 13 Aug 2024 14:16:30 +0200 Subject: [PATCH 09/48] Don't uninstrument automatically --- logfire/_internal/integrations/system_metrics.py | 2 -- tests/otel_integrations/test_system_metrics.py | 6 ++++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index 489a78181..698ea4319 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -55,7 +55,6 @@ # Docs: https://psutil.readthedocs.io/en/latest/#psutil.virtual_memory MEMORY_FIELDS = 'total available used free active inactive buffers cached shared wired slab'.split() -# Based on opentelemetry/instrumentation/system_metrics/__init__.py DEFAULT_CONFIG: ConfigDict = { **cast(ConfigDict, _DEFAULT_CONFIG), 'system.cpu.time': CPU_FIELDS, @@ -109,5 +108,4 @@ def parse_config(config: Config) -> ConfigDict: def instrument_system_metrics(meter_provider: MeterProvider, config: Any = 'basic') -> None: - SystemMetricsInstrumentor().uninstrument() # type: ignore SystemMetricsInstrumentor(config=parse_config(config)).instrument(meter_provider=meter_provider) # type: ignore diff --git a/tests/otel_integrations/test_system_metrics.py b/tests/otel_integrations/test_system_metrics.py index 1473857c8..cdecc4054 100644 --- a/tests/otel_integrations/test_system_metrics.py +++ b/tests/otel_integrations/test_system_metrics.py @@ -1,6 +1,7 @@ from __future__ import annotations from inline_snapshot import snapshot +from opentelemetry.instrumentation.system_metrics import SystemMetricsInstrumentor from opentelemetry.sdk.metrics.export import InMemoryMetricReader import logfire @@ -29,6 +30,7 @@ def test_default_system_metrics_collection(metrics_reader: InMemoryMetricReader) 'system.thread_count', ] ) + SystemMetricsInstrumentor().uninstrument() # type: ignore def test_single_system_metric_collection(metrics_reader: InMemoryMetricReader) -> None: @@ -36,6 +38,7 @@ def test_single_system_metric_collection(metrics_reader: InMemoryMetricReader) - assert get_collected_metric_names(metrics_reader) == [ 'system.cpu.time', ] + SystemMetricsInstrumentor().uninstrument() # type: ignore def test_list_with_basic_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None: @@ -49,6 +52,7 @@ def test_list_with_basic_system_metrics_collection(metrics_reader: InMemoryMetri 'system.thread_count', ] ) + SystemMetricsInstrumentor().uninstrument() # type: ignore def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None: @@ -78,6 +82,7 @@ def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> 'system.thread_count', ] ) + SystemMetricsInstrumentor().uninstrument() # type: ignore def test_dict_with_basic_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None: @@ -133,3 +138,4 @@ def test_dict_with_basic_system_metrics_collection(metrics_reader: InMemoryMetri 'system.thread_count', ] ) + SystemMetricsInstrumentor().uninstrument() # type: ignore From ea40267420328cb95b1e6032848c09c1eccf38be Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Wed, 14 Aug 2024 13:28:20 +0200 Subject: [PATCH 10/48] wip --- docs/integrations/system_metrics.md | 72 ++++++++++++++++++- .../_internal/integrations/system_metrics.py | 42 ++++++----- .../otel_integrations/test_system_metrics.py | 8 +-- 3 files changed, 96 insertions(+), 26 deletions(-) diff --git a/docs/integrations/system_metrics.md b/docs/integrations/system_metrics.md index 0cb811915..3cacec848 100644 --- a/docs/integrations/system_metrics.md +++ b/docs/integrations/system_metrics.md @@ -1,9 +1,77 @@ -By default, **Logfire** does not collect system metrics. +The [`logfire.instrument_system_metrics()`][logfire.Logfire.instrument_system_metrics] method can be used to collect system metrics with **Logfire**, such as CPU and memory usage. -To enable metrics, you need to install the `logfire[system-metrics]` extra: +## Installation + +Install `logfire` with the `system-metrics` extra: {{ install_logfire(extras=['system-metrics']) }} +## Usage + +```py +import logfire + +logfire.configure() + +logfire.instrument_system_metrics() +``` + +Then in your project, click on 'Dashboards' in the top bar, click 'New Dashboard', and select 'Basic System Metrics' from the dropdown. + +## Configuration + +By default, `instrument_system_metrics` collects only the metrics it needs to display the 'Basic System Metrics' dashboard. You can choose exactly which metrics to collect, and also how much information to collect about each metric, by passing an argument. +Here's what it looks like to specify the default configuration in full detail: + +```py +logfire.instrument_system_metrics({ + 'system.cpu.utilization': ['idle', 'iowait', 'user', 'system', 'irq', 'softirq'], + 'system.memory.utilization': ['available', 'used', 'free', 'active', 'inactive', 'buffers', 'cached', 'shared', 'wired', 'slab'], + 'system.swap.usage': ['used', 'free'], + # There are no fields to configure for 'system.thread_count', so the value is None. + 'system.thread_count': None, +}) +``` + +Each key here is a metric name. The values have different meanings for different metrics. For example, for `system.cpu.utilization`, the value is a list of CPU modes. So there will be a separate row for each CPU core saying what percentage of time it spent idle, another row for the time spent waiting for IO, etc. There are no fields to configure for `system.thread_count`, so the value is `None`. + +To make it convenient to tweak the defaults, the argument can have many different shapes. To demonstrate, the following are all equivalent to the default: + +```py +# If you don't need to configure the details of any metric, you can just pass a list of metric names. +logfire.instrument_system_metrics([ + 'system.cpu.utilization', + 'system.memory.utilization', + 'system.swap.usage', + 'system.thread_count', +]) + +# If you need to configure the details of *some* metrics but not all, +# you can pass a dict with values for the metrics you want to configure, +# and `None` for the other metrics you just want to include with the default configuration. +logfire.instrument_system_metrics({ + 'system.cpu.utilization': None, + 'system.memory.utilization': None, + 'system.swap.usage': None, + 'system.thread_count': None, +}) + + +# You can also pass a single metric name as a string, e.g: +# logfire.instrument_system_metrics('system.swap.usage') +# The string 'basic' is a special shortcut for the default metric names. +# You can also pass a list including 'basic' to add metrics to the default, e.g: +# logfire.instrument_system_metrics(['basic', 'system.network.io']) +logfire.instrument_system_metrics('basic') + +# Or you can use 'basic' as a dict key with the value `None` +# if you want to include and configure additional metrics. +# You cannot set the dict value of 'basic' to anything else since it contains different metrics. +logfire.instrument_system_metrics({'basic': None}) +``` + + + ### Available Metrics Logfire collects the following system metrics: diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index 698ea4319..9eb0ddcc6 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -42,13 +42,20 @@ ConfigString = Union[Literal['basic'], MetricName] ConfigDict = Dict[MetricName, Optional[Iterable[str]]] -Config = Union[Literal['all'], ConfigString, Iterable[ConfigString], Dict[ConfigString, Optional[Iterable[str]]]] +Config = Union[ + Literal['all'], + ConfigString, + Iterable[ConfigString], + Dict[Union[ConfigString, Literal['all']], Optional[Iterable[str]]], +] # All the cpu_times fields provided by psutil (used by system_metrics) across all platforms, # except for 'guest' and 'guest_nice' which are included in 'user' and 'nice' in Linux (see psutil._cpu_tot_time). # Docs: https://psutil.readthedocs.io/en/latest/#psutil.cpu_times -CPU_FIELDS = 'idle user system irq softirq nice iowait steal interrupt dpc'.split() +# TODO +# CPU_FIELDS = 'idle user system irq softirq nice iowait steal interrupt dpc'.split() +CPU_FIELDS = ['idle', 'iowait', 'user', 'system', 'irq', 'softirq'] # All the virtual_memory fields provided by psutil across all platforms, # except for 'percent' which can be calculated as `(total - available) / total * 100`. @@ -83,27 +90,26 @@ def parse_config(config: Config) -> ConfigDict: return DEFAULT_CONFIG config = [config] - config_dict: Dict[ConfigString, Optional[Iterable[str]]] + config_dict: Dict[Union[ConfigString, Literal['all']], Optional[Iterable[str]]] if isinstance(config, dict): - config_dict = config + config_dict = config.copy() # type: ignore else: - config_dict = {} - key: ConfigString - for key in config: - if key == 'basic': - config_dict[key] = None - else: - config_dict[key] = DEFAULT_CONFIG[key] + config_dict = {key: None for key in config} result: ConfigDict = {} + for shortcut in ['basic', 'all']: + if shortcut in config_dict: + del config_dict[shortcut] + if shortcut == 'basic': + for metric in BASIC_METRICS: + result[metric] = DEFAULT_CONFIG[metric] + elif shortcut == 'all': + result.update(DEFAULT_CONFIG) + for key, value in config_dict.items(): - if key == 'basic': - for metric in BASIC_METRICS: - result[metric] = DEFAULT_CONFIG[metric] - elif value is None: - result[key] = DEFAULT_CONFIG[key] - else: - result[key] = value + assert key not in ('basic', 'all') + result[key] = value or DEFAULT_CONFIG[key] + return result diff --git a/tests/otel_integrations/test_system_metrics.py b/tests/otel_integrations/test_system_metrics.py index cdecc4054..98bf6f0f3 100644 --- a/tests/otel_integrations/test_system_metrics.py +++ b/tests/otel_integrations/test_system_metrics.py @@ -87,9 +87,9 @@ def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> def test_dict_with_basic_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None: config: Config = { - 'basic': None, 'system.cpu.time': None, 'system.cpu.utilization': ['idle'], + 'basic': None, } assert parse_config(config) == snapshot( { @@ -116,15 +116,11 @@ def test_dict_with_basic_system_metrics_collection(metrics_reader: InMemoryMetri ], 'system.cpu.time': [ 'idle', + 'iowait', 'user', 'system', 'irq', 'softirq', - 'nice', - 'iowait', - 'steal', - 'interrupt', - 'dpc', ], } ) From 7b397d667c732cec60011eeb7b6fd2c505486932 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 15 Aug 2024 13:31:59 +0200 Subject: [PATCH 11/48] docs for new API --- docs/integrations/system_metrics.md | 69 +++++++++---------- .../_internal/integrations/system_metrics.py | 27 ++++---- 2 files changed, 46 insertions(+), 50 deletions(-) diff --git a/docs/integrations/system_metrics.md b/docs/integrations/system_metrics.md index 3cacec848..6c49b1e7d 100644 --- a/docs/integrations/system_metrics.md +++ b/docs/integrations/system_metrics.md @@ -20,57 +20,52 @@ Then in your project, click on 'Dashboards' in the top bar, click 'New Dashboard ## Configuration -By default, `instrument_system_metrics` collects only the metrics it needs to display the 'Basic System Metrics' dashboard. You can choose exactly which metrics to collect, and also how much information to collect about each metric, by passing an argument. -Here's what it looks like to specify the default configuration in full detail: +By default, `instrument_system_metrics` collects only the metrics it needs to display the 'Basic System Metrics' dashboard. You can choose exactly which metrics to collect and how much data to collect about each metric. The default is equivalent to this: ```py logfire.instrument_system_metrics({ - 'system.cpu.utilization': ['idle', 'iowait', 'user', 'system', 'irq', 'softirq'], - 'system.memory.utilization': ['available', 'used', 'free', 'active', 'inactive', 'buffers', 'cached', 'shared', 'wired', 'slab'], - 'system.swap.usage': ['used', 'free'], - # There are no fields to configure for 'system.thread_count', so the value is None. - 'system.thread_count': None, + 'logfire.system.cpu.simple_utilization': None, + 'system.memory.utilization': ['available'], + 'system.swap.utilization': ['used'], }) ``` -Each key here is a metric name. The values have different meanings for different metrics. For example, for `system.cpu.utilization`, the value is a list of CPU modes. So there will be a separate row for each CPU core saying what percentage of time it spent idle, another row for the time spent waiting for IO, etc. There are no fields to configure for `system.thread_count`, so the value is `None`. - -To make it convenient to tweak the defaults, the argument can have many different shapes. To demonstrate, the following are all equivalent to the default: +To collect lots of detailed data about all available metrics, use `logfire.instrument_system_metrics(base='full')`. This is equivalent to: ```py -# If you don't need to configure the details of any metric, you can just pass a list of metric names. -logfire.instrument_system_metrics([ - 'system.cpu.utilization', - 'system.memory.utilization', - 'system.swap.usage', - 'system.thread_count', -]) - -# If you need to configure the details of *some* metrics but not all, -# you can pass a dict with values for the metrics you want to configure, -# and `None` for the other metrics you just want to include with the default configuration. logfire.instrument_system_metrics({ - 'system.cpu.utilization': None, - 'system.memory.utilization': None, - 'system.swap.usage': None, + 'logfire.system.cpu.simple_utilization': None, + 'system.cpu.time': ['idle', 'user', 'system', 'irq', 'softirq', 'nice', 'iowait', 'steal', 'interrupt', 'dpc'], + 'system.cpu.utilization': ['idle', 'user', 'system', 'irq', 'softirq', 'nice', 'iowait', 'steal', 'interrupt', 'dpc'], + 'system.memory.usage': ['available', 'used', 'free', 'active', 'inactive', 'buffers', 'cached', 'shared', 'wired', 'slab', 'total'], + 'system.memory.utilization': ['available', 'used', 'free', 'active', 'inactive', 'buffers', 'cached', 'shared', 'wired', 'slab'], + 'system.swap.usage': ['used', 'free'], + 'system.swap.utilization': ['used'], + 'system.disk.io': ['read', 'write'], + 'system.disk.operations': ['read', 'write'], + 'system.disk.time': ['read', 'write'], + 'system.network.dropped.packets': ['transmit', 'receive'], + 'system.network.packets': ['transmit', 'receive'], + 'system.network.errors': ['transmit', 'receive'], + 'system.network.io': ['transmit', 'receive'], 'system.thread_count': None, + 'process.runtime.memory': ['rss', 'vms'], + 'process.runtime.cpu.time': ['user', 'system'], + 'process.runtime.gc_count': None, + 'process.runtime.thread_count': None, + 'process.runtime.cpu.utilization': None, + 'process.runtime.context_switches': ['involuntary', 'voluntary'], + 'process.open_file_descriptor.count': None, }) - - -# You can also pass a single metric name as a string, e.g: -# logfire.instrument_system_metrics('system.swap.usage') -# The string 'basic' is a special shortcut for the default metric names. -# You can also pass a list including 'basic' to add metrics to the default, e.g: -# logfire.instrument_system_metrics(['basic', 'system.network.io']) -logfire.instrument_system_metrics('basic') - -# Or you can use 'basic' as a dict key with the value `None` -# if you want to include and configure additional metrics. -# You cannot set the dict value of 'basic' to anything else since it contains different metrics. -logfire.instrument_system_metrics({'basic': None}) ``` +Each key here is a metric name. The values have different meanings for different metrics. For example, for `system.cpu.utilization`, the value is a list of CPU modes. So there will be a separate row for each CPU core saying what percentage of time it spent idle, another row for the time spent waiting for IO, etc. There are no fields to configure for `system.thread_count`, so the value is `None`. + +The first dict argument is merged with the base. For example, if you want to collect disk read operations (but not writes) you can write: +- `logfire.instrument_system_metrics({'system.disk.operations': ['read']})` to collect that data in addition to the basic defaults. +- `logfire.instrument_system_metrics({'system.disk.operations': ['read']}, base='full')` to collect detailed data about all metrics, excluding disk write operations. +- `logfire.instrument_system_metrics({'system.disk.operations': ['read']}, base=None)` to collect only disk read operations and nothing else. ### Available Metrics diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index 9eb0ddcc6..d5a7ab50f 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -16,6 +16,7 @@ ) MetricName = Literal[ + 'logfire.system.cpu.simple_utilization':None, 'system.cpu.time', 'system.cpu.utilization', 'system.memory.usage', @@ -49,32 +50,32 @@ Dict[Union[ConfigString, Literal['all']], Optional[Iterable[str]]], ] - # All the cpu_times fields provided by psutil (used by system_metrics) across all platforms, # except for 'guest' and 'guest_nice' which are included in 'user' and 'nice' in Linux (see psutil._cpu_tot_time). # Docs: https://psutil.readthedocs.io/en/latest/#psutil.cpu_times -# TODO -# CPU_FIELDS = 'idle user system irq softirq nice iowait steal interrupt dpc'.split() -CPU_FIELDS = ['idle', 'iowait', 'user', 'system', 'irq', 'softirq'] +CPU_FIELDS = 'idle user system irq softirq nice iowait steal interrupt dpc'.split() # All the virtual_memory fields provided by psutil across all platforms, # except for 'percent' which can be calculated as `(total - available) / total * 100`. # Docs: https://psutil.readthedocs.io/en/latest/#psutil.virtual_memory -MEMORY_FIELDS = 'total available used free active inactive buffers cached shared wired slab'.split() +MEMORY_FIELDS = 'available used free active inactive buffers cached shared wired slab'.split() -DEFAULT_CONFIG: ConfigDict = { +FULL_CONFIG: ConfigDict = { **cast(ConfigDict, _DEFAULT_CONFIG), + 'logfire.system.cpu.simple_utilization': None, 'system.cpu.time': CPU_FIELDS, 'system.cpu.utilization': CPU_FIELDS, - 'system.memory.usage': MEMORY_FIELDS, + 'system.memory.usage': MEMORY_FIELDS + ['total'], 'system.memory.utilization': MEMORY_FIELDS, + 'system.swap.utilization': ['used'], } +print(FULL_CONFIG) + if sys.platform == 'darwin': # pragma: no cover # see https://github.com/giampaolo/psutil/issues/1219 # upstream pr: https://github.com/open-telemetry/opentelemetry-python-contrib/pull/2008 - DEFAULT_CONFIG.pop('system.network.connections', None) - + FULL_CONFIG.pop('system.network.connections', None) BASIC_METRICS: List[MetricName] = [ 'system.thread_count', # used by process count @@ -87,7 +88,7 @@ def parse_config(config: Config) -> ConfigDict: if isinstance(config, str): if config == 'all': - return DEFAULT_CONFIG + return FULL_CONFIG config = [config] config_dict: Dict[Union[ConfigString, Literal['all']], Optional[Iterable[str]]] @@ -102,13 +103,13 @@ def parse_config(config: Config) -> ConfigDict: del config_dict[shortcut] if shortcut == 'basic': for metric in BASIC_METRICS: - result[metric] = DEFAULT_CONFIG[metric] + result[metric] = FULL_CONFIG[metric] elif shortcut == 'all': - result.update(DEFAULT_CONFIG) + result.update(FULL_CONFIG) for key, value in config_dict.items(): assert key not in ('basic', 'all') - result[key] = value or DEFAULT_CONFIG[key] + result[key] = value or FULL_CONFIG[key] return result From 48c31bd77e11e1ec0b289808d86084f3af4f11d7 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 15 Aug 2024 13:33:55 +0200 Subject: [PATCH 12/48] Remove available metrics section --- docs/integrations/system_metrics.md | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/docs/integrations/system_metrics.md b/docs/integrations/system_metrics.md index 6c49b1e7d..9bbcc0d2e 100644 --- a/docs/integrations/system_metrics.md +++ b/docs/integrations/system_metrics.md @@ -66,26 +66,3 @@ The first dict argument is merged with the base. For example, if you want to col - `logfire.instrument_system_metrics({'system.disk.operations': ['read']})` to collect that data in addition to the basic defaults. - `logfire.instrument_system_metrics({'system.disk.operations': ['read']}, base='full')` to collect detailed data about all metrics, excluding disk write operations. - `logfire.instrument_system_metrics({'system.disk.operations': ['read']}, base=None)` to collect only disk read operations and nothing else. - -### Available Metrics - -Logfire collects the following system metrics: - -* `system.cpu.time`: CPU time spent in different modes. -* `system.cpu.utilization`: CPU utilization in different modes. -* `system.memory.usage`: Memory usage. -* `system.memory.utilization`: Memory utilization in different modes. -* `system.swap.usage`: Swap usage. -* `system.swap.utilization`: Swap utilization -* `system.disk.io`: Disk I/O operations (read/write). -* `system.disk.operations`: Disk operations (read/write). -* `system.disk.time`: Disk time (read/write). -* `system.network.dropped.packets`: Dropped packets (transmit/receive). -* `system.network.packets`: Packets (transmit/receive). -* `system.network.errors`: Network errors (transmit/receive). -* `system.network.io`: Network I/O (transmit/receive). -* `system.network.connections`: Network connections (family/type). -* `system.thread_count`: Thread count. -* `process.runtime.memory`: Process memory usage. -* `process.runtime.cpu.time`: Process CPU time. -* `process.runtime.gc_count`: Process garbage collection count. From 68e3f28603ad7cf4641abd83cdf02c5e5c97d514 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 15 Aug 2024 14:17:52 +0200 Subject: [PATCH 13/48] Remove logfire prefix --- docs/integrations/system_metrics.md | 4 ++-- logfire/_internal/integrations/system_metrics.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/integrations/system_metrics.md b/docs/integrations/system_metrics.md index 9bbcc0d2e..eaff8ebf6 100644 --- a/docs/integrations/system_metrics.md +++ b/docs/integrations/system_metrics.md @@ -24,7 +24,7 @@ By default, `instrument_system_metrics` collects only the metrics it needs to di ```py logfire.instrument_system_metrics({ - 'logfire.system.cpu.simple_utilization': None, + 'system.cpu.simple_utilization': None, 'system.memory.utilization': ['available'], 'system.swap.utilization': ['used'], }) @@ -34,7 +34,7 @@ To collect lots of detailed data about all available metrics, use `logfire.instr ```py logfire.instrument_system_metrics({ - 'logfire.system.cpu.simple_utilization': None, + 'system.cpu.simple_utilization': None, 'system.cpu.time': ['idle', 'user', 'system', 'irq', 'softirq', 'nice', 'iowait', 'steal', 'interrupt', 'dpc'], 'system.cpu.utilization': ['idle', 'user', 'system', 'irq', 'softirq', 'nice', 'iowait', 'steal', 'interrupt', 'dpc'], 'system.memory.usage': ['available', 'used', 'free', 'active', 'inactive', 'buffers', 'cached', 'shared', 'wired', 'slab', 'total'], diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index d5a7ab50f..8330b7682 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -16,7 +16,7 @@ ) MetricName = Literal[ - 'logfire.system.cpu.simple_utilization':None, + 'system.cpu.simple_utilization', 'system.cpu.time', 'system.cpu.utilization', 'system.memory.usage', @@ -62,7 +62,7 @@ FULL_CONFIG: ConfigDict = { **cast(ConfigDict, _DEFAULT_CONFIG), - 'logfire.system.cpu.simple_utilization': None, + 'system.cpu.simple_utilization': None, 'system.cpu.time': CPU_FIELDS, 'system.cpu.utilization': CPU_FIELDS, 'system.memory.usage': MEMORY_FIELDS + ['total'], From 7a90d6dc88d9ea519256111fb385b37f52fb2419 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 15 Aug 2024 18:16:42 +0200 Subject: [PATCH 14/48] simpler API with smaller defaults --- .../_internal/integrations/system_metrics.py | 85 +++++++++---------- logfire/_internal/main.py | 9 +- .../otel_integrations/test_system_metrics.py | 83 +----------------- tests/test_metrics.py | 3 +- 4 files changed, 50 insertions(+), 130 deletions(-) diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index 8330b7682..604c46a8b 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -1,19 +1,25 @@ +from __future__ import annotations + import sys -from typing import Any, Dict, Iterable, List, Literal, Optional, Union, cast +from typing import TYPE_CHECKING, Dict, Iterable, Literal, Optional, cast + +from opentelemetry.metrics import CallbackOptions, Observation -from opentelemetry.metrics import MeterProvider +if TYPE_CHECKING: + from logfire import Logfire try: + import psutil from opentelemetry.instrumentation.system_metrics import ( _DEFAULT_CONFIG, # type: ignore SystemMetricsInstrumentor, ) -except ModuleNotFoundError: # pragma: no cover +except ModuleNotFoundError as e: # pragma: no cover raise RuntimeError( '`logfire.instrument_system_metrics()` requires the `opentelemetry-instrumentation-system-metrics` package.\n' 'You can install this with:\n' " pip install 'logfire[system-metrics]'" - ) + ) from e MetricName = Literal[ 'system.cpu.simple_utilization', @@ -41,14 +47,7 @@ 'process.open_file_descriptor.count', ] -ConfigString = Union[Literal['basic'], MetricName] -ConfigDict = Dict[MetricName, Optional[Iterable[str]]] -Config = Union[ - Literal['all'], - ConfigString, - Iterable[ConfigString], - Dict[Union[ConfigString, Literal['all']], Optional[Iterable[str]]], -] +Config = Dict[MetricName, Optional[Iterable[str]]] # All the cpu_times fields provided by psutil (used by system_metrics) across all platforms, # except for 'guest' and 'guest_nice' which are included in 'user' and 'nice' in Linux (see psutil._cpu_tot_time). @@ -60,8 +59,8 @@ # Docs: https://psutil.readthedocs.io/en/latest/#psutil.virtual_memory MEMORY_FIELDS = 'available used free active inactive buffers cached shared wired slab'.split() -FULL_CONFIG: ConfigDict = { - **cast(ConfigDict, _DEFAULT_CONFIG), +FULL_CONFIG: Config = { + **cast(Config, _DEFAULT_CONFIG), 'system.cpu.simple_utilization': None, 'system.cpu.time': CPU_FIELDS, 'system.cpu.utilization': CPU_FIELDS, @@ -70,49 +69,43 @@ 'system.swap.utilization': ['used'], } -print(FULL_CONFIG) - if sys.platform == 'darwin': # pragma: no cover # see https://github.com/giampaolo/psutil/issues/1219 # upstream pr: https://github.com/open-telemetry/opentelemetry-python-contrib/pull/2008 FULL_CONFIG.pop('system.network.connections', None) -BASIC_METRICS: List[MetricName] = [ - 'system.thread_count', # used by process count - 'system.cpu.utilization', - 'system.memory.utilization', - 'system.swap.usage', -] +BASIC_CONFIG: Config = { + 'system.cpu.simple_utilization': None, + 'system.memory.utilization': ['available'], + 'system.swap.utilization': ['used'], +} +Base = Literal['basic', 'full', None] -def parse_config(config: Config) -> ConfigDict: - if isinstance(config, str): - if config == 'all': - return FULL_CONFIG - config = [config] - config_dict: Dict[Union[ConfigString, Literal['all']], Optional[Iterable[str]]] - if isinstance(config, dict): - config_dict = config.copy() # type: ignore +def get_base_config(base: Base) -> Config: + if base == 'basic': + return BASIC_CONFIG + elif base == 'full': + return FULL_CONFIG + elif base is None: + return {} else: - config_dict = {key: None for key in config} + raise ValueError(f'Invalid base: {base}') - result: ConfigDict = {} - for shortcut in ['basic', 'all']: - if shortcut in config_dict: - del config_dict[shortcut] - if shortcut == 'basic': - for metric in BASIC_METRICS: - result[metric] = FULL_CONFIG[metric] - elif shortcut == 'all': - result.update(FULL_CONFIG) - for key, value in config_dict.items(): - assert key not in ('basic', 'all') - result[key] = value or FULL_CONFIG[key] +def simple_cpu_utilization_callback(_options: CallbackOptions) -> Iterable[Observation]: + yield Observation(psutil.cpu_percent() / 100) - return result +def instrument_system_metrics(logfire_instance: Logfire, config: Config | None = None, base: Base = 'basic'): + config = {**get_base_config(base), **(config or {})} + SystemMetricsInstrumentor(config=config).instrument() # type: ignore -def instrument_system_metrics(meter_provider: MeterProvider, config: Any = 'basic') -> None: - SystemMetricsInstrumentor(config=parse_config(config)).instrument(meter_provider=meter_provider) # type: ignore + if 'system.cpu.simple_utilization' in config: + logfire_instance.metric_gauge_callback( + 'system.cpu.simple_utilization', + [simple_cpu_utilization_callback], + description='System CPU utilization without attributes', + unit='1', + ) diff --git a/logfire/_internal/main.py b/logfire/_internal/main.py index ad093b6c6..1678a3a9a 100644 --- a/logfire/_internal/main.py +++ b/logfire/_internal/main.py @@ -74,7 +74,7 @@ from .integrations.redis import RedisInstrumentKwargs from .integrations.sqlalchemy import SQLAlchemyInstrumentKwargs from .integrations.starlette import StarletteInstrumentKwargs - from .integrations.system_metrics import Config as SystemMetricsConfig + from .integrations.system_metrics import Base as SystemMetricsBase, Config as SystemMetricsConfig from .utils import SysExcInfo # This is the type of the exc_info/_exc_info parameter of the log methods. @@ -1253,16 +1253,19 @@ def instrument_mysql( self._warn_if_not_initialized_for_instrumentation() return instrument_mysql(conn, **kwargs) - def instrument_system_metrics(self, config: SystemMetricsConfig = 'basic'): + def instrument_system_metrics( + self, config: SystemMetricsConfig | None = None, base: SystemMetricsBase = 'basic' + ) -> None: """Instrument the system metrics. Args: config: The system metrics configuration. + base: The system metrics base. """ from .integrations.system_metrics import instrument_system_metrics self._warn_if_not_initialized_for_instrumentation() - return instrument_system_metrics(self.config.get_meter_provider(), config) + return instrument_system_metrics(self, config, base) def metric_counter(self, name: str, *, unit: str = '', description: str = '') -> Counter: """Create a counter metric. diff --git a/tests/otel_integrations/test_system_metrics.py b/tests/otel_integrations/test_system_metrics.py index 98bf6f0f3..47810324d 100644 --- a/tests/otel_integrations/test_system_metrics.py +++ b/tests/otel_integrations/test_system_metrics.py @@ -6,7 +6,6 @@ import logfire import logfire._internal.metrics -from logfire._internal.integrations.system_metrics import Config, parse_config from tests.test_metrics import get_collected_metrics @@ -24,39 +23,16 @@ def test_default_system_metrics_collection(metrics_reader: InMemoryMetricReader) logfire.instrument_system_metrics() assert get_collected_metric_names(metrics_reader) == snapshot( [ - 'system.cpu.utilization', + 'system.cpu.simple_utilization', 'system.memory.utilization', - 'system.swap.usage', - 'system.thread_count', - ] - ) - SystemMetricsInstrumentor().uninstrument() # type: ignore - - -def test_single_system_metric_collection(metrics_reader: InMemoryMetricReader) -> None: - logfire.instrument_system_metrics('system.cpu.time') - assert get_collected_metric_names(metrics_reader) == [ - 'system.cpu.time', - ] - SystemMetricsInstrumentor().uninstrument() # type: ignore - - -def test_list_with_basic_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None: - logfire.instrument_system_metrics(['basic', 'system.cpu.time']) - assert get_collected_metric_names(metrics_reader) == snapshot( - [ - 'system.cpu.time', - 'system.cpu.utilization', - 'system.memory.utilization', - 'system.swap.usage', - 'system.thread_count', + 'system.swap.utilization', ] ) SystemMetricsInstrumentor().uninstrument() # type: ignore def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None: - logfire.instrument_system_metrics('all') + logfire.instrument_system_metrics(base='full') assert get_collected_metric_names(metrics_reader) == snapshot( [ 'process.open_file_descriptor.count', @@ -66,6 +42,7 @@ def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> 'process.runtime.cpython.gc_count', 'process.runtime.cpython.memory', 'process.runtime.cpython.thread_count', + 'system.cpu.simple_utilization', 'system.cpu.time', 'system.cpu.utilization', 'system.disk.io', @@ -83,55 +60,3 @@ def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> ] ) SystemMetricsInstrumentor().uninstrument() # type: ignore - - -def test_dict_with_basic_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None: - config: Config = { - 'system.cpu.time': None, - 'system.cpu.utilization': ['idle'], - 'basic': None, - } - assert parse_config(config) == snapshot( - { - 'system.thread_count': None, - 'system.cpu.utilization': [ - 'idle', - ], - 'system.memory.utilization': [ - 'total', - 'available', - 'used', - 'free', - 'active', - 'inactive', - 'buffers', - 'cached', - 'shared', - 'wired', - 'slab', - ], - 'system.swap.usage': [ - 'used', - 'free', - ], - 'system.cpu.time': [ - 'idle', - 'iowait', - 'user', - 'system', - 'irq', - 'softirq', - ], - } - ) - logfire.instrument_system_metrics(config) - assert get_collected_metric_names(metrics_reader) == snapshot( - [ - 'system.cpu.time', - 'system.cpu.utilization', - 'system.memory.utilization', - 'system.swap.usage', - 'system.thread_count', - ] - ) - SystemMetricsInstrumentor().uninstrument() # type: ignore diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 5a3f0a04e..4fd8565e0 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -305,8 +305,7 @@ def observable_counter(options: CallbackOptions): def get_collected_metrics(metrics_reader: InMemoryMetricReader) -> list[dict[str, Any]]: exported_metrics = json.loads(cast(MetricsData, metrics_reader.get_metrics_data()).to_json()) # type: ignore [resource_metric] = exported_metrics['resource_metrics'] - [scope_metric] = resource_metric['scope_metrics'] - return scope_metric['metrics'] + return [metric for scope_metric in resource_metric['scope_metrics'] for metric in scope_metric['metrics']] def test_quiet_metric_exporter(caplog: pytest.LogCaptureFixture) -> None: From 2ab5017dd3b77e067318f192b037443fc5323c8c Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 15 Aug 2024 18:21:45 +0200 Subject: [PATCH 15/48] test bases --- .../otel_integrations/test_system_metrics.py | 80 +++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/tests/otel_integrations/test_system_metrics.py b/tests/otel_integrations/test_system_metrics.py index 47810324d..1b1360c70 100644 --- a/tests/otel_integrations/test_system_metrics.py +++ b/tests/otel_integrations/test_system_metrics.py @@ -1,11 +1,13 @@ from __future__ import annotations +import pytest from inline_snapshot import snapshot from opentelemetry.instrumentation.system_metrics import SystemMetricsInstrumentor from opentelemetry.sdk.metrics.export import InMemoryMetricReader import logfire import logfire._internal.metrics +from logfire._internal.integrations.system_metrics import get_base_config from tests.test_metrics import get_collected_metrics @@ -60,3 +62,81 @@ def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> ] ) SystemMetricsInstrumentor().uninstrument() # type: ignore + + +def test_basic_base(): + assert get_base_config('basic') == { + 'system.cpu.simple_utilization': None, + 'system.memory.utilization': ['available'], + 'system.swap.utilization': ['used'], + }, 'Docs need to be updated if this test fails' + + +def test_full_base(): + assert get_base_config('full') == { + 'system.cpu.simple_utilization': None, + 'system.cpu.time': ['idle', 'user', 'system', 'irq', 'softirq', 'nice', 'iowait', 'steal', 'interrupt', 'dpc'], + 'system.cpu.utilization': [ + 'idle', + 'user', + 'system', + 'irq', + 'softirq', + 'nice', + 'iowait', + 'steal', + 'interrupt', + 'dpc', + ], + 'system.memory.usage': [ + 'available', + 'used', + 'free', + 'active', + 'inactive', + 'buffers', + 'cached', + 'shared', + 'wired', + 'slab', + 'total', + ], + 'system.memory.utilization': [ + 'available', + 'used', + 'free', + 'active', + 'inactive', + 'buffers', + 'cached', + 'shared', + 'wired', + 'slab', + ], + 'system.swap.usage': ['used', 'free'], + 'system.swap.utilization': ['used'], + 'system.disk.io': ['read', 'write'], + 'system.disk.operations': ['read', 'write'], + 'system.disk.time': ['read', 'write'], + 'system.network.dropped.packets': ['transmit', 'receive'], + 'system.network.packets': ['transmit', 'receive'], + 'system.network.errors': ['transmit', 'receive'], + 'system.network.io': ['transmit', 'receive'], + 'system.thread_count': None, + 'process.runtime.memory': ['rss', 'vms'], + 'process.runtime.cpu.time': ['user', 'system'], + 'process.runtime.gc_count': None, + 'process.runtime.thread_count': None, + 'process.runtime.cpu.utilization': None, + 'process.runtime.context_switches': ['involuntary', 'voluntary'], + 'process.open_file_descriptor.count': None, + }, 'Docs need to be updated if this test fails' + + +def test_empty_base(): + assert get_base_config(None) == {} + + +def test_invalid_base(): + with pytest.raises(ValueError): + get_base_config('invalid') # type: ignore From 1087dc1eeb8f9cea186cf186951f848df33c7cf0 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 15 Aug 2024 18:26:18 +0200 Subject: [PATCH 16/48] test_custom_system_metrics_collection --- tests/otel_integrations/test_system_metrics.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/otel_integrations/test_system_metrics.py b/tests/otel_integrations/test_system_metrics.py index 1b1360c70..5b6849e0a 100644 --- a/tests/otel_integrations/test_system_metrics.py +++ b/tests/otel_integrations/test_system_metrics.py @@ -64,6 +64,12 @@ def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> SystemMetricsInstrumentor().uninstrument() # type: ignore +def test_custom_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None: + logfire.instrument_system_metrics({'system.memory.utilization': ['available']}, base=None) + assert get_collected_metric_names(metrics_reader) == ['system.memory.utilization'] + SystemMetricsInstrumentor().uninstrument() # type: ignore + + def test_basic_base(): assert get_base_config('basic') == { 'system.cpu.simple_utilization': None, From 19fca49c0b92e6a819db55e72c11b1b9c448e19c Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 15 Aug 2024 18:32:22 +0200 Subject: [PATCH 17/48] Fix test_full_base --- tests/otel_integrations/test_system_metrics.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/otel_integrations/test_system_metrics.py b/tests/otel_integrations/test_system_metrics.py index 5b6849e0a..1abeb8922 100644 --- a/tests/otel_integrations/test_system_metrics.py +++ b/tests/otel_integrations/test_system_metrics.py @@ -79,7 +79,9 @@ def test_basic_base(): def test_full_base(): - assert get_base_config('full') == { + config = get_base_config('full') + config.pop('system.network.connections', None) + assert config == { 'system.cpu.simple_utilization': None, 'system.cpu.time': ['idle', 'user', 'system', 'irq', 'softirq', 'nice', 'iowait', 'steal', 'interrupt', 'dpc'], 'system.cpu.utilization': [ From a3d533536e2171c20f57d046c561f3d12f7ee768 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Fri, 16 Aug 2024 13:32:00 +0200 Subject: [PATCH 18/48] smarter simple_utilization --- .../_internal/integrations/system_metrics.py | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index 604c46a8b..0b8d051df 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -1,5 +1,6 @@ from __future__ import annotations +import contextlib import sys from typing import TYPE_CHECKING, Dict, Iterable, Literal, Optional, cast @@ -94,18 +95,28 @@ def get_base_config(base: Base) -> Config: raise ValueError(f'Invalid base: {base}') -def simple_cpu_utilization_callback(_options: CallbackOptions) -> Iterable[Observation]: - yield Observation(psutil.cpu_percent() / 100) - - def instrument_system_metrics(logfire_instance: Logfire, config: Config | None = None, base: Base = 'basic'): config = {**get_base_config(base), **(config or {})} SystemMetricsInstrumentor(config=config).instrument() # type: ignore if 'system.cpu.simple_utilization' in config: - logfire_instance.metric_gauge_callback( - 'system.cpu.simple_utilization', - [simple_cpu_utilization_callback], - description='System CPU utilization without attributes', - unit='1', - ) + measure_simple_cpu_utilization(logfire_instance) + + +def measure_simple_cpu_utilization(logfire_instance: Logfire): + process = psutil.Process() + + def callback(_options: CallbackOptions) -> Iterable[Observation]: + percents: list[float] = [psutil.cpu_percent(), process.cpu_percent()] + with contextlib.suppress(Exception): + cpu_num: int = process.cpu_num() # type: ignore + if cpu_num > 0: + percents.append(psutil.cpu_percent(percpu=True)[cpu_num]) + yield Observation(max(percents) / 100) + + logfire_instance.metric_gauge_callback( + 'system.cpu.simple_utilization', + [callback], + description='System CPU utilization without attributes', + unit='1', + ) From c9e889b206507a96842ea38827632eca1f97fae6 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Fri, 16 Aug 2024 13:36:20 +0200 Subject: [PATCH 19/48] MetricName also needs to be updated --- tests/otel_integrations/test_system_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/otel_integrations/test_system_metrics.py b/tests/otel_integrations/test_system_metrics.py index 1abeb8922..5ee078ef5 100644 --- a/tests/otel_integrations/test_system_metrics.py +++ b/tests/otel_integrations/test_system_metrics.py @@ -138,7 +138,7 @@ def test_full_base(): 'process.runtime.cpu.utilization': None, 'process.runtime.context_switches': ['involuntary', 'voluntary'], 'process.open_file_descriptor.count': None, - }, 'Docs need to be updated if this test fails' + }, 'Docs and the MetricName type need to be updated if this test fails' def test_empty_base(): From 34ed55c6f1050933fe2241e9c89690171cc79683 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Fri, 16 Aug 2024 13:38:00 +0200 Subject: [PATCH 20/48] pyright --- logfire/_internal/integrations/system_metrics.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index 0b8d051df..3c47d51f7 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -109,9 +109,10 @@ def measure_simple_cpu_utilization(logfire_instance: Logfire): def callback(_options: CallbackOptions) -> Iterable[Observation]: percents: list[float] = [psutil.cpu_percent(), process.cpu_percent()] with contextlib.suppress(Exception): - cpu_num: int = process.cpu_num() # type: ignore - if cpu_num > 0: - percents.append(psutil.cpu_percent(percpu=True)[cpu_num]) + if not TYPE_CHECKING: + cpu_num = process.cpu_num() + if cpu_num > 0: + percents.append(psutil.cpu_percent(percpu=True)[cpu_num]) yield Observation(max(percents) / 100) logfire_instance.metric_gauge_callback( From fb9a7840e9fbd4e8f4dc59739d4bac3381392bac Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Fri, 16 Aug 2024 13:56:50 +0200 Subject: [PATCH 21/48] pragma --- logfire/_internal/integrations/system_metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index 3c47d51f7..abaeb0b2e 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -109,9 +109,9 @@ def measure_simple_cpu_utilization(logfire_instance: Logfire): def callback(_options: CallbackOptions) -> Iterable[Observation]: percents: list[float] = [psutil.cpu_percent(), process.cpu_percent()] with contextlib.suppress(Exception): - if not TYPE_CHECKING: + if not TYPE_CHECKING: # pragma: no branch cpu_num = process.cpu_num() - if cpu_num > 0: + if cpu_num > 0: # pragma: no branch percents.append(psutil.cpu_percent(percpu=True)[cpu_num]) yield Observation(max(percents) / 100) From 44c17c1e55f7012183c6e77479e9f116448e536b Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Fri, 16 Aug 2024 15:20:46 +0200 Subject: [PATCH 22/48] Update generated stubs --- .../_internal/integrations/system_metrics.pyi | 22 +++++++++---------- logfire-api/logfire_api/_internal/main.pyi | 5 +++-- .../_internal/integrations/system_metrics.py | 6 ++--- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/logfire-api/logfire_api/_internal/integrations/system_metrics.pyi b/logfire-api/logfire_api/_internal/integrations/system_metrics.pyi index 45010d652..e07ea7d7a 100644 --- a/logfire-api/logfire_api/_internal/integrations/system_metrics.pyi +++ b/logfire-api/logfire_api/_internal/integrations/system_metrics.pyi @@ -1,15 +1,15 @@ from _typeshed import Incomplete -from opentelemetry.metrics import MeterProvider -from typing import Any, Iterable +from logfire import Logfire as Logfire +from typing import Iterable MetricName: Incomplete -ConfigString: Incomplete -ConfigDict = dict[MetricName, Iterable[str] | None] -Config: Incomplete -CPU_FIELDS: Incomplete -MEMORY_FIELDS: Incomplete -DEFAULT_CONFIG: ConfigDict -BASIC_METRICS: list[MetricName] +Config = dict[MetricName, Iterable[str] | None] +CPU_FIELDS: list[str] +MEMORY_FIELDS: list[str] +FULL_CONFIG: Config +BASIC_CONFIG: Config +Base: Incomplete -def parse_config(config: Config) -> ConfigDict: ... -def instrument_system_metrics(meter_provider: MeterProvider, config: Any = 'basic') -> None: ... +def get_base_config(base: Base) -> Config: ... +def instrument_system_metrics(logfire_instance: Logfire, config: Config | None = None, base: Base = 'basic'): ... +def measure_simple_cpu_utilization(logfire_instance: Logfire): ... diff --git a/logfire-api/logfire_api/_internal/main.pyi b/logfire-api/logfire_api/_internal/main.pyi index ecec2a257..dc316449b 100644 --- a/logfire-api/logfire_api/_internal/main.pyi +++ b/logfire-api/logfire_api/_internal/main.pyi @@ -18,7 +18,7 @@ from .integrations.pymongo import PymongoInstrumentKwargs as PymongoInstrumentKw from .integrations.redis import RedisInstrumentKwargs as RedisInstrumentKwargs from .integrations.sqlalchemy import SQLAlchemyInstrumentKwargs as SQLAlchemyInstrumentKwargs from .integrations.starlette import StarletteInstrumentKwargs as StarletteInstrumentKwargs -from .integrations.system_metrics import Config as SystemMetricsConfig +from .integrations.system_metrics import Base as SystemMetricsBase, Config as SystemMetricsConfig from .json_encoder import logfire_json_dumps as logfire_json_dumps from .json_schema import JsonSchemaProperties as JsonSchemaProperties, attributes_json_schema as attributes_json_schema, attributes_json_schema_properties as attributes_json_schema_properties, create_json_schema as create_json_schema from .metrics import ProxyMeterProvider as ProxyMeterProvider @@ -644,11 +644,12 @@ class Logfire: If a connection is provided, returns the instrumented connection. If no connection is provided, returns None. """ - def instrument_system_metrics(self, config: SystemMetricsConfig = 'basic'): + def instrument_system_metrics(self, config: SystemMetricsConfig | None = None, base: SystemMetricsBase = 'basic') -> None: """Instrument the system metrics. Args: config: The system metrics configuration. + base: The system metrics base. """ def metric_counter(self, name: str, *, unit: str = '', description: str = '') -> Counter: """Create a counter metric. diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index abaeb0b2e..52319174f 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -2,7 +2,7 @@ import contextlib import sys -from typing import TYPE_CHECKING, Dict, Iterable, Literal, Optional, cast +from typing import TYPE_CHECKING, Dict, Iterable, Literal, LiteralString, Optional, cast from opentelemetry.metrics import CallbackOptions, Observation @@ -53,12 +53,12 @@ # All the cpu_times fields provided by psutil (used by system_metrics) across all platforms, # except for 'guest' and 'guest_nice' which are included in 'user' and 'nice' in Linux (see psutil._cpu_tot_time). # Docs: https://psutil.readthedocs.io/en/latest/#psutil.cpu_times -CPU_FIELDS = 'idle user system irq softirq nice iowait steal interrupt dpc'.split() +CPU_FIELDS: list[LiteralString] = 'idle user system irq softirq nice iowait steal interrupt dpc'.split() # All the virtual_memory fields provided by psutil across all platforms, # except for 'percent' which can be calculated as `(total - available) / total * 100`. # Docs: https://psutil.readthedocs.io/en/latest/#psutil.virtual_memory -MEMORY_FIELDS = 'available used free active inactive buffers cached shared wired slab'.split() +MEMORY_FIELDS: list[LiteralString] = 'available used free active inactive buffers cached shared wired slab'.split() FULL_CONFIG: Config = { **cast(Config, _DEFAULT_CONFIG), From 3ad02098bb44fd90934c0ad3c07e1c6a537425f3 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Fri, 16 Aug 2024 15:21:09 +0200 Subject: [PATCH 23/48] Update generated stubs --- .../logfire_api/_internal/integrations/system_metrics.pyi | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/logfire-api/logfire_api/_internal/integrations/system_metrics.pyi b/logfire-api/logfire_api/_internal/integrations/system_metrics.pyi index e07ea7d7a..7cf35e791 100644 --- a/logfire-api/logfire_api/_internal/integrations/system_metrics.pyi +++ b/logfire-api/logfire_api/_internal/integrations/system_metrics.pyi @@ -1,11 +1,11 @@ from _typeshed import Incomplete from logfire import Logfire as Logfire -from typing import Iterable +from typing import Iterable, LiteralString MetricName: Incomplete Config = dict[MetricName, Iterable[str] | None] -CPU_FIELDS: list[str] -MEMORY_FIELDS: list[str] +CPU_FIELDS: list[LiteralString] +MEMORY_FIELDS: list[LiteralString] FULL_CONFIG: Config BASIC_CONFIG: Config Base: Incomplete From 142ba7095e6eeb97a27f8732a102a5e8eca288b5 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Fri, 16 Aug 2024 15:28:50 +0200 Subject: [PATCH 24/48] 3.8 --- .../logfire_api/_internal/integrations/system_metrics.pyi | 3 ++- logfire/_internal/integrations/system_metrics.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/logfire-api/logfire_api/_internal/integrations/system_metrics.pyi b/logfire-api/logfire_api/_internal/integrations/system_metrics.pyi index 7cf35e791..78108d5e5 100644 --- a/logfire-api/logfire_api/_internal/integrations/system_metrics.pyi +++ b/logfire-api/logfire_api/_internal/integrations/system_metrics.pyi @@ -1,6 +1,7 @@ from _typeshed import Incomplete from logfire import Logfire as Logfire -from typing import Iterable, LiteralString +from typing import Iterable +from typing_extensions import LiteralString MetricName: Incomplete Config = dict[MetricName, Iterable[str] | None] diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index 52319174f..a8f839a06 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -2,11 +2,13 @@ import contextlib import sys -from typing import TYPE_CHECKING, Dict, Iterable, Literal, LiteralString, Optional, cast +from typing import TYPE_CHECKING, Dict, Iterable, Literal, Optional, cast from opentelemetry.metrics import CallbackOptions, Observation if TYPE_CHECKING: + from typing_extensions import LiteralString + from logfire import Logfire try: From 8c782899d88a8983d6a89084289f6176e603ad4e Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Fri, 16 Aug 2024 16:46:49 +0200 Subject: [PATCH 25/48] rename dashboard --- docs/integrations/system_metrics.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/integrations/system_metrics.md b/docs/integrations/system_metrics.md index eaff8ebf6..92793268f 100644 --- a/docs/integrations/system_metrics.md +++ b/docs/integrations/system_metrics.md @@ -16,11 +16,11 @@ logfire.configure() logfire.instrument_system_metrics() ``` -Then in your project, click on 'Dashboards' in the top bar, click 'New Dashboard', and select 'Basic System Metrics' from the dropdown. +Then in your project, click on 'Dashboards' in the top bar, click 'New Dashboard', and select 'Basic System Metrics (Logfire)' from the dropdown. ## Configuration -By default, `instrument_system_metrics` collects only the metrics it needs to display the 'Basic System Metrics' dashboard. You can choose exactly which metrics to collect and how much data to collect about each metric. The default is equivalent to this: +By default, `instrument_system_metrics` collects only the metrics it needs to display the 'Basic System Metrics (Logfire)' dashboard. You can choose exactly which metrics to collect and how much data to collect about each metric. The default is equivalent to this: ```py logfire.instrument_system_metrics({ From d1dd843ffb574888e5c5b9a78f3120daaae149e9 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Fri, 16 Aug 2024 17:02:26 +0200 Subject: [PATCH 26/48] Document both basic system metrics dashboards --- docs/guides/web_ui/dashboards.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/guides/web_ui/dashboards.md b/docs/guides/web_ui/dashboards.md index cb7c7f730..a3e7d6b21 100644 --- a/docs/guides/web_ui/dashboards.md +++ b/docs/guides/web_ui/dashboards.md @@ -19,14 +19,19 @@ This dashboard offers a high-level view of your web services' well-being. It lik * **Percent of 5XX Requests:** Percentage of requests that resulted in server errors (status codes in the 500 range). * **Log Type Ratio**: Breakdown of the different log types generated by your web service (e.g., info, warning, error). -## System Metrics +## Basic System Metrics -This dashboard focuses on system resource utilization, potentially including: +This dashboard shows essential system resource utilization metrics. It comes in two variants: + +- **Basic System Metrics (Logfire):** Uses the data exported by [`logfire.instrument_system_metrics()`](../../integrations/system_metrics.md). +- **Basic System Metrics (OpenTelemetry):** Uses data exported by any OpenTelemetry-based instrumentation following the standard semantic conventions. + +Both variants include the following metrics: -* **CPU Usage:** Percentage of processing power utilized by the system. -* **Memory Usage:** Amount of memory currently in use by the system. * **Number of Processes:** Total number of running processes on the system. -* **Swap Usage:** Amount of swap space currently in use by the system. +* **CPU Usage %:** Percentage of processing power utilized by the system. +* **Memory Usage %:** Percentage of memory currently in use by the system. +* **Swap Usage %:** Percentage of swap space currently in use by the system. ## Custom Dashboards From edb3ad8613fb767e7cf9d1ec9e03f3b5112e6e9d Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Fri, 16 Aug 2024 17:11:18 +0200 Subject: [PATCH 27/48] update metrics docs --- .../onboarding_checklist/add_metrics.md | 29 +++++++------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/docs/guides/onboarding_checklist/add_metrics.md b/docs/guides/onboarding_checklist/add_metrics.md index 5c99975e2..d2fae589a 100644 --- a/docs/guides/onboarding_checklist/add_metrics.md +++ b/docs/guides/onboarding_checklist/add_metrics.md @@ -1,6 +1,16 @@ **Pydantic Logfire** can be used to collect metrics from your application and send them to a metrics backend. -Let's see how to create, and use metrics in your application. +Metrics are a great way to record numerical values where you want to see an aggregation of the data (e.g. over time), +rather than the individual values. + +## System Metrics + +The easiest way to start using metrics is to enable system metrics. +See the [System Metrics][system-metrics] documentation to learn more. + +## Manual Metrics + +Let's see how to create and use custom metrics in your application. ```py import logfire @@ -13,11 +23,6 @@ def send_message(): messages_sent.add(1) ``` -## Metric Types - -Metrics are a great way to record number values where you want to see an aggregation of the data (e.g. over time), -rather than the individual values. - ### Counter The Counter metric is particularly useful when you want to measure the frequency or occurrence of a certain @@ -250,18 +255,6 @@ logfire.metric_up_down_counter_callback( You can read more about the Up-Down Counter metric in the [OpenTelemetry documentation][up-down-counter-callback-metric]. -## System Metrics - -By default, **Logfire** does not collect system metrics. - -To enable metrics, you need just need install the `logfire[system-metrics]` extra: - -{{ install_logfire(extras=['system-metrics']) }} - -**Logfire** will automatically collect system metrics if the `logfire[system-metrics]` extra is installed. - -To know more about which system metrics are collected, check the [System Metrics][system-metrics] documentation. - [counter-metric]: https://opentelemetry.io/docs/specs/otel/metrics/api/#counter [histogram-metric]: https://opentelemetry.io/docs/specs/otel/metrics/api/#histogram [up-down-counter-metric]: https://opentelemetry.io/docs/specs/otel/metrics/api/#updowncounter From 86073d5111eaeecfb853a610224957c99ff6c618 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Fri, 16 Aug 2024 17:27:06 +0200 Subject: [PATCH 28/48] docstring --- logfire/_internal/main.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/logfire/_internal/main.py b/logfire/_internal/main.py index 1678a3a9a..ebc2a8a6a 100644 --- a/logfire/_internal/main.py +++ b/logfire/_internal/main.py @@ -1256,11 +1256,15 @@ def instrument_mysql( def instrument_system_metrics( self, config: SystemMetricsConfig | None = None, base: SystemMetricsBase = 'basic' ) -> None: - """Instrument the system metrics. + """Collect system metrics. + + See https://docs.pydantic.dev/logfire/integrations/system_metrics/ for more information. Args: - config: The system metrics configuration. - base: The system metrics base. + config: A dictionary where the keys are metric names + and the values are optional further configuration for that metric. + base: A string indicating the base config dictionary which `config` will be merged with, + or `None` for an empty base config. """ from .integrations.system_metrics import instrument_system_metrics From 8aa1d1b425d935556312a61f22ece7dd5ad30062 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Fri, 16 Aug 2024 17:27:31 +0200 Subject: [PATCH 29/48] docstring --- logfire-api/logfire_api/_internal/main.pyi | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/logfire-api/logfire_api/_internal/main.pyi b/logfire-api/logfire_api/_internal/main.pyi index dc316449b..ca19532fb 100644 --- a/logfire-api/logfire_api/_internal/main.pyi +++ b/logfire-api/logfire_api/_internal/main.pyi @@ -645,11 +645,15 @@ class Logfire: """ def instrument_system_metrics(self, config: SystemMetricsConfig | None = None, base: SystemMetricsBase = 'basic') -> None: - """Instrument the system metrics. + """Collect system metrics. + + See https://docs.pydantic.dev/logfire/integrations/system_metrics/ for more information. Args: - config: The system metrics configuration. - base: The system metrics base. + config: A dictionary where the keys are metric names + and the values are optional further configuration for that metric. + base: A string indicating the base config dictionary which `config` will be merged with, + or `None` for an empty base config. """ def metric_counter(self, name: str, *, unit: str = '', description: str = '') -> Counter: """Create a counter metric. From 5ec79e59c5380dc56e1513fc401a21c4927c6bdc Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Mon, 19 Aug 2024 12:24:22 +0200 Subject: [PATCH 30/48] comments --- .../_internal/integrations/system_metrics.py | 38 +++++++++++++++++-- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index a8f839a06..e1df5c5ad 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -67,8 +67,11 @@ 'system.cpu.simple_utilization': None, 'system.cpu.time': CPU_FIELDS, 'system.cpu.utilization': CPU_FIELDS, + # For usage, knowing the total amount of bytes available might be handy. 'system.memory.usage': MEMORY_FIELDS + ['total'], + # For utilization, the total is always just 1 (100%), so it's not included. 'system.memory.utilization': MEMORY_FIELDS, + # The 'free' utilization is not included because it's just 1 - 'used'. 'system.swap.utilization': ['used'], } @@ -79,6 +82,7 @@ BASIC_CONFIG: Config = { 'system.cpu.simple_utilization': None, + # The actually used memory ratio can be calculated as `1 - available`. 'system.memory.utilization': ['available'], 'system.swap.utilization': ['used'], } @@ -106,20 +110,48 @@ def instrument_system_metrics(logfire_instance: Logfire, config: Config | None = def measure_simple_cpu_utilization(logfire_instance: Logfire): + # The values of `process.cpu_percent()` are relative to the last time it was called. + # In particular, the first call will always return 0. + # So we need to call it once before we start measuring the actual utilization, + # and we need to use the same process object for all calls. + # TODO a similar problem exists with `psutil.cpu_percent()` + # and thus for the OTEL instrumentation as well, and there it's harder to fix because + # the previous values are separated by thread. + # See https://github.com/open-telemetry/opentelemetry-python-contrib/issues/2797 process = psutil.Process() + process.cpu_percent() def callback(_options: CallbackOptions) -> Iterable[Observation]: - percents: list[float] = [psutil.cpu_percent(), process.cpu_percent()] + percents: list[float] = [ + # Average CPU usage across all cores. + # A high value is notable regardless of which core(s) this process is using. + psutil.cpu_percent(), + # CPU usage of this particular process. + # Can be greater than 100% if the process is using multiple cores. + # Will be less than 100% if multiple processes are using the same core, + # even if this process is using it at full capacity. + process.cpu_percent(), + ] + # CPU usage of the core this process is using, if available. + # This will be higher than `process.cpu_percent()` if multiple processes are using the same core. + # This requires `process.cpu_num()` which is only available on Linux, + # so we need to suppress the exception on other platforms. with contextlib.suppress(Exception): + # Whether `Process.cpu_num` exists depends on the platform, and this affects pyright. + # So we can't use `# type: ignore` here, because on Linux it's not needed. if not TYPE_CHECKING: # pragma: no branch cpu_num = process.cpu_num() - if cpu_num > 0: # pragma: no branch + # `cpu_num` can be -1 on some platforms according to psutil. + if cpu_num >= 0: # pragma: no branch percents.append(psutil.cpu_percent(percpu=True)[cpu_num]) yield Observation(max(percents) / 100) logfire_instance.metric_gauge_callback( 'system.cpu.simple_utilization', [callback], - description='System CPU utilization without attributes', + description='Maximum of: ' + '(1) average CPU usage across all cores, ' + '(2) CPU usage of this process, ' + '(3) CPU usage of the core this process is using, if available.', unit='1', ) From 3fc0971411677742445f21e85ba1d97181fd72a7 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Mon, 19 Aug 2024 12:28:54 +0200 Subject: [PATCH 31/48] comments --- logfire/_internal/integrations/system_metrics.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index e1df5c5ad..b9de32d95 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -144,6 +144,11 @@ def callback(_options: CallbackOptions) -> Iterable[Observation]: # `cpu_num` can be -1 on some platforms according to psutil. if cpu_num >= 0: # pragma: no branch percents.append(psutil.cpu_percent(percpu=True)[cpu_num]) + + # Return the highest of the three values. + # This means interpreting the value is not straightforward, + # but any unusual activity will show a notable spike, regardless of the infra setup. + # psutil returns a value from 0-100, OTEL values here are generally 0-1, so we divide by 100. yield Observation(max(percents) / 100) logfire_instance.metric_gauge_callback( From 2cc9e1ab6ad9ec089b7075959e0f4f1674be40f9 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Mon, 19 Aug 2024 12:44:45 +0200 Subject: [PATCH 32/48] uninstrument automatically --- .../otel_integrations/test_system_metrics.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/otel_integrations/test_system_metrics.py b/tests/otel_integrations/test_system_metrics.py index 5ee078ef5..0ede5e016 100644 --- a/tests/otel_integrations/test_system_metrics.py +++ b/tests/otel_integrations/test_system_metrics.py @@ -12,13 +12,16 @@ def get_collected_metric_names(metrics_reader: InMemoryMetricReader) -> list[str]: - return sorted( - { - metric['name'] - for metric in get_collected_metrics(metrics_reader) - if metric['name'] != 'system.network.connections' - } - ) + try: + return sorted( + { + metric['name'] + for metric in get_collected_metrics(metrics_reader) + if metric['name'] != 'system.network.connections' + } + ) + finally: + SystemMetricsInstrumentor().uninstrument() # type: ignore def test_default_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None: @@ -30,7 +33,6 @@ def test_default_system_metrics_collection(metrics_reader: InMemoryMetricReader) 'system.swap.utilization', ] ) - SystemMetricsInstrumentor().uninstrument() # type: ignore def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None: @@ -61,13 +63,11 @@ def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> 'system.thread_count', ] ) - SystemMetricsInstrumentor().uninstrument() # type: ignore def test_custom_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None: logfire.instrument_system_metrics({'system.memory.utilization': ['available']}, base=None) assert get_collected_metric_names(metrics_reader) == ['system.memory.utilization'] - SystemMetricsInstrumentor().uninstrument() # type: ignore def test_basic_base(): From 8b6974157c760f12f2ea47572c9c5792de0c1691 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Mon, 19 Aug 2024 12:55:18 +0200 Subject: [PATCH 33/48] pin griffe --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index efed1a172..3c1cda0fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -139,6 +139,7 @@ dev-dependencies = [ "celery>=5.4.0", "testcontainers", "mysql-connector-python~=8.0", + "griffe==0.48.0", ] [tool.rye.scripts] From 9bb6b4395bbe604ccf13f3375f4d7e2030e2f2eb Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Mon, 19 Aug 2024 12:57:00 +0200 Subject: [PATCH 34/48] pin griffe --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index dd696fe50..15307d520 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -36,7 +36,7 @@ jobs: # see https://github.com/pydantic/logfire/pull/12 - run: pip install uv - run: uv pip install --system -r requirements.lock -r requirements-dev.lock - - run: uv pip install --system -U mkdocs-material mkdocstrings-python + - run: uv pip install --system -U mkdocs-material mkdocstrings-python griffe==0.48.0 env: UV_EXTRA_INDEX_URL: ${{ secrets.UV_EXTRA_INDEX_URL }} - run: | From f9d86ebaf756aa0358bd44693515e4b0c2cd2ce1 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Mon, 19 Aug 2024 13:02:05 +0200 Subject: [PATCH 35/48] pin griffe --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 6c1afe602..d1a7368a1 100644 --- a/Makefile +++ b/Makefile @@ -55,5 +55,5 @@ cf-pages-build: python3 -V python3 -m pip install uv python3 -m uv pip install --system -r requirements.lock -r requirements-dev.lock - python3 -m uv pip install --system --extra-index-url $(PPPR_URL) -U mkdocs-material mkdocstrings-python + python3 -m uv pip install --system --extra-index-url $(PPPR_URL) -U mkdocs-material mkdocstrings-python griffe==0.48.0 python3 -m mkdocs build From da3e3fa831aacdd9044a02f6c4388d90ebc2b009 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Mon, 19 Aug 2024 13:27:58 +0200 Subject: [PATCH 36/48] Update generated stubs --- .../_internal/integrations/system_metrics.pyi | 4 +-- .../_internal/integrations/system_metrics.py | 29 ++++++++++++++++++- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/logfire-api/logfire_api/_internal/integrations/system_metrics.pyi b/logfire-api/logfire_api/_internal/integrations/system_metrics.pyi index 78108d5e5..36c651114 100644 --- a/logfire-api/logfire_api/_internal/integrations/system_metrics.pyi +++ b/logfire-api/logfire_api/_internal/integrations/system_metrics.pyi @@ -1,9 +1,9 @@ from _typeshed import Incomplete from logfire import Logfire as Logfire -from typing import Iterable +from typing import Iterable, Literal from typing_extensions import LiteralString -MetricName: Incomplete +MetricName: type[Literal['system.cpu.simple_utilization', 'system.cpu.time', 'system.cpu.utilization', 'system.memory.usage', 'system.memory.utilization', 'system.swap.usage', 'system.swap.utilization', 'system.disk.io', 'system.disk.operations', 'system.disk.time', 'system.network.dropped.packets', 'system.network.packets', 'system.network.errors', 'system.network.io', 'system.network.connections', 'system.thread_count', 'process.runtime.memory', 'process.runtime.cpu.time', 'process.runtime.gc_count', 'process.runtime.thread_count', 'process.runtime.cpu.utilization', 'process.runtime.context_switches', 'process.open_file_descriptor.count']] Config = dict[MetricName, Iterable[str] | None] CPU_FIELDS: list[LiteralString] MEMORY_FIELDS: list[LiteralString] diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index b9de32d95..e834d779b 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -24,7 +24,34 @@ " pip install 'logfire[system-metrics]'" ) from e -MetricName = Literal[ +# stubgen seems to need this redundant type declaration. +MetricName: type[ + Literal[ + 'system.cpu.simple_utilization', + 'system.cpu.time', + 'system.cpu.utilization', + 'system.memory.usage', + 'system.memory.utilization', + 'system.swap.usage', + 'system.swap.utilization', + 'system.disk.io', + 'system.disk.operations', + 'system.disk.time', + 'system.network.dropped.packets', + 'system.network.packets', + 'system.network.errors', + 'system.network.io', + 'system.network.connections', + 'system.thread_count', + 'process.runtime.memory', + 'process.runtime.cpu.time', + 'process.runtime.gc_count', + 'process.runtime.thread_count', + 'process.runtime.cpu.utilization', + 'process.runtime.context_switches', + 'process.open_file_descriptor.count', + ] +] = Literal[ # type: ignore # but pyright doesn't like it 'system.cpu.simple_utilization', 'system.cpu.time', 'system.cpu.utilization', From 389b9204cef871aced1fa328b692a767244ef552 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Mon, 19 Aug 2024 13:33:53 +0200 Subject: [PATCH 37/48] format link --- logfire/_internal/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/logfire/_internal/main.py b/logfire/_internal/main.py index ebc2a8a6a..1767b17d5 100644 --- a/logfire/_internal/main.py +++ b/logfire/_internal/main.py @@ -1258,7 +1258,7 @@ def instrument_system_metrics( ) -> None: """Collect system metrics. - See https://docs.pydantic.dev/logfire/integrations/system_metrics/ for more information. + See [the guide](https://docs.pydantic.dev/logfire/integrations/system_metrics/) for more information. Args: config: A dictionary where the keys are metric names From e953802ce21988230ef30d62a9c5f489599e3b9c Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Mon, 19 Aug 2024 13:34:33 +0200 Subject: [PATCH 38/48] format link --- logfire-api/logfire_api/_internal/main.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/logfire-api/logfire_api/_internal/main.pyi b/logfire-api/logfire_api/_internal/main.pyi index ca19532fb..c9b6efd81 100644 --- a/logfire-api/logfire_api/_internal/main.pyi +++ b/logfire-api/logfire_api/_internal/main.pyi @@ -647,7 +647,7 @@ class Logfire: def instrument_system_metrics(self, config: SystemMetricsConfig | None = None, base: SystemMetricsBase = 'basic') -> None: """Collect system metrics. - See https://docs.pydantic.dev/logfire/integrations/system_metrics/ for more information. + See [the guide](https://docs.pydantic.dev/logfire/integrations/system_metrics/) for more information. Args: config: A dictionary where the keys are metric names From e8b0e264d0894deabfaa2d79b97f821e4d2922bc Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Tue, 20 Aug 2024 13:38:24 +0200 Subject: [PATCH 39/48] add popover explaining None value --- docs/integrations/system_metrics.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/integrations/system_metrics.md b/docs/integrations/system_metrics.md index 92793268f..e602717c0 100644 --- a/docs/integrations/system_metrics.md +++ b/docs/integrations/system_metrics.md @@ -24,12 +24,14 @@ By default, `instrument_system_metrics` collects only the metrics it needs to di ```py logfire.instrument_system_metrics({ - 'system.cpu.simple_utilization': None, + 'system.cpu.simple_utilization': None, # (1)! 'system.memory.utilization': ['available'], 'system.swap.utilization': ['used'], }) ``` +1. `None` simply means that there are no fields to configure for this metric. + To collect lots of detailed data about all available metrics, use `logfire.instrument_system_metrics(base='full')`. This is equivalent to: ```py From a35d6652a29071c809971e170675eb69d2ca8c44 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Tue, 20 Aug 2024 14:08:23 +0200 Subject: [PATCH 40/48] warn about costs --- docs/integrations/system_metrics.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/docs/integrations/system_metrics.md b/docs/integrations/system_metrics.md index e602717c0..47179e798 100644 --- a/docs/integrations/system_metrics.md +++ b/docs/integrations/system_metrics.md @@ -32,7 +32,17 @@ logfire.instrument_system_metrics({ 1. `None` simply means that there are no fields to configure for this metric. -To collect lots of detailed data about all available metrics, use `logfire.instrument_system_metrics(base='full')`. This is equivalent to: +To collect lots of detailed data about all available metrics, use `logfire.instrument_system_metrics(base='full')`. + +!!! warning + The amount of data collected by `base='full'` can be expensive, especially if you have many servers, + and this is easy to forget about. Be sure to monitor your usage and costs. + + The most expensive metrics are `system.cpu.utilization/time` which collect data for each core and each mode, + and `system.disk.*` which collect data for each disk device. This can result in hundreds of data points per minute. + The exact number depends on your machine. + +`logfire.instrument_system_metrics(base='full')` is equivalent to: ```py logfire.instrument_system_metrics({ From cdbcb84aee77de9e9a137cbf88d3674e8ab11e91 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Tue, 20 Aug 2024 14:10:32 +0200 Subject: [PATCH 41/48] Link to guide in configure param docs --- logfire/_internal/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/logfire/_internal/config.py b/logfire/_internal/config.py index 88521899a..8b12438f2 100644 --- a/logfire/_internal/config.py +++ b/logfire/_internal/config.py @@ -192,7 +192,7 @@ def configure( `LOGFIRE_CONFIG_DIR` environment variable, otherwise defaults to the current working directory. data_dir: Directory to store credentials, and logs. If `None` uses the `LOGFIRE_CREDENTIALS_DIR` environment variable, otherwise defaults to `'.logfire'`. base_url: Root URL for the Logfire API. If `None` uses the `LOGFIRE_BASE_URL` environment variable, otherwise defaults to https://logfire-api.pydantic.dev. - collect_system_metrics: Legacy argument, use `logfire.instrument_system_metrics()` instead. + collect_system_metrics: Legacy argument, use [`logfire.instrument_system_metrics()`](https://docs.pydantic.dev/logfire/integrations/system_metrics/) instead. id_generator: Generator for span IDs. Defaults to `RandomIdGenerator()` from the OpenTelemetry SDK. ns_timestamp_generator: Generator for nanosecond timestamps. Defaults to [`time.time_ns`][time.time_ns] from the Python standard library. From d7f49021bfcf09bc56863e79b9283e6ed0dd97a5 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Wed, 21 Aug 2024 12:27:10 +0200 Subject: [PATCH 42/48] Split into two CPU metrics --- .../_internal/integrations/system_metrics.py | 46 ++----------------- .../otel_integrations/test_system_metrics.py | 2 + 2 files changed, 5 insertions(+), 43 deletions(-) diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index e834d779b..e2df808e4 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -1,6 +1,5 @@ from __future__ import annotations -import contextlib import sys from typing import TYPE_CHECKING, Dict, Iterable, Literal, Optional, cast @@ -108,6 +107,7 @@ FULL_CONFIG.pop('system.network.connections', None) BASIC_CONFIG: Config = { + 'process.runtime.cpu.utilization': None, 'system.cpu.simple_utilization': None, # The actually used memory ratio can be calculated as `1 - available`. 'system.memory.utilization': ['available'], @@ -137,53 +137,13 @@ def instrument_system_metrics(logfire_instance: Logfire, config: Config | None = def measure_simple_cpu_utilization(logfire_instance: Logfire): - # The values of `process.cpu_percent()` are relative to the last time it was called. - # In particular, the first call will always return 0. - # So we need to call it once before we start measuring the actual utilization, - # and we need to use the same process object for all calls. - # TODO a similar problem exists with `psutil.cpu_percent()` - # and thus for the OTEL instrumentation as well, and there it's harder to fix because - # the previous values are separated by thread. - # See https://github.com/open-telemetry/opentelemetry-python-contrib/issues/2797 - process = psutil.Process() - process.cpu_percent() - def callback(_options: CallbackOptions) -> Iterable[Observation]: - percents: list[float] = [ - # Average CPU usage across all cores. - # A high value is notable regardless of which core(s) this process is using. - psutil.cpu_percent(), - # CPU usage of this particular process. - # Can be greater than 100% if the process is using multiple cores. - # Will be less than 100% if multiple processes are using the same core, - # even if this process is using it at full capacity. - process.cpu_percent(), - ] - # CPU usage of the core this process is using, if available. - # This will be higher than `process.cpu_percent()` if multiple processes are using the same core. - # This requires `process.cpu_num()` which is only available on Linux, - # so we need to suppress the exception on other platforms. - with contextlib.suppress(Exception): - # Whether `Process.cpu_num` exists depends on the platform, and this affects pyright. - # So we can't use `# type: ignore` here, because on Linux it's not needed. - if not TYPE_CHECKING: # pragma: no branch - cpu_num = process.cpu_num() - # `cpu_num` can be -1 on some platforms according to psutil. - if cpu_num >= 0: # pragma: no branch - percents.append(psutil.cpu_percent(percpu=True)[cpu_num]) - - # Return the highest of the three values. - # This means interpreting the value is not straightforward, - # but any unusual activity will show a notable spike, regardless of the infra setup. # psutil returns a value from 0-100, OTEL values here are generally 0-1, so we divide by 100. - yield Observation(max(percents) / 100) + yield Observation(psutil.cpu_percent() / 100) logfire_instance.metric_gauge_callback( 'system.cpu.simple_utilization', [callback], - description='Maximum of: ' - '(1) average CPU usage across all cores, ' - '(2) CPU usage of this process, ' - '(3) CPU usage of the core this process is using, if available.', + description='Average CPU usage across all cores, as a fraction between 0 and 1.', unit='1', ) diff --git a/tests/otel_integrations/test_system_metrics.py b/tests/otel_integrations/test_system_metrics.py index 0ede5e016..a3626015d 100644 --- a/tests/otel_integrations/test_system_metrics.py +++ b/tests/otel_integrations/test_system_metrics.py @@ -28,6 +28,7 @@ def test_default_system_metrics_collection(metrics_reader: InMemoryMetricReader) logfire.instrument_system_metrics() assert get_collected_metric_names(metrics_reader) == snapshot( [ + 'process.runtime.cpython.cpu.utilization', 'system.cpu.simple_utilization', 'system.memory.utilization', 'system.swap.utilization', @@ -72,6 +73,7 @@ def test_custom_system_metrics_collection(metrics_reader: InMemoryMetricReader) def test_basic_base(): assert get_base_config('basic') == { + 'process.runtime.cpu.utilization': None, 'system.cpu.simple_utilization': None, 'system.memory.utilization': ['available'], 'system.swap.utilization': ['used'], From be5e9efbb032ca8451f23a7e802376e3dd2cddfb Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Wed, 21 Aug 2024 12:42:29 +0200 Subject: [PATCH 43/48] document each basic metric --- docs/integrations/system_metrics.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/integrations/system_metrics.md b/docs/integrations/system_metrics.md index 47179e798..b9dc28010 100644 --- a/docs/integrations/system_metrics.md +++ b/docs/integrations/system_metrics.md @@ -24,13 +24,17 @@ By default, `instrument_system_metrics` collects only the metrics it needs to di ```py logfire.instrument_system_metrics({ - 'system.cpu.simple_utilization': None, # (1)! - 'system.memory.utilization': ['available'], - 'system.swap.utilization': ['used'], + 'process.runtime.cpu.utilization': None, # (1)! + 'system.cpu.simple_utilization': None, # (2)! + 'system.memory.utilization': ['available'], # (3)! + 'system.swap.utilization': ['used'], # (4)! }) ``` -1. `None` simply means that there are no fields to configure for this metric. +1. `process.runtime.cpu.utilization` is a name recognized by the OpenTelemetry library. The actual name of the metric exported will be `process.runtime.cpython.cpu.utilization` or a similar name depending on the Python implementation used. The `None` value means that there are no fields to configure for this metric. The value of this metric is [`psutil.Process().cpu_percent()`](https://psutil.readthedocs.io/en/latest/#psutil.Process.cpu_percent), i.e. the percentage of CPU time used by this process, where 100 means using 100% of a single CPU core. The value can be greater than 100 if the process uses multiple cores. +2. The `None` value means that there are no fields to configure for this metric. The value of this metric is [`psutil.cpu_percent()`](https://psutil.readthedocs.io/en/latest/#psutil.cpu_percent), i.e. the fraction of CPU time used by the whole system, where 1 means using 100% of all CPU cores. +3. The value here is a list of 'modes' of memory. The full list can be seen in the [`psutil` documentation](https://psutil.readthedocs.io/en/latest/#psutil.virtual_memory). `available` is "the memory that can be given instantly to processes without the system going into swap. This is calculated by summing different memory metrics that vary depending on the platform. It is supposed to be used to monitor actual memory usage in a cross platform fashion." The value of the metric is a number between 0 and 1, and subtracting the value from 1 gives the fraction of memory used. +4. This is the fraction of available swap used. The value is a number between 0 and 1. To collect lots of detailed data about all available metrics, use `logfire.instrument_system_metrics(base='full')`. From bae2c6d7f107b4c1ed3bd2dd98b5652574476485 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Wed, 21 Aug 2024 12:53:58 +0200 Subject: [PATCH 44/48] Apply review suggestions to docs --- docs/integrations/system_metrics.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/integrations/system_metrics.md b/docs/integrations/system_metrics.md index b9dc28010..cf537663f 100644 --- a/docs/integrations/system_metrics.md +++ b/docs/integrations/system_metrics.md @@ -40,11 +40,11 @@ To collect lots of detailed data about all available metrics, use `logfire.instr !!! warning The amount of data collected by `base='full'` can be expensive, especially if you have many servers, - and this is easy to forget about. Be sure to monitor your usage and costs. + and this is easy to forget about. If you enable this, be sure to monitor your usage and costs. The most expensive metrics are `system.cpu.utilization/time` which collect data for each core and each mode, - and `system.disk.*` which collect data for each disk device. This can result in hundreds of data points per minute. - The exact number depends on your machine. + and `system.disk.*` which collect data for each disk device. The exact number depends on the machine hardware, + but this can result in hundreds of data points per minute from each instrumented host. `logfire.instrument_system_metrics(base='full')` is equivalent to: @@ -77,7 +77,7 @@ logfire.instrument_system_metrics({ Each key here is a metric name. The values have different meanings for different metrics. For example, for `system.cpu.utilization`, the value is a list of CPU modes. So there will be a separate row for each CPU core saying what percentage of time it spent idle, another row for the time spent waiting for IO, etc. There are no fields to configure for `system.thread_count`, so the value is `None`. -The first dict argument is merged with the base. For example, if you want to collect disk read operations (but not writes) you can write: +For convenient customizability, the first dict argument is merged with the base. For example, if you want to collect disk read operations (but not writes) you can write: - `logfire.instrument_system_metrics({'system.disk.operations': ['read']})` to collect that data in addition to the basic defaults. - `logfire.instrument_system_metrics({'system.disk.operations': ['read']}, base='full')` to collect detailed data about all metrics, excluding disk write operations. From b2144ad0d78395d66aa5d835f970d94799f081b6 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Wed, 21 Aug 2024 14:04:39 +0200 Subject: [PATCH 45/48] Ensure process.runtime.cpu.utilization values don't start at 0 --- logfire/_internal/integrations/system_metrics.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index e2df808e4..1540ccf7b 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -1,6 +1,7 @@ from __future__ import annotations import sys +from contextlib import suppress from typing import TYPE_CHECKING, Dict, Iterable, Literal, Optional, cast from opentelemetry.metrics import CallbackOptions, Observation @@ -130,11 +131,19 @@ def get_base_config(base: Base) -> Config: def instrument_system_metrics(logfire_instance: Logfire, config: Config | None = None, base: Base = 'basic'): config = {**get_base_config(base), **(config or {})} - SystemMetricsInstrumentor(config=config).instrument() # type: ignore + instrumentor = SystemMetricsInstrumentor(config=config) # type: ignore + instrumentor.instrument() # type: ignore if 'system.cpu.simple_utilization' in config: measure_simple_cpu_utilization(logfire_instance) + if 'process.runtime.cpu.utilization': + with suppress(Exception): + # https://github.com/open-telemetry/opentelemetry-python-contrib/issues/2797#issuecomment-2298749008 + # The first call to cpu_percent() returns 0 every time, + # so do that first call here rather than in the metric exporter to get meaningful values. + instrumentor._proc.cpu_percent() # type: ignore + def measure_simple_cpu_utilization(logfire_instance: Logfire): def callback(_options: CallbackOptions) -> Iterable[Observation]: From 90df9cac09672572c15f1ab82ceb1b03e8ff0b27 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Wed, 21 Aug 2024 14:14:22 +0200 Subject: [PATCH 46/48] fix check --- logfire/_internal/integrations/system_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index 1540ccf7b..167039e8f 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -137,7 +137,7 @@ def instrument_system_metrics(logfire_instance: Logfire, config: Config | None = if 'system.cpu.simple_utilization' in config: measure_simple_cpu_utilization(logfire_instance) - if 'process.runtime.cpu.utilization': + if 'process.runtime.cpu.utilization' in config: with suppress(Exception): # https://github.com/open-telemetry/opentelemetry-python-contrib/issues/2797#issuecomment-2298749008 # The first call to cpu_percent() returns 0 every time, From f0b8d860f6198036f655637cf61b0fe3fca25f3b Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 22 Aug 2024 10:57:57 +0200 Subject: [PATCH 47/48] fix range of values of process.runtime.cpu.utilization --- docs/integrations/system_metrics.md | 4 +- .../_internal/integrations/system_metrics.py | 37 +++++++++++++++---- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/docs/integrations/system_metrics.md b/docs/integrations/system_metrics.md index cf537663f..54856ce03 100644 --- a/docs/integrations/system_metrics.md +++ b/docs/integrations/system_metrics.md @@ -31,8 +31,8 @@ logfire.instrument_system_metrics({ }) ``` -1. `process.runtime.cpu.utilization` is a name recognized by the OpenTelemetry library. The actual name of the metric exported will be `process.runtime.cpython.cpu.utilization` or a similar name depending on the Python implementation used. The `None` value means that there are no fields to configure for this metric. The value of this metric is [`psutil.Process().cpu_percent()`](https://psutil.readthedocs.io/en/latest/#psutil.Process.cpu_percent), i.e. the percentage of CPU time used by this process, where 100 means using 100% of a single CPU core. The value can be greater than 100 if the process uses multiple cores. -2. The `None` value means that there are no fields to configure for this metric. The value of this metric is [`psutil.cpu_percent()`](https://psutil.readthedocs.io/en/latest/#psutil.cpu_percent), i.e. the fraction of CPU time used by the whole system, where 1 means using 100% of all CPU cores. +1. `process.runtime.cpu.utilization` will lead to exporting a metric that is actually named `process.runtime.cpython.cpu.utilization` or a similar name depending on the Python implementation used. The `None` value means that there are no fields to configure for this metric. The value of this metric is `[psutil.Process().cpu_percent()](https://psutil.readthedocs.io/en/latest/#psutil.Process.cpu_percent) / 100`, i.e. the fraction of CPU time used by this process, where 1 means using 100% of a single CPU core. The value can be greater than 1 if the process uses multiple cores. +2. The `None` value means that there are no fields to configure for this metric. The value of this metric is `[psutil.cpu_percent()](https://psutil.readthedocs.io/en/latest/#psutil.cpu_percent) / 100`, i.e. the fraction of CPU time used by the whole system, where 1 means using 100% of all CPU cores. 3. The value here is a list of 'modes' of memory. The full list can be seen in the [`psutil` documentation](https://psutil.readthedocs.io/en/latest/#psutil.virtual_memory). `available` is "the memory that can be given instantly to processes without the system going into swap. This is calculated by summing different memory metrics that vary depending on the platform. It is supposed to be used to monitor actual memory usage in a cross platform fashion." The value of the metric is a number between 0 and 1, and subtracting the value from 1 gives the fraction of memory used. 4. This is the fraction of available swap used. The value is a number between 0 and 1. diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index 167039e8f..b2ad889e0 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -1,7 +1,7 @@ from __future__ import annotations import sys -from contextlib import suppress +from platform import python_implementation from typing import TYPE_CHECKING, Dict, Iterable, Literal, Optional, cast from opentelemetry.metrics import CallbackOptions, Observation @@ -131,18 +131,17 @@ def get_base_config(base: Base) -> Config: def instrument_system_metrics(logfire_instance: Logfire, config: Config | None = None, base: Base = 'basic'): config = {**get_base_config(base), **(config or {})} - instrumentor = SystemMetricsInstrumentor(config=config) # type: ignore - instrumentor.instrument() # type: ignore if 'system.cpu.simple_utilization' in config: measure_simple_cpu_utilization(logfire_instance) if 'process.runtime.cpu.utilization' in config: - with suppress(Exception): - # https://github.com/open-telemetry/opentelemetry-python-contrib/issues/2797#issuecomment-2298749008 - # The first call to cpu_percent() returns 0 every time, - # so do that first call here rather than in the metric exporter to get meaningful values. - instrumentor._proc.cpu_percent() # type: ignore + # Override OTEL here, see comment in measure_process_runtime_cpu_utilization..callback. + measure_process_runtime_cpu_utilization(logfire_instance) + del config['process.runtime.cpu.utilization'] + + instrumentor = SystemMetricsInstrumentor(config=config) # type: ignore + instrumentor.instrument() # type: ignore def measure_simple_cpu_utilization(logfire_instance: Logfire): @@ -156,3 +155,25 @@ def callback(_options: CallbackOptions) -> Iterable[Observation]: description='Average CPU usage across all cores, as a fraction between 0 and 1.', unit='1', ) + + +def measure_process_runtime_cpu_utilization(logfire_instance: Logfire): + process = psutil.Process() + # This first call always returns 0, do it here so that the first real measurement from an exporter + # will return a nonzero value. + process.cpu_percent() + + def callback(_options: CallbackOptions) -> Iterable[Observation]: + # psutil returns a value from 0-100, OTEL values here are generally 0-1, so we divide by 100. + # OTEL got this wrong: https://github.com/open-telemetry/opentelemetry-python-contrib/issues/2810 + # A fix has been merged there, but we need to know in the dashboard how to interpret the values. + # So the dashboard will assume a 0-100 range if the scope is 'opentelemetry.instrumentation.system_metrics', + # and a 0-1 range otherwise. In particular the scope will be 'logfire' if it comes from here. + yield Observation(process.cpu_percent() / 100) + + logfire_instance.metric_gauge_callback( + f'process.runtime.{python_implementation().lower()}.cpu.utilization', + [callback], + description='Runtime CPU utilization', + unit='1', + ) From 2102b39a180d337e3f5e18143d40d8b812e0e63e Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 22 Aug 2024 11:16:52 +0200 Subject: [PATCH 48/48] Update descriptions of dashboards --- docs/guides/web_ui/dashboards.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/guides/web_ui/dashboards.md b/docs/guides/web_ui/dashboards.md index a3e7d6b21..6b11f54f5 100644 --- a/docs/guides/web_ui/dashboards.md +++ b/docs/guides/web_ui/dashboards.md @@ -29,7 +29,8 @@ This dashboard shows essential system resource utilization metrics. It comes in Both variants include the following metrics: * **Number of Processes:** Total number of running processes on the system. -* **CPU Usage %:** Percentage of processing power utilized by the system. +* **System CPU usage %:** Percentage of total available processing power utilized by the whole system, i.e. the average across all CPU cores. +* **Process CPU usage %:** CPU used by a single process, where e.g. using 2 CPU cores to full capacity would result in a value of 200%. * **Memory Usage %:** Percentage of memory currently in use by the system. * **Swap Usage %:** Percentage of swap space currently in use by the system.