From fab65e65749903d7387b0a9ef2cf45b54b73594d Mon Sep 17 00:00:00 2001 From: Ivana Kellyerova Date: Wed, 10 Apr 2024 14:05:52 +0200 Subject: [PATCH] feat(metrics): New normalization of keys, values, units (#2946) --- sentry_sdk/metrics.py | 39 ++++++++++++--- tests/test_metrics.py | 113 +++++++++++++++++++++++++++++------------- 2 files changed, 111 insertions(+), 41 deletions(-) diff --git a/sentry_sdk/metrics.py b/sentry_sdk/metrics.py index 57f44e6533..1e4f5a532e 100644 --- a/sentry_sdk/metrics.py +++ b/sentry_sdk/metrics.py @@ -54,8 +54,6 @@ _in_metrics = ContextVar("in_metrics", default=False) -_sanitize_key = partial(re.compile(r"[^a-zA-Z0-9_/.-]+").sub, "_") -_sanitize_value = partial(re.compile(r"[^\w\d\s_:/@\.{}\[\]$-]+", re.UNICODE).sub, "") _set = set # set is shadowed below GOOD_TRANSACTION_SOURCES = frozenset( @@ -67,6 +65,32 @@ ] ) +_sanitize_unit = partial(re.compile(r"[^a-zA-Z0-9_]+").sub, "") +_sanitize_metric_key = partial(re.compile(r"[^a-zA-Z0-9_\-.]+").sub, "_") +_sanitize_tag_key = partial(re.compile(r"[^a-zA-Z0-9_\-.\/]+").sub, "") +_TAG_VALUE_SANITIZATION_TABLE = { + "\n": "\\n", + "\r": "\\r", + "\t": "\\t", + "\\": "\\\\", + "|": "\\u{7c}", + ",": "\\u{2c}", +} + + +def _sanitize_tag_value(value): + # type: (str) -> str + return "".join( + [ + ( + _TAG_VALUE_SANITIZATION_TABLE[char] + if char in _TAG_VALUE_SANITIZATION_TABLE + else char + ) + for char in value + ] + ) + def get_code_location(stacklevel): # type: (int) -> Optional[Dict[str, Any]] @@ -269,7 +293,8 @@ def _encode_metrics(flushable_buckets): for timestamp, buckets in flushable_buckets: for bucket_key, metric in iteritems(buckets): metric_type, metric_name, metric_unit, metric_tags = bucket_key - metric_name = _sanitize_key(metric_name) + metric_name = _sanitize_metric_key(metric_name) + metric_unit = _sanitize_unit(metric_unit) _write(metric_name.encode("utf-8")) _write(b"@") _write(metric_unit.encode("utf-8")) @@ -285,7 +310,7 @@ def _encode_metrics(flushable_buckets): _write(b"|#") first = True for tag_key, tag_value in metric_tags: - tag_key = _sanitize_key(tag_key) + tag_key = _sanitize_tag_key(tag_key) if not tag_key: continue if first: @@ -294,7 +319,7 @@ def _encode_metrics(flushable_buckets): _write(b",") _write(tag_key.encode("utf-8")) _write(b":") - _write(_sanitize_value(tag_value).encode("utf-8")) + _write(_sanitize_tag_value(tag_value).encode("utf-8")) _write(b"|T") _write(str(timestamp).encode("ascii")) @@ -309,7 +334,9 @@ def _encode_locations(timestamp, code_locations): for key, loc in code_locations: metric_type, name, unit = key - mri = "{}:{}@{}".format(metric_type, _sanitize_key(name), unit) + mri = "{}:{}@{}".format( + metric_type, _sanitize_metric_key(name), _sanitize_unit(unit) + ) loc["type"] = "location" mapping.setdefault(mri, []).append(loc) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 48b4436df0..741935615d 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -677,56 +677,99 @@ def test_metric_summaries( @minimum_python_37_with_gevent @pytest.mark.forked -def test_tag_normalization( - sentry_init, capture_envelopes, maybe_monkeypatched_threading +@pytest.mark.parametrize( + "metric_name,metric_unit,expected_name", + [ + ("first-metric", "nano-second", "first-metric@nanosecond"), + ("another_metric?", "nano second", "another_metric_@nanosecond"), + ( + "metric", + "nanosecond", + "metric@nanosecond", + ), + ( + "my.amaze.metric I guess", + "nano|\nsecond", + "my.amaze.metric_I_guess@nanosecond", + ), + # fmt: off + (u"métríc", u"nanöseconď", u"m_tr_c@nansecon"), + # fmt: on + ], +) +def test_metric_name_normalization( + sentry_init, + capture_envelopes, + metric_name, + metric_unit, + expected_name, + maybe_monkeypatched_threading, ): sentry_init( - release="fun-release@1.0.0", - environment="not-fun-env", _experiments={"enable_metrics": True, "metric_code_locations": False}, ) - ts = time.time() envelopes = capture_envelopes() - # fmt: off - metrics.distribution("a", 1.0, tags={"foo-bar": "%$foo"}, timestamp=ts) - metrics.distribution("b", 1.0, tags={"foo$$$bar": "blah{}"}, timestamp=ts) - metrics.distribution("c", 1.0, tags={u"foö-bar": u"snöwmän"}, timestamp=ts) - metrics.distribution("d", 1.0, tags={"route": "GET /foo"}, timestamp=ts) - # fmt: on + metrics.distribution(metric_name, 1.0, unit=metric_unit) + Hub.current.flush() (envelope,) = envelopes assert len(envelope.items) == 1 assert envelope.items[0].headers["type"] == "statsd" - m = parse_metrics(envelope.items[0].payload.get_bytes()) - assert len(m) == 4 - assert m[0][4] == { - "foo-bar": "$foo", - "release": "fun-release@1.0.0", - "environment": "not-fun-env", - } + parsed_metrics = parse_metrics(envelope.items[0].payload.get_bytes()) + assert len(parsed_metrics) == 1 - assert m[1][4] == { - "foo_bar": "blah{}", - "release": "fun-release@1.0.0", - "environment": "not-fun-env", - } + name = parsed_metrics[0][1] + assert name == expected_name - # fmt: off - assert m[2][4] == { - "fo_-bar": u"snöwmän", - "release": "fun-release@1.0.0", - "environment": "not-fun-env", - } - assert m[3][4] == { - "release": "fun-release@1.0.0", - "environment": "not-fun-env", - "route": "GET /foo", - } - # fmt: on + +@minimum_python_37_with_gevent +@pytest.mark.forked +@pytest.mark.parametrize( + "metric_tag,expected_tag", + [ + ({"f-oo|bar": "%$foo/"}, {"f-oobar": "%$foo/"}), + ({"foo$.$.$bar": "blah{}"}, {"foo..bar": "blah{}"}), + # fmt: off + ({u"foö-bar": u"snöwmän"}, {u"fo-bar": u"snöwmän"},), + # fmt: on + ({"route": "GET /foo"}, {"route": "GET /foo"}), + ({"__bar__": "this | or , that"}, {"__bar__": "this \\u{7c} or \\u{2c} that"}), + ({"foo/": "hello!\n\r\t\\"}, {"foo/": "hello!\\n\\r\\t\\\\"}), + ], +) +def test_metric_tag_normalization( + sentry_init, + capture_envelopes, + metric_tag, + expected_tag, + maybe_monkeypatched_threading, +): + sentry_init( + _experiments={"enable_metrics": True, "metric_code_locations": False}, + ) + envelopes = capture_envelopes() + + metrics.distribution("a", 1.0, tags=metric_tag) + + Hub.current.flush() + + (envelope,) = envelopes + + assert len(envelope.items) == 1 + assert envelope.items[0].headers["type"] == "statsd" + + parsed_metrics = parse_metrics(envelope.items[0].payload.get_bytes()) + assert len(parsed_metrics) == 1 + + tags = parsed_metrics[0][4] + + expected_tag_key, expected_tag_value = expected_tag.popitem() + assert expected_tag_key in tags + assert tags[expected_tag_key] == expected_tag_value @minimum_python_37_with_gevent