feat(seer grouping): Add Seer-related ingest helpers (#70999)

This adds two helpers, `should_call_seer_for_grouping` and `get_seer_similar_issues`, to be used when we (maybe) call Seer as part of event ingestion. `should_call_seer_for_grouping` does exactly what you'd think given the name, right now only basing the decision on feature flags and whether or not the event has a usable title and/or stacktrace. In the future we'll also include rate limit and killswitch checks, and any other criteria which it makes sense to add. `get_seer_similar_issues` is a wrapper around `get_similarity_data_from_seer` (which is what actually makes the API call to Seer). It extracts request data from the given event, makes the request, pulls together metadata about the results, and if a matching group is found and the flag is on, pulls the `Group` record out of the database. (I chose to put the feature flag check there rather than in the code where the the the grouping actually happens so that we can save the trip to the database if we're not going to end up using the results for grouping.) Code to actually use these helpers is added in #71026.
getsentry · May 21, 2024 · 20be96b · 20be96b
1 parent d488992
commit 20be96b
Show file tree

Hide file tree

Showing 2 changed files with 261 additions and 0 deletions.
diff --git a/src/sentry/grouping/ingest/seer.py b/src/sentry/grouping/ingest/seer.py
@@ -0,0 +1,100 @@
+import logging
+from dataclasses import asdict
+
+from sentry import features
+from sentry.api.endpoints.group_similar_issues_embeddings import get_stacktrace_string
+from sentry.constants import PLACEHOLDER_EVENT_TITLES
+from sentry.eventstore.models import Event
+from sentry.grouping.grouping_info import get_grouping_info_from_variants
+from sentry.grouping.result import CalculatedHashes
+from sentry.models.group import Group
+from sentry.models.project import Project
+from sentry.seer.utils import (
+    SeerSimilarIssuesMetadata,
+    SimilarIssuesEmbeddingsRequest,
+    get_similarity_data_from_seer,
+)
+from sentry.utils.safe import get_path
+
+logger = logging.getLogger("sentry.events.grouping")
+
+
+def should_call_seer_for_grouping(event: Event, project: Project) -> bool:
+    """
+    Use event content, feature flags, rate limits, killswitches, seer health, etc. to determine
+    whether a call to Seer should be made.
+    """
+    # TODO: Implement rate limits, kill switches, other flags, etc
+    # TODO: Return False if the event has a custom fingerprint (check for both client- and server-side fingerprints)
+
+    # If an event has no stacktrace, and only one of our placeholder titles ("<untitled>",
+    # "<unknown>", etc.), there's no data for Seer to analyze, so no point in making the API call.
+    if (
+        event.title in PLACEHOLDER_EVENT_TITLES
+        and not get_path(event.data, "exception", "values", -1, "stacktrace", "frames")
+        and not get_path(event.data, "threads", "values", -1, "stacktrace", "frames")
+    ):
+        return False
+
+    return features.has("projects:similarity-embeddings-metadata", project) or features.has(
+        "projects:similarity-embeddings-grouping", project
+    )
+
+
+def get_seer_similar_issues(
+    event: Event,
+    primary_hashes: CalculatedHashes,
+    num_neighbors: int = 1,
+) -> tuple[
+    dict[
+        str, str | list[dict[str, float | bool | int | str]]
+    ],  # a SeerSimilarIssuesMetadata instance, dictified
+    Group | None,
+]:
+    """
+    Ask Seer for the given event's nearest neighbor(s) and return the seer response data, sorted
+    with the best matches first, along with the group Seer decided the event should go in, if any,
+    or None if no neighbor was near enough.
+
+    Will also return `None` for the neighboring group if the `projects:similarity-embeddings-grouping`
+    feature flag is off.
+    """
+
+    # TODO: In our context, this can never happen. There are other scenarios in which `variants` can
+    # be `None`, but where we'll be using this (during ingestion) it's not possible. This check is
+    # primarily to satisfy mypy. Once we get rid of hierarchical hashing, we'll be able to
+    # make `variants` required in `CalculatedHashes`, meaning we can remove this check. (See note in
+    # `CalculatedHashes` class definition.)
+    if primary_hashes.variants is None:
+        raise Exception("Primary hashes missing variants data")
+
+    event_hash = primary_hashes.hashes[0]
+    stacktrace_string = get_stacktrace_string(
+        get_grouping_info_from_variants(primary_hashes.variants)
+    )
+
+    request_data: SimilarIssuesEmbeddingsRequest = {
+        "hash": event_hash,
+        "project_id": event.project.id,
+        "stacktrace": stacktrace_string,
+        "message": event.title,
+        "k": num_neighbors,
+    }
+
+    # Similar issues are returned with the closest match first
+    seer_results = get_similarity_data_from_seer(request_data)
+
+    similar_issues_metadata = asdict(
+        SeerSimilarIssuesMetadata(request_hash=event_hash, results=seer_results)
+    )
+    parent_group = (
+        Group.objects.filter(id=seer_results[0].parent_group_id).first()
+        if (
+            seer_results
+            and seer_results[0].should_group
+            and features.has("projects:similarity-embeddings-grouping", event.project)
+        )
+        else None
+    )
+
+    return (similar_issues_metadata, parent_group)
diff --git a/tests/sentry/grouping/test_seer.py b/tests/sentry/grouping/test_seer.py
@@ -0,0 +1,161 @@
+from dataclasses import asdict
+from unittest.mock import patch
+
+from sentry.conf.server import SEER_SIMILARITY_MODEL_VERSION
+from sentry.eventstore.models import Event
+from sentry.grouping.ingest.seer import get_seer_similar_issues, should_call_seer_for_grouping
+from sentry.grouping.result import CalculatedHashes
+from sentry.seer.utils import SeerSimilarIssueData
+from sentry.testutils.cases import TestCase
+from sentry.testutils.helpers import Feature
+from sentry.testutils.helpers.eventprocessing import save_new_event
+from sentry.testutils.helpers.features import with_feature
+from sentry.utils.types import NonNone
+
+
+class ShouldCallSeerTest(TestCase):
+    # TODO: Add tests for rate limits, killswitches, etc once those are in place
+
+    def test_obeys_seer_similarity_flags(self):
+        for metadata_flag, grouping_flag, expected_result in [
+            (False, False, False),
+            (True, False, True),
+            (False, True, True),
+            (True, True, True),
+        ]:
+            with Feature(
+                {
+                    "projects:similarity-embeddings-metadata": metadata_flag,
+                    "projects:similarity-embeddings-grouping": grouping_flag,
+                }
+            ):
+                assert (
+                    should_call_seer_for_grouping(
+                        Event(
+                            project_id=self.project.id,
+                            event_id="11212012123120120415201309082013",
+                            data={"title": "Dogs are great!"},
+                        ),
+                        self.project,
+                    )
+                    is expected_result
+                ), f"Case ({metadata_flag}, {grouping_flag}) failed."
+
+    @with_feature("projects:similarity-embeddings-grouping")
+    def test_says_no_for_garbage_event(self):
+        assert (
+            should_call_seer_for_grouping(
+                Event(
+                    project_id=self.project.id,
+                    event_id="11212012123120120415201309082013",
+                    data={"title": "<untitled>"},
+                ),
+                self.project,
+            )
+            is False
+        )
+
+
+class GetSeerSimilarIssuesTest(TestCase):
+    def setUp(self):
+        self.existing_event = save_new_event({"message": "Dogs are great!"}, self.project)
+        self.new_event = Event(
+            project_id=self.project.id,
+            event_id="11212012123120120415201309082013",
+            data={"message": "Adopt don't shop"},
+        )
+        self.new_event_hashes = CalculatedHashes(
+            hashes=["20130809201315042012311220122111"],
+            hierarchical_hashes=[],
+            tree_labels=[],
+            variants={},
+        )
+
+    @with_feature({"projects:similarity-embeddings-grouping": False})
+    def test_returns_metadata_but_no_group_if_seer_grouping_flag_off(self):
+        seer_result_data = SeerSimilarIssueData(
+            parent_hash=self.existing_event.get_primary_hash(),
+            parent_group_id=NonNone(self.existing_event.group_id),
+            stacktrace_distance=0.01,
+            message_distance=0.05,
+            should_group=True,
+        )
+        expected_metadata = {
+            "similarity_model_version": SEER_SIMILARITY_MODEL_VERSION,
+            "request_hash": self.new_event_hashes.hashes[0],
+            "results": [asdict(seer_result_data)],
+        }
+
+        with patch(
+            "sentry.grouping.ingest.seer.get_similarity_data_from_seer",
+            return_value=[seer_result_data],
+        ):
+            assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == (
+                expected_metadata,
+                None,  # No group returned, even though `should_group` is True
+            )
+
+    @with_feature("projects:similarity-embeddings-grouping")
+    def test_returns_metadata_and_group_if_sufficiently_close_group_found(self):
+        seer_result_data = SeerSimilarIssueData(
+            parent_hash=self.existing_event.get_primary_hash(),
+            parent_group_id=NonNone(self.existing_event.group_id),
+            stacktrace_distance=0.01,
+            message_distance=0.05,
+            should_group=True,
+        )
+        expected_metadata = {
+            "similarity_model_version": SEER_SIMILARITY_MODEL_VERSION,
+            "request_hash": self.new_event_hashes.hashes[0],
+            "results": [asdict(seer_result_data)],
+        }
+
+        with patch(
+            "sentry.grouping.ingest.seer.get_similarity_data_from_seer",
+            return_value=[seer_result_data],
+        ):
+            assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == (
+                expected_metadata,
+                self.existing_event.group,
+            )
+
+    @with_feature("projects:similarity-embeddings-grouping")
+    def test_returns_metadata_but_no_group_if_similar_group_insufficiently_close(self):
+        seer_result_data = SeerSimilarIssueData(
+            parent_hash=self.existing_event.get_primary_hash(),
+            parent_group_id=NonNone(self.existing_event.group_id),
+            stacktrace_distance=0.08,
+            message_distance=0.12,
+            should_group=False,
+        )
+        expected_metadata = {
+            "similarity_model_version": SEER_SIMILARITY_MODEL_VERSION,
+            "request_hash": self.new_event_hashes.hashes[0],
+            "results": [asdict(seer_result_data)],
+        }
+
+        with patch(
+            "sentry.grouping.ingest.seer.get_similarity_data_from_seer",
+            return_value=[seer_result_data],
+        ):
+            assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == (
+                expected_metadata,
+                None,
+            )
+
+    @with_feature("projects:similarity-embeddings-grouping")
+    def test_returns_no_group_and_empty_metadata_if_no_similar_group_found(self):
+        expected_metadata = {
+            "similarity_model_version": SEER_SIMILARITY_MODEL_VERSION,
+            "request_hash": self.new_event_hashes.hashes[0],
+            "results": [],
+        }
+
+        with patch(
+            "sentry.grouping.ingest.seer.get_similarity_data_from_seer",
+            return_value=[],
+        ):
+            assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == (
+                expected_metadata,
+                None,
+            )