diff --git a/src/sentry/grouping/ingest/seer.py b/src/sentry/grouping/ingest/seer.py new file mode 100644 index 00000000000000..e05f000376625a --- /dev/null +++ b/src/sentry/grouping/ingest/seer.py @@ -0,0 +1,100 @@ +import logging +from dataclasses import asdict + +from sentry import features +from sentry.api.endpoints.group_similar_issues_embeddings import get_stacktrace_string +from sentry.constants import PLACEHOLDER_EVENT_TITLES +from sentry.eventstore.models import Event +from sentry.grouping.grouping_info import get_grouping_info_from_variants +from sentry.grouping.result import CalculatedHashes +from sentry.models.group import Group +from sentry.models.project import Project +from sentry.seer.utils import ( + SeerSimilarIssuesMetadata, + SimilarIssuesEmbeddingsRequest, + get_similarity_data_from_seer, +) +from sentry.utils.safe import get_path + +logger = logging.getLogger("sentry.events.grouping") + + +def should_call_seer_for_grouping(event: Event, project: Project) -> bool: + """ + Use event content, feature flags, rate limits, killswitches, seer health, etc. to determine + whether a call to Seer should be made. + """ + # TODO: Implement rate limits, kill switches, other flags, etc + # TODO: Return False if the event has a custom fingerprint (check for both client- and server-side fingerprints) + + # If an event has no stacktrace, and only one of our placeholder titles ("", + # "", etc.), there's no data for Seer to analyze, so no point in making the API call. + if ( + event.title in PLACEHOLDER_EVENT_TITLES + and not get_path(event.data, "exception", "values", -1, "stacktrace", "frames") + and not get_path(event.data, "threads", "values", -1, "stacktrace", "frames") + ): + return False + + return features.has("projects:similarity-embeddings-metadata", project) or features.has( + "projects:similarity-embeddings-grouping", project + ) + + +def get_seer_similar_issues( + event: Event, + primary_hashes: CalculatedHashes, + num_neighbors: int = 1, +) -> tuple[ + dict[ + str, str | list[dict[str, float | bool | int | str]] + ], # a SeerSimilarIssuesMetadata instance, dictified + Group | None, +]: + """ + Ask Seer for the given event's nearest neighbor(s) and return the seer response data, sorted + with the best matches first, along with the group Seer decided the event should go in, if any, + or None if no neighbor was near enough. + + Will also return `None` for the neighboring group if the `projects:similarity-embeddings-grouping` + feature flag is off. + """ + + # TODO: In our context, this can never happen. There are other scenarios in which `variants` can + # be `None`, but where we'll be using this (during ingestion) it's not possible. This check is + # primarily to satisfy mypy. Once we get rid of hierarchical hashing, we'll be able to + # make `variants` required in `CalculatedHashes`, meaning we can remove this check. (See note in + # `CalculatedHashes` class definition.) + if primary_hashes.variants is None: + raise Exception("Primary hashes missing variants data") + + event_hash = primary_hashes.hashes[0] + stacktrace_string = get_stacktrace_string( + get_grouping_info_from_variants(primary_hashes.variants) + ) + + request_data: SimilarIssuesEmbeddingsRequest = { + "hash": event_hash, + "project_id": event.project.id, + "stacktrace": stacktrace_string, + "message": event.title, + "k": num_neighbors, + } + + # Similar issues are returned with the closest match first + seer_results = get_similarity_data_from_seer(request_data) + + similar_issues_metadata = asdict( + SeerSimilarIssuesMetadata(request_hash=event_hash, results=seer_results) + ) + parent_group = ( + Group.objects.filter(id=seer_results[0].parent_group_id).first() + if ( + seer_results + and seer_results[0].should_group + and features.has("projects:similarity-embeddings-grouping", event.project) + ) + else None + ) + + return (similar_issues_metadata, parent_group) diff --git a/tests/sentry/grouping/test_seer.py b/tests/sentry/grouping/test_seer.py new file mode 100644 index 00000000000000..1af43c20ec61da --- /dev/null +++ b/tests/sentry/grouping/test_seer.py @@ -0,0 +1,161 @@ +from dataclasses import asdict +from unittest.mock import patch + +from sentry.conf.server import SEER_SIMILARITY_MODEL_VERSION +from sentry.eventstore.models import Event +from sentry.grouping.ingest.seer import get_seer_similar_issues, should_call_seer_for_grouping +from sentry.grouping.result import CalculatedHashes +from sentry.seer.utils import SeerSimilarIssueData +from sentry.testutils.cases import TestCase +from sentry.testutils.helpers import Feature +from sentry.testutils.helpers.eventprocessing import save_new_event +from sentry.testutils.helpers.features import with_feature +from sentry.utils.types import NonNone + + +class ShouldCallSeerTest(TestCase): + # TODO: Add tests for rate limits, killswitches, etc once those are in place + + def test_obeys_seer_similarity_flags(self): + for metadata_flag, grouping_flag, expected_result in [ + (False, False, False), + (True, False, True), + (False, True, True), + (True, True, True), + ]: + with Feature( + { + "projects:similarity-embeddings-metadata": metadata_flag, + "projects:similarity-embeddings-grouping": grouping_flag, + } + ): + assert ( + should_call_seer_for_grouping( + Event( + project_id=self.project.id, + event_id="11212012123120120415201309082013", + data={"title": "Dogs are great!"}, + ), + self.project, + ) + is expected_result + ), f"Case ({metadata_flag}, {grouping_flag}) failed." + + @with_feature("projects:similarity-embeddings-grouping") + def test_says_no_for_garbage_event(self): + assert ( + should_call_seer_for_grouping( + Event( + project_id=self.project.id, + event_id="11212012123120120415201309082013", + data={"title": ""}, + ), + self.project, + ) + is False + ) + + +class GetSeerSimilarIssuesTest(TestCase): + def setUp(self): + self.existing_event = save_new_event({"message": "Dogs are great!"}, self.project) + self.new_event = Event( + project_id=self.project.id, + event_id="11212012123120120415201309082013", + data={"message": "Adopt don't shop"}, + ) + self.new_event_hashes = CalculatedHashes( + hashes=["20130809201315042012311220122111"], + hierarchical_hashes=[], + tree_labels=[], + variants={}, + ) + + @with_feature({"projects:similarity-embeddings-grouping": False}) + def test_returns_metadata_but_no_group_if_seer_grouping_flag_off(self): + seer_result_data = SeerSimilarIssueData( + parent_hash=self.existing_event.get_primary_hash(), + parent_group_id=NonNone(self.existing_event.group_id), + stacktrace_distance=0.01, + message_distance=0.05, + should_group=True, + ) + expected_metadata = { + "similarity_model_version": SEER_SIMILARITY_MODEL_VERSION, + "request_hash": self.new_event_hashes.hashes[0], + "results": [asdict(seer_result_data)], + } + + with patch( + "sentry.grouping.ingest.seer.get_similarity_data_from_seer", + return_value=[seer_result_data], + ): + assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == ( + expected_metadata, + None, # No group returned, even though `should_group` is True + ) + + @with_feature("projects:similarity-embeddings-grouping") + def test_returns_metadata_and_group_if_sufficiently_close_group_found(self): + seer_result_data = SeerSimilarIssueData( + parent_hash=self.existing_event.get_primary_hash(), + parent_group_id=NonNone(self.existing_event.group_id), + stacktrace_distance=0.01, + message_distance=0.05, + should_group=True, + ) + expected_metadata = { + "similarity_model_version": SEER_SIMILARITY_MODEL_VERSION, + "request_hash": self.new_event_hashes.hashes[0], + "results": [asdict(seer_result_data)], + } + + with patch( + "sentry.grouping.ingest.seer.get_similarity_data_from_seer", + return_value=[seer_result_data], + ): + assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == ( + expected_metadata, + self.existing_event.group, + ) + + @with_feature("projects:similarity-embeddings-grouping") + def test_returns_metadata_but_no_group_if_similar_group_insufficiently_close(self): + seer_result_data = SeerSimilarIssueData( + parent_hash=self.existing_event.get_primary_hash(), + parent_group_id=NonNone(self.existing_event.group_id), + stacktrace_distance=0.08, + message_distance=0.12, + should_group=False, + ) + expected_metadata = { + "similarity_model_version": SEER_SIMILARITY_MODEL_VERSION, + "request_hash": self.new_event_hashes.hashes[0], + "results": [asdict(seer_result_data)], + } + + with patch( + "sentry.grouping.ingest.seer.get_similarity_data_from_seer", + return_value=[seer_result_data], + ): + assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == ( + expected_metadata, + None, + ) + + @with_feature("projects:similarity-embeddings-grouping") + def test_returns_no_group_and_empty_metadata_if_no_similar_group_found(self): + expected_metadata = { + "similarity_model_version": SEER_SIMILARITY_MODEL_VERSION, + "request_hash": self.new_event_hashes.hashes[0], + "results": [], + } + + with patch( + "sentry.grouping.ingest.seer.get_similarity_data_from_seer", + return_value=[], + ): + assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == ( + expected_metadata, + None, + )