-
-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(seer grouping): Add Seer-related ingest helpers #70999
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
import logging | ||
from dataclasses import asdict | ||
|
||
from sentry import features | ||
from sentry.api.endpoints.group_similar_issues_embeddings import get_stacktrace_string | ||
from sentry.constants import PLACEHOLDER_EVENT_TITLES | ||
from sentry.eventstore.models import Event | ||
from sentry.grouping.grouping_info import get_grouping_info_from_variants | ||
from sentry.grouping.result import CalculatedHashes | ||
from sentry.models.group import Group | ||
from sentry.models.project import Project | ||
from sentry.seer.utils import ( | ||
SeerSimilarIssuesMetadata, | ||
SimilarIssuesEmbeddingsRequest, | ||
get_similarity_data_from_seer, | ||
) | ||
from sentry.utils.safe import get_path | ||
|
||
logger = logging.getLogger("sentry.events.grouping") | ||
|
||
|
||
def should_call_seer_for_grouping(event: Event, project: Project) -> bool: | ||
""" | ||
Use event content, feature flags, rate limits, killswitches, seer health, etc. to determine | ||
whether a call to Seer should be made. | ||
""" | ||
# TODO: Implement rate limits, kill switches, other flags, etc | ||
# TODO: Return False if the event has a custom fingerprint (check for both client- and server-side fingerprints) | ||
|
||
# If an event has no stacktrace, and only one of our placeholder titles ("<untitled>", | ||
# "<unknown>", etc.), there's no data for Seer to analyze, so no point in making the API call. | ||
if ( | ||
event.title in PLACEHOLDER_EVENT_TITLES | ||
and not get_path(event.data, "exception", "values", -1, "stacktrace", "frames") | ||
and not get_path(event.data, "threads", "values", -1, "stacktrace", "frames") | ||
): | ||
return False | ||
|
||
return features.has("projects:similarity-embeddings-metadata", project) or features.has( | ||
"projects:similarity-embeddings-grouping", project | ||
) | ||
lobsterkatie marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
def get_seer_similar_issues( | ||
event: Event, | ||
primary_hashes: CalculatedHashes, | ||
num_neighbors: int = 1, | ||
) -> tuple[ | ||
dict[ | ||
str, str | list[dict[str, float | bool | int | str]] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. worth making a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sigh This is why I say Python is the IHOP of typing. I agree this isn't great, but there's no way that I've been able to figure out (and googling suggests it's because no way exists) to create a typeddict out of a dataclass. So I can, but it'll just mean hard coding a second copy of all of the attributes from both the WDYT? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah totally. its good as is! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Googling is so 2010s :-P Just ask Copilot ;-) def dataclass_to_typeddict(cls):
return TypedDict(cls.__name__ + 'Dict', {k: v for k, v in get_type_hints(cls).items()})
SeerSimilarIssuesMetadataDict = dataclass_to_typeddict(SeerSimilarIssuesMetadata)
SeerSimilarIssueDataDict = dataclass_to_typeddict(SeerSimilarIssueData) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If only, @vartec. TIL about fields = {"a": int, "b": str}
DerivedTypedDict = TypedDict("DerivedTypedDict", fields) and then construct the dataclass from the same Here's an open issue about it in the mypy repo: python/mypy#4128. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mypy is the worst... |
||
], # a SeerSimilarIssuesMetadata instance, dictified | ||
Group | None, | ||
]: | ||
""" | ||
Ask Seer for the given event's nearest neighbor(s) and return the seer response data, sorted | ||
with the best matches first, along with the group Seer decided the event should go in, if any, | ||
or None if no neighbor was near enough. | ||
|
||
Will also return `None` for the neighboring group if the `projects:similarity-embeddings-grouping` | ||
feature flag is off. | ||
""" | ||
|
||
# TODO: In our context, this can never happen. There are other scenarios in which `variants` can | ||
# be `None`, but where we'll be using this (during ingestion) it's not possible. This check is | ||
# primarily to satisfy mypy. Once we get rid of hierarchical hashing, we'll be able to | ||
# make `variants` required in `CalculatedHashes`, meaning we can remove this check. (See note in | ||
# `CalculatedHashes` class definition.) | ||
if primary_hashes.variants is None: | ||
raise Exception("Primary hashes missing variants data") | ||
lobsterkatie marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
event_hash = primary_hashes.hashes[0] | ||
stacktrace_string = get_stacktrace_string( | ||
get_grouping_info_from_variants(primary_hashes.variants) | ||
) | ||
|
||
request_data: SimilarIssuesEmbeddingsRequest = { | ||
"hash": event_hash, | ||
"project_id": event.project.id, | ||
"stacktrace": stacktrace_string, | ||
"message": event.title, | ||
"k": num_neighbors, | ||
} | ||
|
||
# Similar issues are returned with the closest match first | ||
seer_results = get_similarity_data_from_seer(request_data) | ||
|
||
similar_issues_metadata = asdict( | ||
SeerSimilarIssuesMetadata(request_hash=event_hash, results=seer_results) | ||
) | ||
parent_group = ( | ||
Group.objects.filter(id=seer_results[0].parent_group_id).first() | ||
if ( | ||
seer_results | ||
and seer_results[0].should_group | ||
and features.has("projects:similarity-embeddings-grouping", event.project) | ||
) | ||
else None | ||
) | ||
|
||
return (similar_issues_metadata, parent_group) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
from dataclasses import asdict | ||
from unittest.mock import patch | ||
|
||
from sentry.conf.server import SEER_SIMILARITY_MODEL_VERSION | ||
from sentry.eventstore.models import Event | ||
from sentry.grouping.ingest.seer import get_seer_similar_issues, should_call_seer_for_grouping | ||
from sentry.grouping.result import CalculatedHashes | ||
from sentry.seer.utils import SeerSimilarIssueData | ||
from sentry.testutils.cases import TestCase | ||
from sentry.testutils.helpers import Feature | ||
from sentry.testutils.helpers.eventprocessing import save_new_event | ||
from sentry.testutils.helpers.features import with_feature | ||
from sentry.utils.types import NonNone | ||
|
||
|
||
class ShouldCallSeerTest(TestCase): | ||
# TODO: Add tests for rate limits, killswitches, etc once those are in place | ||
|
||
def test_obeys_seer_similarity_flags(self): | ||
for metadata_flag, grouping_flag, expected_result in [ | ||
(False, False, False), | ||
(True, False, True), | ||
(False, True, True), | ||
(True, True, True), | ||
]: | ||
with Feature( | ||
{ | ||
"projects:similarity-embeddings-metadata": metadata_flag, | ||
"projects:similarity-embeddings-grouping": grouping_flag, | ||
} | ||
): | ||
assert ( | ||
should_call_seer_for_grouping( | ||
Event( | ||
project_id=self.project.id, | ||
event_id="11212012123120120415201309082013", | ||
data={"title": "Dogs are great!"}, | ||
), | ||
self.project, | ||
) | ||
is expected_result | ||
), f"Case ({metadata_flag}, {grouping_flag}) failed." | ||
|
||
@with_feature("projects:similarity-embeddings-grouping") | ||
def test_says_no_for_garbage_event(self): | ||
assert ( | ||
should_call_seer_for_grouping( | ||
Event( | ||
project_id=self.project.id, | ||
event_id="11212012123120120415201309082013", | ||
data={"title": "<untitled>"}, | ||
), | ||
self.project, | ||
) | ||
is False | ||
) | ||
|
||
|
||
class GetSeerSimilarIssuesTest(TestCase): | ||
def setUp(self): | ||
self.existing_event = save_new_event({"message": "Dogs are great!"}, self.project) | ||
self.new_event = Event( | ||
project_id=self.project.id, | ||
event_id="11212012123120120415201309082013", | ||
data={"message": "Adopt don't shop"}, | ||
) | ||
self.new_event_hashes = CalculatedHashes( | ||
hashes=["20130809201315042012311220122111"], | ||
hierarchical_hashes=[], | ||
tree_labels=[], | ||
variants={}, | ||
) | ||
|
||
@with_feature({"projects:similarity-embeddings-grouping": False}) | ||
def test_returns_metadata_but_no_group_if_seer_grouping_flag_off(self): | ||
seer_result_data = SeerSimilarIssueData( | ||
parent_hash=self.existing_event.get_primary_hash(), | ||
parent_group_id=NonNone(self.existing_event.group_id), | ||
stacktrace_distance=0.01, | ||
message_distance=0.05, | ||
should_group=True, | ||
) | ||
expected_metadata = { | ||
"similarity_model_version": SEER_SIMILARITY_MODEL_VERSION, | ||
"request_hash": self.new_event_hashes.hashes[0], | ||
"results": [asdict(seer_result_data)], | ||
} | ||
|
||
with patch( | ||
"sentry.grouping.ingest.seer.get_similarity_data_from_seer", | ||
return_value=[seer_result_data], | ||
): | ||
assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == ( | ||
expected_metadata, | ||
None, # No group returned, even though `should_group` is True | ||
) | ||
|
||
@with_feature("projects:similarity-embeddings-grouping") | ||
def test_returns_metadata_and_group_if_sufficiently_close_group_found(self): | ||
seer_result_data = SeerSimilarIssueData( | ||
parent_hash=self.existing_event.get_primary_hash(), | ||
parent_group_id=NonNone(self.existing_event.group_id), | ||
stacktrace_distance=0.01, | ||
message_distance=0.05, | ||
should_group=True, | ||
) | ||
expected_metadata = { | ||
"similarity_model_version": SEER_SIMILARITY_MODEL_VERSION, | ||
"request_hash": self.new_event_hashes.hashes[0], | ||
"results": [asdict(seer_result_data)], | ||
} | ||
|
||
with patch( | ||
"sentry.grouping.ingest.seer.get_similarity_data_from_seer", | ||
return_value=[seer_result_data], | ||
): | ||
assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == ( | ||
expected_metadata, | ||
self.existing_event.group, | ||
) | ||
|
||
@with_feature("projects:similarity-embeddings-grouping") | ||
def test_returns_metadata_but_no_group_if_similar_group_insufficiently_close(self): | ||
seer_result_data = SeerSimilarIssueData( | ||
parent_hash=self.existing_event.get_primary_hash(), | ||
parent_group_id=NonNone(self.existing_event.group_id), | ||
stacktrace_distance=0.08, | ||
message_distance=0.12, | ||
should_group=False, | ||
) | ||
expected_metadata = { | ||
"similarity_model_version": SEER_SIMILARITY_MODEL_VERSION, | ||
"request_hash": self.new_event_hashes.hashes[0], | ||
"results": [asdict(seer_result_data)], | ||
} | ||
|
||
with patch( | ||
"sentry.grouping.ingest.seer.get_similarity_data_from_seer", | ||
return_value=[seer_result_data], | ||
): | ||
assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == ( | ||
expected_metadata, | ||
None, | ||
) | ||
|
||
@with_feature("projects:similarity-embeddings-grouping") | ||
def test_returns_no_group_and_empty_metadata_if_no_similar_group_found(self): | ||
expected_metadata = { | ||
"similarity_model_version": SEER_SIMILARITY_MODEL_VERSION, | ||
"request_hash": self.new_event_hashes.hashes[0], | ||
"results": [], | ||
} | ||
|
||
with patch( | ||
"sentry.grouping.ingest.seer.get_similarity_data_from_seer", | ||
return_value=[], | ||
): | ||
assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == ( | ||
expected_metadata, | ||
None, | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a comment explaining this condition, please?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.