Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(seer grouping): Add Seer-related ingest helpers #70999

Merged
merged 4 commits into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions src/sentry/grouping/ingest/seer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import logging
from dataclasses import asdict

from sentry import features
from sentry.api.endpoints.group_similar_issues_embeddings import get_stacktrace_string
from sentry.constants import PLACEHOLDER_EVENT_TITLES
from sentry.eventstore.models import Event
from sentry.grouping.grouping_info import get_grouping_info_from_variants
from sentry.grouping.result import CalculatedHashes
from sentry.models.group import Group
from sentry.models.project import Project
from sentry.seer.utils import (
SeerSimilarIssuesMetadata,
SimilarIssuesEmbeddingsRequest,
get_similarity_data_from_seer,
)
from sentry.utils.safe import get_path

logger = logging.getLogger("sentry.events.grouping")


def should_call_seer_for_grouping(event: Event, project: Project) -> bool:
"""
Use event content, feature flags, rate limits, killswitches, seer health, etc. to determine
whether a call to Seer should be made.
"""
# TODO: Implement rate limits, kill switches, other flags, etc
# TODO: Return False if the event has a custom fingerprint (check for both client- and server-side fingerprints)

# If an event has no stacktrace, and only one of our placeholder titles ("<untitled>",
# "<unknown>", etc.), there's no data for Seer to analyze, so no point in making the API call.
if (
event.title in PLACEHOLDER_EVENT_TITLES
and not get_path(event.data, "exception", "values", -1, "stacktrace", "frames")
and not get_path(event.data, "threads", "values", -1, "stacktrace", "frames")
):
Comment on lines +32 to +36
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a comment explaining this condition, please?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (
event.title in PLACEHOLDER_EVENT_TITLES
and not get_path(event.data, "exception", "values", -1, "stacktrace", "frames")
and not get_path(event.data, "threads", "values", -1, "stacktrace", "frames")
):
if not (
get_path(event.data, "exception", "values", -1, "stacktrace", "frames")
or get_path(event.data, "threads", "values", -1, "stacktrace", "frames")
or event.title not in PLACEHOLDER_EVENT_TITLES
):

return False

return features.has("projects:similarity-embeddings-metadata", project) or features.has(
"projects:similarity-embeddings-grouping", project
)
lobsterkatie marked this conversation as resolved.
Show resolved Hide resolved


def get_seer_similar_issues(
event: Event,
primary_hashes: CalculatedHashes,
num_neighbors: int = 1,
) -> tuple[
dict[
str, str | list[dict[str, float | bool | int | str]]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

worth making a TypedDict for this type?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sigh This is why I say Python is the IHOP of typing. I agree this isn't great, but there's no way that I've been able to figure out (and googling suggests it's because no way exists) to create a typeddict out of a dataclass.

So I can, but it'll just mean hard coding a second copy of all of the attributes from both the SeerSimilarIssuesMetadata and SeerSimilarIssueData dataclasses, which feels worse to me (I think because a change to either one of those classes would be guaranteed to make the hypothetical typeddict need updating whereas there's a good chance they could be changed without this type needing to be any different).

WDYT?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah totally. its good as is!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Googling is so 2010s :-P Just ask Copilot ;-)

def dataclass_to_typeddict(cls):
    return TypedDict(cls.__name__ + 'Dict', {k: v for k, v in get_type_hints(cls).items()})

SeerSimilarIssuesMetadataDict = dataclass_to_typeddict(SeerSimilarIssuesMetadata)
SeerSimilarIssueDataDict = dataclass_to_typeddict(SeerSimilarIssueData)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If only, @vartec. TIL about get_type_hints, but my very first attempt to wrestle with the dataclass/typeddict divide was to do something like this:

fields = {"a": int, "b": str}
DerivedTypedDict = TypedDict("DerivedTypedDict", fields)

and then construct the dataclass from the same fields value. Alas, mypy will have none of it: error: TypedDict() expects a dictionary literal as the second argument.

Here's an open issue about it in the mypy repo: python/mypy#4128.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mypy is the worst...

], # a SeerSimilarIssuesMetadata instance, dictified
Group | None,
]:
"""
Ask Seer for the given event's nearest neighbor(s) and return the seer response data, sorted
with the best matches first, along with the group Seer decided the event should go in, if any,
or None if no neighbor was near enough.

Will also return `None` for the neighboring group if the `projects:similarity-embeddings-grouping`
feature flag is off.
"""

# TODO: In our context, this can never happen. There are other scenarios in which `variants` can
# be `None`, but where we'll be using this (during ingestion) it's not possible. This check is
# primarily to satisfy mypy. Once we get rid of hierarchical hashing, we'll be able to
# make `variants` required in `CalculatedHashes`, meaning we can remove this check. (See note in
# `CalculatedHashes` class definition.)
if primary_hashes.variants is None:
raise Exception("Primary hashes missing variants data")
lobsterkatie marked this conversation as resolved.
Show resolved Hide resolved

event_hash = primary_hashes.hashes[0]
stacktrace_string = get_stacktrace_string(
get_grouping_info_from_variants(primary_hashes.variants)
)

request_data: SimilarIssuesEmbeddingsRequest = {
"hash": event_hash,
"project_id": event.project.id,
"stacktrace": stacktrace_string,
"message": event.title,
"k": num_neighbors,
}

# Similar issues are returned with the closest match first
seer_results = get_similarity_data_from_seer(request_data)

similar_issues_metadata = asdict(
SeerSimilarIssuesMetadata(request_hash=event_hash, results=seer_results)
)
parent_group = (
Group.objects.filter(id=seer_results[0].parent_group_id).first()
if (
seer_results
and seer_results[0].should_group
and features.has("projects:similarity-embeddings-grouping", event.project)
)
else None
)

return (similar_issues_metadata, parent_group)
161 changes: 161 additions & 0 deletions tests/sentry/grouping/test_seer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
from dataclasses import asdict
from unittest.mock import patch

from sentry.conf.server import SEER_SIMILARITY_MODEL_VERSION
from sentry.eventstore.models import Event
from sentry.grouping.ingest.seer import get_seer_similar_issues, should_call_seer_for_grouping
from sentry.grouping.result import CalculatedHashes
from sentry.seer.utils import SeerSimilarIssueData
from sentry.testutils.cases import TestCase
from sentry.testutils.helpers import Feature
from sentry.testutils.helpers.eventprocessing import save_new_event
from sentry.testutils.helpers.features import with_feature
from sentry.utils.types import NonNone


class ShouldCallSeerTest(TestCase):
# TODO: Add tests for rate limits, killswitches, etc once those are in place

def test_obeys_seer_similarity_flags(self):
for metadata_flag, grouping_flag, expected_result in [
(False, False, False),
(True, False, True),
(False, True, True),
(True, True, True),
]:
with Feature(
{
"projects:similarity-embeddings-metadata": metadata_flag,
"projects:similarity-embeddings-grouping": grouping_flag,
}
):
assert (
should_call_seer_for_grouping(
Event(
project_id=self.project.id,
event_id="11212012123120120415201309082013",
data={"title": "Dogs are great!"},
),
self.project,
)
is expected_result
), f"Case ({metadata_flag}, {grouping_flag}) failed."

@with_feature("projects:similarity-embeddings-grouping")
def test_says_no_for_garbage_event(self):
assert (
should_call_seer_for_grouping(
Event(
project_id=self.project.id,
event_id="11212012123120120415201309082013",
data={"title": "<untitled>"},
),
self.project,
)
is False
)


class GetSeerSimilarIssuesTest(TestCase):
def setUp(self):
self.existing_event = save_new_event({"message": "Dogs are great!"}, self.project)
self.new_event = Event(
project_id=self.project.id,
event_id="11212012123120120415201309082013",
data={"message": "Adopt don't shop"},
)
self.new_event_hashes = CalculatedHashes(
hashes=["20130809201315042012311220122111"],
hierarchical_hashes=[],
tree_labels=[],
variants={},
)

@with_feature({"projects:similarity-embeddings-grouping": False})
def test_returns_metadata_but_no_group_if_seer_grouping_flag_off(self):
seer_result_data = SeerSimilarIssueData(
parent_hash=self.existing_event.get_primary_hash(),
parent_group_id=NonNone(self.existing_event.group_id),
stacktrace_distance=0.01,
message_distance=0.05,
should_group=True,
)
expected_metadata = {
"similarity_model_version": SEER_SIMILARITY_MODEL_VERSION,
"request_hash": self.new_event_hashes.hashes[0],
"results": [asdict(seer_result_data)],
}

with patch(
"sentry.grouping.ingest.seer.get_similarity_data_from_seer",
return_value=[seer_result_data],
):
assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == (
expected_metadata,
None, # No group returned, even though `should_group` is True
)

@with_feature("projects:similarity-embeddings-grouping")
def test_returns_metadata_and_group_if_sufficiently_close_group_found(self):
seer_result_data = SeerSimilarIssueData(
parent_hash=self.existing_event.get_primary_hash(),
parent_group_id=NonNone(self.existing_event.group_id),
stacktrace_distance=0.01,
message_distance=0.05,
should_group=True,
)
expected_metadata = {
"similarity_model_version": SEER_SIMILARITY_MODEL_VERSION,
"request_hash": self.new_event_hashes.hashes[0],
"results": [asdict(seer_result_data)],
}

with patch(
"sentry.grouping.ingest.seer.get_similarity_data_from_seer",
return_value=[seer_result_data],
):
assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == (
expected_metadata,
self.existing_event.group,
)

@with_feature("projects:similarity-embeddings-grouping")
def test_returns_metadata_but_no_group_if_similar_group_insufficiently_close(self):
seer_result_data = SeerSimilarIssueData(
parent_hash=self.existing_event.get_primary_hash(),
parent_group_id=NonNone(self.existing_event.group_id),
stacktrace_distance=0.08,
message_distance=0.12,
should_group=False,
)
expected_metadata = {
"similarity_model_version": SEER_SIMILARITY_MODEL_VERSION,
"request_hash": self.new_event_hashes.hashes[0],
"results": [asdict(seer_result_data)],
}

with patch(
"sentry.grouping.ingest.seer.get_similarity_data_from_seer",
return_value=[seer_result_data],
):
assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == (
expected_metadata,
None,
)

@with_feature("projects:similarity-embeddings-grouping")
def test_returns_no_group_and_empty_metadata_if_no_similar_group_found(self):
expected_metadata = {
"similarity_model_version": SEER_SIMILARITY_MODEL_VERSION,
"request_hash": self.new_event_hashes.hashes[0],
"results": [],
}

with patch(
"sentry.grouping.ingest.seer.get_similarity_data_from_seer",
return_value=[],
):
assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == (
expected_metadata,
None,
)
Loading