Skip to content

Commit

Permalink
feat(seer grouping): Add Seer-related ingest helpers (#70999)
Browse files Browse the repository at this point in the history
This adds two helpers, `should_call_seer_for_grouping` and `get_seer_similar_issues`, to be used when we (maybe) call Seer as part of event ingestion. 

`should_call_seer_for_grouping` does exactly what you'd think given the name, right now only basing the decision on feature flags and whether or not the event has a usable title and/or stacktrace. In the future we'll also include rate limit and killswitch checks, and any other criteria which it makes sense to add.

`get_seer_similar_issues` is a wrapper around `get_similarity_data_from_seer` (which is what actually makes the API call to Seer). It extracts request data from the given event, makes the request, pulls together metadata about the results, and if a matching group is found and the flag is on, pulls the `Group` record out of the database. (I chose to put the feature flag check there rather than in the code where the the the grouping actually happens so that we can save the trip to the database if we're not going to end up using the results for grouping.)

Code to actually use these helpers is added in #71026.
  • Loading branch information
lobsterkatie authored and cmanallen committed May 21, 2024
1 parent d488992 commit 20be96b
Show file tree
Hide file tree
Showing 2 changed files with 261 additions and 0 deletions.
100 changes: 100 additions & 0 deletions src/sentry/grouping/ingest/seer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import logging
from dataclasses import asdict

from sentry import features
from sentry.api.endpoints.group_similar_issues_embeddings import get_stacktrace_string
from sentry.constants import PLACEHOLDER_EVENT_TITLES
from sentry.eventstore.models import Event
from sentry.grouping.grouping_info import get_grouping_info_from_variants
from sentry.grouping.result import CalculatedHashes
from sentry.models.group import Group
from sentry.models.project import Project
from sentry.seer.utils import (
SeerSimilarIssuesMetadata,
SimilarIssuesEmbeddingsRequest,
get_similarity_data_from_seer,
)
from sentry.utils.safe import get_path

logger = logging.getLogger("sentry.events.grouping")


def should_call_seer_for_grouping(event: Event, project: Project) -> bool:
"""
Use event content, feature flags, rate limits, killswitches, seer health, etc. to determine
whether a call to Seer should be made.
"""
# TODO: Implement rate limits, kill switches, other flags, etc
# TODO: Return False if the event has a custom fingerprint (check for both client- and server-side fingerprints)

# If an event has no stacktrace, and only one of our placeholder titles ("<untitled>",
# "<unknown>", etc.), there's no data for Seer to analyze, so no point in making the API call.
if (
event.title in PLACEHOLDER_EVENT_TITLES
and not get_path(event.data, "exception", "values", -1, "stacktrace", "frames")
and not get_path(event.data, "threads", "values", -1, "stacktrace", "frames")
):
return False

return features.has("projects:similarity-embeddings-metadata", project) or features.has(
"projects:similarity-embeddings-grouping", project
)


def get_seer_similar_issues(
event: Event,
primary_hashes: CalculatedHashes,
num_neighbors: int = 1,
) -> tuple[
dict[
str, str | list[dict[str, float | bool | int | str]]
], # a SeerSimilarIssuesMetadata instance, dictified
Group | None,
]:
"""
Ask Seer for the given event's nearest neighbor(s) and return the seer response data, sorted
with the best matches first, along with the group Seer decided the event should go in, if any,
or None if no neighbor was near enough.
Will also return `None` for the neighboring group if the `projects:similarity-embeddings-grouping`
feature flag is off.
"""

# TODO: In our context, this can never happen. There are other scenarios in which `variants` can
# be `None`, but where we'll be using this (during ingestion) it's not possible. This check is
# primarily to satisfy mypy. Once we get rid of hierarchical hashing, we'll be able to
# make `variants` required in `CalculatedHashes`, meaning we can remove this check. (See note in
# `CalculatedHashes` class definition.)
if primary_hashes.variants is None:
raise Exception("Primary hashes missing variants data")

event_hash = primary_hashes.hashes[0]
stacktrace_string = get_stacktrace_string(
get_grouping_info_from_variants(primary_hashes.variants)
)

request_data: SimilarIssuesEmbeddingsRequest = {
"hash": event_hash,
"project_id": event.project.id,
"stacktrace": stacktrace_string,
"message": event.title,
"k": num_neighbors,
}

# Similar issues are returned with the closest match first
seer_results = get_similarity_data_from_seer(request_data)

similar_issues_metadata = asdict(
SeerSimilarIssuesMetadata(request_hash=event_hash, results=seer_results)
)
parent_group = (
Group.objects.filter(id=seer_results[0].parent_group_id).first()
if (
seer_results
and seer_results[0].should_group
and features.has("projects:similarity-embeddings-grouping", event.project)
)
else None
)

return (similar_issues_metadata, parent_group)
161 changes: 161 additions & 0 deletions tests/sentry/grouping/test_seer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
from dataclasses import asdict
from unittest.mock import patch

from sentry.conf.server import SEER_SIMILARITY_MODEL_VERSION
from sentry.eventstore.models import Event
from sentry.grouping.ingest.seer import get_seer_similar_issues, should_call_seer_for_grouping
from sentry.grouping.result import CalculatedHashes
from sentry.seer.utils import SeerSimilarIssueData
from sentry.testutils.cases import TestCase
from sentry.testutils.helpers import Feature
from sentry.testutils.helpers.eventprocessing import save_new_event
from sentry.testutils.helpers.features import with_feature
from sentry.utils.types import NonNone


class ShouldCallSeerTest(TestCase):
# TODO: Add tests for rate limits, killswitches, etc once those are in place

def test_obeys_seer_similarity_flags(self):
for metadata_flag, grouping_flag, expected_result in [
(False, False, False),
(True, False, True),
(False, True, True),
(True, True, True),
]:
with Feature(
{
"projects:similarity-embeddings-metadata": metadata_flag,
"projects:similarity-embeddings-grouping": grouping_flag,
}
):
assert (
should_call_seer_for_grouping(
Event(
project_id=self.project.id,
event_id="11212012123120120415201309082013",
data={"title": "Dogs are great!"},
),
self.project,
)
is expected_result
), f"Case ({metadata_flag}, {grouping_flag}) failed."

@with_feature("projects:similarity-embeddings-grouping")
def test_says_no_for_garbage_event(self):
assert (
should_call_seer_for_grouping(
Event(
project_id=self.project.id,
event_id="11212012123120120415201309082013",
data={"title": "<untitled>"},
),
self.project,
)
is False
)


class GetSeerSimilarIssuesTest(TestCase):
def setUp(self):
self.existing_event = save_new_event({"message": "Dogs are great!"}, self.project)
self.new_event = Event(
project_id=self.project.id,
event_id="11212012123120120415201309082013",
data={"message": "Adopt don't shop"},
)
self.new_event_hashes = CalculatedHashes(
hashes=["20130809201315042012311220122111"],
hierarchical_hashes=[],
tree_labels=[],
variants={},
)

@with_feature({"projects:similarity-embeddings-grouping": False})
def test_returns_metadata_but_no_group_if_seer_grouping_flag_off(self):
seer_result_data = SeerSimilarIssueData(
parent_hash=self.existing_event.get_primary_hash(),
parent_group_id=NonNone(self.existing_event.group_id),
stacktrace_distance=0.01,
message_distance=0.05,
should_group=True,
)
expected_metadata = {
"similarity_model_version": SEER_SIMILARITY_MODEL_VERSION,
"request_hash": self.new_event_hashes.hashes[0],
"results": [asdict(seer_result_data)],
}

with patch(
"sentry.grouping.ingest.seer.get_similarity_data_from_seer",
return_value=[seer_result_data],
):
assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == (
expected_metadata,
None, # No group returned, even though `should_group` is True
)

@with_feature("projects:similarity-embeddings-grouping")
def test_returns_metadata_and_group_if_sufficiently_close_group_found(self):
seer_result_data = SeerSimilarIssueData(
parent_hash=self.existing_event.get_primary_hash(),
parent_group_id=NonNone(self.existing_event.group_id),
stacktrace_distance=0.01,
message_distance=0.05,
should_group=True,
)
expected_metadata = {
"similarity_model_version": SEER_SIMILARITY_MODEL_VERSION,
"request_hash": self.new_event_hashes.hashes[0],
"results": [asdict(seer_result_data)],
}

with patch(
"sentry.grouping.ingest.seer.get_similarity_data_from_seer",
return_value=[seer_result_data],
):
assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == (
expected_metadata,
self.existing_event.group,
)

@with_feature("projects:similarity-embeddings-grouping")
def test_returns_metadata_but_no_group_if_similar_group_insufficiently_close(self):
seer_result_data = SeerSimilarIssueData(
parent_hash=self.existing_event.get_primary_hash(),
parent_group_id=NonNone(self.existing_event.group_id),
stacktrace_distance=0.08,
message_distance=0.12,
should_group=False,
)
expected_metadata = {
"similarity_model_version": SEER_SIMILARITY_MODEL_VERSION,
"request_hash": self.new_event_hashes.hashes[0],
"results": [asdict(seer_result_data)],
}

with patch(
"sentry.grouping.ingest.seer.get_similarity_data_from_seer",
return_value=[seer_result_data],
):
assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == (
expected_metadata,
None,
)

@with_feature("projects:similarity-embeddings-grouping")
def test_returns_no_group_and_empty_metadata_if_no_similar_group_found(self):
expected_metadata = {
"similarity_model_version": SEER_SIMILARITY_MODEL_VERSION,
"request_hash": self.new_event_hashes.hashes[0],
"results": [],
}

with patch(
"sentry.grouping.ingest.seer.get_similarity_data_from_seer",
return_value=[],
):
assert get_seer_similar_issues(self.new_event, self.new_event_hashes) == (
expected_metadata,
None,
)

0 comments on commit 20be96b

Please sign in to comment.