From ed99e6efcf59478ef77a0fc1ddef1386f85db0c9 Mon Sep 17 00:00:00 2001 From: Alex Zaslavsky Date: Mon, 31 Jul 2023 15:38:15 -0700 Subject: [PATCH] feat(backup): Add EmailObfuscatingComparator (#53763) This comparator is added to prevent sensitive data like user emails from leaking into logs, stack traces, and so on. Issue: getsentry/team-ospo#157 --- src/sentry/runner/commands/backup.py | 107 ++++++++++++++++++--- tests/sentry/backup/test_comparators.py | 121 ++++++++++++++++++++++-- 2 files changed, 208 insertions(+), 20 deletions(-) diff --git a/src/sentry/runner/commands/backup.py b/src/sentry/runner/commands/backup.py index 468d8b21c2230e..c08aa43d5b949e 100644 --- a/src/sentry/runner/commands/backup.py +++ b/src/sentry/runner/commands/backup.py @@ -5,7 +5,7 @@ from datetime import datetime, timedelta, timezone from difflib import unified_diff from io import StringIO -from typing import Dict, List, NamedTuple +from typing import Callable, Dict, List, Literal, NamedTuple import click from dateutil import parser @@ -79,7 +79,7 @@ class JSONScrubbingComparator(ABC): the `scrub(...)` methods are. This ensures that comparators that touch the same fields do not have their inputs mangled by one another.""" - def __init__(self, fields: list[str]): + def __init__(self, *fields: str): self.fields = fields def check(self, side: str, data: JSONData) -> None: @@ -125,15 +125,22 @@ def existence(self, on: InstanceID, left: JSONData, right: JSONData) -> list[Com ) return findings - def scrub(self, on: InstanceID, left: JSONData, right: JSONData) -> None: + def __scrub__( + self, + left: JSONData, + right: JSONData, + f: Callable[[list[str]], list[str]] | Callable[[list[str]], Literal[True]] = lambda _: True, + ) -> None: """Removes all of the fields compared by this comparator from the `fields` dict, so that the - remaining fields may be compared for equality. + remaining fields may be compared for equality. Public callers should use the inheritance-safe wrapper, `scrub`, rather than using this internal method directly. Parameters: - on: An `InstanceID` that must be shared by both versions of the JSON model being compared. - left: One of the models being compared (usually the "before") version. - right: The other model it is being compared against (usually the "after" or post-processed version). + - f: Optional helper method that populates the RHS of the scrubbed entry. If this is + omitted, the scrubbed entry defaults to `True`. """ self.check("left", left) @@ -142,11 +149,24 @@ def scrub(self, on: InstanceID, left: JSONData, right: JSONData) -> None: left["scrubbed"] = {} if "scrubbed" not in right: right["scrubbed"] = {} + for field in self.fields: - del left["fields"][field] - left["scrubbed"][f"{self.get_kind()}::{field}"] = True - del right["fields"][field] - right["scrubbed"][f"{self.get_kind()}::{field}"] = True + for side in [left, right]: + if field not in side["fields"]: + continue + value = side["fields"][field] + if not value: + continue + value = [value] if isinstance(value, str) else value + del side["fields"][field] + side["scrubbed"][f"{self.get_kind()}::{field}"] = f(value) + + def scrub( + self, + left: JSONData, + right: JSONData, + ) -> None: + self.__scrub__(left, right) def get_kind(self) -> str: """A unique identifier for this particular derivation of JSONScrubbingComparator, which will @@ -160,7 +180,7 @@ class DateUpdatedComparator(JSONScrubbingComparator): date that is greater than (ie, occurs after) the specified field's left input.""" def __init__(self, field: str): - super().__init__([field]) + super().__init__(field) self.field = field def compare(self, on: InstanceID, left: JSONData, right: JSONData) -> list[ComparatorFinding]: @@ -182,14 +202,79 @@ def compare(self, on: InstanceID, left: JSONData, right: JSONData) -> list[Compa return [] +class EmailObfuscatingComparator(JSONScrubbingComparator): + """Comparator that compares emails, but then safely truncates them to ensure that they + do not leak out in logs, stack traces, etc.""" + + def __init__(self, *fields: str): + super().__init__(*fields) + + def compare(self, on: InstanceID, left: JSONData, right: JSONData) -> list[ComparatorFinding]: + findings = [] + for f in self.fields: + if f not in left["fields"] and f not in right["fields"]: + continue + + lv = left["fields"][f] + rv = right["fields"][f] + if lv != rv: + lv = self.truncate([lv] if isinstance(lv, str) else lv)[0] + rv = self.truncate([rv] if isinstance(rv, str) else rv)[0] + findings.append( + ComparatorFinding( + kind=self.get_kind(), + on=on, + reason=f"""the left `{f}` value ("{lv}") on `{on}` was not equal to the + right value ("{rv}")""", + ) + ) + return findings + + def scrub( + self, + left: JSONData, + right: JSONData, + ) -> None: + super().__scrub__(left, right, self.truncate) + + @staticmethod + def truncate(data: list[str]) -> list[str]: + truncated = [] + for d in data: + parts = d.split("@") + if len(parts) == 2: + username = parts[0] + domain = parts[1] + truncated.append(f"{username[0]}...@...{domain[-6:]}") + else: + truncated.append(d) + return truncated + + ComparatorList = List[JSONScrubbingComparator] ComparatorMap = Dict[str, ComparatorList] DEFAULT_COMPARATORS: ComparatorMap = { + "sentry.apitoken": [EmailObfuscatingComparator("user")], + "sentry.apiapplication": [EmailObfuscatingComparator("owner")], + "sentry.apiauthorization": [EmailObfuscatingComparator("user")], + "sentry.authidentity": [EmailObfuscatingComparator("user")], + "sentry.authenticator": [EmailObfuscatingComparator("user")], + "sentry.email": [EmailObfuscatingComparator("email")], "sentry.alertrule": [DateUpdatedComparator("date_modified")], "sentry.incidenttrigger": [DateUpdatedComparator("date_modified")], + "sentry.organizationmember": [EmailObfuscatingComparator("user_email")], "sentry.querysubscription": [DateUpdatedComparator("date_updated")], + "sentry.sentryapp": [EmailObfuscatingComparator("creator_user", "creator_label", "proxy_user")], + "sentry.user": [EmailObfuscatingComparator("email", "username")], + "sentry.useremail": [EmailObfuscatingComparator("email", "user")], + "sentry.userip": [EmailObfuscatingComparator("user")], + "sentry.useroption": [EmailObfuscatingComparator("user")], + "sentry.userpermission": [EmailObfuscatingComparator("user")], "sentry.userrole": [DateUpdatedComparator("date_updated")], - "sentry.userroleuser": [DateUpdatedComparator("date_updated")], + "sentry.userroleuser": [ + DateUpdatedComparator("date_updated"), + EmailObfuscatingComparator("user"), + ], } @@ -260,7 +345,7 @@ def json_lines(obj: JSONData) -> list[str]: if res: findings.extend(res) for cmp in comparators[id.model]: - cmp.scrub(id, exp, act) + cmp.scrub(exp, act) # Finally, perform a diff on the remaining JSON. diff = list(unified_diff(json_lines(exp["fields"]), json_lines(act["fields"]), n=3)) diff --git a/tests/sentry/backup/test_comparators.py b/tests/sentry/backup/test_comparators.py index 074c83bf32978f..fc19f43e3db9bd 100644 --- a/tests/sentry/backup/test_comparators.py +++ b/tests/sentry/backup/test_comparators.py @@ -1,10 +1,15 @@ -from sentry.runner.commands.backup import DateUpdatedComparator, InstanceID +from sentry.runner.commands.backup import ( + DateUpdatedComparator, + EmailObfuscatingComparator, + InstanceID, +) +from sentry.utils.json import JSONData def test_good_comparator_both_sides_existing(): cmp = DateUpdatedComparator("my_date_field") id = InstanceID("test", 1) - present = { + present: JSONData = { "model": "test", "pk": 1, "fields": { @@ -17,7 +22,7 @@ def test_good_comparator_both_sides_existing(): def test_good_comparator_neither_side_existing(): cmp = DateUpdatedComparator("my_date_field") id = InstanceID("test", 1) - missing = { + missing: JSONData = { "model": "test", "pk": 1, "fields": {}, @@ -28,14 +33,14 @@ def test_good_comparator_neither_side_existing(): def test_bad_comparator_only_one_side_existing(): cmp = DateUpdatedComparator("my_date_field") id = InstanceID("test", 1) - present = { + present: JSONData = { "model": "test", "pk": 1, "fields": { "my_date_field": "2023-06-22T23:12:34.567Z", }, } - missing = { + missing: JSONData = { "model": "test", "pk": 1, "fields": {}, @@ -60,14 +65,14 @@ def test_bad_comparator_only_one_side_existing(): def test_good_date_updated_comparator(): cmp = DateUpdatedComparator("my_date_field") id = InstanceID("test", 1) - left = { + left: JSONData = { "model": "test", "pk": 1, "fields": { "my_date_field": "2023-06-22T23:00:00.123Z", }, } - right = { + right: JSONData = { "model": "test", "pk": 1, "fields": { @@ -80,14 +85,14 @@ def test_good_date_updated_comparator(): def test_bad_date_updated_comparator(): cmp = DateUpdatedComparator("my_date_field") id = InstanceID("test", 1) - left = { + left: JSONData = { "model": "test", "pk": 1, "fields": { "my_date_field": "2023-06-22T23:12:34.567Z", }, } - right = { + right: JSONData = { "model": "test", "pk": 1, "fields": { @@ -99,3 +104,101 @@ def test_bad_date_updated_comparator(): assert res[0] assert res[0].on == id assert res[0].kind == "DateUpdatedComparator" + + +def test_good_email_obfuscating_comparator(): + cmp = EmailObfuscatingComparator("one_email", "many_emails") + id = InstanceID("test", 1) + model = { + "model": "test", + "pk": 1, + "fields": { + "one_email": "a@example.com", + "many_emails": [ + "b@example.com", + "c@example.com", + ], + }, + } + assert not cmp.compare(id, model, model) + + +def test_bad_email_obfuscating_comparator(): + cmp = EmailObfuscatingComparator("one_email", "many_emails") + id = InstanceID("test", 1) + left: JSONData = { + "model": "test", + "pk": 1, + "fields": { + "one_email": "alpha@example.com", + "many_emails": [ + "bravo@example.com", + "charlie@example.com", + ], + }, + } + right: JSONData = { + "model": "test", + "pk": 1, + "fields": { + "one_email": "alice@testing.com", + "many_emails": [ + "brian@testing.com", + "charlie@example.com", + ], + }, + } + res = cmp.compare(id, left, right) + assert res + + assert res[0] + assert res[0].on == id + assert res[0].kind == "EmailObfuscatingComparator" + assert "a...@...le.com" in res[0].reason + assert "a...@...ng.com" in res[0].reason + + assert res[1] + assert res[1].on == id + assert res[1].kind == "EmailObfuscatingComparator" + assert "b...@...le.com" in res[1].reason + assert "b...@...ng.com" in res[1].reason + + +def test_goo_email_obfuscating_comparator_scrubbed(): + cmp = EmailObfuscatingComparator("one_email", "many_emails") + left: JSONData = { + "model": "test", + "pk": 1, + "fields": { + "one_email": "alpha@example.com", + "many_emails": [ + "bravo@example.com", + "charlie@example.com", + ], + }, + } + right: JSONData = { + "model": "test", + "pk": 1, + "fields": { + "one_email": "alice@testing.com", + "many_emails": [ + "brian@testing.com", + "charlie@example.com", + ], + }, + } + cmp.scrub(left, right) + assert left["scrubbed"] + assert left["scrubbed"]["EmailObfuscatingComparator::one_email"] == ["a...@...le.com"] + assert left["scrubbed"]["EmailObfuscatingComparator::many_emails"] == [ + "b...@...le.com", + "c...@...le.com", + ] + + assert right["scrubbed"] + assert right["scrubbed"]["EmailObfuscatingComparator::one_email"] == ["a...@...ng.com"] + assert right["scrubbed"]["EmailObfuscatingComparator::many_emails"] == [ + "b...@...ng.com", + "c...@...le.com", + ]