From 4c71260e73512cf344c9dc05a441915ada73be2f Mon Sep 17 00:00:00 2001
From: Alex Zaslavsky <azaslavsky@users.noreply.github.com>
Date: Fri, 21 Jul 2023 10:10:06 -0700
Subject: [PATCH] ref(backup): Do deepcopy before validation (#53346)

Previously, all validation mutated the inputs in-place. This was
performant (no need to make deep copies of two potentially large JSON
blobs), but potentially bug prone if not held carefully, since the
validation itself mutated the inputs via JSONScrubbingComparators. This
means that the post-comparison input objects MUST NOT be re-used, since
they are meaningfully different from the inputs that were declared
valid.

Instead, we now do deepcopies under the hood, leaving the originally
passed-in inputs untouched.

Issue: getsentry/team-ospo#156
---
 src/sentry/runner/commands/backup.py    | 13 +++++++++++--
 tests/sentry/backup/test_correctness.py | 10 ++++------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/sentry/runner/commands/backup.py b/src/sentry/runner/commands/backup.py
index 510d0efeceddfc..309f2741bbaeba 100644
--- a/src/sentry/runner/commands/backup.py
+++ b/src/sentry/runner/commands/backup.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
+from copy import deepcopy
 from datetime import datetime, timedelta, timezone
 from difflib import unified_diff
 from io import StringIO
@@ -159,10 +160,13 @@ def compare(self, on: InstanceID, left: JSONData, right: JSONData) -> Comparator
 
 
 def validate(
-    expect: JSONData, actual: JSONData, comparators: ComparatorMap = DEFAULT_COMPARATORS
+    expect: JSONData,
+    actual: JSONData,
+    comparators: ComparatorMap = DEFAULT_COMPARATORS,
 ) -> ComparatorFindings:
     """Ensures that originally imported data correctly matches actual outputted data, and produces a
-    list of reasons why not when it doesn't"""
+    list of reasons why not when it doesn't.
+    """
 
     def json_lines(obj: JSONData) -> list[str]:
         """Take a JSONData object and pretty-print it as JSON."""
@@ -176,6 +180,11 @@ def json_lines(obj: JSONData) -> list[str]:
         id = InstanceID(model["model"], model["pk"])
         exp_models[id] = model
 
+    # Because we may be scrubbing data from the objects as we compare them, we may (optionally) make
+    # deep copies to start to avoid potentially mangling the input data.
+    expect = deepcopy(expect)
+    actual = deepcopy(actual)
+
     # Ensure that the actual JSON contains no duplicates - we assume that the expected JSON did not.
     for model in actual:
         id = InstanceID(model["model"], model["pk"])
diff --git a/tests/sentry/backup/test_correctness.py b/tests/sentry/backup/test_correctness.py
index b047ade1dbbfbe..403b21fe049084 100644
--- a/tests/sentry/backup/test_correctness.py
+++ b/tests/sentry/backup/test_correctness.py
@@ -20,7 +20,9 @@
 
 
 def import_export_then_validate(
-    tmp_path: Path, fixture_file_name: str, map: ComparatorMap = EMPTY_COMPARATORS_FOR_TESTING
+    tmp_path: Path,
+    fixture_file_name: str,
+    map: ComparatorMap = EMPTY_COMPARATORS_FOR_TESTING,
 ) -> None:
     """Test helper that validates that data imported from a fixture `.json` file correctly matches
     the actual outputted export data."""
@@ -34,11 +36,7 @@ def import_export_then_validate(
         rv = CliRunner().invoke(import_, [str(fixture_file_path)])
         assert rv.exit_code == 0, rv.output
 
-    res = validate(
-        expect,
-        tmp_export_to_file(tmp_path.joinpath("tmp_test_file.json")),
-        map,
-    )
+    res = validate(expect, tmp_export_to_file(tmp_path.joinpath("tmp_test_file.json")), map)
     if res.findings:
         raise ValidationError(res)