Fix bug causing missing datamap rows (#1124)

* Update export.py * fix mistakenly dropped datamap rows * add tests for bug that fail with original state * fix test failure with grouping in delete_df * fix mypy reassignment issues, changelog Co-authored-by: SteveDMurphy <[email protected]>
ethyca · Sep 29, 2022 · 6c723e6 · 6c723e6
1 parent adc0b34
commit 6c723e6
Show file tree

Hide file tree

Showing 3 changed files with 123 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -44,6 +44,7 @@ The types of changes are:
 ### Fixed
 
 * Fixed the "help" link in the UI header [#1078](https://github.com/ethyca/fides/pull/1078)
+* Fixed a bug where rows were being excluded from a data map [#1124](https://github.com/ethyca/fides/pull/1124)
 
 ### Security
 

diff --git a/src/fidesctl/ctl/core/export.py b/src/fidesctl/ctl/core/export.py
@@ -389,10 +389,22 @@ def build_joined_dataframe(
     # restructure the joined dataframe to represent system and dataset data categories appropriately
     joined_df = union_data_categories_in_joined_dataframe(joined_df)
 
-    delete_df = joined_df[["system.name", "unioned_data_categories"]][
-        joined_df.groupby(["system.name", "unioned_data_categories"])[
-            "dataset.name"
-        ].transform("count")
+    delete_df = joined_df[
+        [
+            "system.name",
+            "system.privacy_declaration.data_use.name",
+            "system.privacy_declaration.data_subjects.name",
+            "unioned_data_categories",
+        ]
+    ][
+        joined_df.groupby(
+            [
+                "system.name",
+                "system.privacy_declaration.data_use.name",
+                "system.privacy_declaration.data_subjects.name",
+                "unioned_data_categories",
+            ]
+        )["dataset.name"].transform("count")
         > 1
     ].drop_duplicates()
 
@@ -438,6 +450,7 @@ def export_datamap(
             get_server_resource(url, "organization", organization_fides_key, headers)
         ]
     }
+    # Verify this isn't dropping records on the joins
     for resource_type in ["system", "dataset", "data_subject", "data_use"]:
         server_resources = list_server_resources(
             url,

diff --git a/tests/ctl/core/test_export.py b/tests/ctl/core/test_export.py
@@ -1,5 +1,5 @@
 # pylint: disable=missing-docstring, redefined-outer-name
-from typing import Generator
+from typing import Dict, Generator
 
 import pytest
 from fideslang.models import (
@@ -18,7 +18,7 @@
 
 
 @pytest.fixture()
-def test_sample_system_taxonomy() -> Generator:
+def test_sample_system_taxonomy() -> Generator[Dict, None, None]:
     yield {
         "system": [
             System(
@@ -36,7 +36,7 @@ def test_sample_system_taxonomy() -> Generator:
                         data_use="provide.service",
                         data_qualifier="aggregated.anonymized",
                         data_subjects=["customer"],
-                        dataset_references=["users_dataset"],
+                        dataset_references=["test_dataset"],
                     )
                 ],
             )
@@ -45,6 +45,12 @@ def test_sample_system_taxonomy() -> Generator:
         "data_use": [
             DataUse(fides_key="provide.service", name="System", parent_key="provide")
         ],
+        "organization": [
+            Organization(
+                fides_key="default_organization",
+                security_policy="https://www.google.com/",
+            )
+        ],
     }
 
 
@@ -125,3 +131,99 @@ def test_organization_records_to_export() -> None:
         [Organization(fides_key="default_organization")]
     )
     assert len(output_list) == 5
+
+
+@pytest.mark.unit
+def test_joined_datamap_export_system_only(
+    test_sample_system_taxonomy: Dict,
+    test_config: FidesctlConfig,
+) -> None:
+    """
+    Asserts the correct number of rows are exported for a basic system
+    """
+    sample_taxonomy: Dict = test_sample_system_taxonomy
+    sample_taxonomy["dataset"] = []
+    output_list = export.build_joined_dataframe(test_sample_system_taxonomy)
+    assert len(output_list) == 2
+
+
+@pytest.mark.unit
+def test_joined_datamap_export_system_dataset_overlap(
+    test_sample_system_taxonomy: Dict,
+    test_sample_dataset_taxonomy: Generator,
+    test_config: FidesctlConfig,
+) -> None:
+    """
+    Asserts the correct number of rows are exported for a system with a dataset
+    """
+    sample_taxonomy: Dict = test_sample_system_taxonomy
+    sample_taxonomy["dataset"] = test_sample_dataset_taxonomy
+    output_list = export.build_joined_dataframe(sample_taxonomy)
+    assert len(output_list) == 5
+
+
+@pytest.mark.unit
+def test_joined_datamap_export_system_dataset_common(
+    test_sample_system_taxonomy: Dict,
+    test_config: FidesctlConfig,
+) -> None:
+    """
+    Asserts the duplicate rows are removed from an export
+    """
+    sample_taxonomy: Dict = test_sample_system_taxonomy
+    sample_taxonomy["dataset"] = [
+        Dataset(
+            fides_key="test_dataset",
+            name="test dataset",
+            description="dataset for testing",
+            dataset_categories=[],
+            collections=[
+                DatasetCollection(
+                    name="test_collection",
+                    data_categories=[],
+                    fields=[
+                        DatasetField(
+                            name="test_field_1",
+                            data_categories=["user.contact.email"],
+                            data_qualifier="aggregated.anonymized",
+                            retention="No retention policy",
+                        ),
+                        DatasetField(
+                            name="test_field_2",
+                            data_categories=["user.contact.name"],
+                            data_qualifier="aggregated.anonymized",
+                        ),
+                    ],
+                )
+            ],
+        )
+    ]
+    output_list = export.build_joined_dataframe(sample_taxonomy)
+    assert len(output_list) == 2
+
+
+@pytest.mark.unit
+def test_joined_datamap_export_system_multiple_declarations_overlap(
+    test_sample_system_taxonomy: Dict,
+    test_config: FidesctlConfig,
+) -> None:
+    """
+    Asserts the correct number of rows are exported for a complex system
+    """
+    sample_taxonomy: Dict = test_sample_system_taxonomy
+    new_data_subject = DataSubject(fides_key="prospect", name="prospect")
+    new_declaration = PrivacyDeclaration(
+        name="privacy_declaration_2",
+        data_categories=[
+            "user.contact.email",
+            "user.contact.name",
+        ],
+        data_use="provide.service",
+        data_qualifier="aggregated.anonymized",
+        data_subjects=["prospect"],
+    )
+    sample_taxonomy["data_subject"].append(new_data_subject)
+    sample_taxonomy["system"][0].privacy_declarations.append(new_declaration)
+    sample_taxonomy["dataset"] = []
+    output_list = export.build_joined_dataframe(sample_taxonomy)
+    assert len(output_list) == 4