diff --git a/src/fidesops/graph/data_type.py b/src/fidesops/graph/data_type.py index 15ce96517..b9a1a234f 100644 --- a/src/fidesops/graph/data_type.py +++ b/src/fidesops/graph/data_type.py @@ -221,6 +221,50 @@ def parse_data_type_string(type_string: Optional[str]) -> Tuple[Optional[str], b return type_string[:idx], True +def to_data_type_string(data_type: str, is_array: bool) -> str: + """ + Appends [] to the data type if it is an array. + """ + if data_type == DataType.no_op.name: + return data_type + return data_type + "[]" if is_array else data_type + + +def get_data_type(value: Any) -> Tuple[Optional[str], bool]: + """ + Returns the simple or array type of the given value. + """ + + data_type = DataType.no_op.name + is_array = False + + # cannot assume data type for missing or empty values + if value in (None, "", [], {}): + return data_type, is_array + + if isinstance(value, bool): + data_type = DataType.boolean.name + elif isinstance(value, int): + data_type = DataType.integer.name + elif isinstance(value, float): + data_type = DataType.float.name + elif isinstance(value, str): + data_type = DataType.string.name + elif isinstance(value, dict): + data_type = DataType.object.name + elif isinstance(value, list): + is_array = True + if all(isinstance(item, int) for item in value): + data_type = DataType.integer.name + elif all(isinstance(item, float) for item in value): + data_type = DataType.float.name + elif all(isinstance(item, str) for item in value): + data_type = DataType.string.name + elif all(isinstance(item, dict) for item in value): + data_type = DataType.object.name + return data_type, is_array + + if __name__ == "__main__": v = DataType.no_op.value for x in v.__dict__: diff --git a/tests/test_helpers/dataset_utils.py b/tests/test_helpers/dataset_utils.py new file mode 100644 index 000000000..bc6d256b5 --- /dev/null +++ b/tests/test_helpers/dataset_utils.py @@ -0,0 +1,229 @@ +import re +from typing import Any, Dict, Iterable, List, Optional + +import yaml + +from fidesops.graph.config import Collection, Field, FieldPath, ObjectField, ScalarField +from fidesops.graph.data_type import DataType, get_data_type, to_data_type_string +from fidesops.models.connectionconfig import ConnectionConfig +from fidesops.models.datasetconfig import DatasetConfig, convert_dataset_to_graph +from fidesops.schemas.dataset import FidesopsDataset +from fidesops.util.collection_util import Row + +SAAS_DATASET_DIRECTORY = "data/saas/dataset/" + + +def update_dataset( + connection_config: ConnectionConfig, + dataset_config: DatasetConfig, + api_data: Dict[str, List[Row]], + file_name: str, +): + """ + Helper function to update the dataset in the given dataset_config + with api_data and write the formatted result to the specified file. + """ + + generated_dataset = generate_dataset( + dataset_config.dataset, + api_data, + [endpoint["name"] for endpoint in connection_config.saas_config["endpoints"]], + ) + + # the yaml library doesn't allow us to just reformat + # the data_categories field so we fix it with a regex + # + # data_categories: + # - system.operations + # + # data_categories: [system.operations] + # + with open(f"{SAAS_DATASET_DIRECTORY}{file_name}", "w") as dataset_file: + dataset_file.write( + re.sub( + r"(data_categories:)\n\s+- ([^\n]+)", + r"\1 [\2]", + yaml.dump( + {"dataset": [generated_dataset]}, + default_flow_style=False, + sort_keys=False, + indent=2, + ), + ) + ) + + +def generate_dataset( + existing_dataset: Dict[str, Any], + api_data: Dict[str, List[Row]], + collection_order: List[str] = None, +): + """ + Generates a dataset which is an aggregate of the existing dataset and + any new fields generated from the API data. Orders the collections + based on the order of collection_order. + """ + + # preserve the collection order in the dataset if a collection order is not provided + if not collection_order: + collection_order = [ + collection["name"] for collection in existing_dataset["collections"] + ] + + # remove the dataset name from the keys in the api_data map before passing + # into generate_collections + generated_collections = generate_collections( + { + collection_name.replace(f"{existing_dataset['fides_key']}:", ""): collection + for collection_name, collection in api_data.items() + }, + existing_dataset, + ) + + return { + "fides_key": existing_dataset["fides_key"], + "name": existing_dataset["name"], + "description": existing_dataset["description"], + "collections": [ + { + "name": collection["name"], + "fields": collection["fields"], + } + for collection in sorted( + generated_collections, + key=lambda collection: collection_order.index(collection["name"]), + ) + ], + } + + +def generate_collections( + api_data: Dict[str, List[Row]], dataset: Dict[str, Any] = None +) -> List[Dict[str, Any]]: + """ + Generates a list of collections based on the response data or returns + the existing collections if no API data is available. + """ + + # convert FidesopsDataset to Dataset to be able to use the Collection helpers + collection_map = {} + if dataset: + graph = convert_dataset_to_graph( + FidesopsDataset(**dataset), dataset["fides_key"] + ) + collection_map = { + collection.name: collection for collection in graph.collections + } + + collections = [] + for collection_name in set().union( + api_data.keys(), + collection_map.keys(), + ): + if len(rows := api_data.get(collection_name, [])): + fields = generate_fields(rows[0], collection_name, collection_map) + else: + fields = get_simple_fields(collection_map.get(collection_name).fields) + + collections.append( + { + "name": collection_name, + "fields": fields, + } + ) + + return collections + + +def generate_fields( + row: Dict[str, Any], parent_path: str, field_map: Dict[str, Collection] +) -> List[Dict[str, Dict]]: + """ + Generates a simplified version of dataset fields based on the row data. + Maintains the current path of the traversal to determine if the field + exists in the existing dataset. If it does, existing attributes + are preserved instead of generating them from the row data. + """ + + fields = [] + for key, value in row.items(): + # increment path + current_path = f"{parent_path}.{key}" + # initialize field + field = {"name": key} + # derive data_type based on row data + data_type, is_array = get_data_type(value) + + # only values of type object or object[] should have sub-fields defined + # additionally object and object[] cannot have data_categories + if data_type == DataType.object.name and not is_array: + field["fidesops_meta"] = {"data_type": data_type} + field["fields"] = generate_fields(value, current_path, field_map) + elif data_type == DataType.object.name and is_array: + field["fidesops_meta"] = {"data_type": to_data_type_string(data_type, True)} + field["fields"] = generate_fields(value[0], current_path, field_map) + else: + if existing_field := get_existing_field(field_map, current_path): + if isinstance(existing_field, ScalarField): + # field exists, copy existing data categories and data_type (if available) + field["data_categories"] = existing_field.data_categories or [ + "system.operations" + ] + data_type = ( + existing_field.data_type() + if existing_field.data_type() != "None" + else data_type + ) + if data_type: + field["fidesops_meta"] = {"data_type": data_type} + elif isinstance(existing_field, ObjectField): + # the existing field has a more complex type than what we could derive + # from the API response, we need to copy the fields too instead of just + # the data_categories and data_type + field["fidesops_meta"] = { + "data_type": to_data_type_string( + DataType.object.name, isinstance(value, list) + ) + } + field["fields"] = get_simple_fields(existing_field.fields.values()) + else: + # we don't have this field in our dataset, use the default category + # and the derived data_type + field["data_categories"] = ["system.operations"] + # we don't assume the data_type for empty strings, empty lists, + # empty dicts, or nulls + if data_type != DataType.no_op.name: + field["fidesops_meta"] = { + "data_type": to_data_type_string(data_type, is_array) + } + fields.append(field) + return fields + + +def get_existing_field(field_map: Dict[str, Collection], path: str) -> Optional[Field]: + """ + Lookup existing field by collection name and field path. + """ + collection_name, field_path = path.split(".", 1) + if collection := field_map.get(collection_name): + return collection.field_dict.get(FieldPath.parse((field_path))) + return None + + +def get_simple_fields(fields: Iterable[Field]) -> List[Dict[str, Any]]: + """ + Converts dataset fields into simple dictionaries with only + name, data_category, and data_type. + """ + + object_list = [] + for field in fields: + object = {"name": field.name} + if field.data_categories: + object["data_categories"] = field.data_categories + if field.data_type() != "None": + object["fidesops_meta"] = {"data_type": field.data_type()} + if isinstance(field, ObjectField) and field.fields: + object["fields"] = get_simple_fields(field.fields.values()) + object_list.append(object) + return object_list diff --git a/tests/test_helpers/test_dataset_utils.py b/tests/test_helpers/test_dataset_utils.py new file mode 100644 index 000000000..8d7a3620e --- /dev/null +++ b/tests/test_helpers/test_dataset_utils.py @@ -0,0 +1,667 @@ +from tests.test_helpers.dataset_utils import generate_collections, generate_dataset + + +class TestGenerateCollections: + def test_empty_values(self): + api_data = {"user": [{"a": None, "b": "", "c": [], "d": {}}]} + assert generate_collections(api_data) == [ + { + "name": "user", + "fields": [ + {"name": "a", "data_categories": ["system.operations"]}, + {"name": "b", "data_categories": ["system.operations"]}, + {"name": "c", "data_categories": ["system.operations"]}, + {"name": "d", "data_categories": ["system.operations"]}, + ], + } + ] + + def test_boolean_true(self): + api_data = {"user": [{"active": True}]} + assert generate_collections(api_data) == [ + { + "name": "user", + "fields": [ + { + "name": "active", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "boolean"}, + } + ], + } + ] + + def test_boolean_false(self): + api_data = {"user": [{"active": False}]} + assert generate_collections(api_data) == [ + { + "name": "user", + "fields": [ + { + "name": "active", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "boolean"}, + } + ], + } + ] + + def test_integer(self): + api_data = {"user": [{"id": 1}]} + assert generate_collections(api_data) == [ + { + "name": "user", + "fields": [ + { + "name": "id", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "integer"}, + } + ], + } + ] + + def test_zero(self): + api_data = {"user": [{"id": 0}]} + assert generate_collections(api_data) == [ + { + "name": "user", + "fields": [ + { + "name": "id", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "integer"}, + } + ], + } + ] + + def test_float(self): + api_data = {"user": [{"balance": 2.0}]} + assert generate_collections(api_data) == [ + { + "name": "user", + "fields": [ + { + "name": "balance", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "float"}, + } + ], + } + ] + + def test_float_zero(self): + api_data = {"user": [{"balance": 0.0}]} + assert generate_collections(api_data) == [ + { + "name": "user", + "fields": [ + { + "name": "balance", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "float"}, + } + ], + } + ] + + def test_string(self): + api_data = {"user": [{"first_name": "test"}]} + assert generate_collections(api_data) == [ + { + "name": "user", + "fields": [ + { + "name": "first_name", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "string"}, + } + ], + } + ] + + def test_object(self): + api_data = { + "user": [{"address": {"street1": "123 Fake St.", "city": "Springfield"}}] + } + assert generate_collections(api_data) == [ + { + "name": "user", + "fields": [ + { + "name": "address", + "fidesops_meta": {"data_type": "object"}, + "fields": [ + { + "name": "street1", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "string"}, + }, + { + "name": "city", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "string"}, + }, + ], + } + ], + } + ] + + def test_integer_list(self): + api_data = {"user": [{"ids": [1]}]} + assert generate_collections(api_data) == [ + { + "name": "user", + "fields": [ + { + "name": "ids", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "integer[]"}, + } + ], + } + ] + + def test_float_list(self): + api_data = {"user": [{"times": [2.0]}]} + assert generate_collections(api_data) == [ + { + "name": "user", + "fields": [ + { + "name": "times", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "float[]"}, + } + ], + } + ] + + def test_string_list(self): + api_data = {"user": [{"names": ["first last"]}]} + assert generate_collections(api_data) == [ + { + "name": "user", + "fields": [ + { + "name": "names", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "string[]"}, + } + ], + } + ] + + def test_object_list(self): + api_data = { + "user": [ + { + "bank_accounts": [ + {"bank_name": "Wells Fargo", "status": "active"}, + {"bank_name": "Schools First", "status": "active"}, + ] + } + ] + } + assert generate_collections(api_data) == [ + { + "name": "user", + "fields": [ + { + "name": "bank_accounts", + "fidesops_meta": {"data_type": "object[]"}, + "fields": [ + { + "name": "bank_name", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "string"}, + }, + { + "name": "status", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "string"}, + }, + ], + } + ], + } + ] + + +class TestGenerateDataset: + def test_update_existing_name_only_field(self): + """ + Ensures that an existing field with only a name is updated + with a default data_category and data_type + """ + + existing_dataset = { + "fides_key": "example", + "name": "Example Dataset", + "description": "An example dataset", + "collections": [ + { + "name": "user", + "fields": [{"name": "a"}], + } + ], + } + api_data = {"user": [{"a": "property"}]} + + generated_dataset = generate_dataset(existing_dataset, api_data) + + assert generated_dataset == { + "fides_key": "example", + "name": "Example Dataset", + "description": "An example dataset", + "collections": [ + { + "name": "user", + "fields": [ + { + "name": "a", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "string"}, + } + ], + } + ], + } + + def test_update_existing_field_with_category(self): + """ + Ensures that an existing field with an already defined data_category isn't overwritten + """ + + existing_dataset = { + "fides_key": "example", + "name": "Example Dataset", + "description": "An example dataset", + "collections": [ + { + "name": "user", + "fields": [{"name": "a", "data_categories": ["user.provided"]}], + } + ], + } + api_data = {"user": [{"a": "property"}]} + + assert generate_dataset(existing_dataset, api_data) == { + "fides_key": "example", + "name": "Example Dataset", + "description": "An example dataset", + "collections": [ + { + "name": "user", + "fields": [ + { + "name": "a", + "data_categories": ["user.provided"], + "fidesops_meta": {"data_type": "string"}, + } + ], + } + ], + } + + def test_update_existing_scalar_field_to_object_field(self): + """ + Ensures that an existing scalar field's data type and data_category are updated + if the generated field is an object type + """ + + existing_dataset = { + "fides_key": "example", + "name": "Example Dataset", + "description": "An example dataset", + "collections": [ + { + "name": "user", + "fields": [ + { + "name": "a", + "data_categories": ["user.provided"], + "fidesops_meta": {"data_type": "string"}, + } + ], + } + ], + } + api_data = {"user": [{"a": {"first": "A", "last": "B"}}]} + + assert generate_dataset(existing_dataset, api_data) == { + "fides_key": "example", + "name": "Example Dataset", + "description": "An example dataset", + "collections": [ + { + "name": "user", + "fields": [ + { + "name": "a", + "fidesops_meta": {"data_type": "object"}, + "fields": [ + { + "name": "first", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "string"}, + }, + { + "name": "last", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "string"}, + }, + ], + } + ], + } + ], + } + + def test_keep_existing_object_field(self): + """ + Ensures that an existing object field isn't overwritten if the field + from the API response is empty + """ + + existing_dataset = { + "fides_key": "example", + "name": "Example Dataset", + "description": "An example dataset", + "collections": [ + { + "name": "user", + "fields": [ + { + "name": "a", + "fidesops_meta": {"data_type": "object"}, + "fields": [ + { + "name": "first", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "string"}, + }, + { + "name": "last", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "string"}, + }, + { + "name": "address", + "fidesops_meta": {"data_type": "object"}, + "fields": [ + { + "name": "city", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "string"}, + }, + { + "name": "state", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "string"}, + }, + ], + }, + ], + } + ], + } + ], + } + api_data = {"user": [{"a": None}]} + + assert generate_dataset(existing_dataset, api_data) == { + "fides_key": "example", + "name": "Example Dataset", + "description": "An example dataset", + "collections": [ + { + "name": "user", + "fields": [ + { + "name": "a", + "fidesops_meta": {"data_type": "object"}, + "fields": [ + { + "name": "first", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "string"}, + }, + { + "name": "last", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "string"}, + }, + { + "name": "address", + "fidesops_meta": {"data_type": "object"}, + "fields": [ + { + "name": "city", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "string"}, + }, + { + "name": "state", + "data_categories": ["system.operations"], + "fidesops_meta": {"data_type": "string"}, + }, + ], + }, + ], + } + ], + } + ], + } + + def test_keep_existing_scalar_field(self): + """ + Ensures that an existing scalar field isn't overwritten if the field + from the API response is empty + """ + + existing_dataset = { + "fides_key": "example", + "name": "Example Dataset", + "description": "An example dataset", + "collections": [ + { + "name": "user", + "fields": [ + { + "name": "a", + "data_categories": ["user.provided"], + "fidesops_meta": {"data_type": "string"}, + } + ], + } + ], + } + api_data = {"user": [{"a": None}]} + + assert generate_dataset(existing_dataset, api_data) == { + "fides_key": "example", + "name": "Example Dataset", + "description": "An example dataset", + "collections": [ + { + "name": "user", + "fields": [ + { + "name": "a", + "data_categories": ["user.provided"], + "fidesops_meta": {"data_type": "string"}, + } + ], + } + ], + } + + def test_missing_collection(self): + """ + Ensures that an existing collection is preserved if the API data is empty for a collection + """ + + existing_dataset = { + "fides_key": "example", + "name": "Example Dataset", + "description": "An example dataset", + "collections": [ + { + "name": "user", + "fields": [ + { + "name": "a", + "data_categories": ["user.provided"], + "fidesops_meta": {"data_type": "string"}, + } + ], + } + ], + } + api_data = {} + + assert generate_dataset(existing_dataset, api_data) == { + "fides_key": "example", + "name": "Example Dataset", + "description": "An example dataset", + "collections": [ + { + "name": "user", + "fields": [ + { + "name": "a", + "data_categories": ["user.provided"], + "fidesops_meta": {"data_type": "string"}, + } + ], + } + ], + } + + def test_collection_order(self): + """ + Ensures that the collection matches the collection order in the existing dataset + """ + + existing_dataset = { + "fides_key": "example", + "name": "Example Dataset", + "description": "An example dataset", + "collections": [ + { + "name": "user", + "fields": [ + { + "name": "a", + "data_categories": ["user.provided"], + "fidesops_meta": {"data_type": "string"}, + } + ], + }, + { + "name": "posts", + "fields": [ + { + "name": "b", + "data_categories": ["user.provided"], + "fidesops_meta": {"data_type": "string"}, + } + ], + }, + ], + } + api_data = {} + + assert generate_dataset(existing_dataset, api_data) == { + "fides_key": "example", + "name": "Example Dataset", + "description": "An example dataset", + "collections": [ + { + "name": "user", + "fields": [ + { + "name": "a", + "data_categories": ["user.provided"], + "fidesops_meta": {"data_type": "string"}, + } + ], + }, + { + "name": "posts", + "fields": [ + { + "name": "b", + "data_categories": ["user.provided"], + "fidesops_meta": {"data_type": "string"}, + } + ], + }, + ], + } + + def test_collection_order_override(self): + """ + Ensures that the collection order matches the provided order + """ + + existing_dataset = { + "fides_key": "example", + "name": "Example Dataset", + "description": "An example dataset", + "collections": [ + { + "name": "user", + "fields": [ + { + "name": "a", + "data_categories": ["user.provided"], + "fidesops_meta": {"data_type": "string"}, + } + ], + }, + { + "name": "posts", + "fields": [ + { + "name": "b", + "data_categories": ["user.provided"], + "fidesops_meta": {"data_type": "string"}, + } + ], + }, + ], + } + api_data = {} + + assert generate_dataset(existing_dataset, api_data, ["posts", "user"]) == { + "fides_key": "example", + "name": "Example Dataset", + "description": "An example dataset", + "collections": [ + { + "name": "posts", + "fields": [ + { + "name": "b", + "data_categories": ["user.provided"], + "fidesops_meta": {"data_type": "string"}, + } + ], + }, + { + "name": "user", + "fields": [ + { + "name": "a", + "data_categories": ["user.provided"], + "fidesops_meta": {"data_type": "string"}, + } + ], + }, + ], + }