diff --git a/README.md b/README.md index 0e2fa38a0..92cc7d174 100644 --- a/README.md +++ b/README.md @@ -55,35 +55,78 @@ information for Jane across tables in both the postgres and mongo databases. ```json { - "postgres_example_test_dataset:customer": [ - { - "email": "jane@example.com", - "name": "Jane Customer" - } - ], - "postgres_example_test_dataset:address": [ - { - "city": "Example Mountain", - "house": 1111, - "state": "TX", - "street": "Example Place", - "zip": "54321" - } - ], - "postgres_example_test_dataset:payment_card": [ - { - "ccn": 373719391, - "code": 222, - "name": "Example Card 3" - } - ], - "mongo_test:customer_details": [ - { - "gender": "female", - "birthday": "1990-02-28T00:00:00" - } - ] + "mongo_test:flights": [ + { + "passenger_information": { + "full_name": "Jane Customer" + } + } + ], + "mongo_test:payment_card": [ + { + "ccn": "987654321", + "name": "Example Card 2", + "code": "123" + } + ], + "postgres_example_test_dataset:address": [ + { + "zip": "54321", + "street": "Example Place", + "state": "TX", + "city": "Example Mountain", + "house": 1111 + } + ], + "mongo_test:customer_details": [ + { + "birthday": "1990-02-28T00:00:00", + "gender": "female", + "children": [ + "Erica Example" + ] + } + ], + "postgres_example_test_dataset:customer": [ + { + "email": "jane@example.com", + "name": "Jane Customer" + } + ], + "postgres_example_test_dataset:payment_card": [ + { + "ccn": 373719391, + "name": "Example Card 3", + "code": 222 + } + ], + "mongo_test:employee": [ + { + "email": "employee-2@example.com", + "name": "Jane Employee" + } + ], + "mongo_test:conversations": [ + { + "thread": [ + { + "chat_name": "Jane C" + } + ] + }, + { + "thread": [ + { + "chat_name": "Jane C" + }, + { + "chat_name": "Jane C" + } + ] + } + ] } + ``` ### Step Four: Create an Erasure Policy diff --git a/data/dataset/mongo_example_test_dataset.yml b/data/dataset/mongo_example_test_dataset.yml index 673619a36..983280208 100644 --- a/data/dataset/mongo_example_test_dataset.yml +++ b/data/dataset/mongo_example_test_dataset.yml @@ -25,6 +25,8 @@ dataset: fidesops_meta: data_type: string - name: workplace_info + fidesops_meta: + data_type: object fields: - name: employer fidesops_meta: @@ -33,8 +35,51 @@ dataset: data_categories: [ user.provided.identifiable.job_title ] fidesops_meta: data_type: string + - name: direct_reports + data_categories: [ user.provided.identifiable.name ] + fidesops_meta: + data_type: string[] + - name: emergency_contacts + fidesops_meta: + data_type: object[] + fields: + - name: name + data_categories: [ user.provided.identifiable.name ] + fidesops_meta: + data_type: string + - name: relationship + fidesops_meta: + data_type: string + - name: phone + data_categories: [ user.provided.identifiable.contact.phone_number ] + fidesops_meta: + data_type: string + - name: children + data_categories: [ user.provided.identifiable.childrens ] + fidesops_meta: + data_type: string[] + - name: travel_identifiers + fidesops_meta: + data_type: string[] + data_categories: [system.operations] + - name: comments + fidesops_meta: + data_type: object[] + fields: + - name: comment_id + fidesops_meta: + data_type: string + references: + - dataset: mongo_test + field: conversations.thread.comment + direction: to - name: internal_customer_profile fields: + - name: _id + data_categories: [ system.operations ] + fidesops_meta: + primary_key: True + data_type: object_id - name: customer_identifiers fields: - name: internal_id @@ -44,6 +89,11 @@ dataset: - dataset: mongo_test field: customer_feedback.customer_information.internal_customer_id direction: from + - name: derived_emails + data_categories: [user.derived] + fidesops_meta: + data_type: string[] + identity: email - name: derived_interests data_categories: [ user.derived ] fidesops_meta: @@ -81,3 +131,110 @@ dataset: data_categories: [ user.provided.nonidentifiable ] fidesops_meta: data_type: string + - name: flights + fields: + - name: _id + data_categories: [ system.operations ] + fidesops_meta: + primary_key: True + data_type: object_id + - name: passenger_information + fields: + - name: passenger_ids + fidesops_meta: + data_type: string[] + references: + - dataset: mongo_test + field: customer_details.travel_identifiers + direction: from + - name: full_name + data_categories: [user.provided.identifiable.name] + fidesops_meta: + data_type: string + - name: flight_no + - name: date + - name: pilots + data_categories: [ system.operations ] + fidesops_meta: + data_type: string[] + - name: plane + data_categories: [ system.operations ] + fidesops_meta: + data_type: integer + - name: conversations + fidesops_meta: + data_type: object[] + fields: + - name: thread + fields: + - name: comment + fidesops_meta: + data_type: string + - name: message + fidesops_meta: + data_type: string + - name: chat_name + data_categories: [ user.provided.identifiable.name ] + fidesops_meta: + data_type: string + - name: employee + fields: + - name: email + data_categories: [ user.provided.identifiable.contact.email ] + fidesops_meta: + identity: email + data_type: string + - name: id + data_categories: [ user.derived.identifiable.unique_id ] + fidesops_meta: + primary_key: True + references: + - dataset: mongo_test + field: flights.pilots + direction: from + - name: name + data_categories: [ user.provided.identifiable.name ] + fidesops_meta: + data_type: string + - name: aircraft + fields: + - name: _id + data_categories: [ system.operations ] + fidesops_meta: + primary_key: True + data_type: object_id + - name: planes + data_categories: [ system.operations ] + fidesops_meta: + data_type: string[] + references: + - dataset: mongo_test + field: flights.plane + direction: from + - name: model + data_categories: [ system.operations ] + fidesops_meta: + data_type: string + - name: payment_card + fields: + - name: billing_address_id + data_categories: [ system.operations ] + - name: ccn + data_categories: [ user.provided.identifiable.financial.account_number ] + fidesops_meta: + references: + - dataset: mongo_test + field: conversations.thread.ccn + direction: from + - name: code + data_categories: [ user.provided.identifiable.financial ] + - name: customer_id + data_categories: [ user.derived.identifiable.unique_id ] + - name: id + data_categories: [ system.operations ] + fidesops_meta: + primary_key: True + - name: name + data_categories: [ user.provided.identifiable.financial ] + - name: preferred + data_categories: [ user.provided.nonidentifiable ] diff --git a/data/nosql/mongo-init.js b/data/nosql/mongo-init.js index 4e8321496..9416109be 100644 --- a/data/nosql/mongo-init.js +++ b/data/nosql/mongo-init.js @@ -16,10 +16,18 @@ db.customer_details.insert([ "customer_id": 1, "gender": "male", "birthday": new ISODate("1988-01-10"), - "workplace_info": { + "workplace_info": { // Discovered object field "employer": "Mountain Baking Company", - "position": "Chief Strategist" - } + "position": "Chief Strategist", + "direct_reports": ["Robbie Margo", "Sully Hunter"] // Discovered nested array of scalars + }, + "emergency_contacts": [ // Discovered array of objects + {"name": "June Customer", "relationship": "mother", "phone": "444-444-4444"}, + {"name": "Josh Customer", "relationship": "brother", "phone": "111-111-111"}, + ], + "children": ["Christopher Customer", "Courtney Customer"], // Discovered array of scalars + "travel_identifiers": ["A111-11111", "B111-11111"], // References a nested array field, flights.passenger_information.passenger_ids + "comments": [{"comment_id": "com_0001"}, {"comment_id": "com_0003"}, {"comment_id": "com_0005"}] // Array of objects references a nested object field, mongo_test.conversations.thread.comment }, { "customer_id": 2, @@ -27,22 +35,32 @@ db.customer_details.insert([ "birthday": new ISODate("1985-03-05"), "workplace_info": { "employer": "Incline Software Company", - "position": "Software Engineer" - } + "position": "Software Engineer", + "direct_reports": ["Langdon Jeanne", "Dorothy Faron"] + }, + "emergency_contacts": [ + {"name": "Jesse Customer", "relationship": "spouse", "phone": "111-111-1111"}, + {"name": "Jonathan Customer", "relationship": "brother", "phone": "222-222-2222"} + ], + "children": ["Connie Customer"], + "travel_identifiers": ["C222-222222"] }, { "customer_id": 3, "gender": "female", - "birthday": new ISODate("1990-02-28") + "birthday": new ISODate("1990-02-28"), + "travel_identifiers": ["D111-11111"], + "children": ["Erica Example"], + "comments": [{"comment_id": "com_0002"}, {"comment_id": "com_0004"}, {"comment_id": "com_0006"}] } ]); db.customer_feedback.insert([ { "customer_information": { - "email": "customer-1@example.com", + "email": "customer-1@example.com", // Nested identity "phone": "333-333-3333", - "internal_customer_id": "cust_001" + "internal_customer_id": "cust_001" // References nested field internal_customer_profile.customer_identifiers.internal_id }, "rating": 3, "date": new ISODate("2022-01-05"), @@ -58,22 +76,82 @@ db.customer_feedback.insert([ "date": new ISODate("2022-01-10"), "message": "Customer service rep was very helpful and answered all my questions." } -]) +]); db.internal_customer_profile.insert([ { "customer_identifiers": { - "internal_id": "cust_001" + "internal_id": "cust_001" // Nested field referenced by another nested field (customer_information.internal_customer_id) }, - "derived_interests": ["marketing", "food"] + "derived_interests": ["marketing", "food"] // Discovered simple array }, { "customer_identifiers": { "internal_id": "cust_002" }, "derived_interests": ["programming", "hiking", "skateboarding"] + }, + { + "customer_identifiers": { + "internal_id": "cust_003", + "derived_emails": ["jane1@example.com", "jane@example.com"] // Identity within an array field + }, + "derived_interests": ["interior design", "travel", "photography"] + } +]); + +db.conversations.insert([ + { + "thread": [ + {"comment": "com_0001", "message": "hello, testing in-flight chat feature", "chat_name": "John C", "ccn": "123456789"}, // ccn points to mongo_test:payment_card + {"comment": "com_0002", "message": "yep, got your message, looks like it works", "chat_name": "Jane C", "ccn": "987654321"} + ] + }, + { + "thread": [ + {"comment": "com_0003", "message": "can I borrow your headphones?", "chat_name": "John C", "ccn": "123456789"}, + {"comment": "com_0004", "message": "no, sorry I'm using them.", "chat_name": "Jane C", "ccn": "987654321"}, + {"comment": "com_0005", "message": "did you bring anything to read?", "chat_name": "John C", "ccn": "123456789"}, + {"comment": "com_0006", "message": "try reading the informational brochure in the seat pouch.", "chat_name": "Jane C"} + ] + }, + { + "thread": [ + {"comment": "com_0007", "message": "Flight attendants, prepare for take-off please.", "chat_name": "Pilot 1"}, + {"comment": "com_0008", "message": "Airliner A, runway 12 cleared for takeoff", "chat_name": "ATC 2"}, + ] } -]) +]); + +db.flights.insert([ + { + "passenger_information": { + "passenger_ids": ["old_travel_number", "A111-11111"], // Array field referenced by another array field (customer_details.travel_identifiers) + "full_name": "John Customer" + }, + "flight_no": "AA230", + "date": "2021-01-01", + "pilots": ["1", "2"], // Array field referencing a scalar field mongo_test.employee.id + "plane": NumberInt(10002) // Scalar field referenced *by* an array field mongo_test.aircraft.planes + }, + { + "passenger_information": { + "passenger_ids": ["E111-11111", "D111-11111"], + "full_name": "Jane Customer" + }, + "flight_no": "AA240", + "date": "2021-02-01", + "pilots": ["2"], + "plane": NumberInt(30005) + } +]); + + +db.aircraft.insert([ + {"model": "Airbus A350", "planes": ["10001", "10002", "10003", "10004", "10005"]}, + {"model": "Boeing 747-8", "planes": ["30005", "30006", "30007"]} +]); + db.employee.insert([ { @@ -102,11 +180,11 @@ db.employee.insert([ }, "foreign_id": "000000000000000000000002" } -]) +]); db.customer.insert([ { - "id": "1", + "id": NumberInt(1), "email": "customer-1@example.com", "name": "John Customer", "created": Date("2020-04-01 11:47:42"), @@ -132,7 +210,7 @@ db.customer.insert([ ] }, { - "id": "2", + "id": NumberInt(2), "email": "customer-2@example.com", "name": "Jill Customer", "created": Date("2020-04-01 11:47:42"), @@ -153,7 +231,7 @@ db.customer.insert([ ] }, { - "id": "3", + "id": NumberInt(3), "email": "customer-3@example.com", "name": null, "address": null, @@ -166,7 +244,7 @@ db.customer.insert([ ] } -]) +]); @@ -177,7 +255,7 @@ db.payment_card.insert([ "ccn": "123456789", "code": "321", "preferred": true, - "customer_id": "1", + "customer_id": NumberInt(1), "billing_address": { "house": 123, "street": "Example Street", @@ -192,7 +270,7 @@ db.payment_card.insert([ "ccn": "987654321", "code": "123", "preferred": false, - "customer_id": "2", + "customer_id": NumberInt(2), "billing_address": { "house": 123, "street": "Example Street", @@ -201,19 +279,19 @@ db.payment_card.insert([ "zip": "12345" } } -]) +]); db.product.insert([ {"id": "1", "name": "Example Product 1", "price": 10}, {"id": "2", "name": "Example Product 2", "price": 20}, {"id": "3", "name": "Example Product 2", "price": 50} -]) +]); db.orders.insert([ { "id": "ord_aaa-aaa", - "customer_id": "1", + "customer_id": NumberInt(1), "shipping_address": { "house": 4, "street": "Example Lane", @@ -228,7 +306,7 @@ db.orders.insert([ }, { "id": "ord_bbb-bbb", - "customer_id": "2", + "customer_id": NumberInt(2), "shipping_address": { "house": 123, "street": "Example Street", @@ -243,7 +321,7 @@ db.orders.insert([ }, { "id": "ord_ccc-ccc", - "customer_id": "1", + "customer_id": NumberInt(1), "shipping_address": { "house": 123, "street": "Example Street", @@ -261,7 +339,7 @@ db.orders.insert([ }, { "id": "ord_ddd-ddd", - "customer_id": "1", + "customer_id": NumberInt(1), "shipping_address": { "house": 123, "street": "Example Street", @@ -275,20 +353,20 @@ db.orders.insert([ ] }, -]) +]); db.reports.insert([ {"email": "admin-account@example.com", "name": "Monthly Report", "year": 2021, "month": 8, "total_visits": 100}, {"email": "admin-account@example.com", "name": "Monthly Report", "year": 2021, "month": 9, "total_visits": 100}, {"email": "admin-account@example.com", "name": "Monthly Report", "year": 2021, "month": 10, "total_visits": 100}, {"email": "admin-account@example.com", "name": "Monthly Report", "year": 2021, "month": 11, "total_visits": 100} -]) +]); db.composite_pk_test.insert([ - {"id_a":1, "id_b":10, "description":"linked to customer 1", "customer_id":"1"}, - {"id_a":1, "id_b":11, "description":"linked to customer 2", "customer_id":"2"}, - {"id_a":2, "id_b":10, "description":"linked to customer 3", "customer_id":"3"} - ]) + {"id_a":1, "id_b":10, "description":"linked to customer 1", "customer_id": NumberInt(1)}, + {"id_a":1, "id_b":11, "description":"linked to customer 2", "customer_id": NumberInt(2)}, + {"id_a":2, "id_b":10, "description":"linked to customer 3", "customer_id": NumberInt(3)} +]); //values to support test by specific objectId search @@ -296,4 +374,4 @@ db.type_link_test.insert([ {"_id":ObjectId("000000000000000000000001"), "name":"v1", "key":1, "email":"test1@example.com"}, {"_id":ObjectId("000000000000000000000002"), "name":"v2", "key":2, "email":"test1@example.com"}, {"_id":ObjectId("000000000000000000000003"), "name":"v3", "key":3, "email":"test1@example.com"} -]) +]); diff --git a/docs/fidesops/docs/guides/complex_fields.md b/docs/fidesops/docs/guides/complex_fields.md new file mode 100644 index 000000000..62101634f --- /dev/null +++ b/docs/fidesops/docs/guides/complex_fields.md @@ -0,0 +1,279 @@ +# Complex Fields + +Fidesops can retrieve data from complex objects and arrays in MongoDB (*access* requests only; erasure support coming soon), although +this involves annotating your dataset files to let fidesops know about your complex data. + +In this section we'll cover: + +- How to annotate your dataset to describe object fields +- How to reference a nested object +- How to annotate array fields +- How to reference resources in arrays +- Assumptions made with array queries +- Example query traversal referencing complex fields + +## How do I annotate an object field? + +To declare an `object` field, you should define nested fields underneath that field. You can optionally +add the `data_type: object` annotation, but the object type will be inferred by the presence of the nested fields. In the example below, +`workplace_info` is an object field with two nested fields: `employer` and `position`. + +Data categories cannot be specified at the `object` level due to potential conflicts with nested fields. Instead, +annotate the scalar fields within the object field. Here, the `workplace_info.position` field has `data_category` +`user.provided.identifiable.job_title`. + + +```yaml +dataset: + - fides_key: mongo_nested_object_example + name: Mongo Example with Nested Objects + description: Example of a Mongo dataset that contains 'details' about customers defined in the 'postgres_example_test_dataset' + collections: + - name: customer_details + fields: + - ... + - name: workplace_info + fidesops_meta: + data_type: object + fields: + - name: employer + fidesops_meta: + data_type: string + - name: position + data_categories: [ user.provided.identifiable.job_title ] + fidesops_meta: + data_type: string + - name: id +``` + +## How do I reference a nested field? + +To define a relationship between a field on one collection and a nested field on another collection, use dot notation +in the `fidesops_meta` references for as many levels are necessary. + +In the example below, we might add a separate `customer` collection that references +the nested field `workplace_info.id` field in the `customer_details` collection. +Under references, this field is denoted by `..` name, or +`customer_details.workplace_info.id`. + +If we preferred, we could instead define this relationship on the `customer_details.workplace_info.id` field itself, +with a direction of `from`, with field `mydatabase.customer.workplace_id`, and dataset `mydatabase`. + +```yaml +dataset: + - fides_key: mydatabase + name: internal database + description: our internal database of customer data + collections: + - name: customer + fields: + - name: workplace_id + data_categories: [system.operations] + fidesops_meta: + references: + - dataset: mongo_nested_object_example + field: customer_details.workplace_info.id + direction: to + ... + +``` + +## How do I denote an Array Field? + +There is not an official `array` type per se, since arrays can store scalar values or objects. Instead, an array is represented +by a `[]` flag on a field. + +### What if I have an array of scalar values? + +In this example, our `mydatabase:customer` collection has a `travel_identifiers` field that is an array of strings, +described by `data_type: string[]`. An array of integers would be described by `data_type: integer[]`. +```yaml +dataset: + - fides_key: mydatabase + name: internal database + description: our internal database of customer data + collections: + - name: customer + fields: + - ... + - name: travel_identifiers + fidesops_meta: + data_type: string[] + data_categories: [system.operations] +``` + +### How do I describe a nested array? + +In this example, our `mydatabase:customer` collection has a nested `workplace_info.direct_reports` array, that is an +array of strings. In other words, we have a `workplace_info` object field, with sub-fields `employer`, `postion`, and `direct_reports`, +where `direct_reports` is an array. + +We define `direct_reports` as a subfield under `workplace_info`, as well as add the data_type `string[]` to `direct_reports`. + +```yaml +dataset: + - fides_key: mydatabase + name: internal database + description: our internal database of customer data + collections: + - name: customer + fields: + - name: workplace_info + fidesops_meta: + data_type: object + fields: + - name: employer + fidesops_meta: + data_type: string + - name: position + data_categories: [ user.provided.identifiable.job_title ] + fidesops_meta: + data_type: string + - name: direct_reports + data_categories: [ user.provided.identifiable.name ] + fidesops_meta: + data_type: string[] +``` + +### What if my collection contains an array of objects? + +In this example, our `mydatabase:customer` collection has an `emergency_contacts` field which is an array of objects, or +embedded documents, denoted by `data_type: object[]`. Each object in the `emergency_contacts` array can contain a +`name`, `relationship`, and `phone` field. + +```yaml +dataset: + - fides_key: mydatabase + name: internal database + description: our internal database of customer data + collections: + - name: customer + fields: + - name: emergency_contacts + fidesops_meta: + data_type: object[] + fields: + - name: name + data_categories: [ user.provided.identifiable.name ] + fidesops_meta: + data_type: string + - name: relationship + fidesops_meta: + data_type: string + - name: phone + data_categories: [ user.provided.identifiable.contact.phone_number ] + fidesops_meta: + data_type: string +``` + +### How do I reference an array field? + +Generally, reference an array field as if it is any other field. You cannot currently reference a specific index in an array field, +but you can point a field to an array field, and we would search for matches within that array. + +In this example, `mydatabase:flights.plane` is an integer field that will be used to lookup records that match an integer +in the `mydatabase:aircraft.planes` array. + +```yaml +dataset: + - fides_key: mydatabase + name: internal database + description: our internal database of customer data + collections: + - name: flights + fields: + - ... + - name: passenger_information + fields: + - name: passenger_ids + fidesops_meta: + data_type: string[] + - name: plane + data_categories: [ system.operations ] + fidesops_meta: + data_type: integer + - name: aircraft + fields: + - name: _id + data_categories: [ system.operations ] + fidesops_meta: + primary_key: True + data_type: object_id + - name: planes + data_categories: [ system.operations ] + fidesops_meta: + data_type: integer[] + references: + - dataset: mydatabase + field: flights.plane + direction: from + - name: model + data_categories: [ system.operations ] + fidesops_meta: + data_type: string +``` + +In this more complicated example, a field in an array of objects is used to look up a different field in an array of +objects in another collection. Potentially multiple values from `mydatabase:customer.comments.comment_id` can be used +to query for corresponding values in `mydatabase:conversations.thread.comment`. Because this field is in an array of objects, +multiple matches may be found. + +```yaml +dataset: + - fides_key: mydatabase + name: internal database + description: our internal database of customer data + collections: + - name: customer + fields: + - name: comments + fidesops_meta: + data_type: object[] + fields: + - name: comment_id + fidesops_meta: + data_type: string + references: + - dataset: mydatabase + field: conversations.thread.comment + direction: to + - name: conversations + fidesops_meta: + data_type: object[] + fields: + - name: thread + fields: + - name: comment + fidesops_meta: + data_type: string + - name: message + fidesops_meta: + data_type: string + - name: chat_name + data_categories: [ user.provided.identifiable.name ] + fidesops_meta: + data_type: string +``` + +## How does array querying work? + +There are some assumptions made with array querying that may or may not fit with how your data is structured. If an array +is an entrypoint into a collection (in other words, one collection references its array field), there is ambiguity around how +the queries should be built - for example, AND versus OR, and whether only the matched indices or matched embedded documents within +arrays should be considered. + +### Assumptions + +1) If an array is the entry point into a node, we will search for corresponding matches across the entire array. You cannot specify a certain index. +2) Everything is basically an "OR" query. Data returned from multiple array fields will be flattened before being passed into the next collection. + 1) For example, say Collection A returned values [1, 2, 3] and Collection B returned values [4, 5, 6]. Collection C has an array field that depends on both Collection A and Collection B. We search Collection C's array field to return any record that contains one of the values [1, 2, 3, 4, 5, 6] in the array. +3. If an array field is an entry point to a node, only matching indices in that array are considered, both for access/(and eventually erasures), as well as for subsequent queries on dependent collections where applicable. + 1. For example, a query on Collection A only matched indices 0 and 1 in an array. Only the data located at indices 0 and 1 are used to query data on dependent collection C. + + +### Can I see a more detailed example of a query traversal with complex objects? + +This is an example traversal created from our test `postgres_example` and `mongo_test` datasets. +Multiple collections are point to or from complex objects and arrays. See the `mongo_example_test_dataset.yml` for more information. + +![Postgres and Mongo Query Traversal](../img/mongo_and_postgres_complex.png) \ No newline at end of file diff --git a/docs/fidesops/docs/guides/datasets.md b/docs/fidesops/docs/guides/datasets.md index fbddd2119..83946d6df 100644 --- a/docs/fidesops/docs/guides/datasets.md +++ b/docs/fidesops/docs/guides/datasets.md @@ -108,82 +108,4 @@ dataset: - `primary_key` (_Optional_): A boolean value that means that Fidesops will treat this field as a unique row identifier for generating update statements. If multiple fields are marked as primary keys the combination of their values will be treated as a combined key. In SQL terms, we'd issue a query that looked like `SELECT ... FROM TABLE WHERE primary_key_name_1 = value1 AND primary_key_name_2 = value2`. If no primary key is specified for any field on a collection, no updates will be generated against that collection. - `data_type` (_Optional - Required only when processing erasure requests for masking strategies other than Null rewrite_): An indication of type of data held by this field. Data types are used to convert values to the appropriate type when those values are used in queries. Data types are also used to generate the appropriate masked value when running erasures, since Fidesops needs to know the type of data expected by the field in order to generate an appropriate masked value. Available datatypes are `string`, `integer`, `float`, `boolean`, `object_id`. `object` types are also supported for MongoDB. - `length` (_Optional_): An indicator of field length. - -#### Object fields -To declare an `object` field, you should define nested fields underneath that field. You can optionally -add the `data_type: object` annotation, but the object type will be inferred by the presence of the nested fields. In the example below, -`workplace_info` is an object field with two nested fields: `employer` and `position`. - -Data categories cannot be specified at the `object` level due to potential conflicts with nested fields. Instead, -annotate the scalar fields within the object field. Here, the `workplace_info.position` field has `data_category` -`user.provided.identifiable.job_title`. - - -```yaml -dataset: - - fides_key: mongo_nested_object_example - name: Mongo Example with Nested Objects - description: Example of a Mongo dataset that contains 'details' about customers defined in the 'postgres_example_test_dataset' - collections: - - name: customer_details - fields: - - name: _id - data_categories: [system.operations] - fidesops_meta: - primary_key: True - - name: customer_id - data_categories: [user.derived.identifiable.unique_id] - fidesops_meta: - references: - - dataset: postgres_example_test_dataset - field: customer.id - direction: from - - name: workplace_info - fidesops_meta: - data_type: object - fields: - - name: employer - fidesops_meta: - data_type: string - - name: position - data_categories: [ user.provided.identifiable.job_title ] - fidesops_meta: - data_type: string - - name: id -``` - -##### Referencing a nested field - -To define a relationship between a field on one collection and a nested field on another collection, use dot notation -in the `fidesops_meta` references for as many levels are necessary. - -In the example below, we might add another column to our `customer` collection that references -the nested field `workplace_info.id` field in the `customer_details` collection. -Under references, this field is denoted by `..` name, or -`customer_details.workplace_info.id`. - -If we preferred, we could instead define this relationship on the `customer_details.workplace_info.id` field itself, -with a direction of `from`, with field `mydatabase.customer.workplace_id`, and dataset `mydatabase`. - -``` -dataset: - - fides_key: mydatabase - name: internal database - description: our internal database of customer data - collections: - - name: customer - fields: - - name: workplace_id - data_categories: [system.operations] - fidesops_meta: - references: - - dataset: mongo_nested_object_example - field: customer_details.workplace_info.id - direction: to - ... - -``` - -Note that we currently support access requests on object fields in MongoDB only. Support for -nested erasures, as well as support for array fields is underway. \ No newline at end of file diff --git a/docs/fidesops/docs/img/mongo_and_postgres_complex.png b/docs/fidesops/docs/img/mongo_and_postgres_complex.png new file mode 100644 index 000000000..3dfcb4d65 Binary files /dev/null and b/docs/fidesops/docs/img/mongo_and_postgres_complex.png differ diff --git a/docs/fidesops/docs/postman/Fidesops.postman_collection.json b/docs/fidesops/docs/postman/Fidesops.postman_collection.json index 191ed4081..0de9ee44a 100644 --- a/docs/fidesops/docs/postman/Fidesops.postman_collection.json +++ b/docs/fidesops/docs/postman/Fidesops.postman_collection.json @@ -1,6 +1,6 @@ { "info": { - "_postman_id": "a7ecd1c1-4126-4b6a-b8f1-c0f83878f8e0", + "_postman_id": "7b6bc89c-35fe-4361-bd02-ef34653caef6", "name": "Fidesops", "schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json" }, @@ -316,7 +316,7 @@ "header": [], "body": { "mode": "raw", - "raw": "[\n {\"name\": \"Application PostgreSQL DB\",\n \"key\": \"{{postgres_key}}\",\n \"connection_type\": \"postgres\",\n \"access\": \"read\"\n}]", + "raw": "[\n {\"name\": \"Application PostgreSQL DB\",\n \"key\": \"{{postgres_key}}\",\n \"connection_type\": \"postgres\",\n \"access\": \"write\"\n}]", "options": { "raw": { "language": "json" @@ -353,7 +353,7 @@ "header": [], "body": { "mode": "raw", - "raw": "[\n {\n \"name\": \"My Mongo DB\",\n \"key\": \"{{mongo_key}}\",\n \"connection_type\": \"mongodb\",\n \"access\": \"read\"\n }\n]", + "raw": "[\n {\n \"name\": \"My Mongo DB\",\n \"key\": \"{{mongo_key}}\",\n \"connection_type\": \"mongodb\",\n \"access\": \"write\"\n }\n]", "options": { "raw": { "language": "json" @@ -504,7 +504,7 @@ "header": [], "body": { "mode": "raw", - "raw": "[\n {\n \"fides_key\": \"mongo_test\",\n \"name\": \"Mongo Example Test Dataset\",\n \"description\": \"Example of a Mongo dataset\",\n \"collections\": [\n {\n \"name\": \"internal_customer_profile\",\n \"fields\": [\n {\n \"name\": \"customer_identifiers\",\n \"fields\": [\n {\n \"name\": \"internal_id\",\n \"fidesops_meta\": {\n \"data_type\": \"string\",\n \"references\": [\n {\n \"dataset\": \"mongo_test\",\n \"field\": \"customer_feedback.customer_information.internal_customer_id\",\n \"direction\": \"from\"\n }\n ]\n }\n }\n ]\n },\n {\n \"name\": \"derived_interests\",\n \"data_categories\": [\n \"user.derived\"\n ],\n \"fidesops_meta\": {\n \"data_type\": \"string[]\"\n }\n }\n ]\n },\n {\n \"name\": \"customer_feedback\",\n \"fields\": [\n {\n \"name\": \"_id\",\n \"data_categories\": [\n \"system.operations\"\n ],\n \"fidesops_meta\": {\n \"primary_key\": true\n }\n },\n {\n \"name\": \"customer_information\",\n \"fields\": [\n {\n \"name\": \"email\",\n \"fidesops_meta\": {\n \"data_type\": \"string\",\n \"identity\": \"email\"\n }\n },\n {\n \"name\": \"phone\",\n \"data_categories\": [\n \"user.provided.identifiable.contact.phone_number\"\n ],\n \"fidesops_meta\": {\n \"data_type\": \"string\"\n }\n }\n ]\n },\n {\n \"name\": \"rating\",\n \"data_categories\": [\n \"user.provided.nonidentifiable\"\n ],\n \"fidesops_meta\": {\n \"data_type\": \"integer\"\n }\n },\n {\n \"name\": \"date\",\n \"fidesops_meta\": {\n \"data_type\": \"string\"\n }\n },\n {\n \"name\": \"message\",\n \"data_categories\": [\n \"user.provided.nonidentifiable\"\n ],\n \"fidesops_meta\": {\n \"data_type\": \"string\"\n }\n }\n ]\n },\n {\n \"name\": \"customer_details\",\n \"fields\": [\n {\n \"name\": \"_id\",\n \"data_categories\": [\n \"system.operations\"\n ],\n \"fidesops_meta\": {\n \"primary_key\": true\n }\n },\n {\n \"name\": \"customer_id\",\n \"data_categories\": [\n \"user.derived.identifiable.unique_id\"\n ],\n \"fidesops_meta\": {\n \"references\": [\n {\n \"dataset\": \"postgres_example\",\n \"field\": \"customer.id\",\n \"direction\": \"from\"\n }\n ]\n }\n },\n {\n \"name\": \"gender\",\n \"data_categories\": [\n \"user.provided.identifiable.gender\"\n ]\n },\n {\n \"name\": \"birthday\",\n \"data_categories\": [\n \"user.provided.identifiable.date_of_birth\"\n ]\n },\n {\n \"name\": \"workplace_info\",\n \"fields\": [\n {\n \"name\": \"employer\",\n \"fidesops_meta\": {\n \"data_type\": \"string\"\n }\n },\n {\n \"name\": \"position\",\n \"data_categories\": [\n \"user.provided.identifiable.job_title\"\n ],\n \"fidesops_meta\": {\n \"data_type\": \"string\"\n }\n }\n ]\n }\n ]\n }\n ]\n }\n]", + "raw": "[\n {\n \"fides_key\":\"mongo_test\",\n \"name\":\"Mongo Example Test Dataset\",\n \"description\":\"Example of a Mongo dataset that contains 'details' about customers defined in the 'postgres_example'\",\n \"collections\":[\n {\n \"name\":\"customer_details\",\n \"fields\":[\n {\n \"name\":\"_id\",\n \"data_categories\":[\n \"system.operations\"\n ],\n \"fidesops_meta\":{\n \"primary_key\":true\n }\n },\n {\n \"name\":\"customer_id\",\n \"data_categories\":[\n \"user.derived.identifiable.unique_id\"\n ],\n \"fidesops_meta\":{\n \"references\":[\n {\n \"dataset\":\"postgres_example\",\n \"field\":\"customer.id\",\n \"direction\":\"from\"\n }\n ]\n }\n },\n {\n \"name\":\"gender\",\n \"data_categories\":[\n \"user.provided.identifiable.gender\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"string\"\n }\n },\n {\n \"name\":\"birthday\",\n \"data_categories\":[\n \"user.provided.identifiable.date_of_birth\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"string\"\n }\n },\n {\n \"name\":\"workplace_info\",\n \"fidesops_meta\":{\n \"data_type\":\"object\"\n },\n \"fields\":[\n {\n \"name\":\"employer\",\n \"fidesops_meta\":{\n \"data_type\":\"string\"\n }\n },\n {\n \"name\":\"position\",\n \"data_categories\":[\n \"user.provided.identifiable.job_title\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"string\"\n }\n },\n {\n \"name\":\"direct_reports\",\n \"data_categories\":[\n \"user.provided.identifiable.name\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"string[]\"\n }\n }\n ]\n },\n {\n \"name\":\"emergency_contacts\",\n \"fidesops_meta\":{\n \"data_type\":\"object[]\"\n },\n \"fields\":[\n {\n \"name\":\"name\",\n \"data_categories\":[\n \"user.provided.identifiable.name\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"string\"\n }\n },\n {\n \"name\":\"relationship\",\n \"fidesops_meta\":{\n \"data_type\":\"string\"\n }\n },\n {\n \"name\":\"phone\",\n \"data_categories\":[\n \"user.provided.identifiable.contact.phone_number\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"string\"\n }\n }\n ]\n },\n {\n \"name\":\"children\",\n \"data_categories\":[\n \"user.provided.identifiable.childrens\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"string[]\"\n }\n },\n {\n \"name\":\"travel_identifiers\",\n \"fidesops_meta\":{\n \"data_type\":\"string[]\",\n \"data_categories\":[\n \"system.operations\"\n ]\n }\n },\n {\n \"name\":\"comments\",\n \"fidesops_meta\":{\n \"data_type\":\"object[]\"\n },\n \"fields\":[\n {\n \"name\":\"comment_id\",\n \"fidesops_meta\":{\n \"data_type\":\"string\",\n \"references\":[\n {\n \"dataset\":\"mongo_test\",\n \"field\":\"conversations.thread.comment\",\n \"direction\":\"to\"\n }\n ]\n }\n }\n ]\n }\n ]\n },\n {\n \"name\":\"internal_customer_profile\",\n \"fields\":[\n {\n \"name\":\"_id\",\n \"data_categories\":[\n \"system.operations\"\n ],\n \"fidesops_meta\":{\n \"primary_key\":true,\n \"data_type\":\"object_id\"\n }\n },\n {\n \"name\":\"customer_identifiers\",\n \"fields\":[\n {\n \"name\":\"internal_id\",\n \"fidesops_meta\":{\n \"data_type\":\"string\",\n \"references\":[\n {\n \"dataset\":\"mongo_test\",\n \"field\":\"customer_feedback.customer_information.internal_customer_id\",\n \"direction\":\"from\"\n }\n ]\n }\n },\n {\n \"name\":\"derived_emails\",\n \"data_categories\":[\n \"user.derived\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"string[]\",\n \"identity\":\"email\"\n }\n }\n ]\n },\n {\n \"name\":\"derived_interests\",\n \"data_categories\":[\n \"user.derived\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"string[]\"\n }\n }\n ]\n },\n {\n \"name\":\"customer_feedback\",\n \"fields\":[\n {\n \"name\":\"_id\",\n \"data_categories\":[\n \"system.operations\"\n ],\n \"fidesops_meta\":{\n \"primary_key\":true,\n \"data_type\":\"object_id\"\n }\n },\n {\n \"name\":\"customer_information\",\n \"fields\":[\n {\n \"name\":\"email\",\n \"fidesops_meta\":{\n \"identity\":\"email\",\n \"data_type\":\"string\"\n }\n },\n {\n \"name\":\"phone\",\n \"data_categories\":[\n \"user.provided.identifiable.contact.phone_number\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"string\"\n }\n },\n {\n \"name\":\"internal_customer_id\",\n \"data_categories\":[\n \"system.operations\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"string\"\n }\n }\n ]\n },\n {\n \"name\":\"rating\",\n \"data_categories\":[\n \"user.provided.nonidentifiable\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"integer\"\n }\n },\n {\n \"name\":\"date\",\n \"data_categories\":[\n \"system.operations\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"string\"\n }\n },\n {\n \"name\":\"message\",\n \"data_categories\":[\n \"user.provided.nonidentifiable\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"string\"\n }\n }\n ]\n },\n {\n \"name\":\"flights\",\n \"fields\":[\n {\n \"name\":\"_id\",\n \"data_categories\":[\n \"system.operations\"\n ],\n \"fidesops_meta\":{\n \"primary_key\":true,\n \"data_type\":\"object_id\"\n }\n },\n {\n \"name\":\"passenger_information\",\n \"fields\":[\n {\n \"name\":\"passenger_ids\",\n \"fidesops_meta\":{\n \"data_type\":\"string[]\",\n \"references\":[\n {\n \"dataset\":\"mongo_test\",\n \"field\":\"customer_details.travel_identifiers\",\n \"direction\":\"from\"\n }\n ]\n }\n },\n {\n \"name\":\"full_name\",\n \"data_categories\":[\n \"user.provided.identifiable.name\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"string\"\n }\n }\n ]\n },\n {\n \"name\":\"flight_no\"\n },\n {\n \"name\":\"date\"\n },\n {\n \"name\":\"pilots\",\n \"data_categories\":[\n \"system.operations\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"string[]\"\n }\n },\n {\n \"name\":\"plane\",\n \"data_categories\":[\n \"system.operations\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"integer\"\n }\n }\n ]\n },\n {\n \"name\":\"conversations\",\n \"fidesops_meta\":{\n \"data_type\":\"object[]\"\n },\n \"fields\":[\n {\n \"name\":\"thread\",\n \"fields\":[\n {\n \"name\":\"comment\",\n \"fidesops_meta\":{\n \"data_type\":\"string\"\n }\n },\n {\n \"name\":\"message\",\n \"fidesops_meta\":{\n \"data_type\":\"string\"\n }\n },\n {\n \"name\":\"chat_name\",\n \"data_categories\":[\n \"user.provided.identifiable.name\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"string\"\n }\n }\n ]\n }\n ]\n },\n {\n \"name\":\"employee\",\n \"fields\":[\n {\n \"name\":\"email\",\n \"data_categories\":[\n \"user.provided.identifiable.contact.email\"\n ],\n \"fidesops_meta\":{\n \"identity\":\"email\",\n \"data_type\":\"string\"\n }\n },\n {\n \"name\":\"id\",\n \"data_categories\":[\n \"user.derived.identifiable.unique_id\"\n ],\n \"fidesops_meta\":{\n \"primary_key\":true,\n \"references\":[\n {\n \"dataset\":\"mongo_test\",\n \"field\":\"flights.pilots\",\n \"direction\":\"from\"\n }\n ]\n }\n },\n {\n \"name\":\"name\",\n \"data_categories\":[\n \"user.provided.identifiable.name\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"string\"\n }\n }\n ]\n },\n {\n \"name\":\"aircraft\",\n \"fields\":[\n {\n \"name\":\"_id\",\n \"data_categories\":[\n \"system.operations\"\n ],\n \"fidesops_meta\":{\n \"primary_key\":true,\n \"data_type\":\"object_id\"\n }\n },\n {\n \"name\":\"planes\",\n \"data_categories\":[\n \"system.operations\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"integer[]\",\n \"references\":[\n {\n \"dataset\":\"mongo_test\",\n \"field\":\"flights.plane\",\n \"direction\":\"from\"\n }\n ]\n }\n },\n {\n \"name\":\"model\",\n \"data_categories\":[\n \"system.operations\"\n ],\n \"fidesops_meta\":{\n \"data_type\":\"string\"\n }\n }\n ]\n },\n {\n \"name\":\"payment_card\",\n \"fields\":[\n {\n \"name\":\"billing_address_id\",\n \"data_categories\":[\n \"system.operations\"\n ]\n },\n {\n \"name\":\"ccn\",\n \"data_categories\":[\n \"user.provided.identifiable.financial.account_number\"\n ],\n \"fidesops_meta\":{\n \"references\":[\n {\n \"dataset\":\"mongo_test\",\n \"field\":\"conversations.thread.ccn\",\n \"direction\":\"from\"\n }\n ]\n }\n },\n {\n \"name\":\"code\",\n \"data_categories\":[\n \"user.provided.identifiable.financial\"\n ]\n },\n {\n \"name\":\"customer_id\",\n \"data_categories\":[\n \"user.derived.identifiable.unique_id\"\n ]\n },\n {\n \"name\":\"id\",\n \"data_categories\":[\n \"system.operations\"\n ],\n \"fidesops_meta\":{\n \"primary_key\":true\n }\n },\n {\n \"name\":\"name\",\n \"data_categories\":[\n \"user.provided.identifiable.financial\"\n ]\n },\n {\n \"name\":\"preferred\",\n \"data_categories\":[\n \"user.provided.nonidentifiable\"\n ]\n }\n ]\n }\n ]\n}\n]", "options": { "raw": { "language": "json" @@ -705,7 +705,7 @@ "header": [], "body": { "mode": "raw", - "raw": "[\n {\n \"requested_at\": \"2021-08-30T16:09:37.359Z\",\n \"identity\": {\"email\": \"customer-2@example.com\"},\n \"policy_key\": \"{{separate_policy_key}}\"\n }\n]", + "raw": "[\n {\n \"requested_at\": \"2021-08-30T16:09:37.359Z\",\n \"identity\": {\"email\": \"customer-1@example.com\"},\n \"policy_key\": \"{{separate_policy_key}}\"\n }\n]", "options": { "raw": { "language": "json" @@ -2192,8 +2192,7 @@ }, { "key": "mysql_key", - "value": "app_mysql_db", - "type": "string" + "value": "app_mysql_db" }, { "key": "mongo_key", diff --git a/docs/fidesops/mkdocs.yml b/docs/fidesops/mkdocs.yml index a0a0921d2..a083ef62f 100644 --- a/docs/fidesops/mkdocs.yml +++ b/docs/fidesops/mkdocs.yml @@ -17,6 +17,7 @@ nav: - Authenticate with OAuth: guides/oauth.md - Connect SQL and NoSQL Databases: guides/database_connectors.md - Configure Datasets: guides/datasets.md + - Annotate Complex Fields: guides/complex_fields.md - Preview Query Execution: guides/query_execution.md - Create Request Policies: guides/policies.md - Configure Data Masking Strategies: guides/masking_strategies.md diff --git a/src/fidesops/graph/traversal.py b/src/fidesops/graph/traversal.py index 685457e45..f52532ee8 100644 --- a/src/fidesops/graph/traversal.py +++ b/src/fidesops/graph/traversal.py @@ -13,6 +13,7 @@ Collection, ROOT_COLLECTION_ADDRESS, FieldPath, + Field, ) from fidesops.graph.graph import Node, Edge, DatasetGraph from fidesops.util.logger import NotPii @@ -86,6 +87,32 @@ def outgoing_edges(self) -> Set[Edge]: for _, self_field_path, child_field_path in tuples } + @property + def query_field_paths(self) -> Set[FieldPath]: + """ + All of the possible field paths that we can query for possible filter values. + These are field paths that are the ends of incoming edges. + """ + return {edge.f2.field_path for edge in self.incoming_edges()} + + def typed_filtered_values(self, input_data: Dict[str, List[Any]]) -> Dict[str, Any]: + """ + Return a filtered list of key/value sets of data items that are both in + the list of incoming edge fields, and contain data in the input data set. + + The values are cast based on field types, if those types are specified. + """ + out = {} + for key, values in input_data.items(): + path: FieldPath = FieldPath.parse(key) + field: Field = self.node.collection.field(path) + if field and path in self.query_field_paths and isinstance(values, list): + cast_values = [field.cast(v) for v in values] + filtered = list(filter(lambda x: x is not None, cast_values)) + if filtered: + out[key] = filtered + return out + def can_run_given(self, remaining_node_keys: Set[CollectionAddress]) -> bool: """True if finished_node_keys covers all the nodes that this traversal_node is waiting for. If all nodes this traversal_node is waiting for have finished, it's ok for this traversal_node to run. diff --git a/src/fidesops/service/connectors/mongodb_connector.py b/src/fidesops/service/connectors/mongodb_connector.py index 553e9ccef..44a033d0e 100644 --- a/src/fidesops/service/connectors/mongodb_connector.py +++ b/src/fidesops/service/connectors/mongodb_connector.py @@ -104,7 +104,7 @@ def retrieve_data( logger.info(f"Starting data retrieval for {node.address}") for row in collection.find(query_data, fields): rows.append(row) - logger.info(f"Found {len(rows)} on {node.address}") + logger.info(f"Found {len(rows)} rows on {node.address}") return rows def mask_data( diff --git a/src/fidesops/service/connectors/query_config.py b/src/fidesops/service/connectors/query_config.py index 44b7ab163..c4c52ab63 100644 --- a/src/fidesops/service/connectors/query_config.py +++ b/src/fidesops/service/connectors/query_config.py @@ -1,7 +1,7 @@ import logging import re from abc import ABC, abstractmethod -from typing import Dict, Any, List, Set, Optional, Generic, TypeVar, Tuple +from typing import Dict, Any, List, Optional, Generic, TypeVar, Tuple from sqlalchemy import text from sqlalchemy.sql.elements import TextClause @@ -80,32 +80,6 @@ def primary_key_field_paths(self) -> Dict[FieldPath, Field]: if field.primary_key } - @property - def query_field_paths(self) -> Set[FieldPath]: - """ - All of the possible field paths that we can query for possible filter values. - These are field paths that are the ends of incoming edges. - """ - return {edge.f2.field_path for edge in self.node.incoming_edges()} - - def typed_filtered_values(self, input_data: Dict[str, List[Any]]) -> Dict[str, Any]: - """ - Return a filtered list of key/value sets of data items that are both in - the list of incoming edge fields, and contain data in the input data set. - - The values are cast based on field types, if those types are specified. - """ - out = {} - for key, values in input_data.items(): - path: FieldPath = FieldPath.parse(key) - field: Field = self.node.node.collection.field(path) - if field and path in self.query_field_paths and isinstance(values, list): - cast_values = [field.cast(v) for v in values] - filtered = list(filter(lambda x: x is not None, cast_values)) - if filtered: - out[key] = filtered - return out - def query_sources(self) -> Dict[str, List[CollectionAddress]]: """Display the input collection(s) for each query key for display purposes. @@ -179,7 +153,9 @@ def update_value_map( masking_override, null_masking, strategy ): logger.warning( - f"Unable to generate a query for field {rule_field_path.string_path}: data_type is either not present on the field or not supported for the {strategy_config['strategy']} masking strategy. Received data type: {masking_override.data_type_converter.name}" + f"Unable to generate a query for field {rule_field_path.string_path}: data_type is either not " + f"present on the field or not supported for the {strategy_config['strategy']} masking " + f"strategy. Received data type: {masking_override.data_type_converter.name}" ) continue val: Any = rule_field_path.retrieve_from(row) @@ -303,7 +279,7 @@ def generate_query( policy: Optional[Policy] = None, ) -> Optional[TextClause]: """Generate a retrieval query""" - filtered_data: Dict[str, Any] = self.typed_filtered_values(input_data) + filtered_data: Dict[str, Any] = self.node.typed_filtered_values(input_data) if filtered_data: clauses = [] @@ -420,7 +396,7 @@ def generate_query( # pylint: disable=R0914 ) -> Optional[TextClause]: """Generate a retrieval query""" - filtered_data = self.typed_filtered_values(input_data) + filtered_data = self.node.typed_filtered_values(input_data) if filtered_data: clauses = [] @@ -549,7 +525,7 @@ def transform_query_pairs(pairs: Dict[str, Any]) -> Dict[str, Any]: return {"$or": [dict([(k, v)]) for k, v in pairs.items()]} if input_data: - filtered_data: Dict[str, Any] = self.typed_filtered_values(input_data) + filtered_data: Dict[str, Any] = self.node.typed_filtered_values(input_data) if filtered_data: query_pairs = {} for string_field_path, data in filtered_data.items(): diff --git a/src/fidesops/task/consolidate_query_matches.py b/src/fidesops/task/consolidate_query_matches.py new file mode 100644 index 000000000..e789d6811 --- /dev/null +++ b/src/fidesops/task/consolidate_query_matches.py @@ -0,0 +1,40 @@ +from typing import Optional, List, Any, Dict + +from fidesops.graph.config import FieldPath + + +def consolidate_query_matches( + row: Dict[str, Any], + target_path: FieldPath, + flattened_matches: Optional[List] = None, +) -> List[Any]: + """ + Recursively consolidates values along from the target_path into a single array. + + A target_path can point to a single scalar value, an array, or even multiple values within arrays of embedded + documents. We consolidate all values into a single flattened list, that are used in subsequent queries + to locate records in another collection. + + :param row: Retrieved record from dataset + :param target_path: FieldPath to applicable field + :param flattened_matches: Recursive list where matched value(s) from the target_path are recursively added + :return: A consolidated flattened list of matched values. + """ + if flattened_matches is None: + flattened_matches = [] + + if isinstance(row, list): + for elem in row: + consolidate_query_matches(elem, target_path, flattened_matches) + + elif isinstance(row, dict): + for key, value in row.items(): + if target_path.levels and key == target_path.levels[0]: + consolidate_query_matches( + value, FieldPath(*target_path.levels[1:]), flattened_matches + ) + + else: + flattened_matches.append(row) + + return flattened_matches diff --git a/src/fidesops/task/filter_element_match.py b/src/fidesops/task/filter_element_match.py new file mode 100644 index 000000000..af306a8f9 --- /dev/null +++ b/src/fidesops/task/filter_element_match.py @@ -0,0 +1,127 @@ +import copy +import logging +from collections import defaultdict + +from typing import List, Any, Dict + +import pydash + +from fidesops.graph.config import FieldPath +from fidesops.task.refine_target_path import ( + build_incoming_refined_target_paths, + DetailedPath, +) + +logger = logging.getLogger(__name__) + + +def filter_element_match( + row: Dict[str, Any], query_paths: Dict[FieldPath, List[Any]] +) -> Dict[str, Any]: + """ + Modifies row in place to remove unmatched array elements or unmatched embedded documents within arrays. + + :param row: Record retrieved from a dataset + :param query_paths: FieldPaths mapped to query values + :return: A modified record with array elements potentially eliminated if array data was targeted by a query path + + :Example: + The given row was retrieved from a dataset because an element in row["A"] matched 2 or an element in + row["C"]["D"] matched 5. row["A"] is filtered to just contain the matching element, and row["C"] is filtered + to just contain the objects where "D" = 5. Non-array elements should not be touched. + + filter_element_match( + row={"A": [1, 2, 3], "B": 2, "C": [{"D": 3, "E": 4}, {"D": 5, "E": 6}, {"D": 5, "E": 7}]}, + query_paths={FieldPath("A"): [2], FieldPath("C, "D"): 5} + ) + + {"A": [2], "B": 2, "C": [{"D": 5, "E": 6}, {"D": 5, "E": 7}]} + """ + detailed_target_paths: List[DetailedPath] = build_incoming_refined_target_paths( + row, query_paths + ) + + array_paths_to_preserve: Dict[str, List[int]] = _expand_array_paths_to_preserve( + detailed_target_paths + ) + + return _remove_paths_from_row(row, array_paths_to_preserve) + + +def _remove_paths_from_row( + row: Dict[str, Any], preserve_indices: Dict[str, List[int]] +) -> Dict[str, Any]: + """ + Used by filter_element_match, remove array elements from row that are not specified in preserve_indices + + :param row: Record retrieved from a dataset + :param preserve_indices: A dictionary of dot-separated paths to arrays, where the values are the list of indices + we want to *keep* + :return: A filtered row that has removed non-specified indices from arrays + + :Example: + The first element in row["A"]["B"] was the only one specified to preserve, so we remove the other two. + _remove_paths_from_row({"A": {"B": [{"C": "D"}, {"C": "F"}, {"C": "G"}]}}, {"A.B": [0]}) + + {'A': {'B': [{'C': 'D'}]}} + """ + desc_path_length: Dict[ + str, List[int] + ] = dict( # We want to remove elements from the deepest paths first + sorted( + preserve_indices.items(), + key=lambda item: item[0].count("."), + reverse=True, + ) + ) + for path, preserve_index_list in desc_path_length.items(): + matched_array: List = pydash.objects.get(row, path) + if matched_array is None: + # This case shouldn't happen - if this gets logged, we've done something wrong + logger.info( + f"_remove_paths_from_row call: Path {path} in row {row} not found." + ) + continue + # Loop through array in reverse to delete indices + for i, _ in reversed(list(enumerate(matched_array))): + if i not in preserve_index_list: + matched_array.pop(i) + + return row + + +def _expand_array_paths_to_preserve(paths: List[DetailedPath]) -> Dict[str, List[int]]: + """ + Used by "filter_element_match" - Returns a dictionary of string paths mapped to array indices that we want + to preserve. + + :param paths: A list of lists of detailed paths (containing strings and array indices) to elements that matched query criteria + :return: A dict where the keys are a dot-separated path to an array, and the values are a list of indices + in that array that we want to keep. If there are no indices in the original path, that path will be ignored. + Some paths may be expanded into multiple paths where there are multiple levels of indices (arrays of arrays). + + :Example: + _expand_array_paths_to_preserve([["F", 1, 2], ["F", 1, 3], ["G", "H"], ["L", 1, "M"]]) + {'F': [1], 'F.1': [2, 3], 'L': [1]} + + This data will be used to remove all elements from row["F"][1] that are not at index 2, and 3. + We'll then remove all elements from "F" that are not at index [1], and all elements from "L" that are not at index 1. + + """ + # Break path into multiple paths if array elements in path + expanded: List[DetailedPath] = [] + for path in paths: + while path != [] and not isinstance(path[-1], int): + path.pop() + new_path: DetailedPath = [] + for elem in path: + new_path.append(elem) + if isinstance(elem, int) and new_path not in expanded: + expanded.append(copy.deepcopy(new_path)) + + # Combine paths where the key is a dot-separated path to the array, and the value are the indices + # of the array we want to preserve + merge_paths: Dict[str, List[int]] = defaultdict(list) + for path in expanded: + merge_paths[".".join(map(str, path[0:-1]))].append(path[-1]) # type: ignore + return merge_paths diff --git a/src/fidesops/task/filter_results.py b/src/fidesops/task/filter_results.py new file mode 100644 index 000000000..855ee0f03 --- /dev/null +++ b/src/fidesops/task/filter_results.py @@ -0,0 +1,83 @@ +import logging +from typing import List, Dict, Any, Union + +from fidesops.graph.config import FieldPath + +logger = logging.getLogger(__name__) + + +def select_and_save_field(saved: Any, row: Any, target_path: FieldPath) -> Dict: + """Extract the data located along the given `target_path` from the row and add to the "saved" dictionary. + + Entire rows are returned from your collections; this function will incrementally just pull the PII from the rows, + by retrieving data along target_paths to relevant data categories. + + To use, pass in an empty dict for "saved" and loop through a list of FieldPaths you want, + continuing to pass in the ever-growing new "saved" dict that was returned from the previous iteration. + + :param saved: Call with an empty dict to start, it will recursively update as data along the target_path is added to it. + :param row: Call with retrieved row to start, it will recursively be called with a variety of object types until we + reach the most deeply nested value. + :param target_path: FieldPath to the data we want to retrieve + + :return: modified saved dictionary with given field path if found + """ + + def _defaultdict_or_array(resource: Any) -> Any: + """Helper for building new nested resource - can return an empty dict, empty array or resource itself""" + return type(resource)() if isinstance(resource, (list, dict)) else resource + + if isinstance(row, list): + for i, elem in enumerate(row): + try: + saved[i] = select_and_save_field(saved[i], elem, target_path) + except IndexError: + saved.append( + select_and_save_field( + _defaultdict_or_array(elem), elem, target_path + ) + ) + + elif isinstance(row, dict): + for key in row: + if key == target_path.levels[0]: + if key not in saved: + saved[key] = _defaultdict_or_array(row[key]) + saved[key] = select_and_save_field( + saved[key], row[key], FieldPath(*target_path.levels[1:]) + ) + return saved + + +RecursiveRow = Union[Dict[Any, Any], List[Any]] + + +def remove_empty_containers(row: RecursiveRow) -> RecursiveRow: + """ + Recursively updates row in place to remove empty dictionaries and empty arrays at any level in collection or + from embedded collections in arrays. + + `select_and_save_field` recursively builds a nested structure based on desired field paths. + If no input data was found along a deeply nested field path, we may have empty dicts to clean up + before supplying response to user. Also empty arrays and empty dicts do not contain PII. + + :param row: Pass in retrieved row, and it will recursively go through objects and arrays and filter out empty collections. + :return: Updated row with empty objects and arrays removed + """ + if isinstance(row, dict): + for key, value in row.copy().items(): + if isinstance(value, (dict, list)): + value = remove_empty_containers(value) + + if value in [{}, []]: + del row[key] + + elif isinstance(row, list): + for index, elem in reversed(list(enumerate(row))): + if isinstance(elem, (dict, list)): + elem = remove_empty_containers(elem) + + if elem in [{}, []]: + row.pop(index) + + return row diff --git a/src/fidesops/task/graph_task.py b/src/fidesops/task/graph_task.py index e3739bb66..23371aa56 100644 --- a/src/fidesops/task/graph_task.py +++ b/src/fidesops/task/graph_task.py @@ -8,7 +8,6 @@ from time import sleep from typing import List, Dict, Any, Tuple, Callable, Optional, Set -import pandas as pd import dask from dask.threaded import get @@ -26,10 +25,12 @@ from fidesops.models.privacy_request import PrivacyRequest, ExecutionLogStatus from fidesops.schemas.shared_schemas import FidesOpsKey from fidesops.service.connectors import BaseConnector +from fidesops.task.consolidate_query_matches import consolidate_query_matches +from fidesops.task.filter_element_match import filter_element_match +from fidesops.task.filter_results import select_and_save_field, remove_empty_containers from fidesops.task.task_resources import TaskResources from fidesops.util.collection_util import partition, append from fidesops.util.logger import NotPii -from fidesops.util.nested_utils import unflatten_dict logger = logging.getLogger(__name__) @@ -137,13 +138,14 @@ def to_dask_input_data(self, *data: List[Row]) -> Dict[str, List[Any]]: These outputs should correspond to the input key order. Any nested fields are converted into dot-separated paths in the return. - table1: [{x:1, y:A}, {x:2, y:B}], table2: [{x:3},{x:4}], table3: [{z: {a: C}}] + table1: [{x:1, y:A}, {x:2, y:B}], table2: [{x:3},{x:4}], table3: [{z: {a: C}, "y": [4, 5]}] where table1.x => self.id, table1.y=> self.name, table2.x=>self.id table3.z.a => self.contact.address + table3.y => self.contact.email becomes - {id:[1,2,3,4], name:["A","B"], contact.address:["C"]} + {id:[1,2,3,4], name:["A","B"], contact.address:["C"], "contact.email": [4, 5]} """ if not len(data) == len(self.input_keys): logger.warning( @@ -160,11 +162,16 @@ def to_dask_input_data(self, *data: List[Row]) -> Dict[str, List[Any]]: Tuple[FieldPath, FieldPath] ] = self.incoming_field_path_map[collection_address] + logger.info( + f"Consolidating incoming data into {self.traversal_node.node.address} from {collection_address}." + ) for row in rowset: for foreign_field_path, local_field_path in field_mappings: - new_value = foreign_field_path.retrieve_from(row) - if new_value: - append(output, local_field_path.string_path, new_value) + new_values: List = consolidate_query_matches( + row=row, target_path=foreign_field_path + ) + if new_values: + append(output, local_field_path.string_path, new_values) return output def update_status( @@ -226,9 +233,26 @@ def log_end( @retry(action_type=ActionType.access, default_return=[]) def access_request(self, *inputs: List[Row]) -> List[Row]: """Run access request""" + formatted_input_data = self.to_dask_input_data(*inputs) output = self.connector.retrieve_data( - self.traversal_node, self.resources.policy, self.to_dask_input_data(*inputs) + self.traversal_node, self.resources.policy, formatted_input_data ) + + coerced_input_data = self.traversal_node.typed_filtered_values( + formatted_input_data # Cast incoming values to correct type + ) + for row in output: + # In code, filter out non-matching sub-documents and array elements + logger.info( + f"Filtering row in {self.traversal_node.node.address} for matching array elements." + ) + filter_element_match( + row, + { + FieldPath.parse(field): inputs + for field, inputs in coerced_input_data.items() + }, + ) self.resources.cache_object(f"access_request__{self.key}", output) self.log_end(ActionType.access) return output @@ -381,16 +405,23 @@ def termination_fn(*dependent_values: int) -> Tuple[int, ...]: def filter_data_categories( - access_request_results: Dict[str, Optional[Any]], + access_request_results: Dict[str, List[Dict[str, Optional[Any]]]], target_categories: Set[str], data_category_fields: Dict[CollectionAddress, Dict[FidesOpsKey, List[FieldPath]]], ) -> Dict[str, List[Dict[str, Optional[Any]]]]: """Filter access request results to only return fields associated with the target data categories - and subcategories + and subcategories. - For example, if data category "user.provided.identifiable.contact" is specified on one of the rule targets, - all fields on subcategories also apply, so ["user.provided.identifiable.contact.city", + Regarding subcategories,if data category "user.provided.identifiable.contact" is specified on one of the rule + targets, for example, all fields on subcategories also apply, so ["user.provided.identifiable.contact.city", "user.provided.identifiable.contact.street", ...], etc. + + :param access_request_results: Dictionary of access request results for each of your collections + :param target_categories: A set of data categories that we'd like to extract from access_request_results + :param data_category_fields: Data categories mapped to applicable fields for each collection + + :return: Filtered access request results that only contain fields matching the desired data categories. + TODO move to fidesops.task.filter_results.py, leaving for now to make the diff more clear """ logger.info( "Filtering Access Request results to return fields associated with data categories" @@ -417,18 +448,11 @@ def filter_data_categories( if not target_field_paths: continue - # Normalize nested data into a flat dataframe - df: pd.DataFrame = pd.json_normalize(results, sep=".") - # Only keep intersection of dataframe columns and target field paths - df = df[ - df.columns & [field_path.string_path for field_path in target_field_paths] - ] - # Turn the filtered results back into a list of dictionaries - filtered_flattened_results: List[Dict[str, Optional[Any]]] = df.to_dict( - orient="records" - ) - for row in filtered_flattened_results: - # For each row, unflatten the dictionary - filtered_access_results[node_address].append(unflatten_dict(row)) + for row in results: + filtered_results: Dict[str, Any] = {} + for field_path in target_field_paths: + select_and_save_field(filtered_results, row, field_path) + remove_empty_containers(filtered_results) + filtered_access_results[node_address].append(filtered_results) return filtered_access_results diff --git a/src/fidesops/task/refine_target_path.py b/src/fidesops/task/refine_target_path.py new file mode 100644 index 000000000..f0fffc66d --- /dev/null +++ b/src/fidesops/task/refine_target_path.py @@ -0,0 +1,147 @@ +import logging +from typing import Dict, Any, List, Union + +from fidesops.graph.config import FieldPath + +Level = Union[str, int] +DetailedPath = List[ + Level +] # A more detailed path to a field, potentially containing indices + +logger = logging.getLogger(__name__) + + +def build_incoming_refined_target_paths( + row: Dict[str, Any], query_paths: Dict[FieldPath, List[Any]] +) -> List[DetailedPath]: + """ + Return a list of more detailed path(s) to the matched data that caused that row to be returned. Runs + recursive `refine_target_path` for each FieldPath in query_paths. If there are no array paths, the paths + will not change. + + :param row: Record retrieved from a dataset + :param query_paths: FieldPaths mapped to query values + :return: A list of lists containing more detailed paths to the matched data + + :Example: + row = { + "A": [1, 2], + "B": 2, + "C": [{"D": 3, "E": 5}, {"D": 3, "E": 4}, {"D": 3, "E": 4}], + "G": 3, + } + incoming_paths= {FieldPath("A"): [2], FieldPath("C", "E"): [4], FieldPath("G"): [3]} + + build_incoming_refined_target_paths(row, incoming_paths) + [["G"], ["A", 1], ["C", 1, "E"], ["C", 2, "E"]] + """ + found_paths: List[DetailedPath] = [] + for target_path, only in query_paths.items(): + path = refine_target_path(row, list(target_path.levels), only) + if path: + if isinstance(path[0], list): + found_paths.extend(path) + else: + found_paths.append(path) + found_paths.sort(key=len) + return found_paths + + +def refine_target_path( + row: Dict[str, Any], target_path: List[str], only: List[Any] +) -> DetailedPath: # Can also return a list of DetailedPaths if there are multiple matches. + """ + Recursively modify the target_path to be more detailed path(s) to the referenced data. Instead of just strings, + the path will be expanded to include indices where applicable. + + :param row: Record retrieved from a dataset + :param target_path: A list of strings representing the path to the desired field. + :param only: A list of values that were used to build the query. + :return: A list or a list of lists containing more detailed path(s) to the data in "only". If there + was just one path, we return one list. + + :Example: + In this example, path ["A", "B", "C"] points to two elements that match values "F" or "G". We update + the path to insert the indices to locate the appropriate value. + + refine_target_path({"A": {"B": [{"C": "D"}, {"C": "F"}, {"C": "G"}]}}, ["A", "B", "C"], only=["F", "G"]) + + [["A", "B", 1, "C"], ["A", "B", 2, "C"]] + """ + try: + current_level = target_path[0] + current_elem = row[current_level] + except KeyError: # FieldPath not found in record, this is expected to happen when data doesn't exist in collection + return [] + except (IndexError, TypeError): # No field path or invalid field path + logger.warning(f"Error with locating {target_path} on row") + return [] + + if isinstance(current_elem, dict): + next_levels = refine_target_path(current_elem, target_path[1:], only) + return _update_path(current_level, next_levels) + + if isinstance(current_elem, list): + next_levels = _enter_array(current_elem, target_path[1:], only) + return _update_path(current_level, next_levels) + + # Simple case - value is a scalar + return [current_level] if _match_found(current_elem, only) else [] + + +def _enter_array(array: List[Any], field_path: List[str], only: List[Any]) -> List: + """ + Used by recursive "refine_target_path" whenever arrays are encountered in the row. + """ + results: List[Any] = [] + for index, elem in enumerate(array): + current_result = [] + + if field_path: + next_result = refine_target_path(elem, field_path, only) + current_result = _update_path(index, next_result) + else: + if isinstance(elem, list): + next_result = _enter_array( + elem, field_path, only + ) # Nested enter_array calls needed for lists in lists + current_result = _update_path(index, next_result) + else: + if _match_found(elem, only): + current_result = [index] + + if current_result: # Match found at lower level + if isinstance(current_result[0], list): + # Keeps nested lists at most list of lists + results.extend(current_result) + else: + results.append(current_result) + + return results[0] if len(results) == 1 else results + + +def _match_found(elem: Any, only: List[Any]) -> bool: + """The given scalar element matches one of the input values""" + return elem in only + + +def _update_path(current_level: Level, deeper_levels: DetailedPath) -> DetailedPath: + """ + Used by "refine_target_path" and "_enter_array" to recursively build a + more refined target path to the desired data. + """ + if not deeper_levels: + # Element did not contain a match + return [] + + if isinstance(deeper_levels[0], list): + result = [] + for item in deeper_levels: + # Builds multiple possible paths + result.append(_update_path(current_level, item)) + return result + + # Consolidates current level with deeper levels + result = [current_level] + result.extend(deeper_levels) + return result diff --git a/src/fidesops/task/task_resources.py b/src/fidesops/task/task_resources.py index c4e421082..1a8ad6587 100644 --- a/src/fidesops/task/task_resources.py +++ b/src/fidesops/task/task_resources.py @@ -96,11 +96,11 @@ def __init__( self.connections = Connections() def __enter__(self) -> "TaskResources": - """Support 'with' useage for closing resources""" + """Support 'with' usage for closing resources""" return self def __exit__(self, _type: Any, value: Any, traceback: Any) -> None: - """Support 'with' useage for closing resources""" + """Support 'with' usage for closing resources""" self.close() def cache_object(self, key: str, value: Any) -> None: diff --git a/src/fidesops/util/collection_util.py b/src/fidesops/util/collection_util.py index 3f4c3ca08..2ee334e78 100644 --- a/src/fidesops/util/collection_util.py +++ b/src/fidesops/util/collection_util.py @@ -17,17 +17,21 @@ def merge_dicts(*dicts: Dict[T, U]) -> Dict[T, U]: return {} -def append(d: Dict[T, List[U]], key: T, val: U) -> None: +def append(d: Dict[T, List[U]], key: T, value: U) -> None: """Append to values stored under a dictionary key. append({},"A",1) sets dict to {"A":[1]} append({"A":[1],"A",2) sets dict to {"A":[1,2]} + append({"A":[1],"A",[2, 3, 4]) sets dict to {"A":[1, 2, 3, 4]} """ - if val: + if value: if key in d: - d[key].append(val) + if isinstance(value, list): + d[key].extend(value) + else: + d[key].append(value) else: - d[key] = [val] + d[key] = value if isinstance(value, list) else [value] def partition(_iterable: Iterable[T], extractor: Callable[[T], U]) -> Dict[U, List[T]]: diff --git a/src/fidesops/util/nested_utils.py b/src/fidesops/util/nested_utils.py deleted file mode 100644 index 28cb3b52d..000000000 --- a/src/fidesops/util/nested_utils.py +++ /dev/null @@ -1,45 +0,0 @@ -from collections import defaultdict -from typing import Dict, Any - -from fidesops.common_exceptions import FidesopsException - - -def unflatten_dict(input_row: Dict[str, Any], separator: str = ".") -> Dict[str, Any]: - """ - Takes a dictionary that has been normalized to level 1 and reconstitutes: - - { - "a": 1, - "b.c": 2, - "b.d": 3 - } - - and turns it into: - {"a": 1, "b": {"c": 2, "d": 3} - """ - - def _create_dict() -> defaultdict: - """Can create a defaultdict at every level""" - return defaultdict(_create_dict) - - unpacked_results = _create_dict() - - for key, value in input_row.items(): - if isinstance(value, dict): - raise FidesopsException( - "`unflatten_dict` expects a flattened dictionary as input." - ) - - levels = key.split(separator) - subdict = unpacked_results - for level in levels[:-1]: - subdict = subdict[level] - - try: - subdict[levels[-1]] = value - except TypeError as exc: - raise FidesopsException( - f"Error unflattening dictionary, conflicting levels detected: {exc}" - ) - - return unpacked_results diff --git a/tests/api/v1/endpoints/test_dataset_endpoints.py b/tests/api/v1/endpoints/test_dataset_endpoints.py index a6443dd56..e629504a4 100644 --- a/tests/api/v1/endpoints/test_dataset_endpoints.py +++ b/tests/api/v1/endpoints/test_dataset_endpoints.py @@ -39,7 +39,7 @@ def test_example_datasets(example_datasets): assert example_datasets[0]["fides_key"] == "postgres_example_test_dataset" assert len(example_datasets[0]["collections"]) == 11 assert example_datasets[1]["fides_key"] == "mongo_test" - assert len(example_datasets[1]["collections"]) == 3 + assert len(example_datasets[1]["collections"]) == 8 assert example_datasets[2]["fides_key"] == "snowflake_example_test_dataset" assert len(example_datasets[2]["collections"]) == 11 assert example_datasets[3]["fides_key"] == "redshift_example_test_dataset" @@ -465,7 +465,7 @@ def test_patch_datasets_bulk_create( assert mongo_dataset["fides_key"] == "mongo_test" assert mongo_dataset["name"] == "Mongo Example Test Dataset" assert "Example of a Mongo dataset" in mongo_dataset["description"] - assert len(mongo_dataset["collections"]) == 3 + assert len(mongo_dataset["collections"]) == 8 # Check the mssql dataset mssql_dataset = response_body["succeeded"][4] diff --git a/tests/fixtures/application_fixtures.py b/tests/fixtures/application_fixtures.py index 82e4434a2..2d52c0bb4 100644 --- a/tests/fixtures/application_fixtures.py +++ b/tests/fixtures/application_fixtures.py @@ -1135,3 +1135,66 @@ def privacy_request_runner( cache=cache, privacy_request=privacy_request, ) + + +@pytest.fixture(scope="function") +def sample_data(): + return { + "_id": 12345, + "thread": [ + { + "comment": "com_0001", + "message": "hello, testing in-flight chat feature", + "chat_name": "John", + "messages": {}, + }, + { + "comment": "com_0002", + "message": "yep, got your message, looks like it works", + "chat_name": "Jane", + }, + {"comment": "com_0002", "message": "hello!", "chat_name": "Jeanne"}, + ], + "snacks": ["pizza", "chips"], + "seats": {"first_choice": "A2", "second_choice": "B3"}, + "upgrades": { + "magazines": ["Time", "People"], + "books": ["Once upon a Time", "SICP"], + "earplugs": True, + }, + "other_flights": [ + {"DFW": ["11 AM", "12 PM"], "CHO": ["12 PM", "1 PM"]}, + {"DFW": ["2 AM", "12 PM"], "CHO": ["2 PM", "1 PM"]}, + {"DFW": ["3 AM", "2 AM"], "CHO": ["2 PM", "1:30 PM"]}, + ], + "months": { + "july": [ + { + "activities": ["swimming", "hiking"], + "crops": ["watermelon", "cheese", "grapes"], + }, + {"activities": ["tubing"], "crops": ["corn"]}, + ], + "march": [ + { + "activities": ["skiing", "bobsledding"], + "crops": ["swiss chard", "swiss chard"], + }, + {"activities": ["hiking"], "crops": ["spinach"]}, + ], + }, + "hello": [1, 2, 3, 4, 2], + "weights": [[1, 2], [3, 4]], + "toppings": [[["pepperoni", "salami"], ["pepperoni", "cheese", "cheese"]]], + "A": {"C": [{"M": ["p", "n", "n"]}]}, + "C": [["A", "B", "C", "B"], ["G", "H", "B", "B"]], # Double lists + "D": [ + [["A", "B", "C", "B"], ["G", "H", "B", "B"]], + [["A", "B", "C", "B"], ["G", "H", "B", "B"]], + ], # Triple lists + "E": [[["B"], [["A", "B", "C", "B"], ["G", "H", "B", "B"]]]], # Irregular lists + "F": [ + "a", + ["1", "a", [["z", "a", "a"]]], + ], # Lists elems are different types, not officially supported + } diff --git a/tests/graph/graph_test_util.py b/tests/graph/graph_test_util.py index b730cdb13..205cdc890 100644 --- a/tests/graph/graph_test_util.py +++ b/tests/graph/graph_test_util.py @@ -15,9 +15,7 @@ from fidesops.models.privacy_request import PrivacyRequest from fidesops.service.connectors import BaseConnector, MongoDBConnector from fidesops.service.connectors.sql_connector import SQLConnector -from fidesops.service.masking.strategy.masking_strategy_nullify import ( - NullMaskingStrategy, -) + from fidesops.task.graph_task import GraphTask from fidesops.task.task_resources import TaskResources from ..fixtures.application_fixtures import faker diff --git a/tests/graph/test_config.py b/tests/graph/test_config.py index 559a606b6..c3726a1e0 100644 --- a/tests/graph/test_config.py +++ b/tests/graph/test_config.py @@ -427,15 +427,18 @@ def test_parse(self): assert FieldPath.parse("a.b.c.d.e") == FieldPath("a", "b", "c", "d", "e") def test_retrieve_from(self): - input_data = {"A": {"B": {"C": 2}}} + input_data = {"A": {"B": {"C": 2, "E": [1, 2, 3]}}} - assert FieldPath("A").retrieve_from(input_data) == {"B": {"C": 2}} + assert FieldPath("A").retrieve_from(input_data) == {"B": {"C": 2, "E": [1, 2, 3]}} - assert FieldPath("A", "B").retrieve_from(input_data) == {"C": 2} + assert FieldPath("A", "B").retrieve_from(input_data) == {"C": 2, "E": [1, 2, 3]} assert FieldPath("A", "B", "C").retrieve_from(input_data) == 2 assert ( FieldPath("D").retrieve_from(input_data) is None ) # FieldPath not in input data + assert FieldPath("A", "B", "E").retrieve_from(input_data) == [1, 2, 3] + assert FieldPath().retrieve_from(input_data) is None # No levels specified + diff --git a/tests/integration_tests/test_mongo_task.py b/tests/integration_tests/test_mongo_task.py index 28a7ddf66..8a87f5d8e 100644 --- a/tests/integration_tests/test_mongo_task.py +++ b/tests/integration_tests/test_mongo_task.py @@ -8,7 +8,12 @@ import pytest from bson import ObjectId -from fidesops.graph.config import FieldAddress, ScalarField, Collection, Dataset +from fidesops.graph.config import ( + FieldAddress, + ScalarField, + Collection, + Dataset, +) from fidesops.graph.data_type import ( IntTypeConverter, StringTypeConverter, @@ -26,6 +31,7 @@ from fidesops.service.connectors import get_connector from fidesops.task import graph_task from fidesops.task.graph_task import filter_data_categories + from ..graph.graph_test_util import assert_rows_match, erasure_policy, field from ..task.traversal_data import ( integration_db_graph, @@ -102,6 +108,7 @@ def test_combined_erasure_task( access_request_data, ) + # TODO complex erasures not yet addressed assert x == { "postgres_example:customer": 1, "postgres_example:orders": 0, @@ -112,6 +119,10 @@ def test_combined_erasure_task( "mongo_test:customer_feedback": 1, "mongo_test:customer_details": 1, "mongo_test:internal_customer_profile": 1, + "mongo_test:aircraft": 0, + "mongo_test:conversations": 0, + "mongo_test:employee": 0, + "mongo_test:flights": 0, } rerun_access = graph_task.run_access_request( @@ -132,7 +143,7 @@ def test_combined_erasure_task( is not None ) - # This will change when array handling is added - array was just set to None + # TODO This will change when array handling is added - array was just set to None assert ( rerun_access["mongo_test:internal_customer_profile"][0]["derived_interests"] is None @@ -268,7 +279,7 @@ def test_composite_key_erasure( ), ScalarField( name="customer_id", - data_type_converter=StringTypeConverter(), + data_type_converter=IntTypeConverter(), references=[(FieldAddress("mongo_test", "customer", "id"), "from")], ), ], @@ -291,8 +302,8 @@ def test_composite_key_erasure( customer = access_request_data["mongo_test:customer"][0] composite_pk_test = access_request_data["mongo_test:composite_pk_test"][0] - assert customer["id"] == "1" - assert composite_pk_test["customer_id"] == "1" + assert customer["id"] == 1 + assert composite_pk_test["customer_id"] == 1 # erasure erasure = graph_task.run_erasure( @@ -398,7 +409,7 @@ def test_access_erasure_type_conversion( @pytest.mark.integration -def test_filter_on_data_categories_mongo( +def test_object_querying_mongo( db, privacy_request, example_datasets, @@ -461,10 +472,20 @@ def test_filter_on_data_categories_mongo( dataset_graph.data_category_field_mapping, ) assert len(filtered_results["mongo_test:customer_details"]) == 1 + + # Array of embedded emergency contacts returned, array of children, nested array of workplace_info.direct_reports assert filtered_results["mongo_test:customer_details"][0] == { - "birthday": datetime(1988, 1, 10), + "birthday": datetime(1988, 1, 10, 0, 0), "gender": "male", - "workplace_info": {"position": "Chief Strategist"}, + "children": ["Christopher Customer", "Courtney Customer"], + "emergency_contacts": [ + {"name": "June Customer", "phone": "444-444-4444"}, + {"name": "Josh Customer", "phone": "111-111-111"}, + ], + "workplace_info": { + "position": "Chief Strategist", + "direct_reports": ["Robbie Margo", "Sully Hunter"], + }, } # Includes data retrieved from a nested field that was joined with a nested field from another table @@ -474,11 +495,131 @@ def test_filter_on_data_categories_mongo( target_categories, dataset_graph.data_category_field_mapping, ) + + # Test for accessing array assert filtered_results["mongo_test:internal_customer_profile"][0] == { "derived_interests": ["marketing", "food"] } +@pytest.mark.integration +def test_array_querying_mongo( + db, + privacy_request, + example_datasets, + policy, + integration_mongodb_config, + integration_postgres_config, +): + + postgres_config = copy.copy(integration_postgres_config) + + dataset_postgres = FidesopsDataset(**example_datasets[0]) + graph = convert_dataset_to_graph(dataset_postgres, integration_postgres_config.key) + dataset_mongo = FidesopsDataset(**example_datasets[1]) + mongo_graph = convert_dataset_to_graph( + dataset_mongo, integration_mongodb_config.key + ) + dataset_graph = DatasetGraph(*[graph, mongo_graph]) + + access_request_results = graph_task.run_access_request( + privacy_request, + policy, + dataset_graph, + [postgres_config, integration_mongodb_config], + {"email": "jane@example.com"}, + ) + + target_categories = {"user.derived"} + filtered_results = filter_data_categories( + access_request_results, + target_categories, + dataset_graph.data_category_field_mapping, + ) + # Array field mongo_test:internal_customer_profile.customer_identifiers contains identity + # Only matching identity returned + assert filtered_results["mongo_test:internal_customer_profile"][0][ + "customer_identifiers" + ]["derived_emails"] == ["jane@example.com"] + + # # Entire derived_interests array returned - this is not an identity or "to" reference field + assert filtered_results["mongo_test:internal_customer_profile"][0][ + "derived_interests" + ] == ["interior design", "travel", "photography"] + + filtered_identifiable = filter_data_categories( + access_request_results, + {"user.provided.identifiable"}, + dataset_graph.data_category_field_mapping, + ) + + # Includes array field + assert filtered_identifiable["mongo_test:customer_details"] == [ + { + "birthday": datetime(1990, 2, 28, 0, 0), + "gender": "female", + "children": ["Erica Example"], + } + ] + + # items in array mongo_test:customer_details.travel_identifiers used to lookup matching array elements + # in mongo_test:flights:passenger_information.passenger_ids. passenger_information.full_name has relevant + # data category. + assert len(filtered_identifiable["mongo_test:flights"]) == 1 + assert filtered_identifiable["mongo_test:flights"][0] == { + "passenger_information": {"full_name": "Jane Customer"} + } + + # Nested customer_details:comments.comment_id field used to find embedded objects conversations.thread.comment + # fields. Only matching embedded documents queried for relevant data categories + assert filtered_identifiable["mongo_test:conversations"] == [ + {"thread": [{"chat_name": "Jane C"}]}, + {"thread": [{"chat_name": "Jane C"}, {"chat_name": "Jane C"}]}, + ] + + # Integer field mongo_test:flights.plane used to locate only matching elem in mongo_test:aircraft:planes array field + assert access_request_results["mongo_test:aircraft"][0]["planes"] == ['30005'] + # Filtered out, however, because there's no relevant matched data category + assert filtered_identifiable["mongo_test:aircraft"] == [] + + # Values in mongo_test:flights:pilots array field used to locate scalar field in mongo_test:employee.id + assert filtered_identifiable["mongo_test:employee"] == [ + {"email": "employee-2@example.com", "name": "Jane Employee"} + ] + + # No data for identity in this collection + assert access_request_results["mongo_test:customer_feedback"] == [] + + # Only matched embedded document in mongo_test:conversations.thread.ccn used to locate mongo_test:payment_card + assert filtered_identifiable["mongo_test:payment_card"] == [{'code': '123', 'name': 'Example Card 2', 'ccn': '987654321'}] + + # Run again with different email + access_request_results = graph_task.run_access_request( + privacy_request, + policy, + dataset_graph, + [postgres_config, integration_mongodb_config], + {"email": "customer-1@example.com"}, + ) + filtered_identifiable = filter_data_categories( + access_request_results, + {"user.provided.identifiable"}, + dataset_graph.data_category_field_mapping, + ) + + # Two values in mongo_test:flights:pilots array field mapped to mongo_test:employee ids + assert filtered_identifiable["mongo_test:employee"] == [ + {"name": "Jack Employee", "email": "employee-1@example.com"}, + {"name": "Jane Employee", "email": "employee-2@example.com"}, + ] + + # Only embedded documents matching mongo_test:conversations.thread.comment returned + assert filtered_identifiable["mongo_test:conversations"] == [ + {"thread": [{"chat_name": "John C"}]}, + {"thread": [{"chat_name": "John C"}, {"chat_name": "John C"}]}, + ] + + @pytest.mark.integration_mongodb @pytest.mark.integration class TestRetrievingDataMongo: @@ -490,7 +631,12 @@ def connector(self, integration_mongodb_config): def traversal_node(self, example_datasets, integration_mongodb_config): dataset = FidesopsDataset(**example_datasets[1]) graph = convert_dataset_to_graph(dataset, integration_mongodb_config.key) - node = Node(graph, graph.collections[0]) # customer collection + customer_details_collection = None + for collection in graph.collections: + if collection.name == "customer_details": + customer_details_collection = collection + break + node = Node(graph, customer_details_collection) traversal_node = TraversalNode(node) return traversal_node diff --git a/tests/service/connectors/test_queryconfig.py b/tests/service/connectors/test_queryconfig.py index f356adf91..9c877de57 100644 --- a/tests/service/connectors/test_queryconfig.py +++ b/tests/service/connectors/test_queryconfig.py @@ -19,7 +19,6 @@ from fidesops.schemas.masking.masking_secrets import MaskingSecretCache, SecretType from fidesops.service.connectors.query_config import ( - QueryConfig, SQLQueryConfig, MongoQueryConfig, ) @@ -54,8 +53,8 @@ class TestSQLQueryConfig: def test_extract_query_components(self): - def found_query_keys(qconfig: QueryConfig, values: Dict[str, Any]) -> Set[str]: - return set(qconfig.typed_filtered_values(values).keys()) + def found_query_keys(node: TraversalNode, values: Dict[str, Any]) -> Set[str]: + return set(node.typed_filtered_values(values).keys()) config = SQLQueryConfig(payment_card_node) assert config.field_map().keys() == { @@ -68,12 +67,15 @@ def found_query_keys(qconfig: QueryConfig, values: Dict[str, Any]) -> Set[str]: "billing_address_id", ] } - assert config.query_field_paths == {FieldPath("id"), FieldPath("customer_id")} + assert payment_card_node.query_field_paths == { + FieldPath("id"), + FieldPath("customer_id"), + } # values exist for all query keys assert ( found_query_keys( - config, + payment_card_node, { "id": ["A"], "customer_id": ["V"], @@ -85,7 +87,7 @@ def found_query_keys(qconfig: QueryConfig, values: Dict[str, Any]) -> Set[str]: # with no values OR an empty set, these are omitted assert ( found_query_keys( - config, + payment_card_node, { "id": ["A"], "customer_id": [], @@ -94,14 +96,15 @@ def found_query_keys(qconfig: QueryConfig, values: Dict[str, Any]) -> Set[str]: ) == {"id"} ) - assert found_query_keys(config, {"id": ["A"], "ignore_me": ["X"]}) == {"id"} - assert found_query_keys(config, {"ignore_me": ["X"]}) == set() - assert found_query_keys(config, {}) == set() + assert found_query_keys( + payment_card_node, {"id": ["A"], "ignore_me": ["X"]} + ) == {"id"} + assert found_query_keys(payment_card_node, {"ignore_me": ["X"]}) == set() + assert found_query_keys(payment_card_node, {}) == set() def test_typed_filtered_values(self): - config = SQLQueryConfig(payment_card_node) assert ( - config.typed_filtered_values( + payment_card_node.typed_filtered_values( { "id": ["A"], "customer_id": ["V"], @@ -112,7 +115,7 @@ def test_typed_filtered_values(self): ) assert ( - config.typed_filtered_values( + payment_card_node.typed_filtered_values( { "id": ["A"], "customer_id": [], @@ -122,16 +125,18 @@ def test_typed_filtered_values(self): == {"id": ["A"]} ) - assert config.typed_filtered_values({"id": ["A"], "ignore_me": ["X"]}) == { - "id": ["A"] - } + assert payment_card_node.typed_filtered_values( + {"id": ["A"], "ignore_me": ["X"]} + ) == {"id": ["A"]} - assert config.typed_filtered_values({"id": [], "customer_id": ["V"]}) == { - "customer_id": ["V"] - } + assert payment_card_node.typed_filtered_values( + {"id": [], "customer_id": ["V"]} + ) == {"customer_id": ["V"]} # test for type casting: id has type "string": - assert config.typed_filtered_values({"id": [1]}) == {"id": ["1"]} - assert config.typed_filtered_values({"id": [1, 2]}) == {"id": ["1", "2"]} + assert payment_card_node.typed_filtered_values({"id": [1]}) == {"id": ["1"]} + assert payment_card_node.typed_filtered_values({"id": [1, 2]}) == { + "id": ["1", "2"] + } def test_generated_sql_query(self): """Test that the generated query depends on the input set""" @@ -404,24 +409,21 @@ def test_primary_key_field_paths(self, customer_details_node): def test_nested_query_field_paths( self, customer_details_node, customer_feedback_node ): - config = SQLQueryConfig(customer_details_node) - assert config.query_field_paths == { + assert customer_details_node.query_field_paths == { FieldPath("customer_id"), } - other_config = SQLQueryConfig(customer_feedback_node) - assert other_config.query_field_paths == { + assert customer_feedback_node.query_field_paths == { FieldPath("customer_information", "email") } def test_nested_typed_filtered_values(self, customer_feedback_node): """Identity data is located on a nested object""" - config = SQLQueryConfig(customer_feedback_node) input_data = { "customer_information.email": ["test@example.com"], "ignore": ["abcde"], } - assert config.typed_filtered_values(input_data) == { + assert customer_feedback_node.typed_filtered_values(input_data) == { "customer_information.email": ["test@example.com"] } @@ -472,14 +474,19 @@ def test_generate_query( ] config = MongoQueryConfig(customer_details) input_data = {"customer_id": [1]} - # Tuple of query, projection - Projection is specifying fields at the top-level. Nested data will be filtered later. + # Tuple of query, projection - Projection is specifying fields at the top-level. Nested data will + # be filtered later. assert config.generate_query(input_data, policy) == ( {"customer_id": 1}, { "_id": 1, "birthday": 1, + "comments": 1, "customer_id": 1, + "emergency_contacts": 1, + "children": 1, "gender": 1, + "travel_identifiers": 1, "workplace_info": 1, }, ) @@ -513,7 +520,12 @@ def test_generate_update_stmt_multiple_fields( "gender": "male", "customer_id": 1, "_id": 1, - "workplace_info": {"position": "Chief Strategist"}, + "workplace_info": { + "position": "Chief Strategist", + "direct_reports": ["Robbie Margo", "Sully Hunter"], + }, + "emergency_contacts": [{"name": "June Customer", "phone": "444-444-4444"}], + "children": ["Christopher Customer", "Courtney Customer"], } # Make target more broad @@ -525,8 +537,17 @@ def test_generate_update_stmt_multiple_fields( row, erasure_policy, privacy_request ) assert mongo_statement[0] == {"_id": 1} + # TODO lots of this update statmement is wrong assert mongo_statement[1] == { - "$set": {"workplace_info.position": None, "birthday": None, "gender": None} + "$set": { + "birthday": None, + "emergency_contacts.name": None, + "workplace_info.direct_reports": None, + "emergency_contacts.phone": None, + "gender": None, + "workplace_info.position": None, + "children": None, + } } def test_generate_update_stmt_multiple_rules( @@ -558,6 +579,12 @@ def test_generate_update_stmt_multiple_rules( "gender": "male", "customer_id": 1, "_id": 1, + "workplace_info": { + "position": "Chief Strategist", + "direct_reports": ["Robbie Margo", "Sully Hunter"], + }, + "emergency_contacts": [{"name": "June Customer", "phone": "444-444-4444"}], + "children": ["Christopher Customer", "Courtney Customer"], } rule = erasure_policy_two_rules.rules[0] @@ -577,11 +604,15 @@ def test_generate_update_stmt_multiple_rules( } target = rule_two.targets[0] target.data_category = DataCategory("user.provided.identifiable.gender").value + # cache secrets for hash strategy + secret = MaskingSecretCache[str]( + secret="adobo", masking_strategy=HASH, secret_type=SecretType.salt + ) + cache_secret(secret, privacy_request.id) mongo_statement = config.generate_update_stmt( row, erasure_policy_two_rules, privacy_request ) - print(mongo_statement) assert mongo_statement[0] == {"_id": 1} assert len(mongo_statement[1]["$set"]["gender"]) == 30 assert mongo_statement[1]["$set"]["birthday"] == HashMaskingStrategy( diff --git a/tests/service/privacy_request/request_runner_service_test.py b/tests/service/privacy_request/request_runner_service_test.py index f42233ec5..b82f9ef43 100644 --- a/tests/service/privacy_request/request_runner_service_test.py +++ b/tests/service/privacy_request/request_runner_service_test.py @@ -1,5 +1,3 @@ -import json -import logging import time from typing import Any, Dict, List, Set from unittest import mock diff --git a/tests/task/test_consolidate_query_matches.py b/tests/task/test_consolidate_query_matches.py new file mode 100644 index 000000000..91d35a034 --- /dev/null +++ b/tests/task/test_consolidate_query_matches.py @@ -0,0 +1,50 @@ +from fidesops.graph.config import FieldPath +from fidesops.task.consolidate_query_matches import consolidate_query_matches + + +def test_consolidate_query_matches(): + # Matching scalar returned + input_data = {"B": 55} + target_path = FieldPath("B") + assert consolidate_query_matches(input_data, target_path) == [55] + + # Matching array returned as-is + input_data = {"A": [1, 2, 3]} + target_path = FieldPath("A") + assert consolidate_query_matches(input_data, target_path) == [1, 2, 3] + + # Array of embedded objects have multiple matching sub-paths merged + field_path = FieldPath("A", "B") + input_data = {"A": [{"B": 1, "C": 2}, {"B": 3, "C": 4}, {"B": 5, "C": 6}]} + assert consolidate_query_matches(input_data, field_path) == [1, 3, 5] + + # Nested array returned + input_data = {"A": {"B": {"C": [9, 8, 7]}}, "D": {"E": {"F": "G"}}} + field_path = FieldPath("A", "B", "C") + assert consolidate_query_matches(input_data, field_path) == [9, 8, 7] + + # Array of arrays are merged + input_data = {"A": [[5, 6], [7, 8], [9, 10]], "B": [[5, 6], [7, 8], [9, 10]]} + field_path = FieldPath("A") + assert consolidate_query_matches(input_data, field_path) == [5, 6, 7, 8, 9, 10] + + # Array of arrays of embedded objects are merged + input_data = { + "A": [ + [{"B": 1, "C": 2, "D": [3]}, {}], + [{"B": 3, "C": 4, "D": [5]}, {"B": 77, "C": 88, "D": [99]}], + ], + "B": [[5, 6], [7, 8], [9, 10]], + } + field_path = FieldPath("A", "D") + assert consolidate_query_matches(input_data, field_path) == [3, 5, 99] + + # Target path doesn't exist in data + field_path = FieldPath("A", "E", "X") + input_data = {"A": [{"B": 1, "C": 2}, {"B": 3, "C": 4}, {"B": 5, "C": 6}]} + assert consolidate_query_matches(input_data, field_path) == [] + + # No field path + field_path = FieldPath() + input_data = {"A": [{"B": 1, "C": 2}, {"B": 3, "C": 4}, {"B": 5, "C": 6}]} + assert consolidate_query_matches(input_data, field_path) == [] diff --git a/tests/task/test_filter_element_match.py b/tests/task/test_filter_element_match.py new file mode 100644 index 000000000..bc6c2cd18 --- /dev/null +++ b/tests/task/test_filter_element_match.py @@ -0,0 +1,243 @@ +import copy + +import pytest + +from fidesops.graph.config import FieldPath +from fidesops.task.filter_element_match import ( + _expand_array_paths_to_preserve, + filter_element_match, + _remove_paths_from_row, +) + + +class TestFilterElementMatch: + def test_filter_element_match_no_paths(self): + row = {"A": "B", "C": "D"} + query_paths = {} + assert filter_element_match(copy.deepcopy(row), query_paths) == row + + def test_filter_element_match_no_record(self): + row = {} + query_paths = {FieldPath("A", "B", "C"): [1, 2]} + assert filter_element_match(row, query_paths) == {} + + def test_object_match_no_change(self): + row = {"A": "B", "C": {"D": {"E": "F", "G": "H"}}} + query_paths = {FieldPath("A"): ["B"], FieldPath("C", "D", "E"): ["F"]} + assert filter_element_match(row, query_paths) == row + + def test_array_match(self): + row = { + "A": ["b", "c", "d", "e"], + "C": {"D": {"E": ["g", "h", "i", "j"], "G": "H"}}, + "J": ["K", "L", "M"], + } + + query_paths = {FieldPath("A"): ["c", "d"], FieldPath("C", "D", "E"): ["h", "i"]} + assert filter_element_match(row, query_paths) == { + "A": ["c", "d"], + "C": {"D": {"E": ["h", "i"], "G": "H"}}, + "J": ["K", "L", "M"], + } + + def test_multiple_embedded_objects_match(self): + row = { + "A": ["b", "c", "d", "e"], + "C": {"D": {"E": ["g", "h", "i", "j"], "G": "H"}}, + "J": [ + {"K": 1, "J": 2}, + {"K": 3, "J": 4}, + {"K": 1, "J": 6}, + {"K": 2, "J": 4}, + ], + } + query_paths = {FieldPath("J", "K"): [2], FieldPath("J", "J"): [4]} + assert filter_element_match(row, query_paths) == { + "A": ["b", "c", "d", "e"], + "C": {"D": {"E": ["g", "h", "i", "j"], "G": "H"}}, + "J": [{"K": 3, "J": 4}, {"K": 2, "J": 4}], + } + + def test_filter_element_large_data(self, sample_data): + incoming_paths = { + FieldPath( + "F", + ): ["a"], + FieldPath("snacks"): ["pizza"], + FieldPath("thread", "comment"): ["com_0002"], + } + + filtered_record = filter_element_match(sample_data, incoming_paths) + assert filtered_record == { + "_id": 12345, + "thread": [ + { + "comment": "com_0002", + "message": "yep, got your message, looks like it works", + "chat_name": "Jane", + }, + {"comment": "com_0002", "message": "hello!", "chat_name": "Jeanne"}, + ], + "snacks": ["pizza"], + "seats": {"first_choice": "A2", "second_choice": "B3"}, + "upgrades": { + "magazines": ["Time", "People"], + "books": ["Once upon a Time", "SICP"], + "earplugs": True, + }, + "other_flights": [ + {"DFW": ["11 AM", "12 PM"], "CHO": ["12 PM", "1 PM"]}, + {"DFW": ["2 AM", "12 PM"], "CHO": ["2 PM", "1 PM"]}, + {"DFW": ["3 AM", "2 AM"], "CHO": ["2 PM", "1:30 PM"]}, + ], + "months": { + "july": [ + { + "activities": ["swimming", "hiking"], + "crops": ["watermelon", "cheese", "grapes"], + }, + {"activities": ["tubing"], "crops": ["corn"]}, + ], + "march": [ + { + "activities": ["skiing", "bobsledding"], + "crops": ["swiss chard", "swiss chard"], + }, + {"activities": ["hiking"], "crops": ["spinach"]}, + ], + }, + "hello": [1, 2, 3, 4, 2], + "weights": [[1, 2], [3, 4]], + "toppings": [[["pepperoni", "salami"], ["pepperoni", "cheese", "cheese"]]], + "A": {"C": [{"M": ["p", "n", "n"]}]}, + "C": [["A", "B", "C", "B"], ["G", "H", "B", "B"]], + "D": [ + [["A", "B", "C", "B"], ["G", "H", "B", "B"]], + [["A", "B", "C", "B"], ["G", "H", "B", "B"]], + ], + "E": [[["B"], [["A", "B", "C", "B"], ["G", "H", "B", "B"]]]], + "F": ["a", ["a", [["a", "a"]]]], + } + + +class TestRemovePathsFromRow: + """Test sub-method remove_paths_from_row""" + + @pytest.fixture(scope="function") + def row(self): + return { + "A": "a", + "B": [1, 2, 3, 4], + "C": {"D": [4, 5, 6], "E": [7, 8, 9]}, + "D": [{"F": "g", "J": "j"}, {"F": "h", "J": "k"}, {"F": "i"}], + "E": [[[1, 3, 2], [4, 5, 6]]], + "G": [{"H": [{"I": {"J": ["a", "b", "c", "D"]}}]}], + } + + def test_path_does_not_exist(self, row): + modified_row = _remove_paths_from_row(row, {"E.F.G.H.1": [2]}) + assert row == modified_row + + def test_edge_case_index_not_in_row(self, row): + """This shouldn't be hit, this is for completeness.""" + row = _remove_paths_from_row(row, {"B": [10]}) + assert row == row # No change + + def test_remove_top_level_array_indices(self, row): + row = _remove_paths_from_row(row, {"B": [1, 3]}) + assert row["B"] == [2, 4] + + def test_remove_index_from_nested_array(self, row): + row = _remove_paths_from_row(row, {"C.D": [1]}) + assert row["C"]["D"] == [5] + + def test_remove_nested_document(self, row): + row = _remove_paths_from_row(row, {"D": [2]}) + assert row["D"] == [{"F": "i"}] + + def test_remove_element_in_array_of_arrays(self, row): + row = _remove_paths_from_row(row, {"E.0.1": [2]}) + assert row["E"] == [[[1, 3, 2], [6]]] + + def test_deeply_nested_path(self, row): + row = _remove_paths_from_row(row, {"G.0.H.0.I.J": [0, 2]}) + assert row["G"] == [{"H": [{"I": {"J": ["a", "c"]}}]}] + + +class TestExpandArrayPathsToPreserve: + """Test sub-method of filter_element_match""" + + def test_no_array_paths(self): + """All of these paths are to nested fields in objects, no arrays are included here.""" + expanded_field_paths = [ + ["A", "B"], + ["A", "C"], + ["A", "D", "E", "F"], + ["A", "G", "E", "I"], + ] + + assert _expand_array_paths_to_preserve(expanded_field_paths) == {} + + def test_array_at_deepest_level(self): + expanded_field_paths = [["A", "B", 1], ["A", "B", 19]] + assert _expand_array_paths_to_preserve(expanded_field_paths) == {"A.B": [1, 19]} + + def test_array_of_objects(self): + """Removing all indices from resource["A"]["B"] except for 0 and 1""" + expanded_field_paths = [ + ["A", "B", 0, "C"], + ["A", "B", 0, "D"], + ["A", "B", 1, "C"], + ] + + assert _expand_array_paths_to_preserve(expanded_field_paths) == {"A.B": [0, 1]} + + def test_no_paths(self): + """Removing all indices from resource["A"]["B"] except for 0 and 1""" + expanded_field_paths = [] + + assert _expand_array_paths_to_preserve(expanded_field_paths) == {} + + def test_multiple_levels_of_paths(self): + expanded_field_paths = [ + ["A", 1], + ["B", "C", 2], + ["B", "D", 3], + ["C", "D", "E", 5], + ["D", 1, "E", "F", "G"], + ["D", 2, "E", "F"], + ["E"], + ] + + assert _expand_array_paths_to_preserve(expanded_field_paths) == { + "A": [1], + "B.C": [2], + "B.D": [3], + "C.D.E": [5], + "D": [1, 2], + } + + def test_multiple_matching_embedded_objects(self): + expanded_field_paths = [["J", 3, "K"], ["J", 1, "J"], ["J", 3, "J"]] + assert _expand_array_paths_to_preserve(expanded_field_paths) == {"J": [3, 1]} + + def test_nested_arrays_of_arrays(self): + expanded_field_paths = [ + ["F", 0], + ["snacks", 0], + ["F", 1, 1], + ["thread", 1, "comment"], + ["thread", 2, "comment"], + ["F", 1, 2, 0, 1], + ["F", 1, 2, 0, 2], + ["Non", "integer"], + ] + + assert _expand_array_paths_to_preserve(expanded_field_paths) == { + "F": [0, 1], + "snacks": [0], + "F.1": [1, 2], + "thread": [1, 2], + "F.1.2": [0], + "F.1.2.0": [1, 2], + } diff --git a/tests/task/test_filter_results.py b/tests/task/test_filter_results.py new file mode 100644 index 000000000..324840d49 --- /dev/null +++ b/tests/task/test_filter_results.py @@ -0,0 +1,923 @@ +import copy +from datetime import datetime + +from bson import ObjectId + +from fidesops.graph.config import FieldPath, CollectionAddress +from fidesops.task.filter_results import ( + select_and_save_field, + remove_empty_containers, +) +from fidesops.task.graph_task import filter_data_categories + + +def test_select_and_save_field(): + final_results = {} + flat = { + "A": "a", + "B": "b", + "C": ["d", "e", "f"], + "D": ["g", "h", "i", "j"], + "E": { + "F": "g", + "H": "i", + "J": {"K": {"L": {"M": ["m", "n", "o"], "P": "p"}}, "N": {"O": "o"}}, + }, + "F": [{"G": "g", "H": "h"}, {"G": "h", "H": "i"}, {"G": "i", "H": "j"}], + "H": [ + [ + {"M": [1, 2, 3], "N": "n"}, + {"M": [3, 2, 1], "N": "o"}, + {"M": [1, 1, 1], "N": "p"}, + ], + [ + {"M": [4, 5, 6], "N": "q"}, + {"M": [2, 2, 2], "N": "s"}, + {"M": [], "N": "u"}, + ], + [ + {"M": [7, 8, 9], "N": "w"}, + {"M": [6, 6, 6], "N": "y"}, + {"M": [2], "N": "z"}, + ], + ], + "I": { + "X": [ + {"J": "j", "K": ["k"]}, + {"J": "m", "K": ["customer@example.com", "customer-1@example.com"]}, + ], + "Y": [{"J": "l", "K": ["n"]}, {"J": "m", "K": ["customer@example.com"]}], + "Z": [{"J": "m", "K": ["n"]}], + }, + "J": { + "K": { + "L": { + "M": { + "N": { + "O": ["customer@example.com", "customer@gmail.com"], + "P": ["customer@yahoo.com", "customer@customer.com"], + } + } + } + } + }, + "K": [{"L": "l", "M": "m"}, {"L": "n", "M": "o"}], + } + + # Test simple scalar field selected + assert select_and_save_field(final_results, flat, FieldPath("A")) == {"A": "a"} + # Test array field selected, and added to final results + assert select_and_save_field(final_results, flat, FieldPath("C")) == { + "A": "a", + "C": ["d", "e", "f"], + } + + # Test array field selected and added to results + assert select_and_save_field(final_results, flat, FieldPath("D")) == { + "A": "a", + "C": ["d", "e", "f"], + "D": ["g", "h", "i", "j"], + } + # Test nested field selected and added to final results + assert select_and_save_field(final_results, flat, FieldPath("E", "F")) == { + "A": "a", + "C": ["d", "e", "f"], + "D": ["g", "h", "i", "j"], + "E": {"F": "g"}, + } + # Test select field not in results - no error + assert select_and_save_field( + final_results, flat, FieldPath("E", "F", "Z", "X") + ) == { + "A": "a", + "C": ["d", "e", "f"], + "D": ["g", "h", "i", "j"], + "E": {"F": "g"}, + } + + # Test more deeply nested scalar from previous dict + assert select_and_save_field( + final_results, + flat, + FieldPath("E", "J", "K", "L", "M"), + ) == { + "A": "a", + "C": ["d", "e", "f"], + "D": ["g", "h", "i", "j"], + "E": {"F": "g", "J": {"K": {"L": {"M": ["m", "n", "o"]}}}}, + } + + # Test get matching dict key for each element in array + assert select_and_save_field(final_results, flat, FieldPath("F", "G")) == { + "A": "a", + "C": ["d", "e", "f"], + "D": ["g", "h", "i", "j"], + "E": {"F": "g", "J": {"K": {"L": {"M": ["m", "n", "o"]}}}}, + "F": [{"G": "g"}, {"G": "h"}, {"G": "i"}], + } + + # Test get nested fields inside nested arrays + assert select_and_save_field(final_results, flat, FieldPath("H", "N")) == { + "A": "a", + "C": ["d", "e", "f"], + "D": ["g", "h", "i", "j"], + "E": {"F": "g", "J": {"K": {"L": {"M": ["m", "n", "o"]}}}}, + "F": [{"G": "g"}, {"G": "h"}, {"G": "i"}], + "H": [ + [{"N": "n"}, {"N": "o"}, {"N": "p"}], + [{"N": "q"}, {"N": "s"}, {"N": "u"}], + [{"N": "w"}, {"N": "y"}, {"N": "z"}], + ], + } + + # Test get nested fields inside nested arrays + assert select_and_save_field(final_results, flat, FieldPath("H", "M")) == { + "A": "a", + "C": ["d", "e", "f"], + "D": ["g", "h", "i", "j"], + "E": {"F": "g", "J": {"K": {"L": {"M": ["m", "n", "o"]}}}}, + "F": [{"G": "g"}, {"G": "h"}, {"G": "i"}], + "H": [ + [ + {"N": "n", "M": [1, 2, 3]}, + {"N": "o", "M": [3, 2, 1]}, + {"N": "p", "M": [1, 1, 1]}, + ], + [ + {"N": "q", "M": [4, 5, 6]}, + {"N": "s", "M": [2, 2, 2]}, + {"N": "u", "M": []}, + ], + [ + {"N": "w", "M": [7, 8, 9]}, + {"N": "y", "M": [6, 6, 6]}, + {"N": "z", "M": [2]}, + ], + ], + } + + # Test get dict of array of dict fields + assert select_and_save_field(final_results, flat, FieldPath("I", "X", "J")) == { + "A": "a", + "C": ["d", "e", "f"], + "D": ["g", "h", "i", "j"], + "E": {"F": "g", "J": {"K": {"L": {"M": ["m", "n", "o"]}}}}, + "F": [{"G": "g"}, {"G": "h"}, {"G": "i"}], + "H": [ + [ + {"N": "n", "M": [1, 2, 3]}, + {"N": "o", "M": [3, 2, 1]}, + {"N": "p", "M": [1, 1, 1]}, + ], + [ + {"N": "q", "M": [4, 5, 6]}, + {"N": "s", "M": [2, 2, 2]}, + {"N": "u", "M": []}, + ], + [ + {"N": "w", "M": [7, 8, 9]}, + {"N": "y", "M": [6, 6, 6]}, + {"N": "z", "M": [2]}, + ], + ], + "I": {"X": [{"J": "j"}, {"J": "m"}]}, + } + + # Test get deeply nested array field with only matching data, array in arrays + assert select_and_save_field(final_results, flat, FieldPath("I", "X", "K")) == { + "A": "a", + "C": ["d", "e", "f"], + "D": ["g", "h", "i", "j"], + "E": {"F": "g", "J": {"K": {"L": {"M": ["m", "n", "o"]}}}}, + "F": [{"G": "g"}, {"G": "h"}, {"G": "i"}], + "H": [ + [ + {"N": "n", "M": [1, 2, 3]}, + {"N": "o", "M": [3, 2, 1]}, + {"N": "p", "M": [1, 1, 1]}, + ], + [ + {"N": "q", "M": [4, 5, 6]}, + {"N": "s", "M": [2, 2, 2]}, + {"N": "u", "M": []}, + ], + [ + {"N": "w", "M": [7, 8, 9]}, + {"N": "y", "M": [6, 6, 6]}, + {"N": "z", "M": [2]}, + ], + ], + "I": { + "X": [ + {"J": "j", "K": ["k"]}, + {"J": "m", "K": ["customer@example.com", "customer-1@example.com"]}, + ] + }, + } + + # Get deeply nested array inside of dicts, with only matching data + assert select_and_save_field(final_results, flat, FieldPath("I", "Y", "K")) == { + "A": "a", + "C": ["d", "e", "f"], + "D": ["g", "h", "i", "j"], + "E": {"F": "g", "J": {"K": {"L": {"M": ["m", "n", "o"]}}}}, + "F": [{"G": "g"}, {"G": "h"}, {"G": "i"}], + "H": [ + [ + {"N": "n", "M": [1, 2, 3]}, + {"N": "o", "M": [3, 2, 1]}, + {"N": "p", "M": [1, 1, 1]}, + ], + [ + {"N": "q", "M": [4, 5, 6]}, + {"N": "s", "M": [2, 2, 2]}, + {"N": "u", "M": []}, + ], + [ + {"N": "w", "M": [7, 8, 9]}, + {"N": "y", "M": [6, 6, 6]}, + {"N": "z", "M": [2]}, + ], + ], + "I": { + "X": [ + {"J": "j", "K": ["k"]}, + {"J": "m", "K": ["customer@example.com", "customer-1@example.com"]}, + ], + "Y": [{"K": ["n"]}, {"K": ["customer@example.com"]}], + }, + } + + assert select_and_save_field( + final_results, + flat, + FieldPath("J", "K", "L", "M", "N", "O"), + ) == { + "A": "a", + "C": ["d", "e", "f"], + "D": ["g", "h", "i", "j"], + "E": {"F": "g", "J": {"K": {"L": {"M": ["m", "n", "o"]}}}}, + "F": [{"G": "g"}, {"G": "h"}, {"G": "i"}], + "H": [ + [ + {"N": "n", "M": [1, 2, 3]}, + {"N": "o", "M": [3, 2, 1]}, + {"N": "p", "M": [1, 1, 1]}, + ], + [ + {"N": "q", "M": [4, 5, 6]}, + {"N": "s", "M": [2, 2, 2]}, + {"N": "u", "M": []}, + ], + [ + {"N": "w", "M": [7, 8, 9]}, + {"N": "y", "M": [6, 6, 6]}, + {"N": "z", "M": [2]}, + ], + ], + "I": { + "X": [ + {"J": "j", "K": ["k"]}, + {"J": "m", "K": ["customer@example.com", "customer-1@example.com"]}, + ], + "Y": [{"K": ["n"]}, {"K": ["customer@example.com"]}], + }, + "J": { + "K": { + "L": {"M": {"N": {"O": ["customer@example.com", "customer@gmail.com"]}}} + } + }, + } + + # Test "only" param does not apply to regular scalar fields + assert select_and_save_field(final_results, flat, FieldPath("B")) == { + "A": "a", + "C": ["d", "e", "f"], + "D": ["g", "h", "i", "j"], + "E": {"F": "g", "J": {"K": {"L": {"M": ["m", "n", "o"]}}}}, + "F": [{"G": "g"}, {"G": "h"}, {"G": "i"}], + "H": [ + [ + {"N": "n", "M": [1, 2, 3]}, + {"N": "o", "M": [3, 2, 1]}, + {"N": "p", "M": [1, 1, 1]}, + ], + [ + {"N": "q", "M": [4, 5, 6]}, + {"N": "s", "M": [2, 2, 2]}, + {"N": "u", "M": []}, + ], + [ + {"N": "w", "M": [7, 8, 9]}, + {"N": "y", "M": [6, 6, 6]}, + {"N": "z", "M": [2]}, + ], + ], + "I": { + "X": [ + {"J": "j", "K": ["k"]}, + {"J": "m", "K": ["customer@example.com", "customer-1@example.com"]}, + ], + "Y": [{"K": ["n"]}, {"K": ["customer@example.com"]}], + }, + "J": { + "K": { + "L": {"M": {"N": {"O": ["customer@example.com", "customer@gmail.com"]}}} + } + }, + "B": "b", + } + + assert select_and_save_field(final_results, flat, FieldPath("K", "L"),) == { + "A": "a", + "C": ["d", "e", "f"], + "D": ["g", "h", "i", "j"], + "E": {"F": "g", "J": {"K": {"L": {"M": ["m", "n", "o"]}}}}, + "F": [{"G": "g"}, {"G": "h"}, {"G": "i"}], + "H": [ + [ + {"N": "n", "M": [1, 2, 3]}, + {"N": "o", "M": [3, 2, 1]}, + {"N": "p", "M": [1, 1, 1]}, + ], + [ + {"N": "q", "M": [4, 5, 6]}, + {"N": "s", "M": [2, 2, 2]}, + {"N": "u", "M": []}, + ], + [ + {"N": "w", "M": [7, 8, 9]}, + {"N": "y", "M": [6, 6, 6]}, + {"N": "z", "M": [2]}, + ], + ], + "I": { + "X": [ + {"J": "j", "K": ["k"]}, + {"J": "m", "K": ["customer@example.com", "customer-1@example.com"]}, + ], + "Y": [{"K": ["n"]}, {"K": ["customer@example.com"]}], + }, + "J": { + "K": { + "L": {"M": {"N": {"O": ["customer@example.com", "customer@gmail.com"]}}} + } + }, + "B": "b", + "K": [{"L": "l"}, {"L": "n"}], + } + + +def test_remove_empty_containers(): + # No empty dicts or arrays + orig = {"A": {"B": {"C": 0}, "G": {"H": None}}, "I": 0, "J": False} + results = copy.deepcopy(orig) + remove_empty_containers(results) + assert results == orig + + # Empty arrays + results = {"A": [], "B": [], "C": False} + remove_empty_containers(results) + assert results == {"C": False} + + # Empty dicts + results = {"A": {}, "B": {}, "C": {}} + remove_empty_containers(results) + assert results == {} + + # Empty array removed, which causes "C" key to be popped, which causes "B" to be popped + orig = {"A": {"B": {"C": []}, "G": {"H": None}}, "I": 0, "J": False} + results = copy.deepcopy(orig) + remove_empty_containers(results) + assert results == {"A": {"G": {"H": None}}, "I": 0, "J": False} + + # Deeply nested empty array and deeply nested empty dict removed - G - H - I levels gone and A - B - C levels gone + orig = {"A": {"B": {"C": []}, "G": {"H": {"I": {}}}}, "J": 0} + results = copy.deepcopy(orig) + remove_empty_containers(results) + assert results == {"J": 0} + + orig = { + "A": [{"B": "C", "D": {}}, {"B": "G", "D": {}}, {"B": "J", "D": {"J": "K"}}] + } + results = copy.deepcopy(orig) + remove_empty_containers(results) + assert results == {"A": [{"B": "C"}, {"B": "G"}, {"B": "J", "D": {"J": "K"}}]} + + # Empty dict returns original empty dict + orig = {} + results = copy.deepcopy(orig) + remove_empty_containers(results) + assert results == {} + + # Empty dict returned + orig = {"A": {}} + results = copy.deepcopy(orig) + remove_empty_containers(results) + assert results == {} + + # Removing multiple levels of empty arrays and empty dicts + orig = {"A": [[{"B": "C", "D": [{"F": {}}, {"G": []}]}, {"B": "D"}, {"B": "G"}]]} + results = copy.deepcopy(orig) + remove_empty_containers(results) + assert results == { + "A": [[{"B": "C"}, {"B": "D"}, {"B": "G"}]] + } + + +def test_filter_data_categories(): + """Test different combinations of data categories to ensure the access_request_results are filtered properly""" + access_request_results = { + "postgres_example:supplies": [ + { + "foods": { + "vegetables": True, + "fruits": { + "apples": True, + "oranges": False, + "berries": {"strawberries": True, "blueberries": False}, + }, + "grains": {"rice": False, "wheat": True}, + }, + "clothing": True, + } + ] + } + + data_category_fields = { + CollectionAddress("postgres_example", "supplies"): { + "A": [FieldPath("foods", "fruits", "apples"), FieldPath("clothing")], + "B": [FieldPath("foods", "vegetables")], + "C": [ + FieldPath("foods", "grains", "rice"), + FieldPath("foods", "grains", "wheat"), + ], + "D": [], + "E": [ + FieldPath("foods", "fruits", "berries", "strawberries"), + FieldPath("foods", "fruits", "oranges"), + ], + } + } + + only_a_categories = filter_data_categories( + copy.deepcopy(access_request_results), {"A"}, data_category_fields + ) + + assert only_a_categories == { + "postgres_example:supplies": [ + {"foods": {"fruits": {"apples": True}}, "clothing": True} + ] + } + + only_b_categories = filter_data_categories( + copy.deepcopy(access_request_results), {"B"}, data_category_fields + ) + assert only_b_categories == { + "postgres_example:supplies": [ + { + "foods": { + "vegetables": True, + } + } + ] + } + + only_c_categories = filter_data_categories( + copy.deepcopy(access_request_results), {"C"}, data_category_fields + ) + assert only_c_categories == { + "postgres_example:supplies": [ + {"foods": {"grains": {"rice": False, "wheat": True}}} + ] + } + + only_d_categories = filter_data_categories( + copy.deepcopy(access_request_results), {"D"}, data_category_fields + ) + assert only_d_categories == {} + + only_e_categories = filter_data_categories( + copy.deepcopy(access_request_results), {"E"}, data_category_fields + ) + assert only_e_categories == { + "postgres_example:supplies": [ + { + "foods": { + "fruits": { + "oranges": False, + "berries": {"strawberries": True}, + } + } + } + ] + } + + +def test_filter_data_categories_arrays(): + access_request_results = { + "postgres_example:flights": [ + { + "people": { + "passenger_ids": [222, 445, 311, 4444], + "pilot_ids": [123, 12, 112], + }, + "flight_number": 101, + } + ] + } + + data_category_fields = { + CollectionAddress("postgres_example", "flights"): { + "A": [FieldPath("people", "passenger_ids")], + "B": [FieldPath("people", "pilot_ids")], + } + } + + only_a_category = filter_data_categories( + copy.deepcopy(access_request_results), {"A"}, data_category_fields + ) + + # Nested array field retrieved + assert only_a_category == { + "postgres_example:flights": [ + {"people": {"passenger_ids": [222, 445, 311, 4444]}} + ] + } + + only_b_category = filter_data_categories( + copy.deepcopy(access_request_results), {"B"}, data_category_fields + ) + assert only_b_category == { + "postgres_example:flights": [{"people": {"pilot_ids": [123, 12, 112]}}] + } + + +def test_filter_data_categories_limited_results(): + """ + Test scenario where the related data for a given identity is only a small subset of all the annotated fields. + """ + jane_results = { + "mongo_test:customer_details": [ + { + "_id": ObjectId("61f2bc8d6362fd78d72d8791"), + "customer_id": 3.0, + "gender": "female", + "birthday": datetime(1990, 2, 28, 0, 0), + } + ], + "postgres_example:order_item": [], + "postgres_example:report": [], + "postgres_example:orders": [ + {"customer_id": 3, "id": "ord_ddd-eee", "shipping_address_id": 4} + ], + "postgres_example:employee": [], + "postgres_example:address": [ + { + "city": "Example Mountain", + "house": 1111, + "id": 4, + "state": "TX", + "street": "Example Place", + "zip": "54321", + } + ], + "postgres_example:visit": [], + "postgres_example:product": [], + "postgres_example:customer": [ + { + "address_id": 4, + "created": datetime(2020, 4, 1, 11, 47, 42), + "email": "jane@example.com", + "id": 3, + "name": "Jane Customer", + } + ], + "postgres_example:service_request": [], + "postgres_example:payment_card": [ + { + "billing_address_id": 4, + "ccn": 373719391, + "code": 222, + "customer_id": 3, + "id": "pay_ccc-ccc", + "name": "Example Card 3", + "preferred": False, + } + ], + "mongo_test:customer_feedback": [], + "postgres_example:login": [ + {"customer_id": 3, "id": 8, "time": datetime(2021, 1, 6, 1, 0)} + ], + "mongo_test:internal_customer_profile": [], + } + + target_categories = {"user.provided.identifiable"} + + data_category_fields = { + CollectionAddress.from_string("postgres_example:address"): { + "user.provided.identifiable.contact.city": [ + FieldPath( + "city", + ) + ], + "user.provided.identifiable.contact.street": [ + FieldPath( + "house", + ), + FieldPath( + "street", + ), + ], + "system.operations": [ + FieldPath( + "id", + ) + ], + "user.provided.identifiable.contact.state": [ + FieldPath( + "state", + ) + ], + "user.provided.identifiable.contact.postal_code": [ + FieldPath( + "zip", + ) + ], + }, + CollectionAddress.from_string("postgres_example:customer"): { + "system.operations": [ + FieldPath( + "address_id", + ), + FieldPath( + "created", + ), + ], + "user.provided.identifiable.contact.email": [ + FieldPath( + "email", + ) + ], + "user.derived.identifiable.unique_id": [ + FieldPath( + "id", + ) + ], + "user.provided.identifiable.name": [ + FieldPath( + "name", + ) + ], + }, + CollectionAddress.from_string("postgres_example:employee"): { + "system.operations": [ + FieldPath( + "address_id", + ) + ], + "user.provided.identifiable.contact.email": [ + FieldPath( + "email", + ) + ], + "user.derived.identifiable.unique_id": [ + FieldPath( + "id", + ) + ], + "user.provided.identifiable.name": [ + FieldPath( + "name", + ) + ], + }, + CollectionAddress.from_string("postgres_example:login"): { + "user.derived.identifiable.unique_id": [ + FieldPath( + "customer_id", + ) + ], + "system.operations": [ + FieldPath( + "id", + ) + ], + "user.derived.nonidentifiable.sensor": [ + FieldPath( + "time", + ) + ], + }, + CollectionAddress.from_string("postgres_example:order_item"): { + "system.operations": [ + FieldPath( + "order_id", + ), + FieldPath( + "product_id", + ), + FieldPath( + "quantity", + ), + ] + }, + CollectionAddress.from_string("postgres_example:orders"): { + "user.derived.identifiable.unique_id": [ + FieldPath( + "customer_id", + ) + ], + "system.operations": [ + FieldPath( + "id", + ), + FieldPath( + "shipping_address_id", + ), + ], + }, + CollectionAddress.from_string("postgres_example:payment_card"): { + "system.operations": [ + FieldPath( + "billing_address_id", + ), + FieldPath( + "id", + ), + ], + "user.provided.identifiable.financial.account_number": [ + FieldPath( + "ccn", + ) + ], + "user.provided.identifiable.financial": [ + FieldPath( + "code", + ), + FieldPath( + "name", + ), + ], + "user.derived.identifiable.unique_id": [ + FieldPath( + "customer_id", + ) + ], + "user.provided.nonidentifiable": [ + FieldPath( + "preferred", + ) + ], + }, + CollectionAddress.from_string("postgres_example:product"): { + "system.operations": [ + FieldPath( + "id", + ), + FieldPath( + "name", + ), + FieldPath( + "price", + ), + ] + }, + CollectionAddress.from_string("postgres_example:report"): { + "user.provided.identifiable.contact.email": [ + FieldPath( + "email", + ) + ], + "system.operations": [ + FieldPath( + "id", + ), + FieldPath( + "month", + ), + FieldPath( + "name", + ), + FieldPath( + "total_visits", + ), + FieldPath( + "year", + ), + ], + }, + CollectionAddress.from_string("postgres_example:service_request"): { + "user.provided.identifiable.contact.email": [ + FieldPath( + "alt_email", + ) + ], + "system.operations": [ + FieldPath( + "closed", + ), + FieldPath( + "email", + ), + FieldPath( + "id", + ), + FieldPath( + "opened", + ), + ], + "user.derived.identifiable.unique_id": [ + FieldPath( + "employee_id", + ) + ], + }, + CollectionAddress.from_string("postgres_example:visit"): { + "user.provided.identifiable.contact.email": [ + FieldPath( + "email", + ) + ], + "system.operations": [ + FieldPath( + "last_visit", + ) + ], + }, + CollectionAddress.from_string("mongo_test:customer_details"): { + "system.operations": [ + FieldPath( + "_id", + ) + ], + "user.provided.identifiable.date_of_birth": [ + FieldPath( + "birthday", + ) + ], + "user.derived.identifiable.unique_id": [ + FieldPath( + "customer_id", + ) + ], + "user.provided.identifiable.gender": [ + FieldPath( + "gender", + ) + ], + "user.provided.identifiable.job_title": [ + FieldPath("workplace_info", "position") + ], + }, + CollectionAddress.from_string("mongo_test:customer_feedback"): { + "system.operations": [ + FieldPath( + "_id", + ) + ], + "user.provided.identifiable.contact.phone_number": [ + FieldPath("customer_information", "phone") + ], + "user.provided.nonidentifiable": [ + FieldPath( + "message", + ), + FieldPath( + "rating", + ), + ], + }, + CollectionAddress.from_string("mongo_test:internal_customer_profile"): { + "user.derived": [ + FieldPath( + "derived_interests", + ) + ] + }, + } + + filtered_results = filter_data_categories( + copy.deepcopy(jane_results), target_categories, data_category_fields + ) + + assert filtered_results == { + "mongo_test:customer_details": [ + {"gender": "female", "birthday": datetime(1990, 2, 28)} + ], + "postgres_example:address": [ + { + "city": "Example Mountain", + "house": 1111, + "state": "TX", + "street": "Example Place", + "zip": "54321", + } + ], + "postgres_example:customer": [ + {"email": "jane@example.com", "name": "Jane Customer"} + ], + "postgres_example:payment_card": [ + {"ccn": 373719391, "code": 222, "name": "Example Card 3"} + ], + } diff --git a/tests/task/test_graph_task.py b/tests/task/test_graph_task.py index f56692dec..c357d8e2f 100644 --- a/tests/task/test_graph_task.py +++ b/tests/task/test_graph_task.py @@ -1,26 +1,19 @@ -import copy -from datetime import datetime -from typing import Dict, Any +import pytest import dask from bson import ObjectId from fidesops.graph.config import ( CollectionAddress, - FieldPath, ) from fidesops.graph.graph import DatasetGraph -from fidesops.graph.traversal import Traversal, TraversalNode +from fidesops.graph.traversal import Traversal from fidesops.models.connectionconfig import ConnectionConfig, ConnectionType -from fidesops.models.datasetconfig import convert_dataset_to_graph from fidesops.models.policy import Policy -from fidesops.schemas.dataset import FidesopsDataset from fidesops.task.graph_task import ( collect_queries, TaskResources, EMPTY_REQUEST, - filter_data_categories, - GraphTask, ) from .traversal_data import sample_traversal, combined_mongo_postgresql_graph from ..graph.graph_test_util import ( @@ -37,62 +30,198 @@ ] -def test_to_dask_input_data_scalar() -> None: - t = sample_traversal() - n = t.traversal_node_dict[CollectionAddress("mysql", "Address")] +class TestToDaskInput: + @pytest.fixture(scope="function") + def combined_traversal_node_dict( + self, integration_mongodb_config, integration_postgres_config + ): + mongo_dataset, postgres_dataset = combined_mongo_postgresql_graph( + integration_postgres_config, integration_mongodb_config + ) + graph = DatasetGraph(mongo_dataset, postgres_dataset) + identity = {"email": "customer-1@example.com"} + combined_traversal = Traversal(graph, identity) + return combined_traversal.traversal_node_dict + + @pytest.fixture(scope="function") + def make_graph_task(self, integration_mongodb_config, integration_postgres_config): + def task(node): + return MockMongoTask( + node, + TaskResources( + EMPTY_REQUEST, + Policy(), + [integration_postgres_config, integration_mongodb_config], + ), + ) - task = MockSqlTask(n, TaskResources(EMPTY_REQUEST, Policy(), connection_configs)) - customers_data = [ - {"contact_address_id": 31, "foo": "X"}, - {"contact_address_id": 32, "foo": "Y"}, - ] - orders_data = [ - {"billing_address_id": 1, "shipping_address_id": 2}, - {"billing_address_id": 11, "shipping_address_id": 22}, - ] - v = task.to_dask_input_data(customers_data, orders_data) - assert set(v["id"]) == {31, 32, 1, 2, 11, 22} + return task + def test_to_dask_input_data_scalar(self) -> None: + t = sample_traversal() + n = t.traversal_node_dict[CollectionAddress("mysql", "Address")] -def test_to_dask_input_data_nested( - integration_postgres_config, integration_mongodb_config -): + task = MockSqlTask( + n, TaskResources(EMPTY_REQUEST, Policy(), connection_configs) + ) + customers_data = [ + {"contact_address_id": 31, "foo": "X"}, + {"contact_address_id": 32, "foo": "Y"}, + ] + orders_data = [ + {"billing_address_id": 1, "shipping_address_id": 2}, + {"billing_address_id": 11, "shipping_address_id": 22}, + ] + v = task.to_dask_input_data(customers_data, orders_data) + assert set(v["id"]) == {31, 32, 1, 2, 11, 22} + + def test_to_dask_input_nested_identity( + self, combined_traversal_node_dict, make_graph_task + ): + """ + Identity data used to locate record on nested email + + Customer feedback node has one input: + ROOT.email -> customer_feedback.customer_information.email + """ + node = combined_traversal_node_dict[ + CollectionAddress("mongo_test", "customer_feedback") + ] + root_email_input = [{"email": "customer-1@example.com"}] + assert make_graph_task(node).to_dask_input_data(root_email_input) == { + "customer_information.email": ["customer-1@example.com"], + } - mongo_dataset, postgres_dataset = combined_mongo_postgresql_graph( - integration_postgres_config, integration_mongodb_config - ) - graph = DatasetGraph(mongo_dataset, postgres_dataset) - identity = {"email": "customer-1@example.com"} - combined_traversal = Traversal(graph, identity) - n = combined_traversal.traversal_node_dict[ - CollectionAddress("mongo_test", "internal_customer_profile") - ] - - customer_feedback_data = [ - { - "_id": ObjectId("61eb388ecfb4a3721238a39b"), - "customer_information": { - "email": "customer-1@example.com", - "phone": "333-333-3333", - "internal_customer_id": "cust_001", + def test_to_dask_input_customer_feedback_collection( + self, combined_traversal_node_dict, make_graph_task + ): + """ + Nested internal_customer_id used to locate record matching nested internal_id + + Internal customer profile node has two inputs: + ROOT.email -> internal_customer_profile.derived_emails(string[]) + customer_feedback.customer_information.internal_customer_id -> internal_customer_profile.customer_identifiers.internal_id + """ + node = combined_traversal_node_dict[ + CollectionAddress("mongo_test", "internal_customer_profile") + ] + internal_customer_profile_task = make_graph_task(node) + root_email_input = [{"email": "customer-1@example.com"}] + customer_feedback_input = [ + { + "_id": ObjectId("61eb388ecfb4a3721238a39b"), + "customer_information": { + "email": "customer-1@example.com", + "phone": "333-333-3333", + "internal_customer_id": "cust_001", + }, + } + ] + + assert internal_customer_profile_task.to_dask_input_data( + root_email_input, customer_feedback_input + ) == { + "customer_identifiers.derived_emails": ["customer-1@example.com"], + "customer_identifiers.internal_id": ["cust_001"], + } + + def test_to_dask_input_flights_collection( + self, make_graph_task, combined_traversal_node_dict + ): + """ + Array of strings used to locate record with matching value in nested array of strings + + Flights node has one input: + mongo_test.customer_details.travel_identifiers -> mongo_test.passenger_information.passenger_ids + """ + node = combined_traversal_node_dict[CollectionAddress("mongo_test", "flights")] + task = make_graph_task(node) + truncated_customer_details_output = [ + { + "_id": ObjectId("61f422e0ddc2559e0c300e95"), + "travel_identifiers": ["A111-11111", "B111-11111"], + }, + { + "_id": ObjectId("61f422e0ddc2559e0c300e95"), + "travel_identifiers": ["C111-11111"], }, - "rating": 3.0, - "date": datetime(2022, 1, 5, 0, 0), - "message": "Product was cracked!", + ] + assert task.to_dask_input_data(truncated_customer_details_output) == { + "passenger_information.passenger_ids": [ + "A111-11111", + "B111-11111", + "C111-11111", + ], + } + + def test_to_dask_input_aircraft_collection( + self, make_graph_task, combined_traversal_node_dict + ): + """ + Integer used to locate record with matching value in array of integers + + Aircraft node has one input: + mongo_test:flights.plane -> mongo_test:aircraft.planes + """ + node = combined_traversal_node_dict[CollectionAddress("mongo_test", "aircraft")] + task = make_graph_task(node) + truncated_flights_output = [ + {"pilots": ["1", "2"], "plane": 10002.0}, + {"pilots": ["3", "4"], "plane": 101010}, + ] + assert task.to_dask_input_data(truncated_flights_output) == { + "planes": [10002, 101010], } - ] - task = MockMongoTask( - n, - TaskResources( - EMPTY_REQUEST, - Policy(), - [integration_postgres_config, integration_mongodb_config], - ), - ) - dask_input_data = task.to_dask_input_data(customer_feedback_data) - # Output of function returns nested keys as dot-separated where applicable. - assert dask_input_data == {"customer_identifiers.internal_id": ["cust_001"]} + def test_to_dask_input_employee_collection( + self, make_graph_task, combined_traversal_node_dict + ): + """ + Array of integers used to locate record with matching integer + + Mongo employee node has two inputs: + root.email -> mongo_test.employee.email + mongo_test.flights.pilots -> mongo_test.employee.id + """ + node = combined_traversal_node_dict[CollectionAddress("mongo_test", "employee")] + task = make_graph_task(node) + root_email_input = [{"email": "customer-1@example.com"}] + truncated_flights_output = [ + {"pilots": ["1", "2"], "plane": 10002.0}, + {"pilots": ["3", "4"], "plane": 101010}, + ] + assert task.to_dask_input_data(root_email_input, truncated_flights_output) == { + "id": ["1", "2", "3", "4"], + "email": ["customer-1@example.com"], + } + + def test_to_dask_input_conversation_collection( + self, make_graph_task, combined_traversal_node_dict + ): + """ + Array of objects of strings used to locate record within array of objects of scalars + + Mongo conversation node has one input: + mongo_test:customer_details.comments.comment_id -> mongo_test:conversations.thread.comment + """ + node = combined_traversal_node_dict[ + CollectionAddress("mongo_test", "conversations") + ] + task = make_graph_task(node) + truncated_customer_details_output = [ + { + "comments": [ + {"comment_id": "com_0001"}, + {"comment_id": "com_0003"}, + {"comment_id": "com_0005"}, + ] + }, + {"comments": [{"comment_id": "com_0007"}]}, + ] + + assert task.to_dask_input_data(truncated_customer_details_output) == { + "thread.comment": ["com_0001", "com_0003", "com_0005", "com_0007"], + } def test_sql_dry_run_queries() -> None: @@ -160,462 +289,3 @@ def test_mongo_dry_run_queries() -> None: env[CollectionAddress("postgres", "address")] == "db.postgres.address.find({'id': {'$in': [?, ?]}}, {'id': 1, 'street': 1, 'city': 1, 'state': 1, 'zip': 1})" ) - - -def test_filter_data_categories(): - """Test different combinations of data categories to ensure the access_request_results are filtered properly""" - access_request_results = { - "postgres_example:supplies": [ - { - "foods": { - "vegetables": True, - "fruits": { - "apples": True, - "oranges": False, - "berries": {"strawberries": True, "blueberries": False}, - }, - "grains": {"rice": False, "wheat": True}, - }, - "clothing": True, - } - ] - } - - data_category_fields = { - CollectionAddress("postgres_example", "supplies"): { - "A": [FieldPath("foods", "fruits", "apples"), FieldPath("clothing")], - "B": [FieldPath("foods", "vegetables")], - "C": [ - FieldPath("foods", "grains", "rice"), - FieldPath("foods", "grains", "wheat"), - ], - "D": [], - "E": [ - FieldPath("foods", "fruits", "berries", "strawberries"), - FieldPath("foods", "fruits", "oranges"), - ], - } - } - - only_a_categories = filter_data_categories( - copy.deepcopy(access_request_results), {"A"}, data_category_fields - ) - - assert only_a_categories == { - "postgres_example:supplies": [ - {"foods": {"fruits": {"apples": True}}, "clothing": True} - ] - } - - only_b_categories = filter_data_categories( - copy.deepcopy(access_request_results), {"B"}, data_category_fields - ) - assert only_b_categories == { - "postgres_example:supplies": [ - { - "foods": { - "vegetables": True, - } - } - ] - } - - only_c_categories = filter_data_categories( - copy.deepcopy(access_request_results), {"C"}, data_category_fields - ) - assert only_c_categories == { - "postgres_example:supplies": [ - {"foods": {"grains": {"rice": False, "wheat": True}}} - ] - } - - only_d_categories = filter_data_categories( - copy.deepcopy(access_request_results), {"D"}, data_category_fields - ) - assert only_d_categories == {} - - only_e_categories = filter_data_categories( - copy.deepcopy(access_request_results), {"E"}, data_category_fields - ) - assert only_e_categories == { - "postgres_example:supplies": [ - { - "foods": { - "fruits": { - "oranges": False, - "berries": {"strawberries": True}, - } - } - } - ] - } - - -def test_filter_data_categories_limited_results(): - """ - Test scenario where the related data for a given identity is only a small subset of all the annotated fields. - """ - jane_results = { - "mongo_test:customer_details": [ - { - "_id": ObjectId("61f2bc8d6362fd78d72d8791"), - "customer_id": 3.0, - "gender": "female", - "birthday": datetime(1990, 2, 28, 0, 0), - } - ], - "postgres_example:order_item": [], - "postgres_example:report": [], - "postgres_example:orders": [ - {"customer_id": 3, "id": "ord_ddd-eee", "shipping_address_id": 4} - ], - "postgres_example:employee": [], - "postgres_example:address": [ - { - "city": "Example Mountain", - "house": 1111, - "id": 4, - "state": "TX", - "street": "Example Place", - "zip": "54321", - } - ], - "postgres_example:visit": [], - "postgres_example:product": [], - "postgres_example:customer": [ - { - "address_id": 4, - "created": datetime(2020, 4, 1, 11, 47, 42), - "email": "jane@example.com", - "id": 3, - "name": "Jane Customer", - } - ], - "postgres_example:service_request": [], - "postgres_example:payment_card": [ - { - "billing_address_id": 4, - "ccn": 373719391, - "code": 222, - "customer_id": 3, - "id": "pay_ccc-ccc", - "name": "Example Card 3", - "preferred": False, - } - ], - "mongo_test:customer_feedback": [], - "postgres_example:login": [ - {"customer_id": 3, "id": 8, "time": datetime(2021, 1, 6, 1, 0)} - ], - "mongo_test:internal_customer_profile": [], - } - - target_categories = {"user.provided.identifiable"} - - data_category_fields = { - CollectionAddress.from_string("postgres_example:address"): { - "user.provided.identifiable.contact.city": [ - FieldPath( - "city", - ) - ], - "user.provided.identifiable.contact.street": [ - FieldPath( - "house", - ), - FieldPath( - "street", - ), - ], - "system.operations": [ - FieldPath( - "id", - ) - ], - "user.provided.identifiable.contact.state": [ - FieldPath( - "state", - ) - ], - "user.provided.identifiable.contact.postal_code": [ - FieldPath( - "zip", - ) - ], - }, - CollectionAddress.from_string("postgres_example:customer"): { - "system.operations": [ - FieldPath( - "address_id", - ), - FieldPath( - "created", - ), - ], - "user.provided.identifiable.contact.email": [ - FieldPath( - "email", - ) - ], - "user.derived.identifiable.unique_id": [ - FieldPath( - "id", - ) - ], - "user.provided.identifiable.name": [ - FieldPath( - "name", - ) - ], - }, - CollectionAddress.from_string("postgres_example:employee"): { - "system.operations": [ - FieldPath( - "address_id", - ) - ], - "user.provided.identifiable.contact.email": [ - FieldPath( - "email", - ) - ], - "user.derived.identifiable.unique_id": [ - FieldPath( - "id", - ) - ], - "user.provided.identifiable.name": [ - FieldPath( - "name", - ) - ], - }, - CollectionAddress.from_string("postgres_example:login"): { - "user.derived.identifiable.unique_id": [ - FieldPath( - "customer_id", - ) - ], - "system.operations": [ - FieldPath( - "id", - ) - ], - "user.derived.nonidentifiable.sensor": [ - FieldPath( - "time", - ) - ], - }, - CollectionAddress.from_string("postgres_example:order_item"): { - "system.operations": [ - FieldPath( - "order_id", - ), - FieldPath( - "product_id", - ), - FieldPath( - "quantity", - ), - ] - }, - CollectionAddress.from_string("postgres_example:orders"): { - "user.derived.identifiable.unique_id": [ - FieldPath( - "customer_id", - ) - ], - "system.operations": [ - FieldPath( - "id", - ), - FieldPath( - "shipping_address_id", - ), - ], - }, - CollectionAddress.from_string("postgres_example:payment_card"): { - "system.operations": [ - FieldPath( - "billing_address_id", - ), - FieldPath( - "id", - ), - ], - "user.provided.identifiable.financial.account_number": [ - FieldPath( - "ccn", - ) - ], - "user.provided.identifiable.financial": [ - FieldPath( - "code", - ), - FieldPath( - "name", - ), - ], - "user.derived.identifiable.unique_id": [ - FieldPath( - "customer_id", - ) - ], - "user.provided.nonidentifiable": [ - FieldPath( - "preferred", - ) - ], - }, - CollectionAddress.from_string("postgres_example:product"): { - "system.operations": [ - FieldPath( - "id", - ), - FieldPath( - "name", - ), - FieldPath( - "price", - ), - ] - }, - CollectionAddress.from_string("postgres_example:report"): { - "user.provided.identifiable.contact.email": [ - FieldPath( - "email", - ) - ], - "system.operations": [ - FieldPath( - "id", - ), - FieldPath( - "month", - ), - FieldPath( - "name", - ), - FieldPath( - "total_visits", - ), - FieldPath( - "year", - ), - ], - }, - CollectionAddress.from_string("postgres_example:service_request"): { - "user.provided.identifiable.contact.email": [ - FieldPath( - "alt_email", - ) - ], - "system.operations": [ - FieldPath( - "closed", - ), - FieldPath( - "email", - ), - FieldPath( - "id", - ), - FieldPath( - "opened", - ), - ], - "user.derived.identifiable.unique_id": [ - FieldPath( - "employee_id", - ) - ], - }, - CollectionAddress.from_string("postgres_example:visit"): { - "user.provided.identifiable.contact.email": [ - FieldPath( - "email", - ) - ], - "system.operations": [ - FieldPath( - "last_visit", - ) - ], - }, - CollectionAddress.from_string("mongo_test:customer_details"): { - "system.operations": [ - FieldPath( - "_id", - ) - ], - "user.provided.identifiable.date_of_birth": [ - FieldPath( - "birthday", - ) - ], - "user.derived.identifiable.unique_id": [ - FieldPath( - "customer_id", - ) - ], - "user.provided.identifiable.gender": [ - FieldPath( - "gender", - ) - ], - "user.provided.identifiable.job_title": [ - FieldPath("workplace_info", "position") - ], - }, - CollectionAddress.from_string("mongo_test:customer_feedback"): { - "system.operations": [ - FieldPath( - "_id", - ) - ], - "user.provided.identifiable.contact.phone_number": [ - FieldPath("customer_information", "phone") - ], - "user.provided.nonidentifiable": [ - FieldPath( - "message", - ), - FieldPath( - "rating", - ), - ], - }, - CollectionAddress.from_string("mongo_test:internal_customer_profile"): { - "user.derived": [ - FieldPath( - "derived_interests", - ) - ] - }, - } - - filtered_results = filter_data_categories( - copy.deepcopy(jane_results), target_categories, data_category_fields - ) - - assert filtered_results == { - "mongo_test:customer_details": [ - {"gender": "female", "birthday": datetime(1990, 2, 28)} - ], - "postgres_example:address": [ - { - "city": "Example Mountain", - "house": 1111, - "state": "TX", - "street": "Example Place", - "zip": "54321", - } - ], - "postgres_example:customer": [ - {"email": "jane@example.com", "name": "Jane Customer"} - ], - "postgres_example:payment_card": [ - {"ccn": 373719391, "code": 222, "name": "Example Card 3"} - ], - } diff --git a/tests/task/test_refine_target_path.py b/tests/task/test_refine_target_path.py new file mode 100644 index 000000000..edfb12ea9 --- /dev/null +++ b/tests/task/test_refine_target_path.py @@ -0,0 +1,242 @@ +from fidesops.graph.config import FieldPath +from fidesops.task.refine_target_path import ( + refine_target_path, + build_incoming_refined_target_paths, +) + + +class TestRefineTargetPath: + def test_refine_target_path_value_does_not_match(self): + data = {"A": "a", "B": "b"} + + assert refine_target_path(data, ["A"], ["b"]) == [] + + def test_refine_target_path_to_scalar_value(self): + data = {"A": "a", "B": "b"} + assert refine_target_path(data, ["A"], ["a"]) == [ + "A" + ] # Target path is unchanged + + def test_refine_target_path_to_scalar_from_multiple_options(self): + data = {"A": "a", "B": "b"} + assert refine_target_path(data, ["A"], ["a", "b", "c"]) == [ + "A" + ] # Target path is unchanged + + def test_refine_target_path_to_nested_value(self): + data = {"A": {"B": {"C": "D", "E": "F", "G": "G"}}} + assert refine_target_path(data, ["A", "B", "C"], ["D"]) == [ + "A", + "B", + "C", + ] # Target path is unchanged + + def test_refine_target_path_to_top_level_array(self): + data = {"A": ["a", "b", "c"], "D": ["e", "f", "g"]} + assert refine_target_path(data, ["A"], ["c"]) == ["A", 2] + + def test_refine_target_path_to_nested_array(self): + data = {"A": {"B": {"C": ["d", "e", "f"], "D": ["g", "h", "i"]}}} + assert refine_target_path(data, ["A", "B", "C"], ["e"]) == ["A", "B", "C", 1] + + def test_refine_target_paths_to_multiple_indices_in_nested_arrays(self): + data = {"A": {"B": {"C": ["d", "e", "f", "e", "e"], "D": ["g", "h", "i"]}}} + assert refine_target_path(data, ["A", "B", "C"], ["e"]) == [ + ["A", "B", "C", 1], + ["A", "B", "C", 3], + ["A", "B", "C", 4], + ] + + def test_refine_target_path_to_embedded_object_in_arrays(self): + data = { + "A": [ + {"B": "C", "D": "E", "F": "G"}, + {"D": "J"}, + {"B": "I", "D": "K", "F": "J"}, + ] + } + assert refine_target_path(data, ["A", "F"], ["G"]) == ["A", 0, "F"] + + def test_refine_target_path_to_multiple_embedded_objects_in_arrays(self): + data = { + "A": [ + {"B": "C", "D": "E", "F": "G"}, + {"D": "J"}, + {"B": "I", "D": "K", "F": "J"}, + {"B": "J", "D": "M", "F": "G"}, + ], + "B": "C", + } + assert refine_target_path(data, ["A", "F"], ["G"]) == [ + ["A", 0, "F"], + ["A", 3, "F"], + ] + + def test_refine_target_path_to_multiple_objects_from_multiple_possibilities(self): + data = { + "A": [ + {"B": "C", "D": "E", "F": "G"}, + {"D": "J"}, + {"B": "I", "D": "K", "F": "J"}, + {"B": "J", "D": "M", "F": "G"}, + {"B": "J", "D": "M", "F": "H"}, + {"B": "J", "D": "M", "F": "I"}, + ], + "B": "C", + } + assert refine_target_path(data, ["A", "F"], ["G", "I"]) == [ + ["A", 0, "F"], + ["A", 3, "F"], + ["A", 5, "F"], + ] + + def test_refined_target_path_array_of_arrays(self): + data = {"A": [["B", "C", "D", "C", "E"]], "C": ["E", "F"]} + assert refine_target_path(data, ["A"], ["C", "E"]) == [ + ["A", 0, 1], + ["A", 0, 3], + ["A", 0, 4], + ] + + def test_refine_target_path_large_data(self, sample_data): + result = refine_target_path( + sample_data, ["months", "march", "crops"], ["swiss chard"] + ) + assert result == [ + ["months", "march", 0, "crops", 0], + ["months", "march", 0, "crops", 1], + ] + + result = refine_target_path(sample_data, ["_id"], [12345]) + assert result == ["_id"] + + result = refine_target_path(sample_data, ["snacks"], ["pizza"]) + assert result == ["snacks", 0] + + result = refine_target_path(sample_data, ["thread", "comment"], ["com_0002"]) + assert result == [["thread", 1, "comment"], ["thread", 2, "comment"]] + + result = refine_target_path(sample_data, ["seats", "first_choice"], ["A2"]) + assert result == ["seats", "first_choice"] + + result = refine_target_path(sample_data, ["upgrades", "books"], ["SICP"]) + assert result == ["upgrades", "books", 1] + + result = refine_target_path(sample_data, ["other_flights", "CHO"], ["1 PM"]) + assert result == [ + ["other_flights", 0, "CHO", 1], + ["other_flights", 1, "CHO", 1], + ] + + result = refine_target_path(sample_data, ["bad_path"], ["bad match"]) + assert result == [] + + result = refine_target_path(sample_data, ["hello"], only=[2]) + assert result == [["hello", 1], ["hello", 4]] + + result = refine_target_path( + sample_data, ["months", "july", "crops"], ["watermelon", "grapes"] + ) + assert result == [ + ["months", "july", 0, "crops", 0], + ["months", "july", 0, "crops", 2], + ] + + result = refine_target_path(sample_data, ["weights"], [4]) + assert result == ["weights", 1, 1] + + result = refine_target_path(sample_data, ["toppings"], ["cheese"]) + assert result == [["toppings", 0, 1, 1], ["toppings", 0, 1, 2]] + + result = refine_target_path(sample_data, ["A", "C", "M"], ["n"]) + assert result == [["A", "C", 0, "M", 1], ["A", "C", 0, "M", 2]] + + result = refine_target_path(sample_data, [], ["pizza"]) + assert result == [] + + result = refine_target_path(sample_data, ["C"], ["B"]) + assert result == [["C", 0, 1], ["C", 0, 3], ["C", 1, 2], ["C", 1, 3]] + + result = refine_target_path(sample_data, ["D"], ["B"]) + assert result == [ + ["D", 0, 0, 1], + ["D", 0, 0, 3], + ["D", 0, 1, 2], + ["D", 0, 1, 3], + ["D", 1, 0, 1], + ["D", 1, 0, 3], + ["D", 1, 1, 2], + ["D", 1, 1, 3], + ] + + result = refine_target_path(sample_data, ["E"], ["B"]) + assert result == [ + ["E", 0, 0, 0], + ["E", 0, 1, 0, 1], + ["E", 0, 1, 0, 3], + ["E", 0, 1, 1, 2], + ["E", 0, 1, 1, 3], + ] + + result = refine_target_path(sample_data, ["F"], ["a"]) + assert result == [["F", 0], ["F", 1, 1], ["F", 1, 2, 0, 1], ["F", 1, 2, 0, 2]] + + +class TestBuildIncomingRefinedTargetPaths: + def test_build_refined_paths_bad_path(self): + row = {"A": [1, 2, 3], "B": "C"} + result = build_incoming_refined_target_paths( + row, {FieldPath("A", "B", "C"): ["F"]} + ) + assert result == [] + + def test_one_match_makes_list_of_lists(self): + row = {"A": [1, 2, 3], "B": "C"} + result = build_incoming_refined_target_paths(row, {FieldPath("A"): [1]}) + assert result == [["A", 0]] + + def test_two_matches_makes_list_of_lists(self): + row = {"A": [1, 2, 3], "B": "C"} + result = build_incoming_refined_target_paths(row, {FieldPath("A"): [1, 3]}) + assert result == [["A", 0], ["A", 2]] + + def test_list_of_list_of_lists(self): + row = {"A": [[[1, 2, 3]], [[4, 5, 6]]], "B": "C"} + result = build_incoming_refined_target_paths(row, {FieldPath("A"): [1, 3]}) + assert result == [["A", 0, 0, 0], ["A", 0, 0, 2]] + + def test_build_incoming_refined_path_multiple_matches_in_array(self): + row = { + "A": ["b", "c", "d", "e"], + "C": {"D": {"E": ["g", "h", "i", "j"], "G": "H"}}, + "J": [ + {"K": 1, "J": 2}, + {"K": 3, "J": 4}, + {"K": 1, "J": 6}, + {"K": 2, "J": 4}, + ], + } + + result = build_incoming_refined_target_paths( + row, {FieldPath("J", "K"): [2], FieldPath("J", "J"): [4]} + ) + assert result == [["J", 3, "K"], ["J", 1, "J"], ["J", 3, "J"]] + + def test_build_incoming_refined_target_paths_large_data(self, sample_data): + incoming_paths = { + FieldPath( + "F", + ): ["a"], + FieldPath("snacks"): ["pizza"], + FieldPath("thread", "comment"): ["com_0002"], + } + result = build_incoming_refined_target_paths(sample_data, incoming_paths) + assert result == [ + ["F", 0], + ["snacks", 0], + ["F", 1, 1], + ["thread", 1, "comment"], + ["thread", 2, "comment"], + ["F", 1, 2, 0, 1], + ["F", 1, 2, 0, 2], + ] diff --git a/tests/task/traversal_data.py b/tests/task/traversal_data.py index 3a1447dd4..65d72a9ff 100644 --- a/tests/task/traversal_data.py +++ b/tests/task/traversal_data.py @@ -5,9 +5,17 @@ ScalarField, FieldAddress, CollectionAddress, - Dataset, ObjectField, + Dataset, + ObjectField, +) +from fidesops.graph.data_type import ( + DataType, + IntTypeConverter, + StringTypeConverter, + ObjectIdTypeConverter, + ObjectTypeConverter, + NoOpTypeConverter, ) -from fidesops.graph.data_type import DataType from fidesops.graph.graph import DatasetGraph from fidesops.graph.traversal import Traversal from fidesops.models.connectionconfig import ConnectionConfig @@ -74,56 +82,355 @@ def combined_mongo_postgresql_graph( ], ) - mongo_customer_details = Collection( + aircraft = Collection( + name="aircraft", + fields=[ + ScalarField( + name="_id", + data_type_converter=ObjectIdTypeConverter(), + is_array=False, + primary_key=True, + ), + ScalarField( + name="model", + data_type_converter=StringTypeConverter(), + is_array=False, + ), + ScalarField( + name="planes", + data_type_converter=IntTypeConverter(), + is_array=True, + references=[(FieldAddress("mongo_test", "flights", "plane"), "from")], + ), + ], + after=set(), + ) + + conversations = Collection( + name="conversations", + fields=[ + ObjectField( + name="thread", + data_type_converter=ObjectTypeConverter(), + is_array=False, + fields={ + "comment": ScalarField( + name="comment", + data_type_converter=StringTypeConverter(), + is_array=False, + ), + "message": ScalarField( + name="message", + data_type_converter=StringTypeConverter(), + is_array=False, + ), + "chat_name": ScalarField( + name="chat_name", + data_type_converter=StringTypeConverter(), + is_array=False, + ), + }, + ) + ], + after=set(), + ) + + customer_details = Collection( name="customer_details", fields=[ - ScalarField(name="_id", primary_key=True), + ScalarField( + name="_id", + data_type_converter=NoOpTypeConverter(), + is_array=False, + primary_key=True, + ), + ScalarField( + name="birthday", + data_type_converter=StringTypeConverter(), + is_array=False, + ), + ScalarField( + name="children", + data_type_converter=StringTypeConverter(), + is_array=True, + ), + ObjectField( + name="comments", + data_type_converter=ObjectTypeConverter(), + is_array=True, + fields={ + "name": ScalarField( + name="comment_id", + data_type_converter=StringTypeConverter(), + is_array=False, + references=[ + ( + FieldAddress( + "mongo_test", "conversations", "thread", "comment" + ), + "to", + ) + ], + ) + }, + ), ScalarField( name="customer_id", + data_type_converter=NoOpTypeConverter(), + is_array=False, references=[ - (FieldAddress("postgres_example", "customer", "id"), "from") + ( + FieldAddress("postgres_example", "customer", "id"), + "from", + ) ], ), - ScalarField(name="gender", data_type_converter=str_converter), - ScalarField(name="birthday", data_type_converter=str_converter), - ObjectField(name="workplace_info", data_type_converter=obj_converter, fields={ - "employer": ScalarField(name="employer", data_type_converter=str_converter), - "position": ScalarField(name="position", data_type_converter=str_converter) - }), - ] + ObjectField( + name="emergency_contacts", + data_type_converter=ObjectTypeConverter(), + is_array=True, + fields={ + "name": ScalarField( + name="name", + data_type_converter=StringTypeConverter(), + is_array=False, + ), + "relationship": ScalarField( + name="relationship", + data_type_converter=StringTypeConverter(), + is_array=False, + ), + "phone": ScalarField( + name="phone", + data_type_converter=StringTypeConverter(), + is_array=False, + ), + }, + ), + ScalarField( + name="gender", + data_type_converter=StringTypeConverter(), + is_array=False, + ), + ScalarField( + name="travel_identifiers", + data_type_converter=StringTypeConverter(), + is_array=True, + ), + ObjectField( + name="workplace_info", + data_type_converter=ObjectTypeConverter(), + is_array=False, + fields={ + "employer": ScalarField( + name="employer", + data_type_converter=StringTypeConverter(), + is_array=False, + ), + "position": ScalarField( + name="position", + data_type_converter=StringTypeConverter(), + is_array=False, + ), + "direct_reports": ScalarField( + name="direct_reports", + data_type_converter=StringTypeConverter(), + is_array=True, + ), + }, + ), + ], + after=set(), ) - - mongo_customer_feedback = Collection( + customer_feedback = Collection( name="customer_feedback", fields=[ - ScalarField(name="_id", primary_key=True), - ObjectField(name="customer_information", data_type_converter=obj_converter, fields={ - "email": ScalarField(name="email", data_type_converter=str_converter, identity="email"), - "phone": ScalarField(name="phone", data_type_converter=str_converter), - "internal_customer_id": ScalarField(name="internal_customer_id", data_type_converter=str_converter) - }), - ScalarField(name="rating", data_type_converter=int_converter), - ScalarField(name="date", data_type_converter=str_converter), - ScalarField(name="message", data_type_converter=str_converter), - ] + ScalarField( + name="_id", + data_type_converter=ObjectIdTypeConverter(), + is_array=False, + primary_key=True, + ), + ObjectField( + name="customer_information", + data_type_converter=ObjectTypeConverter(), + is_array=False, + fields={ + "email": ScalarField( + name="email", + data_type_converter=StringTypeConverter(), + is_array=False, + identity="email", + ), + "phone": ScalarField( + name="phone", + data_type_converter=StringTypeConverter(), + is_array=False, + ), + "internal_customer_id": ScalarField( + name="internal_customer_id", + data_type_converter=StringTypeConverter(), + is_array=False, + ), + }, + ), + ScalarField( + name="date", + data_type_converter=StringTypeConverter(), + is_array=False, + ), + ScalarField( + name="message", + data_type_converter=StringTypeConverter(), + is_array=False, + ), + ScalarField( + name="rating", + data_type_converter=IntTypeConverter(), + is_array=False, + ), + ], + after=set(), ) - - mongo_internal_customer_profile = Collection( + employee = Collection( + name="employee", + fields=[ + ScalarField( + name="email", + data_type_converter=StringTypeConverter(), + is_array=False, + identity="email", + ), + ScalarField( + name="id", + data_type_converter=NoOpTypeConverter(), + is_array=False, + references=[(FieldAddress("mongo_test", "flights", "pilots"), "from")], + primary_key=True, + ), + ScalarField( + name="name", + data_type_converter=StringTypeConverter(), + is_array=False, + ), + ], + after=set(), + ) + flights = Collection( + name="flights", + fields=[ + ScalarField( + name="_id", + data_type_converter=ObjectIdTypeConverter(), + is_array=False, + primary_key=True, + ), + ScalarField( + name="date", data_type_converter=NoOpTypeConverter(), is_array=False + ), + ScalarField( + name="flight_no", + data_type_converter=NoOpTypeConverter(), + is_array=False, + ), + ObjectField( + name="passenger_information", + data_type_converter=ObjectTypeConverter(), + is_array=False, + fields={ + "passenger_ids": ScalarField( + name="passenger_ids", + data_type_converter=StringTypeConverter(), + is_array=True, + references=[ + ( + FieldAddress( + "mongo_test", + "customer_details", + "travel_identifiers", + ), + "from", + ) + ], + ), + "full_name": ScalarField( + name="full_name", + data_type_converter=StringTypeConverter(), + is_array=False, + ), + }, + ), + ScalarField( + name="pilots", + data_type_converter=StringTypeConverter(), + is_array=True, + ), + ScalarField( + name="plane", data_type_converter=IntTypeConverter(), is_array=False + ), + ], + after=set(), + ) + internal_customer_profile = Collection( name="internal_customer_profile", fields=[ - ScalarField(name="_id", primary_key=True), - ObjectField(name="customer_identifiers", data_type_converter=obj_converter, fields={ - "internal_id": ScalarField(name="internal_id", data_type_converter=str_converter, references=[ - (FieldAddress("mongo_test", "customer_feedback", "customer_information", "internal_customer_id"), "from") - ],), - }), - ScalarField(name="derived_interests", data_type_converter=str_converter), - ] + ScalarField( + name="_id", + data_type_converter=ObjectIdTypeConverter(), + is_array=False, + primary_key=True, + ), + ObjectField( + name="customer_identifiers", + data_type_converter=ObjectTypeConverter(), + is_array=False, + fields={ + "internal_id": ScalarField( + name="internal_id", + data_type_converter=StringTypeConverter(), + is_array=False, + references=[ + ( + FieldAddress( + "mongo_test", + "customer_feedback", + "customer_information", + "internal_customer_id", + ), + "from", + ) + ], + ), + "derived_emails": ScalarField( + name="derived_emails", + data_type_converter=StringTypeConverter(), + is_array=True, + identity="email", + ), + }, + ), + ScalarField( + name="derived_interests", + data_type_converter=StringTypeConverter(), + is_array=True, + ), + ], + after=set(), ) mongo_dataset = Dataset( name="mongo_test", - collections=[mongo_addresses, mongo_orders, mongo_customer_details, mongo_customer_feedback, mongo_internal_customer_profile], + collections=[ + mongo_addresses, + mongo_orders, + aircraft, + conversations, + customer_details, + customer_feedback, + employee, + flights, + internal_customer_profile, + ], connection_key=mongo_config.key, ) @@ -135,7 +442,7 @@ def integration_db_dataset(db_name: str, connection_key: FidesOpsKey) -> Dataset customers = Collection( name="customer", fields=[ - ScalarField(name="id", primary_key=True), + ScalarField(name="id", primary_key=True, data_type_converter=int_converter), ScalarField(name="name", data_type_converter=str_converter), ScalarField( name="email", identity="email", data_type_converter=str_converter @@ -222,8 +529,10 @@ def sample_traversal() -> Traversal: ScalarField(name="email", identity="email"), ScalarField( name="contact_address_id", - references=[(FieldAddress("mysql", "Address", "id"), "to"), - (FieldAddress("mssql", "Address", "id"), "to")], + references=[ + (FieldAddress("mysql", "Address", "id"), "to"), + (FieldAddress("mssql", "Address", "id"), "to"), + ], ), ], ) diff --git a/tests/util/test_nested_utils.py b/tests/util/test_nested_utils.py deleted file mode 100644 index c2acdf547..000000000 --- a/tests/util/test_nested_utils.py +++ /dev/null @@ -1,29 +0,0 @@ -import pytest - -from fidesops.common_exceptions import FidesopsException -from fidesops.util.nested_utils import unflatten_dict - - -def test_unflatten_dict(): - input_data = {"A.B": 1, "A.C": 2, "A.D.E": 3} - assert unflatten_dict(input_data) == {"A": {"B": 1, "C": 2, "D": {"E": 3}}} - - input_data = {"A": 2, "B": 3, "C": 4} - assert unflatten_dict(input_data) == input_data - - assert unflatten_dict({}) == {} - - assert unflatten_dict({'A.B': 1, 'A.B': 2}) == {"A": {"B": 2}} # Conflicting values, second value is retained - - input_data = {"A.B": 1, "A": 2, "A.C": 3} - # You don't want to pass in input data like this, you have conflicts here - - with pytest.raises(FidesopsException): - unflatten_dict(input_data) - - with pytest.raises(FidesopsException): - # Data passed in is not completely flattened - unflatten_dict({'A.B.C': 1, 'A': {'B.D': 2}}) - - with pytest.raises(IndexError): - # unflatten_dict shouldn't be called with a None separator - unflatten_dict({"": "hello"}, separator=None)