Skip to content
This repository has been archived by the owner on Nov 30, 2022. It is now read-only.

Add Configuration Option for Entrypoint Array Querying [#193] #229

Merged
merged 6 commits into from
Mar 1, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 80 additions & 59 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,76 +55,97 @@ information for Jane across tables in both the postgres and mongo databases.

```json
{
"mongo_test:flights": [
"mongo_test:flights": [
{
"passenger_information": {
"full_name": "Jane Customer"
}
}
],
"mongo_test:customer_details": [
{
"gender": "female",
"children": [
"Erica Example"
],
"birthday": "1990-02-28T00:00:00"
}
],
"postgres_example:address": [
{
"city": "Example Mountain",
"state": "TX",
"house": 1111,
"zip": "54321",
"street": "Example Place"
}
],
"postgres_example:customer": [
{
"email": "[email protected]",
"name": "Jane Customer"
}
],
"mongo_test:rewards": [
{
"owner": [
{
"passenger_information": {
"full_name": "Jane Customer"
}
}
],
"mongo_test:payment_card": [
{
"ccn": "987654321",
"name": "Example Card 2",
"code": "123"
}
],
"postgres_example_test_dataset:address": [
{
"zip": "54321",
"street": "Example Place",
"state": "TX",
"city": "Example Mountain",
"house": 1111
}
],
"mongo_test:customer_details": [
{
"birthday": "1990-02-28T00:00:00",
"gender": "female",
"children": [
"Erica Example"
]
}
],
"postgres_example_test_dataset:customer": [
"phone": "530-486-6983"
},
{
"email": "[email protected]",
"name": "Jane Customer"
"phone": "818-695-1881"
}
],
"postgres_example_test_dataset:payment_card": [
]
},
{
"owner": [
{
"ccn": 373719391,
"name": "Example Card 3",
"code": 222
"phone": "254-344-9868"
}
],
"mongo_test:employee": [
]
}
],
"mongo_test:employee": [
{
"email": "[email protected]",
"name": "Jane Employee"
}
],
"mongo_test:conversations": [
{
"thread": [
{
"email": "[email protected]",
"name": "Jane Employee"
"ccn": "987654321",
"chat_name": "Jane C"
}
],
"mongo_test:conversations": [
]
},
{
"thread": [
{
"thread": [
{
"chat_name": "Jane C"
}
]
"ccn": "987654321",
"chat_name": "Jane C"
},
{
"thread": [
{
"chat_name": "Jane C"
},
{
"chat_name": "Jane C"
}
]
"chat_name": "Jane C"
}
]
]
}
],
"mongo_test:payment_card": [
{
"ccn": "987654321",
"code": "123",
"name": "Example Card 2"
}
],
"postgres_example:payment_card": [
{
"ccn": 373719391,
"code": 222,
"name": "Example Card 3"
}
]
}

```
Expand Down
30 changes: 30 additions & 0 deletions data/dataset/mongo_example_test_dataset.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,12 @@ dataset:
fidesops_meta:
data_type: string[]
identity: email
- name: derived_phone
data_categories: [ user.derived ]
fidesops_meta:
data_type: string[]
return_all_elements: true
identity: phone_number
- name: derived_interests
data_categories: [ user.derived ]
fidesops_meta:
Expand Down Expand Up @@ -257,3 +263,27 @@ dataset:
data_categories: [ user.provided.identifiable.financial ]
- name: preferred
data_categories: [ user.provided.nonidentifiable ]
- name: rewards
fields:
- name: _id
fidesops_meta:
primary_key: True
data_type: object_id
- name: owner
fidesops_meta:
data_type: object[]
return_all_elements: true
fields:
- name: phone
data_categories: [ user.provided.identifiable.contact.phone_number ]
fidesops_meta:
data_type: string
references:
- dataset: mongo_test
field: internal_customer_profile.customer_identifiers.derived_phone
direction: from
- name: shopper_name
- name: points
fidesops_meta:
data_type: integer
- name: expiration_date
11 changes: 9 additions & 2 deletions data/nosql/mongo-init.js
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,16 @@ db.internal_customer_profile.insert([
},
{
"customer_identifiers": {
"internal_id": "cust_002"
"internal_id": "cust_002",
"derived_phone": ["757-499-5508"]
},
"derived_interests": ["programming", "hiking", "skateboarding"]
},
{
"customer_identifiers": {
"internal_id": "cust_003",
"derived_emails": ["[email protected]", "[email protected]"] // Identity within an array field
"derived_emails": ["[email protected]", "[email protected]"], // Identity within an array field
"derived_phone": ["530-486-6983", "254-344-9868"]
},
"derived_interests": ["interior design", "travel", "photography"]
}
Expand Down Expand Up @@ -246,6 +248,11 @@ db.customer.insert([
}
]);

db.rewards.insert([
{"owner": [{"phone": "530-486-6983", "shopper_name": "janec"}, {"phone": "818-695-1881", "shopper_name": "janec"}], "points": 95, "expiration": Date("2023-01-05")},
{"owner": [{"phone": "254-344-9868", "shopper_name": "janec"}], "points": 50, "expiration": Date("2023-02-05")},
{"owner": [{"phone": "304-969-7140", "shopper-name": "timc"}], "points": 3, "expiration": Date("2022-02-05")}
])


db.payment_card.insert([
Expand Down
5 changes: 3 additions & 2 deletions docs/fidesops/docs/guides/complex_fields.md
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,9 @@ arrays should be considered.
1) If an array is the entry point into a node, we will search for corresponding matches across the entire array. You cannot specify a certain index.
2) Everything is basically an "OR" query. Data returned from multiple array fields will be flattened before being passed into the next collection.
1) For example, say Collection A returned values [1, 2, 3] and Collection B returned values [4, 5, 6]. Collection C has an array field that depends on both Collection A and Collection B. We search Collection C's array field to return any record that contains one of the values [1, 2, 3, 4, 5, 6] in the array.
3. If an array field is an entry point to a node, only matching indices in that array are considered, both for access and erasures, as well as for subsequent queries on dependent collections where applicable.
1. For example, a query on Collection A only matched indices 0 and 1 in an array. Only the data located at indices 0 and 1 are used to query data on dependent collection C.
3. By default, if an array field is an entry point to a node, only matching indices in that array are considered, both for access and erasures, as well as for subsequent queries on dependent collections where applicable.
1. For example, a query on Collection A only matched indices 0 and 1 in an array. Only the data located at indices 0 and 1 will be returned, and used to query data on dependent collection C.
2. This can be overridden by specifying `return_all_elements=true` on an entrypoint array field, in which case, the query will return the entire array and mask the entire array.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a great description 👍 . Are we better off using yaml notation in the example we give, since that's consistent with the example datasets? ie. return_all_elements: true

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah good point

4. Individual array elements are masked, not the entire array, e.g. ["MASKED", "MASKED", "MASKED"]

### Can I see a more detailed example of a query traversal with complex objects?
Expand Down
15 changes: 8 additions & 7 deletions docs/fidesops/docs/guides/datasets.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,12 @@ dataset:
- `name`: The name of the field will be used to generate query and update statements. Please note that Fidesops does not do automated schema discovery. It is only aware of the fields you declare. This means that the only fields that will be addressed and retrieved by Fidesops queries are the fields you declare.
- `data_categories`: Annotating data\_categories connects fields to policy rules, and determines which actions apply to each field. For more information see [Policies](policies.md)
- `fidesops_meta`: The fidesops\_meta section specifies some additional fields that control how Fidesops manages your data:
- `references`: A declaration of relationships between collections. Where the configuration declares a reference to `mydatabase:address:id` it means Fidesops will use the values from `mydatabase.address.id` to search for related values in `customer`. Unlike the SQL declaration, this is not an enforceable relationship, but simply a statement of which values are connected. In the example above, the references from the `customer` field to `mydatabase.address.id` is analogous to a SQL statement `customer id REFERENCES address.id`, with the exception that any dataset and collection can be referenced. The relationship requires you to specify the dataset as well as the collection for relationships, because you may declare a configuration with multiple datasets, where values in one collection in the first dataset are searched using values found in the second dataset.
- `field`: The specified linked field, using the syntax `[dataset name].[collection name ].[field name]`.
- `identity`: Signifies that this field is an identity value that can be used as the root for a traversal [See graph traversal](query_execution.md)
- `direction`(_Optional_): Accepted values are `from` or `to`. This determines how Fidesops uses the relationships to discover data. If the direction is `to`, Fidesops will only use data in the _source_ collection to discover data in the _referenced_ collection. If the direction is `from`, Fidesops will only use data in the _referenced_ collection to discover data in the _source_ collection. If the direction is omitted, Fidesops will traverse the relation in whatever direction works to discover all related data.
- `primary_key` (_Optional_): A boolean value that means that Fidesops will treat this field as a unique row identifier for generating update statements. If no primary key is specified for any field on a collection, no updates will be generated against that collection. If multiple fields are marked as primary keys the combination of their values will be treated as a combined key. In SQL terms, we'd issue a query that looked like `SELECT ... FROM TABLE WHERE primary_key_name_1 = value1 AND primary_key_name_2 = value2`.
- `data_type` (_Optional_): An indication of the type of data held by this field. Data types are used to convert values to the appropriate type when those values are used in queries. This is especially necessary when using data of one type to help locate data of another type. Data types are also used to generate the appropriate masked value when running erasures, since Fidesops needs to know the type of data expected by the field in order to generate an appropriate masked value. Available data types are `string`, `integer`, `float`, `boolean`, and `object_id`. `object` types are also supported for MongoDB.
- `length` (_Optional_): An indicator of field length.
- `references`: A declaration of relationships between collections. Where the configuration declares a reference to `mydatabase:address:id` it means Fidesops will use the values from `mydatabase.address.id` to search for related values in `customer`. Unlike the SQL declaration, this is not an enforceable relationship, but simply a statement of which values are connected. In the example above, the references from the `customer` field to `mydatabase.address.id` is analogous to a SQL statement `customer id REFERENCES address.id`, with the exception that any dataset and collection can be referenced. The relationship requires you to specify the dataset as well as the collection for relationships, because you may declare a configuration with multiple datasets, where values in one collection in the first dataset are searched using values found in the second dataset.
- `field`: The specified linked field, using the syntax `[dataset name].[collection name ].[field name]`.
- `identity`: Signifies that this field is an identity value that can be used as the root for a traversal [See graph traversal](query_execution.md)
- `direction`(_Optional_): Accepted values are `from` or `to`. This determines how Fidesops uses the relationships to discover data. If the direction is `to`, Fidesops will only use data in the _source_ collection to discover data in the _referenced_ collection. If the direction is `from`, Fidesops will only use data in the _referenced_ collection to discover data in the _source_ collection. If the direction is omitted, Fidesops will traverse the relation in whatever direction works to discover all related data.
- `primary_key` (_Optional_): A boolean value that means that Fidesops will treat this field as a unique row identifier for generating update statements. If no primary key is specified for any field on a collection, no updates will be generated against that collection. If multiple fields are marked as primary keys the combination of their values will be treated as a combined key. In SQL terms, we'd issue a query that looked like `SELECT ... FROM TABLE WHERE primary_key_name_1 = value1 AND primary_key_name_2 = value2`.
- `data_type` (_Optional_): An indication of the type of data held by this field. Data types are used to convert values to the appropriate type when those values are used in queries. This is especially necessary when using data of one type to help locate data of another type. Data types are also used to generate the appropriate masked value when running erasures, since Fidesops needs to know the type of data expected by the field in order to generate an appropriate masked value. Available data types are `string`, `integer`, `float`, `boolean`, and `object_id`. `object` types are also supported for MongoDB.
- `length` (_Optional_): An indicator of field length.
- `return_all_elements`: (_Optional_): For array entrypoint fields, specify whether the query should return/mask all fields, or just matching fields. By default, we just return/mask matching fields. `return_all_elements=true` will return/mask the entire array.

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions src/fidesops/graph/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,8 @@ class Field(BaseModel, ABC):
"""an optional pointer to an arbitrary key in an expected json package provided as a seed value"""
data_categories: Optional[List[FidesOpsKey]]
data_type_converter: DataTypeConverter = DataType.no_op.value
return_all_elements: Optional[bool] = None
# Should field be returned by query if it is in an entrypoint array field, or just if it matches query?

"""Known type of held data"""
length: Optional[int]
Expand Down Expand Up @@ -351,6 +353,7 @@ def generate_field(
length: Optional[int],
is_array: bool,
sub_fields: List[Field],
return_all_elements: Optional[bool],
) -> Field:
"""Generate a graph field."""

Expand All @@ -361,6 +364,7 @@ def generate_field(
is_array=is_array,
fields={f.name: f for f in sub_fields},
data_type_converter=DataType.object.value,
return_all_elements=return_all_elements,
)
return ScalarField(
name=name,
Expand All @@ -371,6 +375,7 @@ def generate_field(
primary_key=is_pk,
length=length,
is_array=is_array,
return_all_elements=return_all_elements,
)


Expand Down
15 changes: 11 additions & 4 deletions src/fidesops/models/datasetconfig.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import Dict, Any, Set
from typing import Dict, Any, Set, Optional

from boto3 import Session
from sqlalchemy import (
Expand Down Expand Up @@ -96,7 +96,9 @@ def get_graph(self) -> Dataset:
return dataset_graph


def to_graph_field(field: FidesopsDatasetField) -> Field:
def to_graph_field(
field: FidesopsDatasetField, return_all_elements: Optional[bool] = None
) -> Field:
"""Flattens the dataset field type into its graph representation"""

# NOTE: on the dataset field, annotations like identity & references are
Expand Down Expand Up @@ -148,9 +150,13 @@ def to_graph_field(field: FidesopsDatasetField) -> Field:

(data_type_name, is_array) = parse_data_type_string(meta_section.data_type)

if field.fields:
sub_fields = [to_graph_field(fld) for fld in field.fields]
if meta_section.return_all_elements:
# If specified on array field, lifts and passes into sub-fields, for example,
# arrays of objects
return_all_elements = True

if field.fields:
sub_fields = [to_graph_field(fld, return_all_elements) for fld in field.fields]
return generate_field(
name=field.name,
data_categories=field.data_categories,
Expand All @@ -161,6 +167,7 @@ def to_graph_field(field: FidesopsDatasetField) -> Field:
length=length,
is_array=is_array,
sub_fields=sub_fields,
return_all_elements=return_all_elements,
)


Expand Down
17 changes: 16 additions & 1 deletion src/fidesops/schemas/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ class FidesopsMeta(BaseModel):
"""Optionally specify the data type. Fidesops will attempt to cast values to this type when querying."""
length: Optional[int]
"""Optionally specify the allowable field length. Fidesops will not generate values that exceed this size."""
return_all_elements: Optional[bool]
"""Optionally specify to query for the entire array if the array is an entrypoint into the node. Default is False."""

@validator("data_type")
def valid_data_type(cls, v: Optional[str]) -> Optional[str]:
Expand All @@ -131,6 +133,19 @@ def valid_data_categories(
"""Validate that all annotated data categories exist in the taxonomy"""
return _valid_data_categories(v)

@validator("fidesops_meta")
def valid_meta(cls, meta_values: Optional[FidesopsMeta]) -> Optional[FidesopsMeta]:
"""Validate upfront that the return_all_elements flag can only be specified on array fields"""
if not meta_values:
return meta_values

is_array: bool = bool(meta_values.data_type and "[]" in meta_values.data_type)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

small nit: I think meta_values.data_type.endswith("[]") might be slightly more performant since it will start at the end of the string and not the start

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will change!

if not is_array and meta_values.return_all_elements is not None:
raise ValueError(
"The 'return_all_elements' attribute can only be specified on array fields."
)
return meta_values

@validator("fields")
def validate_object_fields(
cls,
Expand All @@ -143,7 +158,7 @@ def validate_object_fields(
"""
declared_data_type = None

if values["fidesops_meta"]:
if values.get("fidesops_meta"):
declared_data_type = values["fidesops_meta"].data_type

if fields and declared_data_type:
Expand Down
4 changes: 2 additions & 2 deletions src/fidesops/task/filter_element_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@

import pydash

from fidesops.graph.config import FieldPath
from fidesops.task.refine_target_path import (
build_refined_target_paths,
DetailedPath,
join_detailed_path,
FieldPathNodeInput,
)
from fidesops.util.collection_util import FIDESOPS_DO_NOT_MASK_INDEX, Row

Expand All @@ -19,7 +19,7 @@

def filter_element_match(
row: Row,
query_paths: Dict[FieldPath, List[Any]],
query_paths: FieldPathNodeInput,
delete_elements: bool = True,
) -> Dict[str, Any]:
"""
Expand Down
Loading