From 9179f59b33a638502c7efacec24739d5ebb7cd0d Mon Sep 17 00:00:00 2001 From: Alex Chantavy Date: Fri, 16 Dec 2022 11:39:50 -0800 Subject: [PATCH] #349: Generalize graph write queries (#1038) * Build ingest query * Linter * Save cleanup query for another PR * Implement schema * bump mypy to 0.981 for python/mypy#13398 * linter * make load_graph_data interface make more sense * fix comment * Docs and some better names * add a todo * Doc updates, rename some fields * Fix pre-commit * Code commment suggestions Co-authored-by: Ramon Petgrave <32398091+ramonpetgrave64@users.noreply.github.com> * Stackoverflow comment for clarity) * Support ingesting only parts of a schema without breaking the others * Doc comment * Linter * Support matching on one or more properties * Correctly name test * Change key_refs to TargetNodeMatcher to enforce it as a mandatory field * Remove use of hacky default_field() * Support subset of schema relationships for query generation, test multiple node labels * Docstrings * Comments in tests * Better comments * Test for exception conditions * Remove irrelevant comment Co-authored-by: Ramon Petgrave <32398091+ramonpetgrave64@users.noreply.github.com> --- cartography/client/core/tx.py | 64 +++ cartography/graph/model.py | 255 ++++++++++ cartography/graph/querybuilder.py | 438 ++++++++++++++---- cartography/intel/aws/emr.py | 119 +++-- setup.cfg | 4 + .../data/graph/__init__.py | 0 tests/data/graph/querybuilder/__init__.py | 0 .../querybuilder/sample_data/__init__.py | 0 .../sample_data/helloworld_relationships.py | 42 ++ .../sample_data/multiple_attr_match.py | 37 ++ .../querybuilder/sample_models/__init__.py | 0 .../sample_models/interesting_asset.py | 94 ++++ .../sample_models/multiple_attr_match.py | 51 ++ .../querybuilder/sample_models/simple_node.py | 49 ++ .../integration/cartography/graph/__init__.py | 0 ...st_querybuilder_labels_and_var_num_rels.py | 168 +++++++ ...st_querybuilder_match_on_multiple_attrs.py | 41 ++ .../graph/test_querybuilder_rel_subsets.py | 141 ++++++ tests/unit/cartography/graph/helpers.py | 10 + .../cartography/graph/test_querybuilder.py | 90 ---- ...st_querybuilder_build_attach_links_excs.py | 79 ++++ .../graph/test_querybuilder_complex.py | 55 +++ .../test_querybuilder_filter_selected_rels.py | 17 + .../graph/test_querybuilder_simple.py | 59 +++ 24 files changed, 1585 insertions(+), 228 deletions(-) create mode 100644 cartography/graph/model.py rename cartography/client/__init__py => tests/data/graph/__init__.py (100%) create mode 100644 tests/data/graph/querybuilder/__init__.py create mode 100644 tests/data/graph/querybuilder/sample_data/__init__.py create mode 100644 tests/data/graph/querybuilder/sample_data/helloworld_relationships.py create mode 100644 tests/data/graph/querybuilder/sample_data/multiple_attr_match.py create mode 100644 tests/data/graph/querybuilder/sample_models/__init__.py create mode 100644 tests/data/graph/querybuilder/sample_models/interesting_asset.py create mode 100644 tests/data/graph/querybuilder/sample_models/multiple_attr_match.py create mode 100644 tests/data/graph/querybuilder/sample_models/simple_node.py create mode 100644 tests/integration/cartography/graph/__init__.py create mode 100644 tests/integration/cartography/graph/test_querybuilder_labels_and_var_num_rels.py create mode 100644 tests/integration/cartography/graph/test_querybuilder_match_on_multiple_attrs.py create mode 100644 tests/integration/cartography/graph/test_querybuilder_rel_subsets.py create mode 100644 tests/unit/cartography/graph/helpers.py delete mode 100644 tests/unit/cartography/graph/test_querybuilder.py create mode 100644 tests/unit/cartography/graph/test_querybuilder_build_attach_links_excs.py create mode 100644 tests/unit/cartography/graph/test_querybuilder_complex.py create mode 100644 tests/unit/cartography/graph/test_querybuilder_filter_selected_rels.py create mode 100644 tests/unit/cartography/graph/test_querybuilder_simple.py diff --git a/cartography/client/core/tx.py b/cartography/client/core/tx.py index a9c1e25723..da643907b6 100644 --- a/cartography/client/core/tx.py +++ b/cartography/client/core/tx.py @@ -7,6 +7,8 @@ import neo4j +from cartography.util import batch + def read_list_of_values_tx(tx: neo4j.Transaction, query: str, **kwargs) -> List[Union[str, int]]: """ @@ -146,3 +148,65 @@ def read_single_dict_tx(tx: neo4j.Transaction, query: str, **kwargs) -> Dict[str result.consume() return value + + +def write_list_of_dicts_tx( + tx: neo4j.Transaction, + query: str, + **kwargs, +) -> None: + """ + Writes a list of dicts to Neo4j. + + Example usage: + import neo4j + dict_list: List[Dict[Any, Any]] = [{...}, ...] + + neo4j_driver = neo4j.driver(... args ...) + neo4j_session = neo4j_driver.Session(... args ...) + + neo4j_session.write_transaction( + write_list_of_dicts_tx, + ''' + UNWIND $DictList as data + MERGE (a:SomeNode{id: data.id}) + SET + a.other_field = $other_field, + a.yet_another_kwarg_field = $yet_another_kwarg_field + ... + ''', + DictList=dict_list, + other_field='some extra value', + yet_another_kwarg_field=1234 + ) + + :param tx: The neo4j write transaction. + :param query: The Neo4j write query to run. + :param kwargs: Keyword args to be supplied to the Neo4j query. + :return: None + """ + tx.run(query, kwargs) + + +def load_graph_data( + neo4j_session: neo4j.Session, + query: str, + dict_list: List[Dict[str, Any]], + **kwargs, +) -> None: + """ + Writes data to the graph. + :param neo4j_session: The Neo4j session + :param query: The Neo4j write query to run. This query is not meant to be handwritten, rather it should be generated + with cartography.graph.querybuilder.build_ingestion_query(). + :param dict_list: The data to load to the graph represented as a list of dicts. + :param kwargs: Allows additional keyword args to be supplied to the Neo4j query. + :return: None + """ + for data_batch in batch(dict_list, size=10000): + neo4j_session.write_transaction( + write_list_of_dicts_tx, + query, + DictList=data_batch, + **kwargs, + ) diff --git a/cartography/graph/model.py b/cartography/graph/model.py new file mode 100644 index 0000000000..c222cf1c84 --- /dev/null +++ b/cartography/graph/model.py @@ -0,0 +1,255 @@ +import abc +from dataclasses import dataclass +from dataclasses import field +from dataclasses import make_dataclass +from enum import auto +from enum import Enum +from typing import Dict +from typing import List +from typing import Optional + + +class LinkDirection(Enum): + """ + Each CartographyRelSchema has a LinkDirection that determines whether the relationship points toward the original + node ("INWARD") or away from the original node ("OUTWARD"). + + For example the following code creates the path `(:EMRCluster)<-[:RESOURCE]-(:AWSAccount)`: + + class EMRCluster(CartographyNodeSchema): + label: str = "EMRCluster" + sub_resource_relationship: CartographyRelSchema = EMRClusterToAWSAccount() + # ... + + class EMRClusterToAWSAccount(CartographyRelSchema): + target_node_label: str = "AWSAccount" + rel_label: str = "RESOURCE" + direction: LinkDirection = LinkDirection.INWARD + # ... + + If `EMRClusterToAWSAccount.direction` was LinkDirection.OUTWARD, then the directionality of the relationship would + be `(:EMRCluster)-[:RESOURCE]->(:AWSAccount)` instead. + """ + INWARD = auto() + OUTWARD = auto() + + +class PropertyRef: + """ + PropertyRefs represent properties on cartography nodes and relationships. + + cartography takes lists of Python dicts and loads them to Neo4j. PropertyRefs allow our dynamically generated Neo4j + ingestion queries to set values for a given node or relationship property from (A) a field on the dict being + processed (PropertyRef.set_in_kwargs=False; default), or (B) from a single variable provided by a keyword argument + (PropertyRef.set_in_kwargs=True). + """ + + def __init__(self, name: str, set_in_kwargs=False): + """ + :param name: The name of the property + :param set_in_kwargs: Optional. If True, the property is not defined on the data dict, and we expect to find the + property in the kwargs. + If False, looks for the property in the data dict. + Defaults to False. + """ + self.name = name + self.set_in_kwargs = set_in_kwargs + + def _parameterize_name(self) -> str: + return f"${self.name}" + + def __repr__(self) -> str: + """ + `querybuilder.build_ingestion_query()`, generates a Neo4j batched ingestion query of the form + `UNWIND $DictList AS item [...]`. + + If set_in_kwargs is False (default), we instruct the querybuilder to get the value for this given property from + the dict being processed. To do this, this function returns "item.". This is used for loading + in lists of nodes. + + On the other hand if set_in_kwargs is True, then the value will instead come from kwargs passed to + querybuilder.build_ingestion_query(). This is used for things like applying the same update tag to all nodes of + a given run. + """ + return f"item.{self.name}" if not self.set_in_kwargs else self._parameterize_name() + + +@dataclass(frozen=True) +class CartographyNodeProperties(abc.ABC): + """ + Abstract base dataclass that represents the properties on a CartographyNodeSchema. This class is abstract so that we + can enforce that all subclasses have an id and a lastupdated field. + """ + id: PropertyRef = field(init=False) + lastupdated: PropertyRef = field(init=False) + + def __post_init__(self): + """ + Designed to prevent direct instantiation. This workaround is needed since this is a dataclass and an abstract + class without an abstract method defined. + See https://stackoverflow.com/q/60590442. + """ + if self.__class__ == CartographyNodeProperties: + raise TypeError("Cannot instantiate abstract class.") + + +@dataclass(frozen=True) +class CartographyRelProperties(abc.ABC): + """ + Abstract class that represents the properties on a CartographyRelSchema. This is intended to enforce that all + subclasses will have a lastupdated field defined on their resulting relationships. + """ + lastupdated: PropertyRef = field(init=False) + + def __post_init__(self): + """ + Designed to prevent direct instantiation. This workaround is needed since this is a dataclass and an abstract + class without an abstract method defined. + """ + if self.__class__ == CartographyRelProperties: + raise TypeError("Cannot instantiate abstract class.") + + +@dataclass(frozen=True) +class TargetNodeMatcher: + """ + Dataclass used to encapsulate the following mapping: + Keys: one or more attribute names on the target_node_label used to uniquely identify what node to connect to. + Values: The value of the target_node_key used to uniquely identify what node to connect to. This is given as a + PropertyRef. + This is used to ensure dataclass immutability when composed as part of a CartographyNodeSchema object. + See `make_target_node_matcher()`. + """ + pass + + +@dataclass(frozen=True) +class CartographyRelSchema(abc.ABC): + """ + Abstract base dataclass that represents a cartography relationship. + + The CartographyRelSchema contains properties that make it possible to connect the CartographyNodeSchema to other + existing nodes in the graph. + """ + @property + @abc.abstractmethod + def properties(self) -> CartographyRelProperties: + """ + :return: The properties of the relationship. + """ + pass + + @property + @abc.abstractmethod + def target_node_label(self) -> str: + """ + :return: The target node label that this relationship will connect to. + """ + pass + + @property + @abc.abstractmethod + def target_node_matcher(self) -> TargetNodeMatcher: + """ + :return: A TargetNodeMatcher object used to find what node(s) to attach the relationship to. + """ + pass + + @property + @abc.abstractmethod + def rel_label(self) -> str: + """ + :return: The string label of the relationship. + """ + pass + + @property + @abc.abstractmethod + def direction(self) -> LinkDirection: + """ + :return: The LinkDirection of the query. Please see the `LinkDirection` docs for a detailed explanation. + """ + pass + + +@dataclass(frozen=True) +class OtherRelationships: + """ + Encapsulates a list of CartographyRelSchema. This is used to ensure dataclass immutability when composed as part of + a CartographyNodeSchema object. + """ + rels: List[CartographyRelSchema] + + +@dataclass(frozen=True) +class ExtraNodeLabels: + """ + Encapsulates a list of str representing additional labels for the CartographyNodeSchema that this is composed on. + This wrapping is used to ensure dataclass immutability for the CartographyNodeSchema. + """ + labels: List[str] + + +@dataclass(frozen=True) +class CartographyNodeSchema(abc.ABC): + """ + Abstract base dataclass that represents a graph node in cartography. This is used to dynamically generate graph + ingestion queries. + """ + @property + @abc.abstractmethod + def label(self) -> str: + """ + :return: The primary str label of the node. + """ + pass + + @property + @abc.abstractmethod + def properties(self) -> CartographyNodeProperties: + """ + :return: The properties of the node. + """ + pass + + @property + def sub_resource_relationship(self) -> Optional[CartographyRelSchema]: + """ + Optional. + Allows subclasses to specify a subresource relationship for the given node. "Sub resource" is a term we made up + best defined by examples: + - In the AWS module, the subresource is an AWSAccount + - In Azure, the subresource is a Subscription + - In GCP, the subresource is a GCPProject + - In Okta, the subresource is an OktaOrganization + ... and so on and so forth. + :return: + """ + return None + + @property + def other_relationships(self) -> Optional[OtherRelationships]: + """ + Optional. + Allows subclasses to specify additional cartography relationships on the node. + :return: None if not overriden. Else return the node's OtherRelationships. + """ + return None + + @property + def extra_node_labels(self) -> Optional[ExtraNodeLabels]: + """ + Optional. + Allows specifying extra labels on the node. + :return: None if not overriden. Else return the ExtraNodeLabels specified on the node. + """ + return None + + +def make_target_node_matcher(key_ref_dict: Dict[str, PropertyRef]) -> TargetNodeMatcher: + """ + :param key_ref_dict: A Dict mapping keys present on the node to PropertyRef objects. + :return: A TargetNodeMatcher used for CartographyRelSchema to match with other nodes. + """ + fields = [(key, PropertyRef, field(default=prop_ref)) for key, prop_ref in key_ref_dict.items()] + return make_dataclass(TargetNodeMatcher.__name__, fields, frozen=True)() diff --git a/cartography/graph/querybuilder.py b/cartography/graph/querybuilder.py index 8ba100816e..0e8032832e 100644 --- a/cartography/graph/querybuilder.py +++ b/cartography/graph/querybuilder.py @@ -1,118 +1,364 @@ +import logging +from dataclasses import asdict from string import Template from typing import Dict +from typing import Optional +from typing import Set +from typing import Tuple +from cartography.graph.model import CartographyNodeProperties +from cartography.graph.model import CartographyNodeSchema +from cartography.graph.model import CartographyRelSchema +from cartography.graph.model import ExtraNodeLabels +from cartography.graph.model import LinkDirection +from cartography.graph.model import OtherRelationships +from cartography.graph.model import PropertyRef +from cartography.graph.model import TargetNodeMatcher -def build_node_ingestion_query(node_label: str, node_property_map: Dict[str, str]) -> str: + +logger = logging.getLogger(__name__) + + +def _build_node_properties_statement( + node_property_map: Dict[str, PropertyRef], + extra_node_labels: Optional[ExtraNodeLabels] = None, +) -> str: """ - Generates Neo4j query string to write a list of dicts as nodes to the graph with the - given node_label, id_field, and other arbitrary fields as provided by field_list. The - resulting query looks like + Generate a Neo4j clause that sets node properties using the given mapping of attribute names to PropertyRefs. - UNWIND $DictList AS item - MERGE (i:`node_label`{id:item.`node_property_map['id']`}) - ON CREATE SET i.firstseen = timestamp() - SET i.lastupdated = $UpdateTag, - ... ... + As seen in this example, - Note that `node_property_map` **must** have an `id` key defined. + node_property_map: Dict[str, PropertyRef] = { + 'id': PropertyRef("Id"), + 'node_prop_1': PropertyRef("Prop1"), + 'node_prop_2': PropertyRef("Prop2", set_in_kwargs=True), + } + set_clause: str = _build_node_properties_statement(node_property_map) - :param node_label: The label of the nodes to write, e.g. EC2Instance - :param node_property_map: A mapping of node property names to dict key names. - :return: A Neo4j query string using the UNWIND + MERGE pattern to write a list of nodes - in batch. This exposes 2 parameters: `$DictList` accepts a list of dictionaries to - write as nodes to the graph, and `$UpdateTag` is the standard cartography int update tag. + the returned set_clause will be + ``` + i.id = item.Id, + i.node_prop_1 = item.Prop1, + i.node_prop_2 = $Prop2 + ``` + where `i` is a reference to the Neo4j node. + :param node_property_map: Mapping of node attribute names as str to PropertyRef objects + :param extra_node_labels: Optional ExtraNodeLabels object to set on the node as string + :return: The resulting Neo4j SET clause to set the given attributes on the node """ - if 'id' not in node_property_map or not node_property_map['id']: - raise ValueError('node_property_map must have key `id` set.') + ingest_fields_template = Template('i.$node_property = $property_ref') - ingest_preamble_template = Template(""" - UNWIND $DictList AS item - MERGE (i:$NodeLabel{id:item.$DictIdField}) - ON CREATE SET i.firstseen = timestamp() - SET i.lastupdated = $UpdateTag""") - ingest_fields_template = Template(' i.$NodeProperty = item.$DictProperty') + set_clause = ',\n'.join([ + ingest_fields_template.safe_substitute(node_property=node_property, property_ref=property_ref) + for node_property, property_ref in node_property_map.items() + if node_property != 'id' # The `MERGE` clause will have already set `id`; let's not set it again. + ]) - ingest_preamble = ingest_preamble_template.safe_substitute( - NodeLabel=node_label, DictIdField=node_property_map['id'], - ) + # Set extra labels on the node if specified + if extra_node_labels: + extra_labels = ':'.join([label for label in extra_node_labels.labels]) + set_clause += f",\n i:{extra_labels}" + return set_clause + + +def _build_rel_properties_statement(rel_var: str, rel_property_map: Optional[Dict[str, PropertyRef]] = None) -> str: + """ + Generate a Neo4j clause that sets relationship properties using the given mapping of attribute names to + PropertyRefs. + + In this code example: + + rel_property_map: Dict[str, PropertyRef] = { + 'rel_prop_1': PropertyRef("Prop1"), + 'rel_prop_2': PropertyRef("Prop2", static=True), + } + set_clause: str = _build_rel_properties_statement('r', rel_property_map) - # If the node_property_map contains more than just `id`, generate a SET statement for the other fields. - if len(node_property_map.keys()) > 1: - set_clause = ',\n'.join([ - ingest_fields_template.safe_substitute(NodeProperty=node_property, DictProperty=dict_property) - for node_property, dict_property in node_property_map.items() - if not node_property == 'id' # Make sure to exclude setting the `id` again. + the returned set_clause will be: + + r.rel_prop_1 = item.Prop1, + r.rel_prop_2 = $Prop2 + + :param rel_var: The variable name to use for the relationship in the Neo4j query + :param rel_property_map: Mapping of relationship attribute names as str to PropertyRef objects + :return: The resulting Neo4j SET clause to set the given attributes on the relationship + """ + set_clause = '' + ingest_fields_template = Template('$rel_var.$rel_property = $property_ref') + + if rel_property_map: + set_clause += ',\n'.join([ + ingest_fields_template.safe_substitute( + rel_var=rel_var, + rel_property=rel_property, + property_ref=property_ref, + ) + for rel_property, property_ref in rel_property_map.items() ]) - ingest_query = ingest_preamble + ",\n" + set_clause - else: - ingest_query = ingest_preamble - return ingest_query + return set_clause -def build_relationship_ingestion_query( - node_label_a: str, search_property_a: str, dict_key_a: str, - node_label_b: str, search_property_b: str, dict_key_b: str, - rel_label: str, - rel_property_map: Dict[str, str] = None, -) -> str: +def _build_match_clause(matcher: TargetNodeMatcher) -> str: + """ + Generate a Neo4j match statement on one or more keys and values for a given node. + :param matcher: A TargetNodeMatcher object + :return: a Neo4j match clause """ - Generates Neo4j query string that looks like + match = Template("$Key: $PropRef") + matcher_asdict = asdict(matcher) + return ', '.join(match.safe_substitute(Key=key, PropRef=prop_ref) for key, prop_ref in matcher_asdict.items()) - UNWIND $RelMappingList AS item - MATCH (a:`node_label_a`{`search_property_a`:item.`dict_key_a`}) - MATCH (b:`node_label_b`{`search_property_b`:item.`dict_key_b}) - MERGE (a)-[r:`rel_label`]->(b) - ON CREATE SET r.firstseen = timestamp() - SET r.lastupdated = $UpdateTag, - ... ... - - To summarize, for each dict in RelMappingList, we create the paths - `($NodeA)-[:$RELATIONSHIP_NAME]->($NodeB)`. - - :param node_label_a: The label of $NodeA. - :param search_property_a: the search key to used to search the graph to find node A. For - performance, this should be an indexed property. If your graph is large, querying on - non-indexed properties can cause your syncs to take **days** to run! - :param dict_key_a: The dict key on what value of `search_property_a` to search for. - :param node_label_b: The label of $NodeB. - :param search_property_b: the search key to used to search the graph to find node B. For - performance, this should be an indexed property. If your graph is large, querying on - non-indexed properties can cause your syncs to take **days** to run! - :param dict_key_b: The dict key on what value of `search_property_b` to search for. - :param rel_label: The $RELATIONSHIP_NAME from $NodeA to $NodeB. - :param rel_property_map: Optional mapping of relationship property names to set and their - corresponding keys on the input data dict. Note: relationships in cartography are not indexed - so performing searches on them is slow. Reconsider your schema design if you expect to need - to run queries using relationship fields as search keys. - :return: Neo4j query string to draw relationships between $NodeA and $NodeB. This exposes 2 - parameters: `$RelMappingList` accepts a list of dictionaries to write as relationships to the - graph, and `$UpdateTag` is the standard cartography int update tag. - """ - ingest_preamble_template = Template(""" - UNWIND $RelMappingList AS item - MATCH (a:$NodeLabelA{$SearchPropertyA:item.$DictKeyA}) - MATCH (b:$NodeLabelB{$SearchPropertyB:item.$DictKeyB}) - MERGE (a)-[r:$LabelR]->(b) + +def _asdict_with_validate_relprops(link: CartographyRelSchema) -> Dict[str, PropertyRef]: + """ + Give a helpful error message when forgetting to put `()` when instantiating a CartographyRelSchema, as this + isn't always caught by IDEs. + """ + try: + rel_props_as_dict: Dict[str, PropertyRef] = asdict(link.properties) + except TypeError as e: + if e.args and e.args[0] and e.args == 'asdict() should be called on dataclass instances': + logger.error( + f'TypeError thrown when trying to draw relation "{link.rel_label}" to a "{link.target_node_label}" ' + f'node. Please make sure that you did not forget to write `()` when specifying `properties` in the' + f'dataclass. ' + f'For example, do `properties: RelProp = RelProp()`; NOT `properties: RelProp = RelProp`.', + ) + raise + return rel_props_as_dict + + +def _build_attach_sub_resource_statement(sub_resource_link: Optional[CartographyRelSchema] = None) -> str: + """ + Generates a Neo4j statement to attach a sub resource to a node. A 'sub resource' is a term we made up to describe + billing units of a given resource. For example, + - In AWS, the sub resource is an AWSAccount. + - In Azure, the sub resource is a Subscription. + - In GCP, the sub resource is a GCPProject. + - etc. + This is a private function not meant to be called outside of build_ingest_query(). + :param sub_resource_link: Optional: The CartographyRelSchema object connecting previous node(s) to the sub resource. + :return: a Neo4j clause that connects previous node(s) to a sub resource, taking into account the labels, attribute + keys, and directionality. If sub_resource_link is None, return an empty string. + """ + if not sub_resource_link: + return '' + + sub_resource_attach_template = Template( + """ + OPTIONAL MATCH (j:$SubResourceLabel{$MatchClause}) + WITH i, item, j WHERE j IS NOT NULL + $RelMergeClause ON CREATE SET r.firstseen = timestamp() - SET r.lastupdated = $UpdateTag""") - ingest_fields_template = Template(' r.$RelProperty = item.$DictProperty') - - ingest_preamble = ingest_preamble_template.safe_substitute( - NodeLabelA=node_label_a, - SearchPropertyA=search_property_a, - DictKeyA=dict_key_a, - NodeLabelB=node_label_b, - SearchPropertyB=search_property_b, - DictKeyB=dict_key_b, - LabelR=rel_label, + SET + $set_rel_properties_statement + """, ) - if rel_property_map: - set_clause = ',\n'.join([ - ingest_fields_template.safe_substitute(RelProperty=rel_property, DictProperty=dict_property) - for rel_property, dict_property in rel_property_map.items() - ]) - ingest_query = ingest_preamble + ",\n" + set_clause + if sub_resource_link.direction == LinkDirection.INWARD: + rel_merge_template = Template("""MERGE (i)<-[r:$SubResourceRelLabel]-(j)""") else: - ingest_query = ingest_preamble + rel_merge_template = Template("""MERGE (i)-[r:$SubResourceRelLabel]->(j)""") + + rel_merge_clause = rel_merge_template.safe_substitute(SubResourceRelLabel=sub_resource_link.rel_label) + + rel_props_as_dict: Dict[str, PropertyRef] = _asdict_with_validate_relprops(sub_resource_link) + + attach_sub_resource_statement = sub_resource_attach_template.safe_substitute( + SubResourceLabel=sub_resource_link.target_node_label, + MatchClause=_build_match_clause(sub_resource_link.target_node_matcher), + RelMergeClause=rel_merge_clause, + SubResourceRelLabel=sub_resource_link.rel_label, + set_rel_properties_statement=_build_rel_properties_statement('r', rel_props_as_dict), + ) + return attach_sub_resource_statement + + +def _build_attach_additional_links_statement( + additional_relationships: Optional[OtherRelationships] = None, +) -> str: + """ + Generates a Neo4j statement to attach one or more CartographyRelSchemas to node(s) previously mentioned in the + query. + This is a private function not meant to be called outside of build_ingestion_query(). + :param additional_relationships: Optional list of CartographyRelSchema describing what other relationships should + be created from the previous node(s) in this query. + :return: A Neo4j clause that connects previous node(s) to the given additional_links., taking into account the + labels, attribute keys, and directionality. If additional_relationships is None, return an empty string. + """ + if not additional_relationships: + return '' + + additional_links_template = Template( + """ + WITH i, item + OPTIONAL MATCH ($node_var:$AddlLabel{$MatchClause}) + WITH i, item, $node_var WHERE $node_var IS NOT NULL + $RelMerge + ON CREATE SET $rel_var.firstseen = timestamp() + SET + $set_rel_properties_statement + """, + ) + links = [] + for num, link in enumerate(additional_relationships.rels): + node_var = f"n{num}" + rel_var = f"r{num}" + + if link.direction == LinkDirection.INWARD: + rel_merge_template = Template("""MERGE (i)<-[$rel_var:$AddlRelLabel]-($node_var)""") + else: + rel_merge_template = Template("""MERGE (i)-[$rel_var:$AddlRelLabel]->($node_var)""") + + rel_merge = rel_merge_template.safe_substitute( + rel_var=rel_var, + AddlRelLabel=link.rel_label, + node_var=node_var, + ) + + rel_props_as_dict = _asdict_with_validate_relprops(link) + + additional_ref = additional_links_template.safe_substitute( + AddlLabel=link.target_node_label, + MatchClause=_build_match_clause(link.target_node_matcher), + node_var=node_var, + rel_var=rel_var, + RelMerge=rel_merge, + set_rel_properties_statement=_build_rel_properties_statement(rel_var, rel_props_as_dict), + ) + links.append(additional_ref) + + return 'UNION'.join(links) + + +def _build_attach_relationships_statement( + sub_resource_relationship: Optional[CartographyRelSchema], + other_relationships: Optional[OtherRelationships], +) -> str: + """ + Use Neo4j subqueries to attach sub resource and/or other relationships. + Subqueries allow the query to continue to run even if we only have data for some but not all the relationships + defined by a schema. + For example, if an EC2Instance has attachments to NetworkInterfaces and AWSAccounts, but our data only includes + EC2Instance to AWSAccount information, structuring the ingestion query with subqueries allows us to build a query + that will ignore the null relationships and continue to MERGE the ones that exist. + """ + if not sub_resource_relationship and not other_relationships: + return "" + + attach_sub_resource_statement = _build_attach_sub_resource_statement(sub_resource_relationship) + attach_additional_links_statement = _build_attach_additional_links_statement(other_relationships) + + statements = [] + statements += [attach_sub_resource_statement] if attach_sub_resource_statement else [] + statements += [attach_additional_links_statement] if attach_additional_links_statement else [] + + attach_relationships_statement = 'UNION'.join(stmt for stmt in statements) + + query_template = Template( + """ + WITH i, item + CALL { + WITH i, item + $attach_relationships_statement + } + """, + ) + return query_template.safe_substitute(attach_relationships_statement=attach_relationships_statement) + + +def _filter_selected_relationships( + node_schema: CartographyNodeSchema, + selected_relationships: Set[CartographyRelSchema], +) -> Tuple[Optional[CartographyRelSchema], Optional[OtherRelationships]]: + """ + Ensures that selected relationships specified to build_ingestion_query() are actually present on + node_schema.sub_resource_relationship and node_schema.other_relationships. + :param node_schema: The node schema object to filter relationships against + :param selected_relationships: The set of relationships to check if they exist in the node schema. If empty set, + this means that no relationships have been selected. None is not an accepted value here. + :return: a tuple of the (sub resource rel [if present in selected_relationships], an OtherRelationships object + containing all values of node_schema.other_relationships that are present in selected_relationships) + """ + # The empty set means no relationships are selected + if selected_relationships == set(): + return None, None + + # Collect the node's sub resource rel and OtherRelationships together in one set for easy comparison + all_rels_on_node = {node_schema.sub_resource_relationship} + if node_schema.other_relationships: + for rel in node_schema.other_relationships.rels: + all_rels_on_node.add(rel) + + # Ensure that the selected_relationships are actually present on the node_schema. + for selected_rel in selected_relationships: + if selected_rel not in all_rels_on_node: + raise ValueError( + f"build_ingestion_query() failed: CartographyRelSchema {selected_rel.__class__.__name__} is not " + f"defined on CartographyNodeSchema type {node_schema.__class__.__name__}. Please verify the " + f"value of `selected_relationships` passed to `build_ingestion_query()`.", + ) + + sub_resource_rel = node_schema.sub_resource_relationship + if sub_resource_rel not in selected_relationships: + sub_resource_rel = None + + # By this point, everything in selected_relationships is validated to be present in node_schema + filtered_other_rels = OtherRelationships([rel for rel in selected_relationships if rel != sub_resource_rel]) + + return sub_resource_rel, filtered_other_rels + + +def build_ingestion_query( + node_schema: CartographyNodeSchema, + selected_relationships: Optional[Set[CartographyRelSchema]] = None, +) -> str: + """ + Generates a Neo4j query from the given CartographyNodeSchema to ingest the specified nodes and relationships so that + cartography module authors don't need to handwrite their own queries. + :param node_schema: The CartographyNodeSchema object to build a Neo4j query from. + :param selected_relationships: If specified, generates a query that attaches only the relationships in this optional + set of CartographyRelSchema. The RelSchema specified here _must_ be present in node_schema.sub_resource_relationship + or node_schema.other_relationships. + If selected_relationships is None (default), then we create a query using all RelSchema specified in + node_schema.sub_resource_relationship + node_schema.other_relationships. + If selected_relationships is the empty set, we create a query with no relationship attachments at all. + :return: An optimized Neo4j query that can be used to ingest nodes and relationships. + Important notes: + - The resulting query uses the UNWIND + MERGE pattern (see + https://neo4j.com/docs/cypher-manual/current/clauses/unwind/#unwind-creating-nodes-from-a-list-parameter) to batch + load the data for speed. + - The query assumes that a list of dicts will be passed to it through parameter $DictList. + - The query sets `firstseen` attributes on all the nodes and relationships that it creates. + - The query is intended to be supplied as input to cartography.core.client.tx.load_graph_data(). + """ + query_template = Template( + """ + UNWIND $DictList AS item + MERGE (i:$node_label{id: $dict_id_field}) + ON CREATE SET i.firstseen = timestamp() + SET + $set_node_properties_statement + $attach_relationships_statement + """, + ) + + node_props: CartographyNodeProperties = node_schema.properties + node_props_as_dict: Dict[str, PropertyRef] = asdict(node_props) + + # Handle selected relationships + sub_resource_rel: Optional[CartographyRelSchema] = node_schema.sub_resource_relationship + other_rels: Optional[OtherRelationships] = node_schema.other_relationships + if selected_relationships or selected_relationships == set(): + sub_resource_rel, other_rels = _filter_selected_relationships(node_schema, selected_relationships) + + ingest_query = query_template.safe_substitute( + node_label=node_schema.label, + dict_id_field=node_props.id, + set_node_properties_statement=_build_node_properties_statement( + node_props_as_dict, + node_schema.extra_node_labels, + ), + attach_relationships_statement=_build_attach_relationships_statement(sub_resource_rel, other_rels), + ) return ingest_query diff --git a/cartography/intel/aws/emr.py b/cartography/intel/aws/emr.py index 33e2a46c81..4220686bc4 100644 --- a/cartography/intel/aws/emr.py +++ b/cartography/intel/aws/emr.py @@ -1,5 +1,6 @@ import logging import time +from dataclasses import dataclass from typing import Dict from typing import List @@ -7,6 +8,16 @@ import botocore.exceptions import neo4j +from cartography.client.core.tx import load_graph_data +from cartography.graph.model import CartographyNodeProperties +from cartography.graph.model import CartographyNodeSchema +from cartography.graph.model import CartographyRelProperties +from cartography.graph.model import CartographyRelSchema +from cartography.graph.model import LinkDirection +from cartography.graph.model import make_target_node_matcher +from cartography.graph.model import PropertyRef +from cartography.graph.model import TargetNodeMatcher +from cartography.graph.querybuilder import build_ingestion_query from cartography.intel.aws.ec2.util import get_botocore_config from cartography.util import aws_handle_regions from cartography.util import run_cleanup_job @@ -48,53 +59,77 @@ def get_emr_describe_cluster(boto3_session: boto3.session.Session, region: str, return cluster_details +@dataclass(frozen=True) +class EMRClusterNodeProperties(CartographyNodeProperties): + arn: PropertyRef = PropertyRef('ClusterArn') + auto_terminate: PropertyRef = PropertyRef('AutoTerminate') + autoscaling_role: PropertyRef = PropertyRef('AutoScalingRole') + custom_ami_id: PropertyRef = PropertyRef('CustomAmiId') + firstseen: PropertyRef = PropertyRef('firstseen') + id: PropertyRef = PropertyRef('Id') + instance_collection_type: PropertyRef = PropertyRef('InstanceCollectionType') + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + log_encryption_kms_key_id: PropertyRef = PropertyRef('LogEncryptionKmsKeyId') + log_uri: PropertyRef = PropertyRef('LogUri') + master_public_dns_name: PropertyRef = PropertyRef('MasterPublicDnsName') + name: PropertyRef = PropertyRef('Name') + outpost_arn: PropertyRef = PropertyRef('OutpostArn') + region: PropertyRef = PropertyRef('Region', set_in_kwargs=True) + release_label: PropertyRef = PropertyRef('ReleaseLabel') + repo_upgrade_on_boot: PropertyRef = PropertyRef('RepoUpgradeOnBoot') + requested_ami_version: PropertyRef = PropertyRef('RequestedAmiVersion') + running_ami_version: PropertyRef = PropertyRef('RunningAmiVersion') + scale_down_behavior: PropertyRef = PropertyRef('ScaleDownBehavior') + security_configuration: PropertyRef = PropertyRef('SecurityConfiguration') + servicerole: PropertyRef = PropertyRef('ServiceRole') + termination_protected: PropertyRef = PropertyRef('TerminationProtected') + visible_to_all_users: PropertyRef = PropertyRef('VisibleToAllUsers') + + +@dataclass(frozen=True) +class EMRClusterToAwsAccountRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +# (:EMRCluster)<-[:RESOURCE]-(:AWSAccount) +class EMRClusterToAWSAccount(CartographyRelSchema): + target_node_label: str = 'AWSAccount' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('AccountId', set_in_kwargs=True)}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "RESOURCE" + properties: EMRClusterToAwsAccountRelProperties = EMRClusterToAwsAccountRelProperties() + + +@dataclass(frozen=True) +class EMRClusterSchema(CartographyNodeSchema): + label: str = 'EMRCluster' + properties: EMRClusterNodeProperties = EMRClusterNodeProperties() + sub_resource_relationship: EMRClusterToAWSAccount = EMRClusterToAWSAccount() + + @timeit def load_emr_clusters( - neo4j_session: neo4j.Session, cluster_data: List[Dict], region: str, current_aws_account_id: str, - aws_update_tag: int, + neo4j_session: neo4j.Session, + cluster_data: List[Dict], + region: str, + current_aws_account_id: str, + aws_update_tag: int, ) -> None: - query = """ - UNWIND $Clusters as emr_cluster - MERGE (cluster:EMRCluster{id: emr_cluster.Name}) - ON CREATE SET cluster.firstseen = timestamp(), - cluster.arn = emr_cluster.ClusterArn, - cluster.id = emr_cluster.Id, - cluster.region = $Region - SET cluster.name = emr_cluster.Name, - cluster.instance_collection_type = emr_cluster.InstanceCollectionType, - cluster.log_encryption_kms_key_id = emr_cluster.LogEncryptionKmsKeyId, - cluster.requested_ami_version = emr_cluster.RequestedAmiVersion, - cluster.running_ami_version = emr_cluster.RunningAmiVersion, - cluster.release_label = emr_cluster.ReleaseLabel, - cluster.auto_terminate = emr_cluster.AutoTerminate, - cluster.termination_protected = emr_cluster.TerminationProtected, - cluster.visible_to_all_users = emr_cluster.VisibleToAllUsers, - cluster.master_public_dns_name = emr_cluster.MasterPublicDnsName, - cluster.security_configuration = emr_cluster.SecurityConfiguration, - cluster.autoscaling_role = emr_cluster.AutoScalingRole, - cluster.scale_down_behavior = emr_cluster.ScaleDownBehavior, - cluster.custom_ami_id = emr_cluster.CustomAmiId, - cluster.repo_upgrade_on_boot = emr_cluster.RepoUpgradeOnBoot, - cluster.outpost_arn = emr_cluster.OutpostArn, - cluster.log_uri = emr_cluster.LogUri, - cluster.servicerole = emr_cluster.ServiceRole, - cluster.lastupdated = $aws_update_tag - WITH cluster - - MATCH (owner:AWSAccount{id: $AWS_ACCOUNT_ID}) - MERGE (owner)-[r:RESOURCE]->(cluster) - ON CREATE SET r.firstseen = timestamp() - SET r.lastupdated = $aws_update_tag - """ - logger.info("Loading EMR %d clusters for region '%s' into graph.", len(cluster_data), region) - neo4j_session.run( - query, - Clusters=cluster_data, + + ingestion_query = build_ingestion_query(EMRClusterSchema()) + + load_graph_data( + neo4j_session, + ingestion_query, + cluster_data, + lastupdated=aws_update_tag, Region=region, - aws_update_tag=aws_update_tag, - AWS_ACCOUNT_ID=current_aws_account_id, - ).consume() + AccountId=current_aws_account_id, + ) @timeit diff --git a/setup.cfg b/setup.cfg index b364e22b44..312318941c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -63,5 +63,9 @@ ignore_errors = true disallow_untyped_defs = false allow_redefinition = true +# Intentional TypeErrors are here because we are testing if the code gives a helpful error message to the module author. +[mypy-tests.unit.cartography.graph.test_querybuilder_build_attach_links_excs] +ignore_errors = true + [coverage:report] fail_under = 30 diff --git a/cartography/client/__init__py b/tests/data/graph/__init__.py similarity index 100% rename from cartography/client/__init__py rename to tests/data/graph/__init__.py diff --git a/tests/data/graph/querybuilder/__init__.py b/tests/data/graph/querybuilder/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/graph/querybuilder/sample_data/__init__.py b/tests/data/graph/querybuilder/sample_data/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/graph/querybuilder/sample_data/helloworld_relationships.py b/tests/data/graph/querybuilder/sample_data/helloworld_relationships.py new file mode 100644 index 0000000000..4eaf902e88 --- /dev/null +++ b/tests/data/graph/querybuilder/sample_data/helloworld_relationships.py @@ -0,0 +1,42 @@ +MERGE_SUB_RESOURCE_QUERY = """ +MERGE (s:SubResource{id: "sub-resource-id"}) +ON CREATE SET s.lastupdated = 1 +""" + + +MERGE_HELLO_ASSET_QUERY = """ +MERGE (h:HelloAsset{id: "the-helloasset-id-1"}) +ON CREATE SET h.lastupdated = 1 +""" + + +MERGE_WORLD_ASSET_QUERY = """ +MERGE (w:WorldAsset{id: "the-worldasset-id-1"}) +ON CREATE SET w.lastupdated = 1 +""" + + +# This dataset shows an InterestingNode attached to a WorldAsset but no other relationships. +INTERESTING_NODE_WITH_PARTIAL_RELS = [ + { + 'Id': 'interesting-node-id', + 'property1': 'b', + 'property2': 'c', + 'AnotherField': 'd', + 'YetAnotherRelField': 'e', + 'world_asset_id': 'the-worldasset-id-1', + }, +] + +# This dataset shows an InterestingNode attached to a HelloAsset and a WorldAsset. +INTERESTING_NODE_WITH_ALL_RELS = [ + { + 'Id': 'interesting-node-id', + 'property1': 'b', + 'property2': 'c', + 'AnotherField': 'd', + 'YetAnotherRelField': 'e', + 'world_asset_id': 'the-worldasset-id-1', + 'hello_asset_id': 'the-helloasset_id-1', + }, +] diff --git a/tests/data/graph/querybuilder/sample_data/multiple_attr_match.py b/tests/data/graph/querybuilder/sample_data/multiple_attr_match.py new file mode 100644 index 0000000000..1d137369e5 --- /dev/null +++ b/tests/data/graph/querybuilder/sample_data/multiple_attr_match.py @@ -0,0 +1,37 @@ +MERGE_PERSONS = """ +MERGE (s1:Person{id: 1, first_name: "Homer", last_name: "Simpson", lastupdated: 1}) +MERGE (s2:Person{id: 2, first_name: "Marge", last_name: "Simpson", lastupdated: 1}) +MERGE (s3:Person{id: 3, first_name: "Bart", last_name: "Simpson", lastupdated: 1}) +MERGE (s4:Person{id: 4, first_name: "Lisa", last_name: "Simpson", lastupdated: 1}) +MERGE (s5:Person{id: 5, first_name: "Maggie", last_name: "Simpson", lastupdated: 1}) +""" + + +# This is intended to test matching on more than one attribute. +# Lisa has 1 computer, Homer has 2, everyone else has no computers. +TEST_COMPUTERS = [ + { + 'Id': 1234, + 'RAM_GB': 16, + 'NumCores': 4, + 'name': 'macbook-air', + 'LastName': 'Simpson', + 'FirstName': "Lisa", + }, + { + 'Id': 9876, + 'RAM_GB': 128, + 'NumCores': 32, + 'name': 'server-in-the-closet', + 'LastName': 'Simpson', + 'FirstName': "Homer", + }, + { + 'Id': 1337, + 'RAM_GB': 2048, + 'NumCores': 1024, + 'name': 'beefy-box', + 'LastName': 'Simpson', + 'FirstName': "Homer", + }, +] diff --git a/tests/data/graph/querybuilder/sample_models/__init__.py b/tests/data/graph/querybuilder/sample_models/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/graph/querybuilder/sample_models/interesting_asset.py b/tests/data/graph/querybuilder/sample_models/interesting_asset.py new file mode 100644 index 0000000000..93b31a4d9b --- /dev/null +++ b/tests/data/graph/querybuilder/sample_models/interesting_asset.py @@ -0,0 +1,94 @@ +from dataclasses import dataclass +from typing import Optional + +from cartography.graph.model import CartographyNodeProperties +from cartography.graph.model import CartographyNodeSchema +from cartography.graph.model import CartographyRelProperties +from cartography.graph.model import CartographyRelSchema +from cartography.graph.model import ExtraNodeLabels +from cartography.graph.model import LinkDirection +from cartography.graph.model import make_target_node_matcher +from cartography.graph.model import OtherRelationships +from cartography.graph.model import PropertyRef +from cartography.graph.model import TargetNodeMatcher +from tests.data.graph.querybuilder.sample_models.simple_node import SimpleNodeProperties + + +@dataclass(frozen=True) +class InterestingAssetProperties(CartographyNodeProperties): + id: PropertyRef = PropertyRef('Id') + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + property1: PropertyRef = PropertyRef('property1') + property2: PropertyRef = PropertyRef('property2') + + +@dataclass(frozen=True) +class InterestingAssetToSubResourceRelProps(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + another_rel_field: PropertyRef = PropertyRef('AnotherField') + yet_another_rel_field: PropertyRef = PropertyRef("YetAnotherRelField") + + +@dataclass(frozen=True) +class InterestingAssetToSubResourceRel(CartographyRelSchema): + """ + Define a sub resource relationship + (:InterestingAsset)<-[:RELATIONSHIP_LABEL]-(:SubResource) + """ + target_node_label: str = 'SubResource' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('sub_resource_id', set_in_kwargs=True)}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "RELATIONSHIP_LABEL" + properties: InterestingAssetToSubResourceRelProps = InterestingAssetToSubResourceRelProps() + + +@dataclass(frozen=True) +class InterestingAssetToHelloAssetRelProps(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class InterestingAssetToHelloAssetRel(CartographyRelSchema): + """ + Define an additional relationship + (:InterestingAsset)-[:ASSOCIATED_WITH]->(:HelloAsset) + """ + target_node_label: str = 'HelloAsset' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher({'id': PropertyRef('hello_asset_id')}) + direction: LinkDirection = LinkDirection.OUTWARD + rel_label: str = "ASSOCIATED_WITH" + properties: InterestingAssetToHelloAssetRelProps = InterestingAssetToHelloAssetRelProps() + + +@dataclass(frozen=True) +class InterestingAssetToWorldAssetRelProps(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class InterestingAssetToWorldAssetRel(CartographyRelSchema): + """ + Define yet another relationship. + (:InterestingAsset)<-[:CONNECTED]-(:WorldAsset) + """ + target_node_label: str = 'WorldAsset' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher({'id': PropertyRef('world_asset_id')}) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "CONNECTED" + properties: InterestingAssetToWorldAssetRelProps = InterestingAssetToWorldAssetRelProps() + + +@dataclass(frozen=True) +class InterestingAssetSchema(CartographyNodeSchema): + extra_node_labels: Optional[ExtraNodeLabels] = ExtraNodeLabels(['AnotherNodeLabel', 'YetAnotherNodeLabel']) + label: str = 'InterestingAsset' + properties: SimpleNodeProperties = SimpleNodeProperties() + sub_resource_relationship: InterestingAssetToSubResourceRel = InterestingAssetToSubResourceRel() + other_relationships: Optional[OtherRelationships] = OtherRelationships( + [ + InterestingAssetToHelloAssetRel(), + InterestingAssetToWorldAssetRel(), + ], + ) diff --git a/tests/data/graph/querybuilder/sample_models/multiple_attr_match.py b/tests/data/graph/querybuilder/sample_models/multiple_attr_match.py new file mode 100644 index 0000000000..6c9bb65ad6 --- /dev/null +++ b/tests/data/graph/querybuilder/sample_models/multiple_attr_match.py @@ -0,0 +1,51 @@ +from dataclasses import dataclass +from typing import Optional + +from cartography.graph.model import CartographyNodeProperties +from cartography.graph.model import CartographyNodeSchema +from cartography.graph.model import CartographyRelProperties +from cartography.graph.model import CartographyRelSchema +from cartography.graph.model import LinkDirection +from cartography.graph.model import make_target_node_matcher +from cartography.graph.model import OtherRelationships +from cartography.graph.model import PropertyRef +from cartography.graph.model import TargetNodeMatcher + + +@dataclass(frozen=True) +class TestComputerToPersonRelProps(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class TestComputerToPersonRel(CartographyRelSchema): + """ + (:TestComputer)<-[:OWNS]-(:Person) + """ + target_node_label: str = 'Person' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + { + 'first_name': PropertyRef('FirstName'), + 'last_name': PropertyRef('LastName'), + }, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "OWNS" + properties: TestComputerToPersonRelProps = TestComputerToPersonRelProps() + + +# Test defining a simple node with no relationships. +@dataclass(frozen=True) +class TestComputerProperties(CartographyNodeProperties): + id: PropertyRef = PropertyRef('Id') + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + ram_gb: PropertyRef = PropertyRef('RAM_GB') + num_cores: PropertyRef = PropertyRef('NumCores') + name: PropertyRef = PropertyRef('name') + + +@dataclass(frozen=True) +class TestComputer(CartographyNodeSchema): + label: str = 'TestComputer' + properties: TestComputerProperties = TestComputerProperties() + other_relationships: Optional[OtherRelationships] = OtherRelationships([TestComputerToPersonRel()]) diff --git a/tests/data/graph/querybuilder/sample_models/simple_node.py b/tests/data/graph/querybuilder/sample_models/simple_node.py new file mode 100644 index 0000000000..7c8a122784 --- /dev/null +++ b/tests/data/graph/querybuilder/sample_models/simple_node.py @@ -0,0 +1,49 @@ +from dataclasses import dataclass + +from cartography.graph.model import CartographyNodeProperties +from cartography.graph.model import CartographyNodeSchema +from cartography.graph.model import CartographyRelProperties +from cartography.graph.model import CartographyRelSchema +from cartography.graph.model import LinkDirection +from cartography.graph.model import make_target_node_matcher +from cartography.graph.model import PropertyRef +from cartography.graph.model import TargetNodeMatcher + + +# Test defining a simple node with no relationships. +@dataclass(frozen=True) +class SimpleNodeProperties(CartographyNodeProperties): + id: PropertyRef = PropertyRef('Id') + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + property1: PropertyRef = PropertyRef('property1') + property2: PropertyRef = PropertyRef('property2') + + +@dataclass(frozen=True) +class SimpleNodeSchema(CartographyNodeSchema): + label: str = 'SimpleNode' + properties: SimpleNodeProperties = SimpleNodeProperties() + + +# Test defining a simple node with a sub resource rel: (:SimpleNode)<-[:RESOURCE]-(:SubResource) +@dataclass(frozen=True) +class SimpleNodeToSubResourceRelProps(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class SimpleNodeToSubResourceRel(CartographyRelSchema): + target_node_label: str = 'SubResource' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('sub_resource_id', set_in_kwargs=True)}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "RELATIONSHIP_LABEL" + properties: SimpleNodeToSubResourceRelProps = SimpleNodeToSubResourceRelProps() + + +@dataclass(frozen=True) +class SimpleNodeWithSubResourceSchema(CartographyNodeSchema): + label: str = 'SimpleNode' + properties: SimpleNodeProperties = SimpleNodeProperties() + sub_resource_relationship: SimpleNodeToSubResourceRel = SimpleNodeToSubResourceRel() diff --git a/tests/integration/cartography/graph/__init__.py b/tests/integration/cartography/graph/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/cartography/graph/test_querybuilder_labels_and_var_num_rels.py b/tests/integration/cartography/graph/test_querybuilder_labels_and_var_num_rels.py new file mode 100644 index 0000000000..aa9d3c3594 --- /dev/null +++ b/tests/integration/cartography/graph/test_querybuilder_labels_and_var_num_rels.py @@ -0,0 +1,168 @@ +from cartography.client.core.tx import load_graph_data +from cartography.graph.querybuilder import build_ingestion_query +from tests.data.graph.querybuilder.sample_data.helloworld_relationships import INTERESTING_NODE_WITH_ALL_RELS +from tests.data.graph.querybuilder.sample_data.helloworld_relationships import MERGE_HELLO_ASSET_QUERY +from tests.data.graph.querybuilder.sample_data.helloworld_relationships import MERGE_SUB_RESOURCE_QUERY +from tests.data.graph.querybuilder.sample_data.helloworld_relationships import MERGE_WORLD_ASSET_QUERY +from tests.data.graph.querybuilder.sample_models.interesting_asset import InterestingAssetSchema +from tests.data.graph.querybuilder.sample_models.interesting_asset import InterestingAssetToSubResourceRel +from tests.data.graph.querybuilder.sample_models.interesting_asset import InterestingAssetToWorldAssetRel + + +def test_load_graph_data_extra_node_labels_and_no_relationships(neo4j_session): + """ + Test that + - multiple labels defined on a CartographyNodeSchema are properly recorded to the graph. + - we are able to generate a query that includes no relationships in build_ingestion_query()'s + `selected_relationships` parameter. + """ + # Act: specify the empty set as selected_relationships to build_ingestion_query(). + query = build_ingestion_query(InterestingAssetSchema(), selected_relationships=set()) + + # Act: call `load_graph_data()` without specifying `sub_resource` or any other kwargs that were present on + # InterestingAsset's attached RelSchema. + load_graph_data( + neo4j_session, + query, + INTERESTING_NODE_WITH_ALL_RELS, + lastupdated=1, + ) + + # Assert that the labels exist + expected = {'AnotherNodeLabel', 'InterestingAsset', 'YetAnotherNodeLabel'} + result = neo4j_session.run( + """ + MATCH (n1:InterestingAsset) RETURN labels(n1) AS labels; + """, + ) + actual = {label for label in result.data()[0]['labels']} + assert actual == expected + + +def test_load_graph_data_with_sub_rel_selected(neo4j_session): + """ + Test generating and running a query that includes only InterestingAssetSchema.sub_resource_relationship in + build_ingestion_query()'s selected_relationships parameter. + """ + # Arrange: add (:SubResource{id:sub-resource-id}) and (:WorldAsset{id: world-asset-id}) to the test graph + neo4j_session.run(MERGE_SUB_RESOURCE_QUERY) + neo4j_session.run(MERGE_HELLO_ASSET_QUERY) + neo4j_session.run(MERGE_WORLD_ASSET_QUERY) + + # Act + query = build_ingestion_query( + InterestingAssetSchema(), + selected_relationships={ + InterestingAssetToSubResourceRel(), + }, + ) + load_graph_data( + neo4j_session, + query, + INTERESTING_NODE_WITH_ALL_RELS, + lastupdated=1, + sub_resource_id='sub-resource-id', + ) + + # Assert that the InterestingAsset to SubResource relationship exists. + expected = {('interesting-node-id', 'sub-resource-id')} + result = neo4j_session.run( + """ + MATCH (n1:InterestingAsset)<-[:RELATIONSHIP_LABEL]-(n2:SubResource) RETURN n1.id, n2.id; + """, + ) + actual = {(r['n1.id'], r['n2.id']) for r in result} + assert actual == expected + + +def test_load_graph_data_with_worldasset_rel_selected(neo4j_session): + """ + Test generating and running a query that specifies only 1 of 2 of the rels in + InterestingAssetSchema.other_relationships to build_ingestion_query()'s selected_relationships parameter. + """ + # Arrange: add (:SubResource{id:sub-resource-id}) and (:WorldAsset{id: world-asset-id}) to the test graph + neo4j_session.run(MERGE_SUB_RESOURCE_QUERY) + neo4j_session.run(MERGE_HELLO_ASSET_QUERY) + neo4j_session.run(MERGE_WORLD_ASSET_QUERY) + + # Act + query = build_ingestion_query( + InterestingAssetSchema(), + selected_relationships={ + InterestingAssetToWorldAssetRel(), + }, + ) + load_graph_data( + neo4j_session, + query, + INTERESTING_NODE_WITH_ALL_RELS, + lastupdated=1, + ) + + # Assert that the InterestingAsset to WorldAsset relationship exists + expected = {('interesting-node-id', 'the-worldasset-id-1')} + result = neo4j_session.run( + """ + MATCH (n1:InterestingAsset)<-[:CONNECTED]-(n2:WorldAsset) RETURN n1.id, n2.id; + """, + ) + actual = {(r['n1.id'], r['n2.id']) for r in result} + assert actual == expected + + +def test_load_graph_data_with_sub_resource_and_worldasset_rel_selected(neo4j_session): + """ + Test generating and running a query that includes InterestingAssetSchema.sub_resource_relationship + only 1 of 2 of + the rels in InterestingAssetSchema.other_relationships to build_ingestion_query()'s selected_relationships + parameter. + """ + # Arrange: add (:SubResource{id:sub-resource-id}) and (:WorldAsset{id: world-asset-id}) to the test graph + neo4j_session.run(MERGE_SUB_RESOURCE_QUERY) + neo4j_session.run(MERGE_HELLO_ASSET_QUERY) + neo4j_session.run(MERGE_WORLD_ASSET_QUERY) + + # Act + query = build_ingestion_query( + InterestingAssetSchema(), + selected_relationships={ + InterestingAssetToSubResourceRel(), + InterestingAssetToWorldAssetRel(), + }, + ) + load_graph_data( + neo4j_session, + query, + INTERESTING_NODE_WITH_ALL_RELS, + lastupdated=1, + sub_resource_id='sub-resource-id', + ) + + # Assert that the InterestingAsset to WorldAsset relationship exists + expected = {('interesting-node-id', 'the-worldasset-id-1')} + result = neo4j_session.run( + """ + MATCH (n1:InterestingAsset)<-[:CONNECTED]-(n2:WorldAsset) RETURN n1.id, n2.id; + """, + ) + actual = {(r['n1.id'], r['n2.id']) for r in result} + assert actual == expected + + # Assert that the InterestingAsset to SubResource relationship exists. + expected = {('interesting-node-id', 'sub-resource-id')} + result = neo4j_session.run( + """ + MATCH (n1:InterestingAsset)<-[:RELATIONSHIP_LABEL]-(n2:SubResource) RETURN n1.id, n2.id; + """, + ) + actual = {(r['n1.id'], r['n2.id']) for r in result} + assert actual == expected + + # Assert that the InterestingAsset to Hello relationships does NOT exist. + expected = set() + result = neo4j_session.run( + """ + MATCH (n1:InterestingAsset)-[:ASSOCIATED_WITH]->(n2:HelloAsset) RETURN n1.id, n2.id; + """, + ) + actual = {(r['n1.id'], r['n2.id']) for r in result} + assert actual == expected diff --git a/tests/integration/cartography/graph/test_querybuilder_match_on_multiple_attrs.py b/tests/integration/cartography/graph/test_querybuilder_match_on_multiple_attrs.py new file mode 100644 index 0000000000..04da068551 --- /dev/null +++ b/tests/integration/cartography/graph/test_querybuilder_match_on_multiple_attrs.py @@ -0,0 +1,41 @@ +from cartography.client.core.tx import load_graph_data +from cartography.graph.querybuilder import build_ingestion_query +from tests.data.graph.querybuilder.sample_data.multiple_attr_match import MERGE_PERSONS +from tests.data.graph.querybuilder.sample_data.multiple_attr_match import TEST_COMPUTERS +from tests.data.graph.querybuilder.sample_models.multiple_attr_match import TestComputer + + +def test_load_graph_data_match_on_multiple_attrs(neo4j_session): + """ + Test load_graph_data() if we have a relationship that matches on more than one attribute. + + In this test case, Persons can OWN TestComputers, and this assignment is made based on both first_name and + last_name. + """ + # Arrange: add (:SubResource{id:sub-resource-id}) and (:WorldAsset{id: world-asset-id}) to the test graph + neo4j_session.run(MERGE_PERSONS) + + # Act + query = build_ingestion_query(TestComputer()) + load_graph_data( + neo4j_session, + query, + TEST_COMPUTERS, + lastupdated=1, + ) + + # Assert that Homer has 2 computers and Lisa has 1 computer + expected = { + ('server-in-the-closet', 'Homer'), + ('beefy-box', 'Homer'), + ('macbook-air', 'Lisa'), + } + result = neo4j_session.run( + """ + MATCH (n1:TestComputer)<-[:OWNS]-(n2:Person) RETURN n1.name, n2.first_name; + """, + ) + actual = { + (r['n1.name'], r['n2.first_name']) for r in result + } + assert actual == expected diff --git a/tests/integration/cartography/graph/test_querybuilder_rel_subsets.py b/tests/integration/cartography/graph/test_querybuilder_rel_subsets.py new file mode 100644 index 0000000000..d3dbdd6634 --- /dev/null +++ b/tests/integration/cartography/graph/test_querybuilder_rel_subsets.py @@ -0,0 +1,141 @@ +from cartography.client.core.tx import load_graph_data +from cartography.graph.querybuilder import build_ingestion_query +from tests.data.graph.querybuilder.sample_data.helloworld_relationships import INTERESTING_NODE_WITH_PARTIAL_RELS +from tests.data.graph.querybuilder.sample_data.helloworld_relationships import MERGE_SUB_RESOURCE_QUERY +from tests.data.graph.querybuilder.sample_data.helloworld_relationships import MERGE_WORLD_ASSET_QUERY +from tests.data.graph.querybuilder.sample_models.interesting_asset import InterestingAssetSchema + + +def test_load_graph_data_subset_of_relationships(neo4j_session): + """ + Test load_graph_data() if a schema defines multiple relationships but only a subset of them are possible to create + given our data. + + In this test case, the following relationships are possible: + (:InterestingAsset)<-[:RELATIONSHIP_LABEL]-(:SubResource) + (:InterestingAsset)-[:ASSOCIATED_WITH]->(:HelloAsset) + (:InterestingAsset)<-[:CONNECTED]-(:WorldAsset) + but our test data does not include a HelloAsset. + """ + # Arrange: add (:SubResource{id:sub-resource-id}) and (:WorldAsset{id: world-asset-id}) to the test graph + neo4j_session.run(MERGE_SUB_RESOURCE_QUERY) + neo4j_session.run(MERGE_WORLD_ASSET_QUERY) + + # Act + query = build_ingestion_query(InterestingAssetSchema()) + load_graph_data( + neo4j_session, + query, + INTERESTING_NODE_WITH_PARTIAL_RELS, + lastupdated=1, + sub_resource_id='sub-resource-id', + ) + + # Assert that the InterestingAsset to SubResource relationship exists + expected = { + ('interesting-node-id', 'sub-resource-id'), + } + result = neo4j_session.run( + """ + MATCH (n1:InterestingAsset)<-[:RELATIONSHIP_LABEL]-(n2:SubResource) RETURN n1.id, n2.id; + """, + ) + actual = { + (r['n1.id'], r['n2.id']) for r in result + } + assert actual == expected + + # Assert that the InterestingAsset to HelloAsset relationship does NOT exist + expected = { + ('interesting-node-id', None), + } + result = neo4j_session.run( + """ + MATCH (n1:InterestingAsset) + OPTIONAL MATCH (n1)--(n2:HelloAsset) + RETURN n1.id, n2.id; + """, + ) + actual = { + (r['n1.id'], r['n2.id']) for r in result + } + assert actual == expected + + # Assert that the InterestingAsset to WorldAsset relationship exists + expected = { + ('interesting-node-id', 'the-worldasset-id-1'), + } + result = neo4j_session.run( + """ + MATCH (n1:InterestingAsset)<-[:CONNECTED]-(n2:WorldAsset) RETURN n1.id, n2.id; + """, + ) + actual = { + (r['n1.id'], r['n2.id']) for r in result + } + assert actual == expected + + +def test_load_graph_data_subset_of_relationships_only_sub_resource(neo4j_session): + """ + In this test case, our test data only includes the sub resource relationship + """ + # Arrange: add (:SubResource{id:sub-resource-id}) + neo4j_session.run("MATCH (n) DETACH DELETE n;") + neo4j_session.run(MERGE_SUB_RESOURCE_QUERY) + + # Act + query = build_ingestion_query(InterestingAssetSchema()) + load_graph_data( + neo4j_session, + query, + INTERESTING_NODE_WITH_PARTIAL_RELS, + lastupdated=1, + sub_resource_id='sub-resource-id', + ) + + # Assert that the InterestingAsset to SubResource relationship exists + expected = { + ('interesting-node-id', 'sub-resource-id'), + } + result = neo4j_session.run( + """ + MATCH (n1:InterestingAsset)<-[:RELATIONSHIP_LABEL]-(n2:SubResource) RETURN n1.id, n2.id; + """, + ) + actual = { + (r['n1.id'], r['n2.id']) for r in result + } + assert actual == expected + + # Assert that the InterestingAsset to HelloAsset relationship does NOT exist + expected = { + ('interesting-node-id', None), + } + result = neo4j_session.run( + """ + MATCH (n1:InterestingAsset) + OPTIONAL MATCH (n1)--(n2:HelloAsset) + RETURN n1.id, n2.id; + """, + ) + actual = { + (r['n1.id'], r['n2.id']) for r in result + } + assert actual == expected + + # Assert that the InterestingAsset to WorldAsset relationship does NOT exist + expected = { + ('interesting-node-id', None), + } + result = neo4j_session.run( + """ + MATCH (n1:InterestingAsset) + OPTIONAL MATCH (n1)<-[:CONNECTED]-(n2:WorldAsset) + RETURN n1.id, n2.id; + """, + ) + actual = { + (r['n1.id'], r['n2.id']) for r in result + } + assert actual == expected diff --git a/tests/unit/cartography/graph/helpers.py b/tests/unit/cartography/graph/helpers.py new file mode 100644 index 0000000000..3dbb4c11f6 --- /dev/null +++ b/tests/unit/cartography/graph/helpers.py @@ -0,0 +1,10 @@ +def remove_leading_whitespace_and_empty_lines(text: str) -> str: + """ + Helper function for tests. + On the given text string, remove all leading whitespace on each line and remove blank lines, + :param text: Text string + :return: The text string but with no leading whitespace and no blank lines. + """ + # We call lstrip() twice on the same line. This is inefficient but ok for small unit tests. + # Please change it if you want to. + return '\n'.join([line.lstrip() for line in text.split('\n') if line.lstrip() != '']) diff --git a/tests/unit/cartography/graph/test_querybuilder.py b/tests/unit/cartography/graph/test_querybuilder.py deleted file mode 100644 index d45b846c08..0000000000 --- a/tests/unit/cartography/graph/test_querybuilder.py +++ /dev/null @@ -1,90 +0,0 @@ -from cartography.graph.querybuilder import build_node_ingestion_query -from cartography.graph.querybuilder import build_relationship_ingestion_query - - -def test_build_node_ingestion_query(): - query = build_node_ingestion_query( - 'EC2Instance', - { - 'id': 'Arn', - 'arn': 'Arn', - 'publicdnsname': 'PublicDnsName', - 'privateipaddress': 'PrivateIpAddress', - 'publicipaddress': 'PublicIpAddress', - 'imageid': 'ImageId', - 'instancetype': 'InstanceType', - 'monitoringstate': 'MonitoringState', - 'state': 'State', - 'launchtime': 'LaunchTime', - 'launchtimeunix': 'LaunchTimeUnix', - 'region': 'Region', - 'iaminstanceprofile': 'IamInstanceProfile', - }, - ) - assert query == """ - UNWIND $DictList AS item - MERGE (i:EC2Instance{id:item.Arn}) - ON CREATE SET i.firstseen = timestamp() - SET i.lastupdated = $UpdateTag, - i.arn = item.Arn, - i.publicdnsname = item.PublicDnsName, - i.privateipaddress = item.PrivateIpAddress, - i.publicipaddress = item.PublicIpAddress, - i.imageid = item.ImageId, - i.instancetype = item.InstanceType, - i.monitoringstate = item.MonitoringState, - i.state = item.State, - i.launchtime = item.LaunchTime, - i.launchtimeunix = item.LaunchTimeUnix, - i.region = item.Region, - i.iaminstanceprofile = item.IamInstanceProfile""" - - -def test_build_node_ingestion_query_only_id(): - query = build_node_ingestion_query( - 'SomeNodeWithOnlyAnId', - { - 'id': 'IdOnTheDictObject', - }, - ) - assert query == """ - UNWIND $DictList AS item - MERGE (i:SomeNodeWithOnlyAnId{id:item.IdOnTheDictObject}) - ON CREATE SET i.firstseen = timestamp() - SET i.lastupdated = $UpdateTag""" - - -def test_build_relationship_ingestion_query(): - query = build_relationship_ingestion_query( - 'AWSAccount', 'id', 'Id', - 'EC2Instance', 'instanceid', 'InstanceId', - 'RESOURCE', - ) - assert query == """ - UNWIND $RelMappingList AS item - MATCH (a:AWSAccount{id:item.Id}) - MATCH (b:EC2Instance{instanceid:item.InstanceId}) - MERGE (a)-[r:RESOURCE]->(b) - ON CREATE SET r.firstseen = timestamp() - SET r.lastupdated = $UpdateTag""" - - -def test_build_relationship_with_attributes_query(): - query = build_relationship_ingestion_query( - 'Service', 'name', 'Name', - 'GoLibrary', 'id', 'Id', - 'REQUIRES', - { - 'libraryspecifier': 'LibrarySpecifier', - 'someotherrelfield': 'SomeOtherRelField', - }, - ) - assert query == """ - UNWIND $RelMappingList AS item - MATCH (a:Service{name:item.Name}) - MATCH (b:GoLibrary{id:item.Id}) - MERGE (a)-[r:REQUIRES]->(b) - ON CREATE SET r.firstseen = timestamp() - SET r.lastupdated = $UpdateTag, - r.libraryspecifier = item.LibrarySpecifier, - r.someotherrelfield = item.SomeOtherRelField""" diff --git a/tests/unit/cartography/graph/test_querybuilder_build_attach_links_excs.py b/tests/unit/cartography/graph/test_querybuilder_build_attach_links_excs.py new file mode 100644 index 0000000000..24c1e12ca9 --- /dev/null +++ b/tests/unit/cartography/graph/test_querybuilder_build_attach_links_excs.py @@ -0,0 +1,79 @@ +from dataclasses import dataclass + +from pytest import raises + +from cartography.graph.model import CartographyNodeProperties +from cartography.graph.model import CartographyNodeSchema +from cartography.graph.model import CartographyRelProperties +from cartography.graph.model import CartographyRelSchema +from cartography.graph.model import LinkDirection +from cartography.graph.model import make_target_node_matcher +from cartography.graph.model import OtherRelationships +from cartography.graph.model import PropertyRef +from cartography.graph.model import TargetNodeMatcher +from cartography.graph.querybuilder import _build_attach_additional_links_statement +from cartography.graph.querybuilder import _build_attach_sub_resource_statement + + +@dataclass(frozen=True) +class MyNodeToBillingUnitRelProps(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class MyNodeToBillingUnitRel(CartographyRelSchema): + target_node_label: str = 'BillingUnit' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher({'id': PropertyRef('billing_unit_id')}) + direction: LinkDirection = LinkDirection.OUTWARD + rel_label: str = "BILLING_UNIT" + # This is intentionally missing "()" at the end. This will raise an exception! + properties: MyNodeToBillingUnitRelProps = MyNodeToBillingUnitRelProps + + +@dataclass(frozen=True) +class MyNodeToOtherNodeRelProps(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class MyNodeToOtherNodeRel(CartographyRelSchema): + target_node_label: str = 'OtherNode' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher({'id': PropertyRef('other_node_id')}) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "REL_LABEL_GOES_HERE" + # This is intentionally missing "()" at the end. This will raise an exception! + properties: MyNodeToOtherNodeRelProps = MyNodeToOtherNodeRelProps + + +@dataclass(frozen=True) +class MyNodeProperties(CartographyNodeProperties): + id: PropertyRef = PropertyRef('Id') + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class MyNodeSchema(CartographyNodeSchema): + label: str = 'MyNode' + properties: MyNodeProperties = MyNodeProperties() + sub_resource_relationship: CartographyRelSchema = MyNodeToBillingUnitRel() + other_relationships: OtherRelationships = OtherRelationships([MyNodeToOtherNodeRel()]) + + +def test_build_attach_addl_links_raises_typeerror(): + """ + _build_attach_additional_links_statement calls asdict() on each rel in node_schema.other_relationships. If the + module author forgot to put `()` at the end of each RelSchema, Python will treat it as a "type" and not a + dataclass, so asdict() will fail with a typeerror. + This test ensures that we raise a helpful error message for this situation, because IDEs don't always catch this + mistake. + """ + with raises(TypeError): + _ = _build_attach_additional_links_statement(MyNodeSchema().other_relationships) + + +def test_build_attach_sub_resource_stmt_raises_typeerror(): + """ + Same test logic as test_build_attach_addl_links_raises_typeerror above but for _build_attach_sub_resource_statement. + """ + with raises(TypeError): + _ = _build_attach_sub_resource_statement(MyNodeSchema().sub_resource_relationship) diff --git a/tests/unit/cartography/graph/test_querybuilder_complex.py b/tests/unit/cartography/graph/test_querybuilder_complex.py new file mode 100644 index 0000000000..acdafe083f --- /dev/null +++ b/tests/unit/cartography/graph/test_querybuilder_complex.py @@ -0,0 +1,55 @@ +from cartography.graph.querybuilder import build_ingestion_query +from tests.data.graph.querybuilder.sample_models.interesting_asset import InterestingAssetSchema +from tests.unit.cartography.graph.helpers import remove_leading_whitespace_and_empty_lines + + +def test_build_ingestion_query_complex(): + # Act + query = build_ingestion_query(InterestingAssetSchema()) + + expected = """ + UNWIND $DictList AS item + MERGE (i:InterestingAsset{id: item.Id}) + ON CREATE SET i.firstseen = timestamp() + SET + i.lastupdated = $lastupdated, + i.property1 = item.property1, + i.property2 = item.property2, + i:AnotherNodeLabel:YetAnotherNodeLabel + + WITH i, item + CALL { + WITH i, item + OPTIONAL MATCH (j:SubResource{id: $sub_resource_id}) + WITH i, item, j WHERE j IS NOT NULL + MERGE (i)<-[r:RELATIONSHIP_LABEL]-(j) + ON CREATE SET r.firstseen = timestamp() + SET + r.lastupdated = $lastupdated, + r.another_rel_field = item.AnotherField, + r.yet_another_rel_field = item.YetAnotherRelField + + UNION + WITH i, item + OPTIONAL MATCH (n0:HelloAsset{id: item.hello_asset_id}) + WITH i, item, n0 WHERE n0 IS NOT NULL + MERGE (i)-[r0:ASSOCIATED_WITH]->(n0) + ON CREATE SET r0.firstseen = timestamp() + SET + r0.lastupdated = $lastupdated + + UNION + WITH i, item + OPTIONAL MATCH (n1:WorldAsset{id: item.world_asset_id}) + WITH i, item, n1 WHERE n1 IS NOT NULL + MERGE (i)<-[r1:CONNECTED]-(n1) + ON CREATE SET r1.firstseen = timestamp() + SET + r1.lastupdated = $lastupdated + } + """ + + # Assert: compare query outputs while ignoring leading whitespace. + actual_query = remove_leading_whitespace_and_empty_lines(query) + expected_query = remove_leading_whitespace_and_empty_lines(expected) + assert actual_query == expected_query diff --git a/tests/unit/cartography/graph/test_querybuilder_filter_selected_rels.py b/tests/unit/cartography/graph/test_querybuilder_filter_selected_rels.py new file mode 100644 index 0000000000..a76d497af6 --- /dev/null +++ b/tests/unit/cartography/graph/test_querybuilder_filter_selected_rels.py @@ -0,0 +1,17 @@ +from pytest import raises + +from cartography.graph.querybuilder import _filter_selected_relationships +from tests.data.graph.querybuilder.sample_models.interesting_asset import InterestingAssetToSubResourceRel +from tests.data.graph.querybuilder.sample_models.simple_node import SimpleNodeSchema + + +def test_filter_selected_rels_raises_value_err(): + """ + Specify a RelSchema that is not present on a given NodeSchema -> expect exception + """ + # Act and assert + with raises(ValueError): + _, _ = _filter_selected_relationships( + SimpleNodeSchema(), + selected_relationships={InterestingAssetToSubResourceRel()}, + ) diff --git a/tests/unit/cartography/graph/test_querybuilder_simple.py b/tests/unit/cartography/graph/test_querybuilder_simple.py new file mode 100644 index 0000000000..2e05fdb851 --- /dev/null +++ b/tests/unit/cartography/graph/test_querybuilder_simple.py @@ -0,0 +1,59 @@ +from cartography.graph.querybuilder import build_ingestion_query +from tests.data.graph.querybuilder.sample_models.simple_node import SimpleNodeSchema +from tests.data.graph.querybuilder.sample_models.simple_node import SimpleNodeWithSubResourceSchema +from tests.unit.cartography.graph.helpers import remove_leading_whitespace_and_empty_lines + + +def test_simplenode_sanity_checks(): + """ + Test creating a simple node schema with no relationships. + """ + schema: SimpleNodeSchema = SimpleNodeSchema() + # Assert that the unimplemented, non-abstract properties have None values. + assert schema.extra_node_labels is None + assert schema.sub_resource_relationship is None + assert schema.other_relationships is None + + +def test_simplenode_with_subresource_sanity_checks(): + """ + Test creating a simple node schema with no relationships and ensure that the optional attributes are indeed None. + """ + schema: SimpleNodeWithSubResourceSchema = SimpleNodeWithSubResourceSchema() + # Assert that the unimplemented, non-abstract properties have None values. + assert schema.extra_node_labels is None + assert schema.other_relationships is None + + +def test_build_ingestion_query_with_sub_resource(): + """ + Test creating a simple node schema with a sub resource relationship. + """ + # Act + query = build_ingestion_query(SimpleNodeWithSubResourceSchema()) + + expected = """ + UNWIND $DictList AS item + MERGE (i:SimpleNode{id: item.Id}) + ON CREATE SET i.firstseen = timestamp() + SET + i.lastupdated = $lastupdated, + i.property1 = item.property1, + i.property2 = item.property2 + + WITH i, item + CALL { + WITH i, item + OPTIONAL MATCH (j:SubResource{id: $sub_resource_id}) + WITH i, item, j WHERE j IS NOT NULL + MERGE (i)<-[r:RELATIONSHIP_LABEL]-(j) + ON CREATE SET r.firstseen = timestamp() + SET + r.lastupdated = $lastupdated + } + """ + + # Assert: compare query outputs while ignoring leading whitespace. + actual_query = remove_leading_whitespace_and_empty_lines(query) + expected_query = remove_leading_whitespace_and_empty_lines(expected) + assert actual_query == expected_query