From 9031d2ab9945ee8dccf7dfed78de23f2da088724 Mon Sep 17 00:00:00 2001 From: Alex Chantavy Date: Tue, 4 Apr 2023 17:20:04 -0700 Subject: [PATCH 1/4] Volume data model, make a failing test --- cartography/intel/aws/ec2/volumes.py | 118 ++++++------ cartography/intel/aws/util/arns.py | 18 ++ cartography/models/aws/ec2/volumes.py | 14 ++ tests/data/aws/ec2/volumes.py | 4 +- .../intel/aws/ec2/test_ec2_volumes.py | 180 ++++++++++-------- 5 files changed, 189 insertions(+), 145 deletions(-) create mode 100644 cartography/intel/aws/util/arns.py diff --git a/cartography/intel/aws/ec2/volumes.py b/cartography/intel/aws/ec2/volumes.py index 6de03c7d42..3ad50f1186 100644 --- a/cartography/intel/aws/ec2/volumes.py +++ b/cartography/intel/aws/ec2/volumes.py @@ -6,7 +6,9 @@ import boto3 import neo4j +from cartography.client.core.tx import load from cartography.graph.job import GraphJob +from cartography.intel.aws.util.arns import build_arn from cartography.models.aws.ec2.volumes import EBSVolumeSchema from cartography.util import aws_handle_regions from cartography.util import timeit @@ -16,7 +18,7 @@ @timeit @aws_handle_regions -def get_volumes(boto3_session: boto3.session.Session, region: str) -> List[Dict]: +def get_volumes(boto3_session: boto3.session.Session, region: str) -> List[Dict[str, Any]]: client = boto3_session.client('ec2', region_name=region) paginator = client.get_paginator('describe_volumes') volumes: List[Dict] = [] @@ -26,90 +28,76 @@ def get_volumes(boto3_session: boto3.session.Session, region: str) -> List[Dict] def transform_volumes(volumes: List[Dict[str, Any]], region: str, current_aws_account_id: str) -> List[Dict[str, Any]]: + result = [] for volume in volumes: - volume['VolumeArn'] = f"arn:aws:ec2:{region}:{current_aws_account_id}:volume/{volume['VolumeId']}" - volume['CreateTime'] = str(volume['CreateTime']) - return volumes + attachments = volume.get('Attachments', []) + active_attachments = [a for a in attachments if a['State'] == 'attached'] + + volume_id = volume['VolumeId'] + raw_vol = ({ + 'Arn': build_arn('ec2', current_aws_account_id, 'volume', volume_id, region), + 'AvailabilityZone': volume['AvailabilityZone'], + 'CreateTime': volume['CreateTime'], + 'Encrypted': volume['Encrypted'], + 'Size': volume['Size'], + 'State': volume['State'], + 'OutpostArn': volume['OutpostArn'], + 'SnapshotId': volume['SnapshotId'], + 'Iops': volume['Iops'], + 'FastRestored': volume['FastRestored'], + 'MultiAttachEnabled': volume['MultiAttachEnabled'], + 'VolumeType': volume['VolumeType'], + 'VolumeId': volume_id, + 'KmsKeyId': volume['KmsKeyId'], + }) + + if not active_attachments: + result.append(raw_vol) + continue + + for attachment in active_attachments: + vol_with_attachment = raw_vol.copy() + vol_with_attachment['InstanceId'] = attachment['InstanceId'] + result.append(vol_with_attachment) + + return result @timeit def load_volumes( - neo4j_session: neo4j.Session, data: List[Dict], region: str, current_aws_account_id: str, update_tag: int, + neo4j_session: neo4j.Session, + ebs_data: List[Dict[str, Any]], + region: str, + current_aws_account_id: str, + update_tag: int, ) -> None: - ingest_volumes = """ - UNWIND $volumes_list as volume - MERGE (vol:EBSVolume{id: volume.VolumeId}) - ON CREATE SET vol.firstseen = timestamp() - SET vol.arn = volume.VolumeArn, - vol.lastupdated = $update_tag, - vol.availabilityzone = volume.AvailabilityZone, - vol.createtime = volume.CreateTime, - vol.encrypted = volume.Encrypted, - vol.size = volume.Size, - vol.state = volume.State, - vol.outpostarn = volume.OutpostArn, - vol.snapshotid = volume.SnapshotId, - vol.iops = volume.Iops, - vol.fastrestored = volume.FastRestored, - vol.multiattachenabled = volume.MultiAttachEnabled, - vol.type = volume.VolumeType, - vol.kmskeyid = volume.KmsKeyId, - vol.region=$Region - WITH vol - MATCH (aa:AWSAccount{id: $AWS_ACCOUNT_ID}) - MERGE (aa)-[r:RESOURCE]->(vol) - ON CREATE SET r.firstseen = timestamp() - SET r.lastupdated = $update_tag - """ - - neo4j_session.run( - ingest_volumes, - volumes_list=data, - AWS_ACCOUNT_ID=current_aws_account_id, + load( + neo4j_session, + EBSVolumeSchema(), + ebs_data, Region=region, - update_tag=update_tag, + AWS_ID=current_aws_account_id, + lastupdated=update_tag, ) -def load_volume_relationships( - neo4j_session: neo4j.Session, - volumes: List[Dict[str, Any]], - aws_update_tag: int, -) -> None: - add_relationship_query = """ - MATCH (volume:EBSVolume{arn: $VolumeArn}) - WITH volume - MATCH (instance:EC2Instance{instanceid: $InstanceId}) - MERGE (volume)-[r:ATTACHED_TO_EC2_INSTANCE]->(instance) - ON CREATE SET r.firstseen = timestamp() - SET r.lastupdated = $aws_update_tag - """ - for volume in volumes: - for attachment in volume.get('Attachments', []): - if attachment['State'] != 'attached': - continue - neo4j_session.run( - add_relationship_query, - VolumeArn=volume['VolumeArn'], - InstanceId=attachment['InstanceId'], - aws_update_tag=aws_update_tag, - ) - - @timeit -def cleanup_volumes(neo4j_session: neo4j.Session, common_job_parameters: Dict) -> None: +def cleanup_volumes(neo4j_session: neo4j.Session, common_job_parameters: Dict[str, Any]) -> None: GraphJob.from_node_schema(EBSVolumeSchema(), common_job_parameters).run(neo4j_session) @timeit def sync_ebs_volumes( - neo4j_session: neo4j.Session, boto3_session: boto3.session.Session, regions: List[str], - current_aws_account_id: str, update_tag: int, common_job_parameters: Dict, + neo4j_session: neo4j.Session, + boto3_session: boto3.session.Session, + regions: List[str], + current_aws_account_id: str, + update_tag: int, + common_job_parameters: Dict[str, Any], ) -> None: for region in regions: logger.debug("Syncing volumes for region '%s' in account '%s'.", region, current_aws_account_id) data = get_volumes(boto3_session, region) transformed_data = transform_volumes(data, region, current_aws_account_id) load_volumes(neo4j_session, transformed_data, region, current_aws_account_id, update_tag) - load_volume_relationships(neo4j_session, transformed_data, update_tag) cleanup_volumes(neo4j_session, common_job_parameters) diff --git a/cartography/intel/aws/util/arns.py b/cartography/intel/aws/util/arns.py new file mode 100644 index 0000000000..e6108b82c3 --- /dev/null +++ b/cartography/intel/aws/util/arns.py @@ -0,0 +1,18 @@ +from typing import Optional + + +def build_arn( + resource: str, + account: str, + typename: str, + name: str, + region: Optional[str] = None, + partition: Optional[str] = None, +) -> str: + if not partition: + # TODO: support partitions from others. Please file an issue on this if needed, would love to hear from you + partition = 'aws' + if not region: + # Some resources are present in all regions, e.g. IAM policies + region = "" + return f"arn:{partition}:{resource}:{region}:{account}:{typename}/{name}" diff --git a/cartography/models/aws/ec2/volumes.py b/cartography/models/aws/ec2/volumes.py index 2140f4fcd0..4e4e336636 100644 --- a/cartography/models/aws/ec2/volumes.py +++ b/cartography/models/aws/ec2/volumes.py @@ -13,10 +13,24 @@ @dataclass(frozen=True) class EBSVolumeNodeProperties(CartographyNodeProperties): + arn: PropertyRef = PropertyRef('Arn', extra_index=True) id: PropertyRef = PropertyRef('VolumeId') + volumeid: PropertyRef = PropertyRef('VolumeId', extra_index=True) region: PropertyRef = PropertyRef('Region', set_in_kwargs=True) lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) deleteontermination: PropertyRef = PropertyRef('DeleteOnTermination') + availabilityzone: PropertyRef = PropertyRef('AvailabilityZone') + createtime: PropertyRef = PropertyRef('CreateTime') + encrypted: PropertyRef = PropertyRef('Encrypted') + size: PropertyRef = PropertyRef('Size') + state: PropertyRef = PropertyRef('State') + outpostarn: PropertyRef = PropertyRef('OutpostArn') + snapshotid: PropertyRef = PropertyRef('SnapshotId') + iops: PropertyRef = PropertyRef('Iops') + fastrestored: PropertyRef = PropertyRef('FastRestored') + multiattachenabled: PropertyRef = PropertyRef('MultiAttachEnabled') + type: PropertyRef = PropertyRef('VolumeType') + kmskeyid: PropertyRef = PropertyRef('KmsKeyId') @dataclass(frozen=True) diff --git a/tests/data/aws/ec2/volumes.py b/tests/data/aws/ec2/volumes.py index e3a29c03fa..6e08003e73 100644 --- a/tests/data/aws/ec2/volumes.py +++ b/tests/data/aws/ec2/volumes.py @@ -14,7 +14,7 @@ 'Size': 123, 'SnapshotId': 'sn-01', 'State': 'available', - 'VolumeId': 'v-01', + 'VolumeId': 'vol-0df', 'Iops': 123, 'VolumeType': 'standard', 'FastRestored': True, @@ -33,7 +33,7 @@ 'OutpostArn': 'arn1', 'Size': 123, 'State': 'available', - 'VolumeId': 'v-02', + 'VolumeId': 'vol-03', 'Iops': 123, 'SnapshotId': 'sn-02', 'VolumeType': 'standard', diff --git a/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py b/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py index 705d2db3ec..186fe97275 100644 --- a/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py +++ b/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py @@ -3,94 +3,100 @@ import cartography.intel.aws.ec2.instances import cartography.intel.aws.ec2.volumes -import tests.data.aws.ec2.instances -import tests.data.aws.ec2.volumes from cartography.intel.aws.ec2.instances import sync_ec2_instances +from cartography.intel.aws.ec2.volumes import sync_ebs_volumes from tests.data.aws.ec2.instances import DESCRIBE_INSTANCES - +from tests.data.aws.ec2.volumes import DESCRIBE_VOLUMES +from tests.integration.cartography.intel.aws.common import create_test_account +from tests.integration.util import check_nodes +from tests.integration.util import check_rels TEST_ACCOUNT_ID = '000000000000' TEST_REGION = 'eu-west-1' TEST_UPDATE_TAG = 123456789 -def test_load_volumes(neo4j_session): +@patch.object(cartography.intel.aws.ec2.volumes, 'get_volumes', return_value=DESCRIBE_VOLUMES) +def test_sync_ebs_volumes(mock_get_vols, neo4j_session): # Arrange - data = tests.data.aws.ec2.volumes.DESCRIBE_VOLUMES - transformed_data = cartography.intel.aws.ec2.volumes.transform_volumes(data, TEST_REGION, TEST_ACCOUNT_ID) + boto3_session = MagicMock() + create_test_account(neo4j_session, TEST_ACCOUNT_ID, TEST_UPDATE_TAG) # Act - cartography.intel.aws.ec2.volumes.load_volumes( + sync_ebs_volumes( neo4j_session, - transformed_data, - TEST_REGION, + boto3_session, + [TEST_REGION], TEST_ACCOUNT_ID, TEST_UPDATE_TAG, + {'UPDATE_TAG': TEST_UPDATE_TAG, 'AWS_ID': TEST_ACCOUNT_ID}, ) # Assert - expected_nodes = { - "v-01", "v-02", + assert check_nodes(neo4j_session, 'EBSVolume', ['arn']) == { + ('arn:aws:ec2:eu-west-1:000000000000:volume/vol-03',), + ('arn:aws:ec2:eu-west-1:000000000000:volume/vol-0df',), } - nodes = neo4j_session.run( - """ - MATCH (r:EBSVolume) RETURN r.id; - """, - ) - actual_nodes = {n['r.id'] for n in nodes} - - assert actual_nodes == expected_nodes - - -def test_load_volume_to_account_rels(neo4j_session): + # Assert + assert check_rels( + neo4j_session, + 'AWSAccount', + 'id', + 'EBSVolume', + 'volumeid', + 'RESOURCE', + rel_direction_right=True, + ) == { + (TEST_ACCOUNT_ID, 'vol-03'), + (TEST_ACCOUNT_ID, 'vol-0df'), + } - # Arrange: Create Test AWSAccount - neo4j_session.run( - """ - MERGE (aws:AWSAccount{id: $aws_account_id}) - ON CREATE SET aws.firstseen = timestamp() - SET aws.lastupdated = $aws_update_tag - """, - aws_account_id=TEST_ACCOUNT_ID, - aws_update_tag=TEST_UPDATE_TAG, - ) - # Act: Load Test Volumes - data = tests.data.aws.ec2.volumes.DESCRIBE_VOLUMES - transformed_data = cartography.intel.aws.ec2.volumes.transform_volumes(data, TEST_REGION, TEST_ACCOUNT_ID) +@patch.object(cartography.intel.aws.ec2.instances, 'get_ec2_instances', return_value=DESCRIBE_INSTANCES['Reservations']) +@patch.object(cartography.intel.aws.ec2.volumes, 'get_volumes', return_value=DESCRIBE_VOLUMES) +def test_sync_ebs_volumes_e2e(mock_get_vols, mock_get_instances, neo4j_session): + # Arrange + neo4j_session.run('MATCH (n) DETACH DELETE n;') + boto3_session = MagicMock() + create_test_account(neo4j_session, TEST_ACCOUNT_ID, TEST_UPDATE_TAG) - cartography.intel.aws.ec2.volumes.load_volumes( + # Act: sync_ec2_instances() loads attached ebs volumes + sync_ec2_instances( neo4j_session, - transformed_data, - TEST_REGION, + boto3_session, + [TEST_REGION], TEST_ACCOUNT_ID, TEST_UPDATE_TAG, + {'UPDATE_TAG': TEST_UPDATE_TAG, 'AWS_ID': TEST_ACCOUNT_ID}, ) - # Assert - expected = { - (TEST_ACCOUNT_ID, 'v-01'), - (TEST_ACCOUNT_ID, 'v-02'), + # Assert that deleteontermination is set by sync_ec2_instances + assert check_nodes(neo4j_session, 'EBSVolume', ['id', 'deleteontermination', 'encrypted']) == { + ('vol-03', True, None), + ('vol-04', True, None), + ('vol-09', True, None), + ('vol-0df', True, None), } - result = neo4j_session.run( - """ - MATCH (n1:AWSAccount)-[:RESOURCE]->(n2:EBSVolume) RETURN n1.id, n2.id; - """, - ) - actual = { - (r['n1.id'], r['n2.id']) for r in result + # Assert that they are attached to the instances + assert check_rels( + neo4j_session, + 'EC2Instance', + 'instanceid', + 'EBSVolume', + 'volumeid', + 'ATTACHED_TO', + rel_direction_right=False, + ) == { + ('i-01', 'vol-0df'), + ('i-02', 'vol-03'), + ('i-03', 'vol-09'), + ('i-04', 'vol-04'), } - assert actual == expected - - -@patch.object(cartography.intel.aws.ec2.instances, 'get_ec2_instances', return_value=DESCRIBE_INSTANCES['Reservations']) -def test_load_volume_to_instance_rels(mock_get_instances, neo4j_session): - # Arrange: Load in ec2 instances first - boto3_session = MagicMock() - sync_ec2_instances( + # Act + sync_ebs_volumes( neo4j_session, boto3_session, [TEST_REGION], @@ -98,28 +104,46 @@ def test_load_volume_to_instance_rels(mock_get_instances, neo4j_session): TEST_UPDATE_TAG, {'UPDATE_TAG': TEST_UPDATE_TAG, 'AWS_ID': TEST_ACCOUNT_ID}, ) - # Prep the volume data - raw_volumes = tests.data.aws.ec2.volumes.DESCRIBE_VOLUMES - transformed_volumes = cartography.intel.aws.ec2.volumes.transform_volumes(raw_volumes, TEST_REGION, TEST_ACCOUNT_ID) - # Act - cartography.intel.aws.ec2.volumes.load_volume_relationships( - neo4j_session, - transformed_volumes, - TEST_UPDATE_TAG, - ) + # Assert that additional fields such as `encrypted` have been added by sync_ebs_volumes(), while + # deleteontermination has not been overwritten with None by sync_ebs_volumes() + assert check_nodes(neo4j_session, 'EBSVolume', ['id', 'deleteontermination', 'encrypted']) == { + # Attached to the instances initially + ('vol-04', True, None), + ('vol-09', True, None), + # Added by ebs sync + ('vol-03', True, True), + ('vol-0df', True, True), + } - # Assert - result = neo4j_session.run( - """ - MATCH (n1:EC2Instance)<-[:ATTACHED_TO_EC2_INSTANCE]-(n2:EBSVolume) RETURN n1.id, n2.id; - """, - ) - expected = { - ('i-01', 'v-01'), - ('i-02', 'v-02'), + # Assert that they are still attached to the instances + assert check_rels( + neo4j_session, + 'EC2Instance', + 'instanceid', + 'EBSVolume', + 'volumeid', + 'ATTACHED_TO', + rel_direction_right=False, + ) == { + ('i-01', 'vol-0df'), + ('i-02', 'vol-03'), + ('i-03', 'vol-09'), + ('i-04', 'vol-04'), } - actual = { - (r['n1.id'], r['n2.id']) for r in result + + # Assert that the account to volume rels exist + assert check_rels( + neo4j_session, + 'AWSAccount', + 'id', + 'EBSVolume', + 'volumeid', + 'RESOURCE', + rel_direction_right=True, + ) == { + ('000000000000', 'vol-03'), + ('000000000000', 'vol-04'), + ('000000000000', 'vol-09'), + ('000000000000', 'vol-0df'), } - assert actual == expected From 2d50fb9bef5f2c5571983aa1fdc4c9c36042ad94 Mon Sep 17 00:00:00 2001 From: Alex Chantavy Date: Wed, 5 Apr 2023 22:47:34 -0700 Subject: [PATCH 2/4] Handle multiple modules updating the same nodes, documents --- cartography/graph/querybuilder.py | 3 +- cartography/models/core/common.py | 91 +++++++++++++------ .../graph/test_querybuilder_complex.py | 12 +-- .../graph/test_querybuilder_simple.py | 9 +- 4 files changed, 76 insertions(+), 39 deletions(-) diff --git a/cartography/graph/querybuilder.py b/cartography/graph/querybuilder.py index e33d444265..90b5f8aa32 100644 --- a/cartography/graph/querybuilder.py +++ b/cartography/graph/querybuilder.py @@ -41,7 +41,7 @@ def _build_node_properties_statement( i.node_prop_1 = item.Prop1, i.node_prop_2 = $Prop2 ``` - where `i` is a reference to the Neo4j node. + where `i` is a reference to the Neo4j node and `item` is a reference to the current dict being processed. :param node_property_map: Mapping of node attribute names as str to PropertyRef objects :param extra_node_labels: Optional ExtraNodeLabels object to set on the node as string :return: The resulting Neo4j SET clause to set the given attributes on the node @@ -79,6 +79,7 @@ def _build_rel_properties_statement(rel_var: str, rel_property_map: Optional[Dic r.rel_prop_1 = item.Prop1, r.rel_prop_2 = $Prop2 + where `item` is a reference to the current dict being processed. :param rel_var: The variable name to use for the relationship in the Neo4j query :param rel_property_map: Mapping of relationship attribute names as str to PropertyRef objects :return: The resulting Neo4j SET clause to set the given attributes on the relationship diff --git a/cartography/models/core/common.py b/cartography/models/core/common.py index 6ee5d664c6..817eac87c1 100644 --- a/cartography/models/core/common.py +++ b/cartography/models/core/common.py @@ -1,30 +1,41 @@ class PropertyRef: """ - PropertyRefs represent properties on cartography nodes and relationships. + PropertyRefs represent properties on cartography nodes and relationships. Here is how they are used: - cartography takes lists of Python dicts and loads them to Neo4j. PropertyRefs allow our dynamically generated Neo4j - ingestion queries to set values for a given node or relationship property from (A) a field on the dict being - processed (PropertyRef.set_in_kwargs=False; default), or (B) from a single variable provided by a keyword argument - (PropertyRef.set_in_kwargs=True). + (1) cartography takes lists of Python dicts and loads them to Neo4j. PropertyRefs allow our dynamically generated + Neo4j ingestion queries to set values for a given node or relationship property from + (A) a field on the dict being processed (PropertyRef.set_in_kwargs=False; default), or + (B) from a single variable provided by a keyword argument (PropertyRef.set_in_kwargs=True). + + (2) PropertyRefs defined on CartographyNodeProperties objects can specify the `extra_index=True` field to ensure + that an index is created for that given node on the given property. + + (3) When we need to create a relationship from node A to node B based on the value of one or more of node B's + properties, cartography uses PropertyRefs on TargetNodeMatcher objects to generate an appropriate Neo4j `MATCH` + clause. This clause may need special handling, such as when we need to match on one of node B's properties in a + case-insensitive way - see the constructor docs for more details. """ def __init__(self, name: str, set_in_kwargs=False, extra_index=False, ignore_case=False): """ :param name: The name of the property - :param set_in_kwargs: Optional. If True, the property is not defined on the data dict, and we expect to find the - property in the kwargs. - If False, looks for the property in the data dict. - Defaults to False. - :param extra_index: If True, make sure that we create an index for this property name. + :param set_in_kwargs: Optional. This param only has effect if the PropertyRef is part of + CartographyNodeProperties or CartographyRelProperties objects. + If True, instructs the querybuilder to find the value for this property name in the kwargs passed to it. + This is used for things like applying the same update tag to all nodes of a given run. + If False (default), we instruct the querybuilder to get the value for this given property from + the current dict being processed. + :param extra_index: This param only has effect if the PropertyRef is part of a CartographyNodeProperties object. + If True, make sure that we create an index for this property name. Notes: - extra_index is available for the case where you anticipate a property will be queried frequently. - The `id` and `lastupdated` properties will always have indexes created for them automatically by `ensure_indexes()`. - All properties included in target node matchers will always have indexes created for them. Defaults to False. - :param ignore_case: If True, performs a case-insensitive match when comparing the value of this property during - relationship creation. Defaults to False. This only has effect as part of a TargetNodeMatcher, and this is not - supported for the sub resource relationship. + :param ignore_case: This param only has effect as part of a TargetNodeMatcher, and this is not + supported for the sub resource relationship. If True, performs a case-insensitive match when comparing the value + of this property during relationship creation. Defaults to False. Example on why you would set this to True: GitHub usernames can have both uppercase and lowercase characters, but GitHub itself treats usernames as case-insensitive. Suppose your company's internal personnel database stores GitHub usernames all as @@ -34,25 +45,51 @@ def __init__(self, name: str, set_in_kwargs=False, extra_index=False, ignore_cas that points to the GitHubUser node's name field, otherwise if one of your employees' GitHub usernames contains capital letters, you would not be able to map them properly to a GitHubUser node in your graph. """ - self.name = name - self.set_in_kwargs = set_in_kwargs - self.extra_index = extra_index - self.ignore_case = ignore_case + self.name: str = name + self.set_in_kwargs: bool = set_in_kwargs + self.extra_index: bool = extra_index + self.ignore_case: bool = ignore_case def _parameterize_name(self) -> str: + """ + Private function. Turns self.name into a Neo4j query parameter so that the Neo4j ingestion query can access the + value on the dictionary being processed. This is used when parameters are supplied to the querybuilder via + kwargs. + Relevant Neo4j references: https://neo4j.com/docs/python-manual/current/query-simple/#_write_to_the_database and + https://neo4j.com/docs/cypher-manual/current/syntax/parameters/. + """ return f"${self.name}" def __repr__(self) -> str: """ - `querybuilder.build_ingestion_query()`, generates a Neo4j batched ingestion query of the form - `UNWIND $DictList AS item [...]`. + Returns string representation of the property so that it can be rendered in the querybuilder based on the + various inputs passed to the constructor. + """ + if self.set_in_kwargs: + return self._parameterize_name() - If set_in_kwargs is False (default), we instruct the querybuilder to get the value for this given property from - the dict being processed. To do this, this function returns "item.". This is used for loading - in lists of nodes. + if self.name.lower() == 'id' or self.ignore_case: + # Don't do coalesce() on caseinsensitive attr match. + return f"item.{self.name}" - On the other hand if set_in_kwargs is True, then the value will instead come from kwargs passed to - querybuilder.build_ingestion_query(). This is used for things like applying the same update tag to all nodes of - a given run. - """ - return f"item.{self.name}" if not self.set_in_kwargs else self._parameterize_name() + # Attention: This implementation detail is quirky and deserves the following essay to explain. + # This function ensures that the Neo4j query sets the value of this node/relationship property by checking the + # following sources in order: + # 1. the current dict being processed. That is, we check if dict `item` contains a key called `self.name` with + # a non-None value. If so, we are done. Else continue and check the next source. + # 2. An existing value on the node itself. That is, we convert self.name to lowercase and check if that is non + # null on the Neo4j node `i` already. If None, then the property is not set (this a Neo4j driver behavior). + # This means we make an ASSUMPTION that the property name set on the node is the lowercase version of self.name. + # + # We do this because not all fields defined on a given CartographyNodeSchema may be present on the dict being + # processed (this is by design because we encourage multiple intel modules to update the same nodes as they + # may enrich the node with different properties), and when a dict is processed by the Neo4j driver, fields that + # are not defined on the dict are treated as None values. + # + # As an example, the EC2 instance sync loads in :EC2Instances with :EBSVolumes attached based on the output of + # `describe-ec2-instances`. This data includes a field called `deleteontermination` on the :EBSVolume. + # `deleteontermination` is only set during the EC2 instance sync. When the EBS Volume sync runs, it enriches + # existing :EBSVolumes with additional fields from data retrieved with `describe-ebs-volumes` but because this + # API call does not include `deleteontermination`, we will overwrite the previous values of + # `deleteontermination` with `None`, thus removing the property from all :EBSVolumes. + return f"COALESCE(item.{self.name}, i.{self.name.lower()})" diff --git a/tests/unit/cartography/graph/test_querybuilder_complex.py b/tests/unit/cartography/graph/test_querybuilder_complex.py index 464650401f..6de7d5330b 100644 --- a/tests/unit/cartography/graph/test_querybuilder_complex.py +++ b/tests/unit/cartography/graph/test_querybuilder_complex.py @@ -13,8 +13,8 @@ def test_build_ingestion_query_complex(): ON CREATE SET i.firstseen = timestamp() SET i.lastupdated = $lastupdated, - i.property1 = item.property1, - i.property2 = item.property2, + i.property1 = COALESCE(item.property1, i.property1), + i.property2 = COALESCE(item.property2, i.property2), i:AnotherNodeLabel:YetAnotherNodeLabel WITH i, item @@ -26,14 +26,14 @@ def test_build_ingestion_query_complex(): ON CREATE SET r.firstseen = timestamp() SET r.lastupdated = $lastupdated, - r.another_rel_field = item.AnotherField, - r.yet_another_rel_field = item.YetAnotherRelField + r.another_rel_field = COALESCE(item.AnotherField, i.anotherfield), + r.yet_another_rel_field = COALESCE(item.YetAnotherRelField, i.yetanotherrelfield) UNION WITH i, item OPTIONAL MATCH (n0:HelloAsset) WHERE - n0.id = item.hello_asset_id + n0.id = COALESCE(item.hello_asset_id, i.hello_asset_id) WITH i, item, n0 WHERE n0 IS NOT NULL MERGE (i)-[r0:ASSOCIATED_WITH]->(n0) ON CREATE SET r0.firstseen = timestamp() @@ -44,7 +44,7 @@ def test_build_ingestion_query_complex(): WITH i, item OPTIONAL MATCH (n1:WorldAsset) WHERE - n1.id = item.world_asset_id + n1.id = COALESCE(item.world_asset_id, i.world_asset_id) WITH i, item, n1 WHERE n1 IS NOT NULL MERGE (i)<-[r1:CONNECTED]-(n1) ON CREATE SET r1.firstseen = timestamp() diff --git a/tests/unit/cartography/graph/test_querybuilder_simple.py b/tests/unit/cartography/graph/test_querybuilder_simple.py index fcc5d4f191..536101a29a 100644 --- a/tests/unit/cartography/graph/test_querybuilder_simple.py +++ b/tests/unit/cartography/graph/test_querybuilder_simple.py @@ -32,15 +32,14 @@ def test_build_ingestion_query_with_sub_resource(): """ # Act query = build_ingestion_query(SimpleNodeWithSubResourceSchema()) - expected = """ UNWIND $DictList AS item MERGE (i:SimpleNode{id: item.Id}) ON CREATE SET i.firstseen = timestamp() SET i.lastupdated = $lastupdated, - i.property1 = item.property1, - i.property2 = item.property2 + i.property1 = COALESCE(item.property1, i.property1), + i.property2 = COALESCE(item.property2, i.property2) WITH i, item CALL { @@ -69,8 +68,8 @@ def test_build_ingestion_query_case_insensitive_match(): ON CREATE SET i.firstseen = timestamp() SET i.lastupdated = $lastupdated, - i.email = item.email, - i.github_username = item.github_username + i.email = COALESCE(item.email, i.email), + i.github_username = COALESCE(item.github_username, i.github_username) WITH i, item CALL { From 4c63ea31cadf52171a9fb22dc9c8366d7a155012 Mon Sep 17 00:00:00 2001 From: Alex Chantavy Date: Fri, 7 Apr 2023 17:46:34 -0700 Subject: [PATCH 3/4] Fix comment wording Co-authored-by: Ramon Petgrave <32398091+ramonpetgrave64@users.noreply.github.com> --- cartography/models/core/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cartography/models/core/common.py b/cartography/models/core/common.py index 817eac87c1..528a28185b 100644 --- a/cartography/models/core/common.py +++ b/cartography/models/core/common.py @@ -76,7 +76,7 @@ def __repr__(self) -> str: # This function ensures that the Neo4j query sets the value of this node/relationship property by checking the # following sources in order: # 1. the current dict being processed. That is, we check if dict `item` contains a key called `self.name` with - # a non-None value. If so, we are done. Else continue and check the next source. + # a non-None value. If so, we use that value. Else continue and check the next source. # 2. An existing value on the node itself. That is, we convert self.name to lowercase and check if that is non # null on the Neo4j node `i` already. If None, then the property is not set (this a Neo4j driver behavior). # This means we make an ASSUMPTION that the property name set on the node is the lowercase version of self.name. From 47f7e27bbe5d4e9b1f216dc89ccd4eb0a94dc8cb Mon Sep 17 00:00:00 2001 From: Alex Chantavy Date: Tue, 11 Apr 2023 16:08:28 -0700 Subject: [PATCH 4/4] remove unnecessary parens Co-authored-by: Ramon Petgrave <32398091+ramonpetgrave64@users.noreply.github.com> --- cartography/intel/aws/ec2/volumes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cartography/intel/aws/ec2/volumes.py b/cartography/intel/aws/ec2/volumes.py index 3ad50f1186..ad744c5f90 100644 --- a/cartography/intel/aws/ec2/volumes.py +++ b/cartography/intel/aws/ec2/volumes.py @@ -34,7 +34,7 @@ def transform_volumes(volumes: List[Dict[str, Any]], region: str, current_aws_ac active_attachments = [a for a in attachments if a['State'] == 'attached'] volume_id = volume['VolumeId'] - raw_vol = ({ + raw_vol = { 'Arn': build_arn('ec2', current_aws_account_id, 'volume', volume_id, region), 'AvailabilityZone': volume['AvailabilityZone'], 'CreateTime': volume['CreateTime'], @@ -49,7 +49,7 @@ def transform_volumes(volumes: List[Dict[str, Any]], region: str, current_aws_ac 'VolumeType': volume['VolumeType'], 'VolumeId': volume_id, 'KmsKeyId': volume['KmsKeyId'], - }) + } if not active_attachments: result.append(raw_vol)