From 9031d2ab9945ee8dccf7dfed78de23f2da088724 Mon Sep 17 00:00:00 2001
From: Alex Chantavy <achantavy@lyft.com>
Date: Tue, 4 Apr 2023 17:20:04 -0700
Subject: [PATCH 1/4] Volume data model, make a failing test

---
 cartography/intel/aws/ec2/volumes.py          | 118 ++++++------
 cartography/intel/aws/util/arns.py            |  18 ++
 cartography/models/aws/ec2/volumes.py         |  14 ++
 tests/data/aws/ec2/volumes.py                 |   4 +-
 .../intel/aws/ec2/test_ec2_volumes.py         | 180 ++++++++++--------
 5 files changed, 189 insertions(+), 145 deletions(-)
 create mode 100644 cartography/intel/aws/util/arns.py

diff --git a/cartography/intel/aws/ec2/volumes.py b/cartography/intel/aws/ec2/volumes.py
index 6de03c7d42..3ad50f1186 100644
--- a/cartography/intel/aws/ec2/volumes.py
+++ b/cartography/intel/aws/ec2/volumes.py
@@ -6,7 +6,9 @@
 import boto3
 import neo4j
 
+from cartography.client.core.tx import load
 from cartography.graph.job import GraphJob
+from cartography.intel.aws.util.arns import build_arn
 from cartography.models.aws.ec2.volumes import EBSVolumeSchema
 from cartography.util import aws_handle_regions
 from cartography.util import timeit
@@ -16,7 +18,7 @@
 
 @timeit
 @aws_handle_regions
-def get_volumes(boto3_session: boto3.session.Session, region: str) -> List[Dict]:
+def get_volumes(boto3_session: boto3.session.Session, region: str) -> List[Dict[str, Any]]:
     client = boto3_session.client('ec2', region_name=region)
     paginator = client.get_paginator('describe_volumes')
     volumes: List[Dict] = []
@@ -26,90 +28,76 @@ def get_volumes(boto3_session: boto3.session.Session, region: str) -> List[Dict]
 
 
 def transform_volumes(volumes: List[Dict[str, Any]], region: str, current_aws_account_id: str) -> List[Dict[str, Any]]:
+    result = []
     for volume in volumes:
-        volume['VolumeArn'] = f"arn:aws:ec2:{region}:{current_aws_account_id}:volume/{volume['VolumeId']}"
-        volume['CreateTime'] = str(volume['CreateTime'])
-    return volumes
+        attachments = volume.get('Attachments', [])
+        active_attachments = [a for a in attachments if a['State'] == 'attached']
+
+        volume_id = volume['VolumeId']
+        raw_vol = ({
+            'Arn': build_arn('ec2', current_aws_account_id, 'volume', volume_id, region),
+            'AvailabilityZone': volume['AvailabilityZone'],
+            'CreateTime': volume['CreateTime'],
+            'Encrypted': volume['Encrypted'],
+            'Size': volume['Size'],
+            'State': volume['State'],
+            'OutpostArn': volume['OutpostArn'],
+            'SnapshotId': volume['SnapshotId'],
+            'Iops': volume['Iops'],
+            'FastRestored': volume['FastRestored'],
+            'MultiAttachEnabled': volume['MultiAttachEnabled'],
+            'VolumeType': volume['VolumeType'],
+            'VolumeId': volume_id,
+            'KmsKeyId': volume['KmsKeyId'],
+        })
+
+        if not active_attachments:
+            result.append(raw_vol)
+            continue
+
+        for attachment in active_attachments:
+            vol_with_attachment = raw_vol.copy()
+            vol_with_attachment['InstanceId'] = attachment['InstanceId']
+            result.append(vol_with_attachment)
+
+    return result
 
 
 @timeit
 def load_volumes(
-        neo4j_session: neo4j.Session, data: List[Dict], region: str, current_aws_account_id: str, update_tag: int,
+        neo4j_session: neo4j.Session,
+        ebs_data: List[Dict[str, Any]],
+        region: str,
+        current_aws_account_id: str,
+        update_tag: int,
 ) -> None:
-    ingest_volumes = """
-    UNWIND $volumes_list as volume
-        MERGE (vol:EBSVolume{id: volume.VolumeId})
-        ON CREATE SET vol.firstseen = timestamp()
-        SET vol.arn = volume.VolumeArn,
-            vol.lastupdated = $update_tag,
-            vol.availabilityzone = volume.AvailabilityZone,
-            vol.createtime = volume.CreateTime,
-            vol.encrypted = volume.Encrypted,
-            vol.size = volume.Size,
-            vol.state = volume.State,
-            vol.outpostarn = volume.OutpostArn,
-            vol.snapshotid = volume.SnapshotId,
-            vol.iops = volume.Iops,
-            vol.fastrestored = volume.FastRestored,
-            vol.multiattachenabled = volume.MultiAttachEnabled,
-            vol.type = volume.VolumeType,
-            vol.kmskeyid = volume.KmsKeyId,
-            vol.region=$Region
-        WITH vol
-        MATCH (aa:AWSAccount{id: $AWS_ACCOUNT_ID})
-        MERGE (aa)-[r:RESOURCE]->(vol)
-        ON CREATE SET r.firstseen = timestamp()
-        SET r.lastupdated = $update_tag
-    """
-
-    neo4j_session.run(
-        ingest_volumes,
-        volumes_list=data,
-        AWS_ACCOUNT_ID=current_aws_account_id,
+    load(
+        neo4j_session,
+        EBSVolumeSchema(),
+        ebs_data,
         Region=region,
-        update_tag=update_tag,
+        AWS_ID=current_aws_account_id,
+        lastupdated=update_tag,
     )
 
 
-def load_volume_relationships(
-        neo4j_session: neo4j.Session,
-        volumes: List[Dict[str, Any]],
-        aws_update_tag: int,
-) -> None:
-    add_relationship_query = """
-        MATCH (volume:EBSVolume{arn: $VolumeArn})
-        WITH volume
-        MATCH (instance:EC2Instance{instanceid: $InstanceId})
-        MERGE (volume)-[r:ATTACHED_TO_EC2_INSTANCE]->(instance)
-        ON CREATE SET r.firstseen = timestamp()
-        SET r.lastupdated = $aws_update_tag
-    """
-    for volume in volumes:
-        for attachment in volume.get('Attachments', []):
-            if attachment['State'] != 'attached':
-                continue
-            neo4j_session.run(
-                add_relationship_query,
-                VolumeArn=volume['VolumeArn'],
-                InstanceId=attachment['InstanceId'],
-                aws_update_tag=aws_update_tag,
-            )
-
-
 @timeit
-def cleanup_volumes(neo4j_session: neo4j.Session, common_job_parameters: Dict) -> None:
+def cleanup_volumes(neo4j_session: neo4j.Session, common_job_parameters: Dict[str, Any]) -> None:
     GraphJob.from_node_schema(EBSVolumeSchema(), common_job_parameters).run(neo4j_session)
 
 
 @timeit
 def sync_ebs_volumes(
-        neo4j_session: neo4j.Session, boto3_session: boto3.session.Session, regions: List[str],
-        current_aws_account_id: str, update_tag: int, common_job_parameters: Dict,
+        neo4j_session: neo4j.Session,
+        boto3_session: boto3.session.Session,
+        regions: List[str],
+        current_aws_account_id: str,
+        update_tag: int,
+        common_job_parameters: Dict[str, Any],
 ) -> None:
     for region in regions:
         logger.debug("Syncing volumes for region '%s' in account '%s'.", region, current_aws_account_id)
         data = get_volumes(boto3_session, region)
         transformed_data = transform_volumes(data, region, current_aws_account_id)
         load_volumes(neo4j_session, transformed_data, region, current_aws_account_id, update_tag)
-        load_volume_relationships(neo4j_session, transformed_data, update_tag)
     cleanup_volumes(neo4j_session, common_job_parameters)
diff --git a/cartography/intel/aws/util/arns.py b/cartography/intel/aws/util/arns.py
new file mode 100644
index 0000000000..e6108b82c3
--- /dev/null
+++ b/cartography/intel/aws/util/arns.py
@@ -0,0 +1,18 @@
+from typing import Optional
+
+
+def build_arn(
+        resource: str,
+        account: str,
+        typename: str,
+        name: str,
+        region: Optional[str] = None,
+        partition: Optional[str] = None,
+) -> str:
+    if not partition:
+        # TODO: support partitions from others. Please file an issue on this if needed, would love to hear from you
+        partition = 'aws'
+    if not region:
+        # Some resources are present in all regions, e.g. IAM policies
+        region = ""
+    return f"arn:{partition}:{resource}:{region}:{account}:{typename}/{name}"
diff --git a/cartography/models/aws/ec2/volumes.py b/cartography/models/aws/ec2/volumes.py
index 2140f4fcd0..4e4e336636 100644
--- a/cartography/models/aws/ec2/volumes.py
+++ b/cartography/models/aws/ec2/volumes.py
@@ -13,10 +13,24 @@
 
 @dataclass(frozen=True)
 class EBSVolumeNodeProperties(CartographyNodeProperties):
+    arn: PropertyRef = PropertyRef('Arn', extra_index=True)
     id: PropertyRef = PropertyRef('VolumeId')
+    volumeid: PropertyRef = PropertyRef('VolumeId', extra_index=True)
     region: PropertyRef = PropertyRef('Region', set_in_kwargs=True)
     lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True)
     deleteontermination: PropertyRef = PropertyRef('DeleteOnTermination')
+    availabilityzone: PropertyRef = PropertyRef('AvailabilityZone')
+    createtime: PropertyRef = PropertyRef('CreateTime')
+    encrypted: PropertyRef = PropertyRef('Encrypted')
+    size: PropertyRef = PropertyRef('Size')
+    state: PropertyRef = PropertyRef('State')
+    outpostarn: PropertyRef = PropertyRef('OutpostArn')
+    snapshotid: PropertyRef = PropertyRef('SnapshotId')
+    iops: PropertyRef = PropertyRef('Iops')
+    fastrestored: PropertyRef = PropertyRef('FastRestored')
+    multiattachenabled: PropertyRef = PropertyRef('MultiAttachEnabled')
+    type: PropertyRef = PropertyRef('VolumeType')
+    kmskeyid: PropertyRef = PropertyRef('KmsKeyId')
 
 
 @dataclass(frozen=True)
diff --git a/tests/data/aws/ec2/volumes.py b/tests/data/aws/ec2/volumes.py
index e3a29c03fa..6e08003e73 100644
--- a/tests/data/aws/ec2/volumes.py
+++ b/tests/data/aws/ec2/volumes.py
@@ -14,7 +14,7 @@
         'Size': 123,
         'SnapshotId': 'sn-01',
         'State': 'available',
-        'VolumeId': 'v-01',
+        'VolumeId': 'vol-0df',
         'Iops': 123,
         'VolumeType': 'standard',
         'FastRestored': True,
@@ -33,7 +33,7 @@
         'OutpostArn': 'arn1',
         'Size': 123,
         'State': 'available',
-        'VolumeId': 'v-02',
+        'VolumeId': 'vol-03',
         'Iops': 123,
         'SnapshotId': 'sn-02',
         'VolumeType': 'standard',
diff --git a/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py b/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py
index 705d2db3ec..186fe97275 100644
--- a/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py
+++ b/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py
@@ -3,94 +3,100 @@
 
 import cartography.intel.aws.ec2.instances
 import cartography.intel.aws.ec2.volumes
-import tests.data.aws.ec2.instances
-import tests.data.aws.ec2.volumes
 from cartography.intel.aws.ec2.instances import sync_ec2_instances
+from cartography.intel.aws.ec2.volumes import sync_ebs_volumes
 from tests.data.aws.ec2.instances import DESCRIBE_INSTANCES
-
+from tests.data.aws.ec2.volumes import DESCRIBE_VOLUMES
+from tests.integration.cartography.intel.aws.common import create_test_account
+from tests.integration.util import check_nodes
+from tests.integration.util import check_rels
 
 TEST_ACCOUNT_ID = '000000000000'
 TEST_REGION = 'eu-west-1'
 TEST_UPDATE_TAG = 123456789
 
 
-def test_load_volumes(neo4j_session):
+@patch.object(cartography.intel.aws.ec2.volumes, 'get_volumes', return_value=DESCRIBE_VOLUMES)
+def test_sync_ebs_volumes(mock_get_vols, neo4j_session):
     # Arrange
-    data = tests.data.aws.ec2.volumes.DESCRIBE_VOLUMES
-    transformed_data = cartography.intel.aws.ec2.volumes.transform_volumes(data, TEST_REGION, TEST_ACCOUNT_ID)
+    boto3_session = MagicMock()
+    create_test_account(neo4j_session, TEST_ACCOUNT_ID, TEST_UPDATE_TAG)
 
     # Act
-    cartography.intel.aws.ec2.volumes.load_volumes(
+    sync_ebs_volumes(
         neo4j_session,
-        transformed_data,
-        TEST_REGION,
+        boto3_session,
+        [TEST_REGION],
         TEST_ACCOUNT_ID,
         TEST_UPDATE_TAG,
+        {'UPDATE_TAG': TEST_UPDATE_TAG, 'AWS_ID': TEST_ACCOUNT_ID},
     )
 
     # Assert
-    expected_nodes = {
-        "v-01", "v-02",
+    assert check_nodes(neo4j_session, 'EBSVolume', ['arn']) == {
+        ('arn:aws:ec2:eu-west-1:000000000000:volume/vol-03',),
+        ('arn:aws:ec2:eu-west-1:000000000000:volume/vol-0df',),
     }
 
-    nodes = neo4j_session.run(
-        """
-        MATCH (r:EBSVolume) RETURN r.id;
-        """,
-    )
-    actual_nodes = {n['r.id'] for n in nodes}
-
-    assert actual_nodes == expected_nodes
-
-
-def test_load_volume_to_account_rels(neo4j_session):
+    # Assert
+    assert check_rels(
+        neo4j_session,
+        'AWSAccount',
+        'id',
+        'EBSVolume',
+        'volumeid',
+        'RESOURCE',
+        rel_direction_right=True,
+    ) == {
+        (TEST_ACCOUNT_ID, 'vol-03'),
+        (TEST_ACCOUNT_ID, 'vol-0df'),
+    }
 
-    # Arrange: Create Test AWSAccount
-    neo4j_session.run(
-        """
-        MERGE (aws:AWSAccount{id: $aws_account_id})
-        ON CREATE SET aws.firstseen = timestamp()
-        SET aws.lastupdated = $aws_update_tag
-        """,
-        aws_account_id=TEST_ACCOUNT_ID,
-        aws_update_tag=TEST_UPDATE_TAG,
-    )
 
-    # Act: Load Test Volumes
-    data = tests.data.aws.ec2.volumes.DESCRIBE_VOLUMES
-    transformed_data = cartography.intel.aws.ec2.volumes.transform_volumes(data, TEST_REGION, TEST_ACCOUNT_ID)
+@patch.object(cartography.intel.aws.ec2.instances, 'get_ec2_instances', return_value=DESCRIBE_INSTANCES['Reservations'])
+@patch.object(cartography.intel.aws.ec2.volumes, 'get_volumes', return_value=DESCRIBE_VOLUMES)
+def test_sync_ebs_volumes_e2e(mock_get_vols, mock_get_instances, neo4j_session):
+    # Arrange
+    neo4j_session.run('MATCH (n) DETACH DELETE n;')
+    boto3_session = MagicMock()
+    create_test_account(neo4j_session, TEST_ACCOUNT_ID, TEST_UPDATE_TAG)
 
-    cartography.intel.aws.ec2.volumes.load_volumes(
+    # Act: sync_ec2_instances() loads attached ebs volumes
+    sync_ec2_instances(
         neo4j_session,
-        transformed_data,
-        TEST_REGION,
+        boto3_session,
+        [TEST_REGION],
         TEST_ACCOUNT_ID,
         TEST_UPDATE_TAG,
+        {'UPDATE_TAG': TEST_UPDATE_TAG, 'AWS_ID': TEST_ACCOUNT_ID},
     )
 
-    # Assert
-    expected = {
-        (TEST_ACCOUNT_ID, 'v-01'),
-        (TEST_ACCOUNT_ID, 'v-02'),
+    # Assert that deleteontermination is set by sync_ec2_instances
+    assert check_nodes(neo4j_session, 'EBSVolume', ['id', 'deleteontermination', 'encrypted']) == {
+        ('vol-03', True, None),
+        ('vol-04', True, None),
+        ('vol-09', True, None),
+        ('vol-0df', True, None),
     }
 
-    result = neo4j_session.run(
-        """
-        MATCH (n1:AWSAccount)-[:RESOURCE]->(n2:EBSVolume) RETURN n1.id, n2.id;
-        """,
-    )
-    actual = {
-        (r['n1.id'], r['n2.id']) for r in result
+    # Assert that they are attached to the instances
+    assert check_rels(
+        neo4j_session,
+        'EC2Instance',
+        'instanceid',
+        'EBSVolume',
+        'volumeid',
+        'ATTACHED_TO',
+        rel_direction_right=False,
+    ) == {
+        ('i-01', 'vol-0df'),
+        ('i-02', 'vol-03'),
+        ('i-03', 'vol-09'),
+        ('i-04', 'vol-04'),
     }
 
-    assert actual == expected
-
-
-@patch.object(cartography.intel.aws.ec2.instances, 'get_ec2_instances', return_value=DESCRIBE_INSTANCES['Reservations'])
-def test_load_volume_to_instance_rels(mock_get_instances, neo4j_session):
-    # Arrange: Load in ec2 instances first
-    boto3_session = MagicMock()
-    sync_ec2_instances(
+    # Act
+    sync_ebs_volumes(
         neo4j_session,
         boto3_session,
         [TEST_REGION],
@@ -98,28 +104,46 @@ def test_load_volume_to_instance_rels(mock_get_instances, neo4j_session):
         TEST_UPDATE_TAG,
         {'UPDATE_TAG': TEST_UPDATE_TAG, 'AWS_ID': TEST_ACCOUNT_ID},
     )
-    # Prep the volume data
-    raw_volumes = tests.data.aws.ec2.volumes.DESCRIBE_VOLUMES
-    transformed_volumes = cartography.intel.aws.ec2.volumes.transform_volumes(raw_volumes, TEST_REGION, TEST_ACCOUNT_ID)
 
-    # Act
-    cartography.intel.aws.ec2.volumes.load_volume_relationships(
-        neo4j_session,
-        transformed_volumes,
-        TEST_UPDATE_TAG,
-    )
+    # Assert that additional fields such as `encrypted` have been added by sync_ebs_volumes(), while
+    # deleteontermination has not been overwritten with None by sync_ebs_volumes()
+    assert check_nodes(neo4j_session, 'EBSVolume', ['id', 'deleteontermination', 'encrypted']) == {
+        # Attached to the instances initially
+        ('vol-04', True, None),
+        ('vol-09', True, None),
+        # Added by ebs sync
+        ('vol-03', True, True),
+        ('vol-0df', True, True),
+    }
 
-    # Assert
-    result = neo4j_session.run(
-        """
-        MATCH (n1:EC2Instance)<-[:ATTACHED_TO_EC2_INSTANCE]-(n2:EBSVolume) RETURN n1.id, n2.id;
-        """,
-    )
-    expected = {
-        ('i-01', 'v-01'),
-        ('i-02', 'v-02'),
+    # Assert that they are still attached to the instances
+    assert check_rels(
+        neo4j_session,
+        'EC2Instance',
+        'instanceid',
+        'EBSVolume',
+        'volumeid',
+        'ATTACHED_TO',
+        rel_direction_right=False,
+    ) == {
+        ('i-01', 'vol-0df'),
+        ('i-02', 'vol-03'),
+        ('i-03', 'vol-09'),
+        ('i-04', 'vol-04'),
     }
-    actual = {
-        (r['n1.id'], r['n2.id']) for r in result
+
+    # Assert that the account to volume rels exist
+    assert check_rels(
+        neo4j_session,
+        'AWSAccount',
+        'id',
+        'EBSVolume',
+        'volumeid',
+        'RESOURCE',
+        rel_direction_right=True,
+    ) == {
+        ('000000000000', 'vol-03'),
+        ('000000000000', 'vol-04'),
+        ('000000000000', 'vol-09'),
+        ('000000000000', 'vol-0df'),
     }
-    assert actual == expected

From 2d50fb9bef5f2c5571983aa1fdc4c9c36042ad94 Mon Sep 17 00:00:00 2001
From: Alex Chantavy <achantavy@lyft.com>
Date: Wed, 5 Apr 2023 22:47:34 -0700
Subject: [PATCH 2/4] Handle multiple modules updating the same nodes,
 documents

---
 cartography/graph/querybuilder.py             |  3 +-
 cartography/models/core/common.py             | 91 +++++++++++++------
 .../graph/test_querybuilder_complex.py        | 12 +--
 .../graph/test_querybuilder_simple.py         |  9 +-
 4 files changed, 76 insertions(+), 39 deletions(-)

diff --git a/cartography/graph/querybuilder.py b/cartography/graph/querybuilder.py
index e33d444265..90b5f8aa32 100644
--- a/cartography/graph/querybuilder.py
+++ b/cartography/graph/querybuilder.py
@@ -41,7 +41,7 @@ def _build_node_properties_statement(
         i.node_prop_1 = item.Prop1,
         i.node_prop_2 = $Prop2
         ```
-    where `i` is a reference to the Neo4j node.
+    where `i` is a reference to the Neo4j node and `item` is a reference to the current dict being processed.
     :param node_property_map: Mapping of node attribute names as str to PropertyRef objects
     :param extra_node_labels: Optional ExtraNodeLabels object to set on the node as string
     :return: The resulting Neo4j SET clause to set the given attributes on the node
@@ -79,6 +79,7 @@ def _build_rel_properties_statement(rel_var: str, rel_property_map: Optional[Dic
         r.rel_prop_1 = item.Prop1,
         r.rel_prop_2 = $Prop2
 
+    where `item` is a reference to the current dict being processed.
     :param rel_var: The variable name to use for the relationship in the Neo4j query
     :param rel_property_map: Mapping of relationship attribute names as str to PropertyRef objects
     :return: The resulting Neo4j SET clause to set the given attributes on the relationship
diff --git a/cartography/models/core/common.py b/cartography/models/core/common.py
index 6ee5d664c6..817eac87c1 100644
--- a/cartography/models/core/common.py
+++ b/cartography/models/core/common.py
@@ -1,30 +1,41 @@
 class PropertyRef:
     """
-    PropertyRefs represent properties on cartography nodes and relationships.
+    PropertyRefs represent properties on cartography nodes and relationships. Here is how they are used:
 
-    cartography takes lists of Python dicts and loads them to Neo4j. PropertyRefs allow our dynamically generated Neo4j
-    ingestion queries to set values for a given node or relationship property from (A) a field on the dict being
-    processed (PropertyRef.set_in_kwargs=False; default), or (B) from a single variable provided by a keyword argument
-    (PropertyRef.set_in_kwargs=True).
+    (1) cartography takes lists of Python dicts and loads them to Neo4j. PropertyRefs allow our dynamically generated
+    Neo4j ingestion queries to set values for a given node or relationship property from
+        (A) a field on the dict being processed (PropertyRef.set_in_kwargs=False; default), or
+        (B) from a single variable provided by a keyword argument (PropertyRef.set_in_kwargs=True).
+
+    (2) PropertyRefs defined on CartographyNodeProperties objects can specify the `extra_index=True` field to ensure
+    that an index is created for that given node on the given property.
+
+    (3) When we need to create a relationship from node A to node B based on the value of one or more of node B's
+    properties, cartography uses PropertyRefs on TargetNodeMatcher objects to generate an appropriate Neo4j `MATCH`
+    clause. This clause may need special handling, such as when we need to match on one of node B's properties in a
+    case-insensitive way - see the constructor docs for more details.
     """
 
     def __init__(self, name: str, set_in_kwargs=False, extra_index=False, ignore_case=False):
         """
         :param name: The name of the property
-        :param set_in_kwargs: Optional. If True, the property is not defined on the data dict, and we expect to find the
-        property in the kwargs.
-        If False, looks for the property in the data dict.
-        Defaults to False.
-        :param extra_index: If True, make sure that we create an index for this property name.
+        :param set_in_kwargs: Optional. This param only has effect if the PropertyRef is part of
+        CartographyNodeProperties or CartographyRelProperties objects.
+        If True, instructs the querybuilder to find the value for this property name in the kwargs passed to it.
+        This is used for things like applying the same update tag to all nodes of a given run.
+        If False (default), we instruct the querybuilder to get the value for this given property from
+        the current dict being processed.
+        :param extra_index: This param only has effect if the PropertyRef is part of a CartographyNodeProperties object.
+        If True, make sure that we create an index for this property name.
           Notes:
           - extra_index is available for the case where you anticipate a property will be queried frequently.
           - The `id` and `lastupdated` properties will always have indexes created for them automatically by
             `ensure_indexes()`.
           - All properties included in target node matchers will always have indexes created for them.
             Defaults to False.
-        :param ignore_case: If True, performs a case-insensitive match when comparing the value of this property during
-        relationship creation. Defaults to False. This only has effect as part of a TargetNodeMatcher, and this is not
-        supported for the sub resource relationship.
+        :param ignore_case: This param only has effect as part of a TargetNodeMatcher, and this is not
+        supported for the sub resource relationship. If True, performs a case-insensitive match when comparing the value
+        of this property during relationship creation. Defaults to False.
             Example on why you would set this to True:
             GitHub usernames can have both uppercase and lowercase characters, but GitHub itself treats usernames as
             case-insensitive. Suppose your company's internal personnel database stores GitHub usernames all as
@@ -34,25 +45,51 @@ def __init__(self, name: str, set_in_kwargs=False, extra_index=False, ignore_cas
             that points to the GitHubUser node's name field, otherwise if one of your employees' GitHub usernames
             contains capital letters, you would not be able to map them properly to a GitHubUser node in your graph.
         """
-        self.name = name
-        self.set_in_kwargs = set_in_kwargs
-        self.extra_index = extra_index
-        self.ignore_case = ignore_case
+        self.name: str = name
+        self.set_in_kwargs: bool = set_in_kwargs
+        self.extra_index: bool = extra_index
+        self.ignore_case: bool = ignore_case
 
     def _parameterize_name(self) -> str:
+        """
+        Private function. Turns self.name into a Neo4j query parameter so that the Neo4j ingestion query can access the
+        value on the dictionary being processed. This is used when parameters are supplied to the querybuilder via
+        kwargs.
+        Relevant Neo4j references: https://neo4j.com/docs/python-manual/current/query-simple/#_write_to_the_database and
+        https://neo4j.com/docs/cypher-manual/current/syntax/parameters/.
+        """
         return f"${self.name}"
 
     def __repr__(self) -> str:
         """
-        `querybuilder.build_ingestion_query()`, generates a Neo4j batched ingestion query of the form
-        `UNWIND $DictList AS item [...]`.
+        Returns string representation of the property so that it can be rendered in the querybuilder based on the
+        various inputs passed to the constructor.
+        """
+        if self.set_in_kwargs:
+            return self._parameterize_name()
 
-        If set_in_kwargs is False (default), we instruct the querybuilder to get the value for this given property from
-        the dict being processed. To do this, this function returns "item.<key on the dict>". This is used for loading
-        in lists of nodes.
+        if self.name.lower() == 'id' or self.ignore_case:
+            # Don't do coalesce() on caseinsensitive attr match.
+            return f"item.{self.name}"
 
-        On the other hand if set_in_kwargs is True, then the value will instead come from kwargs passed to
-        querybuilder.build_ingestion_query(). This is used for things like applying the same update tag to all nodes of
-        a given run.
-        """
-        return f"item.{self.name}" if not self.set_in_kwargs else self._parameterize_name()
+        # Attention: This implementation detail is quirky and deserves the following essay to explain.
+        # This function ensures that the Neo4j query sets the value of this node/relationship property by checking the
+        # following sources in order:
+        # 1. the current dict being processed. That is, we check if dict `item` contains a key called `self.name` with
+        # a non-None value. If so, we are done. Else continue and check the next source.
+        # 2. An existing value on the node itself. That is, we convert self.name to lowercase and check if that is non
+        # null on the Neo4j node `i` already. If None, then the property is not set (this a Neo4j driver behavior).
+        # This means we make an ASSUMPTION that the property name set on the node is the lowercase version of self.name.
+        #
+        # We do this because not all fields defined on a given CartographyNodeSchema may be present on the dict being
+        # processed (this is by design because we encourage multiple intel modules to update the same nodes as they
+        # may enrich the node with different properties), and when a dict is processed by the Neo4j driver, fields that
+        # are not defined on the dict are treated as None values.
+        #
+        # As an example, the EC2 instance sync loads in :EC2Instances with :EBSVolumes attached based on the output of
+        # `describe-ec2-instances`. This data includes a field called `deleteontermination` on the :EBSVolume.
+        # `deleteontermination` is only set during the EC2 instance sync. When the EBS Volume sync runs, it enriches
+        # existing :EBSVolumes with additional fields from data retrieved with `describe-ebs-volumes` but because this
+        # API call does not include `deleteontermination`, we will overwrite the previous values of
+        # `deleteontermination` with `None`, thus removing the property from all :EBSVolumes.
+        return f"COALESCE(item.{self.name}, i.{self.name.lower()})"
diff --git a/tests/unit/cartography/graph/test_querybuilder_complex.py b/tests/unit/cartography/graph/test_querybuilder_complex.py
index 464650401f..6de7d5330b 100644
--- a/tests/unit/cartography/graph/test_querybuilder_complex.py
+++ b/tests/unit/cartography/graph/test_querybuilder_complex.py
@@ -13,8 +13,8 @@ def test_build_ingestion_query_complex():
             ON CREATE SET i.firstseen = timestamp()
             SET
                 i.lastupdated = $lastupdated,
-                i.property1 = item.property1,
-                i.property2 = item.property2,
+                i.property1 = COALESCE(item.property1, i.property1),
+                i.property2 = COALESCE(item.property2, i.property2),
                 i:AnotherNodeLabel:YetAnotherNodeLabel
 
             WITH i, item
@@ -26,14 +26,14 @@ def test_build_ingestion_query_complex():
                 ON CREATE SET r.firstseen = timestamp()
                 SET
                     r.lastupdated = $lastupdated,
-                    r.another_rel_field = item.AnotherField,
-                    r.yet_another_rel_field = item.YetAnotherRelField
+                    r.another_rel_field = COALESCE(item.AnotherField, i.anotherfield),
+                    r.yet_another_rel_field = COALESCE(item.YetAnotherRelField, i.yetanotherrelfield)
 
                 UNION
                 WITH i, item
                 OPTIONAL MATCH (n0:HelloAsset)
                 WHERE
-                    n0.id = item.hello_asset_id
+                    n0.id = COALESCE(item.hello_asset_id, i.hello_asset_id)
                 WITH i, item, n0 WHERE n0 IS NOT NULL
                 MERGE (i)-[r0:ASSOCIATED_WITH]->(n0)
                 ON CREATE SET r0.firstseen = timestamp()
@@ -44,7 +44,7 @@ def test_build_ingestion_query_complex():
                 WITH i, item
                 OPTIONAL MATCH (n1:WorldAsset)
                 WHERE
-                    n1.id = item.world_asset_id
+                    n1.id = COALESCE(item.world_asset_id, i.world_asset_id)
                 WITH i, item, n1 WHERE n1 IS NOT NULL
                 MERGE (i)<-[r1:CONNECTED]-(n1)
                 ON CREATE SET r1.firstseen = timestamp()
diff --git a/tests/unit/cartography/graph/test_querybuilder_simple.py b/tests/unit/cartography/graph/test_querybuilder_simple.py
index fcc5d4f191..536101a29a 100644
--- a/tests/unit/cartography/graph/test_querybuilder_simple.py
+++ b/tests/unit/cartography/graph/test_querybuilder_simple.py
@@ -32,15 +32,14 @@ def test_build_ingestion_query_with_sub_resource():
     """
     # Act
     query = build_ingestion_query(SimpleNodeWithSubResourceSchema())
-
     expected = """
         UNWIND $DictList AS item
             MERGE (i:SimpleNode{id: item.Id})
             ON CREATE SET i.firstseen = timestamp()
             SET
                 i.lastupdated = $lastupdated,
-                i.property1 = item.property1,
-                i.property2 = item.property2
+                i.property1 = COALESCE(item.property1, i.property1),
+                i.property2 = COALESCE(item.property2, i.property2)
 
             WITH i, item
             CALL {
@@ -69,8 +68,8 @@ def test_build_ingestion_query_case_insensitive_match():
             ON CREATE SET i.firstseen = timestamp()
             SET
                 i.lastupdated = $lastupdated,
-                i.email = item.email,
-                i.github_username = item.github_username
+                i.email = COALESCE(item.email, i.email),
+                i.github_username = COALESCE(item.github_username, i.github_username)
 
             WITH i, item
             CALL {

From 4c63ea31cadf52171a9fb22dc9c8366d7a155012 Mon Sep 17 00:00:00 2001
From: Alex Chantavy <achantavy@lyft.com>
Date: Fri, 7 Apr 2023 17:46:34 -0700
Subject: [PATCH 3/4] Fix comment wording

Co-authored-by: Ramon Petgrave <32398091+ramonpetgrave64@users.noreply.github.com>
---
 cartography/models/core/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cartography/models/core/common.py b/cartography/models/core/common.py
index 817eac87c1..528a28185b 100644
--- a/cartography/models/core/common.py
+++ b/cartography/models/core/common.py
@@ -76,7 +76,7 @@ def __repr__(self) -> str:
         # This function ensures that the Neo4j query sets the value of this node/relationship property by checking the
         # following sources in order:
         # 1. the current dict being processed. That is, we check if dict `item` contains a key called `self.name` with
-        # a non-None value. If so, we are done. Else continue and check the next source.
+        # a non-None value. If so, we use that value. Else continue and check the next source.
         # 2. An existing value on the node itself. That is, we convert self.name to lowercase and check if that is non
         # null on the Neo4j node `i` already. If None, then the property is not set (this a Neo4j driver behavior).
         # This means we make an ASSUMPTION that the property name set on the node is the lowercase version of self.name.

From 47f7e27bbe5d4e9b1f216dc89ccd4eb0a94dc8cb Mon Sep 17 00:00:00 2001
From: Alex Chantavy <achantavy@lyft.com>
Date: Tue, 11 Apr 2023 16:08:28 -0700
Subject: [PATCH 4/4] remove unnecessary parens

Co-authored-by: Ramon Petgrave <32398091+ramonpetgrave64@users.noreply.github.com>
---
 cartography/intel/aws/ec2/volumes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cartography/intel/aws/ec2/volumes.py b/cartography/intel/aws/ec2/volumes.py
index 3ad50f1186..ad744c5f90 100644
--- a/cartography/intel/aws/ec2/volumes.py
+++ b/cartography/intel/aws/ec2/volumes.py
@@ -34,7 +34,7 @@ def transform_volumes(volumes: List[Dict[str, Any]], region: str, current_aws_ac
         active_attachments = [a for a in attachments if a['State'] == 'attached']
 
         volume_id = volume['VolumeId']
-        raw_vol = ({
+        raw_vol = {
             'Arn': build_arn('ec2', current_aws_account_id, 'volume', volume_id, region),
             'AvailabilityZone': volume['AvailabilityZone'],
             'CreateTime': volume['CreateTime'],
@@ -49,7 +49,7 @@ def transform_volumes(volumes: List[Dict[str, Any]], region: str, current_aws_ac
             'VolumeType': volume['VolumeType'],
             'VolumeId': volume_id,
             'KmsKeyId': volume['KmsKeyId'],
-        })
+        }
 
         if not active_attachments:
             result.append(raw_vol)