From a1e6c6adc7c66446864033835a1f5e22c744aeb6 Mon Sep 17 00:00:00 2001 From: Alex Chantavy Date: Fri, 21 Apr 2023 23:13:29 -0700 Subject: [PATCH] Trivy AWS ECR scan support --- README.md | 1 + cartography/client/aws/__init__.py | 19 + cartography/client/aws/ecr.py | 35 ++ cartography/config.py | 12 + .../cleanup/trivy_scan_findings_cleanup.json | 41 ++ cartography/intel/trivy/__init__.py | 119 +++++ cartography/intel/trivy/scanner.py | 431 ++++++++++++++++++ cartography/sync.py | 2 + 8 files changed, 660 insertions(+) create mode 100644 cartography/client/aws/ecr.py create mode 100644 cartography/data/jobs/cleanup/trivy_scan_findings_cleanup.json create mode 100644 cartography/intel/trivy/__init__.py create mode 100644 cartography/intel/trivy/scanner.py diff --git a/README.md b/README.md index 36ecc6c14..45c2d7766 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ Start [here](https://lyft.github.io/cartography/install.html). - [NIST CVE](https://lyft.github.io/cartography/modules/cve/index.html) - Common Vulnerabilities and Exposures (CVE) data from NIST database - [Lastpass](https://lyft.github.io/cartography/modules/lastpass/index.html) - users - [BigFix](https://lyft.github.io/cartography/modules/bigfix/index.html) - Computers +- Trivy Scanner - AWS ECR Images (TODO documentation) ## Usage Start with our [tutorial](https://lyft.github.io/cartography/usage/tutorial.html). Our [data schema](https://lyft.github.io/cartography/usage/schema.html) is a helpful reference when you get stuck. diff --git a/cartography/client/aws/__init__.py b/cartography/client/aws/__init__.py index e69de29bb..9274d9fb5 100644 --- a/cartography/client/aws/__init__.py +++ b/cartography/client/aws/__init__.py @@ -0,0 +1,19 @@ +from typing import List + +import neo4j + +from cartography.client.core.tx import read_list_of_values_tx +from cartography.util import timeit + + +@timeit +def list_accounts(neo4j_session: neo4j.Session) -> List[str]: + """ + :param neo4j_session: The neo4j session object. + :return: A list of all AWS account IDs in the graph + """ + # See https://community.neo4j.com/t/extract-list-of-nodes-and-labels-from-path/13665/4 + query = """ + MATCH (a:AWSAccount) RETURN a.id + """ + return neo4j_session.read_transaction(read_list_of_values_tx, query) diff --git a/cartography/client/aws/ecr.py b/cartography/client/aws/ecr.py new file mode 100644 index 000000000..f3389046a --- /dev/null +++ b/cartography/client/aws/ecr.py @@ -0,0 +1,35 @@ +from typing import Set +from typing import Tuple + +import neo4j + +from cartography.client.core.tx import read_list_of_tuples_tx +from cartography.util import timeit + + +@timeit +def get_ecr_images(neo4j_session: neo4j.Session, aws_account_id: str) -> Set[Tuple[str, str, str, str, str]]: + """ + Queries the graph for all ECR images and their parent images. + Returns 5-tuples of ECR repository regions, tags, names, and binary digests. This is used to identify which images + to scan. + :param neo4j_session: The neo4j session object. + :param aws_account_id: The AWS account ID to get ECR repo data for. + :return: 5-tuples of repo region, image tag, image URI, repo_name, and image_digest. + """ + # See https://community.neo4j.com/t/extract-list-of-nodes-and-labels-from-path/13665/4 + query = """ + MATCH (e1:ECRRepositoryImage)<-[:REPO_IMAGE]-(repo:ECRRepository) + MATCH (repo)<-[:RESOURCE]-(:AWSAccount{id:$AWS_ID}) + MATCH path = (e1)-[:PARENT*..]->(e2:ECRRepositoryImage) // TODO are there generic OSS ways to infer parent rels + WITH reduce(output=[], n in nodes(path) | output + n) as repo_img_collection + UNWIND repo_img_collection as repo_img + MATCH (er:ECRRepository)-[:REPO_IMAGE]->(repo_img:ECRRepositoryImage)-[:IMAGE]->(img:ECRImage) + RETURN DISTINCT + er.region as region, + repo_img.tag as tag, + repo_img.id as uri, + er.name as repo_name, + img.digest as digest + """ + return neo4j_session.read_transaction(read_list_of_tuples_tx, query, AWS_ID=aws_account_id) diff --git a/cartography/config.py b/cartography/config.py index e8449376a..22a633166 100644 --- a/cartography/config.py +++ b/cartography/config.py @@ -97,6 +97,12 @@ class Config: :param bigfix_password: The password to authenticate to BigFix. Optional. :type bigfix_root_url: str :param bigfix_root_url: The API URL to use for BigFix, e.g. "https://example.com:52311". Optional. + :type trivy_path: str + :param trivy_path: The path the to the Trivy file binary. + :type trivy_opa_policy_file_path: str + :param trivy_path: The path to the OPA policy file to use with Trivy. Optional. + :type trivy_resource_type: str + :param trivy_resource_type: The resource type to scan with Trivy e.g. 'aws.ecr'. """ def __init__( @@ -148,6 +154,9 @@ def __init__( bigfix_username=None, bigfix_password=None, bigfix_root_url=None, + trivy_path=None, + trivy_opa_policy_file_path=None, + trivy_resource_type=None, ): self.neo4j_uri = neo4j_uri self.neo4j_user = neo4j_user @@ -196,3 +205,6 @@ def __init__( self.bigfix_username = bigfix_username self.bigfix_password = bigfix_password self.bigfix_root_url = bigfix_root_url + self.trivy_path = trivy_path + self.trivy_opa_policy_file_path = trivy_opa_policy_file_path + self.trivy_resource_type = trivy_resource_type diff --git a/cartography/data/jobs/cleanup/trivy_scan_findings_cleanup.json b/cartography/data/jobs/cleanup/trivy_scan_findings_cleanup.json new file mode 100644 index 000000000..72a28badb --- /dev/null +++ b/cartography/data/jobs/cleanup/trivy_scan_findings_cleanup.json @@ -0,0 +1,41 @@ +{ + "statements": [ + { + "query": "MATCH (:TrivyImageFinding)-[r:AFFECTS]->(:TrivyPackage)-[:DEPLOYED]->(:ECRImage)<-[:IMAGE]-(:ECRRepositoryImage)<-[:REPO_IMAGE]-(:ECRRepository)<-[:RESOURCE]-(:AWSAccount{id: $AWS_ID}) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", + "iterative": true, + "iterationsize": 100 + }, + { + "query": "MATCH (n:TrivyPackage)-[:DEPLOYED]->(:ECRImage)<-[:IMAGE]-(:ECRRepositoryImage)<-[:REPO_IMAGE]-(:ECRRepository)<-[:RESOURCE]-(:AWSAccount{id: $AWS_ID}) WHERE n.lastupdated <> $UPDATE_TAG WITH n LIMIT $LIMIT_SIZE DETACH DELETE (n)", + "iterative": true, + "iterationsize": 100 + }, + { + "query": "MATCH (:TrivyPackage)-[r:DEPLOYED]->(:ECRImage)<-[:IMAGE]-(:ECRRepositoryImage)<-[:REPO_IMAGE]-(:ECRRepository)<-[:RESOURCE]-(:AWSAccount{id: $AWS_ID}) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", + "iterative": true, + "iterationsize": 1000, + "__comment__": "In testing, setting this to 1000 made this job 10x faster. There may have been other problems too but this was worth playing with." + }, + { + "query": "MATCH (:TrivyImageFinding)-[r:AFFECTS]->(:ECRImage)<-[:IMAGE]-(:ECRRepositoryImage)<-[:REPO_IMAGE]-(:ECRRepository)<-[:RESOURCE]-(:AWSAccount{id: $AWS_ID}) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", + "iterative": true, + "iterationsize": 100 + }, + { + "query": "MATCH (:TrivyFix)-[r:APPLIES_TO]->(:TrivyImageFinding) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", + "iterative": true, + "iterationsize": 100 + }, + { + "query": "MATCH (:TrivyPackage)-[r:SHOULD_UPDATE_TO]->(:TrivyFix) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", + "iterative": true, + "iterationsize": 100 + }, + { + "query": "MATCH (n:TrivyImageFinding) WHERE n.lastupdated <> $UPDATE_TAG WITH n LIMIT $LIMIT_SIZE DELETE (n)", + "iterative": true, + "iterationsize": 100 + } + ], + "name": "cleanup Trivy image scan findings" +} diff --git a/cartography/intel/trivy/__init__.py b/cartography/intel/trivy/__init__.py new file mode 100644 index 000000000..c93162cfe --- /dev/null +++ b/cartography/intel/trivy/__init__.py @@ -0,0 +1,119 @@ +import logging +import subprocess +from typing import Any +from typing import Dict +from typing import List +from typing import Tuple + +import neo4j + +import cartography.config +from cartography.client.aws import list_accounts +from cartography.client.aws.ecr import get_ecr_images +from cartography.intel.trivy.scanner import _call_trivy_update_db +from cartography.intel.trivy.scanner import cleanup +from cartography.intel.trivy.scanner import sync_single_image +from cartography.stats import get_stats_client +from cartography.util import timeit + + +logger = logging.getLogger(__name__) +stat_handler = get_stats_client('trivy.scanner') + + +# If we have >= this percentage of Trivy fatal failures, crash the sync. 10 == 10%, 20 == 20%, etc. +TRIVY_SCAN_FATAL_CIRCUIT_BREAKER_PERCENT = 10 + + +@timeit +def get_scan_targets(neo4j_session: neo4j.Session) -> List[Tuple[str, str, str, str, str]]: + aws_accounts = list_accounts(neo4j_session) + ecr_images: List[Tuple[str, str, str, str, str]] = [] + for account_id in aws_accounts: + ecr_images.extend(get_ecr_images(neo4j_session, account_id)) + return ecr_images + + +@timeit +def sync_trivy_aws_ecr( + neo4j_session: neo4j.Session, + trivy_path: str, + trivy_opa_policy_file_path: str, + update_tag: int, + common_job_parameters: Dict[str, Any], +) -> None: + trivy_scan_failure_count = 0 + + ecr_images = get_scan_targets(neo4j_session) + num_images = len(ecr_images) + logger.info(f"Scanning {num_images} ECR images with Trivy") + + for region, image_tag, image_uri, repo_name, image_digest in ecr_images: + try: + sync_single_image( + neo4j_session, + image_tag, + image_uri, + repo_name, + image_digest, + update_tag, + True, + trivy_path, + trivy_opa_policy_file_path, + ) + except subprocess.CalledProcessError as exc: + trivy_error_msg = exc.output.decode('utf-8') if type(exc.output) == bytes else exc.output + if 'rego_parse_error' in trivy_error_msg: + logger.error( + 'Trivy image scan failed due to rego_parse_error - please check rego syntax! ' + f"image_uri = {image_uri}, " + f"trivy_error_msg = {trivy_error_msg}.", + ) + raise + else: + trivy_scan_failure_count += 1 + logger.warning( + "Trivy image scan failed - please investigate. trivy_scan_failure_count++." + f"image_uri = {image_uri}" + f"trivy_error_msg = {trivy_error_msg}.", + ) + if (trivy_scan_failure_count / num_images) * 100 >= TRIVY_SCAN_FATAL_CIRCUIT_BREAKER_PERCENT: + logger.error('Trivy scan fatal failure circuit breaker hit, crashing.') + raise + # Else if circuit breaker is not hit, then keep going. + except KeyError: + trivy_scan_failure_count += 1 + logger.warning( + 'Trivy image scan failed because it returned unexpectedly incomplete data. ' + 'Please repro locally. trivy_scan_failure_count++. ' + f"image_uri = {image_uri}.", + ) + if (trivy_scan_failure_count / num_images) * 100 >= TRIVY_SCAN_FATAL_CIRCUIT_BREAKER_PERCENT: + logger.error('Trivy scan fatal failure circuit breaker hit, crashing.') + raise + # Else if circuit breaker is not hit, then keep going. + cleanup(neo4j_session, common_job_parameters) + + +@timeit +def start_trivy_scans(neo4j_session: neo4j.Session, config: cartography.config.Config) -> None: + if not config.trivy_path: + logger.info("Trivy module not configured. Skipping.") + return + + common_job_parameters = { + "UPDATE_TAG": config.update_tag, + # TODO we will need to infer the sub resource id based on what resource is being processed + "AWS_ID": 'id goes here', + } + _call_trivy_update_db(config.trivy_path) + if config.trivy_resource_type == 'aws.ecr': + sync_trivy_aws_ecr( + neo4j_session, + config.trivy_path, + config.trivy_opa_policy_file_path, + config.update_tag, + common_job_parameters, + ) + + # Support other Trivy resource types here e.g. if Google Cloud has images. diff --git a/cartography/intel/trivy/scanner.py b/cartography/intel/trivy/scanner.py new file mode 100644 index 000000000..67141d353 --- /dev/null +++ b/cartography/intel/trivy/scanner.py @@ -0,0 +1,431 @@ +import json +import logging +import subprocess +from typing import Any +from typing import Dict +from typing import List +from typing import Optional + +import neo4j +from neo4j import Session + +from cartography.graph.job import GraphJob +from cartography.stats import get_stats_client +from cartography.util import timeit + + +logger = logging.getLogger(__name__) +stat_handler = get_stats_client(__name__) + + +@timeit +def _call_trivy_binary(ecr_image_uri: str, trivy_path: str, image_cmd_args: Optional[List[str]] = None) -> bytes: + """ + Calls `trivy --quiet image $image_cmd_args $ecr_image_uri` and returns text output as raw string. + """ + if image_cmd_args is None: + image_cmd_args = [] + + # Build the command with global args + command: List[str] = [trivy_path, '--quiet'] + + # Add the image subcommand with its own args + command.append('image') + command.extend(image_cmd_args) + command.append(ecr_image_uri) + + try: + trivy_output_as_str: bytes = subprocess.check_output(command, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError: + stat_handler.incr('image_scan_fatal_count') + raise + stat_handler.incr('image_scan_success_count') + return trivy_output_as_str + + +@timeit +def _call_trivy_update_db(trivy_path: str) -> None: + """ + Calls `trivy --quiet --download-db-only`. + """ + command: List[str] = [trivy_path, '--quiet', 'image', '--download-db-only'] + + # Run the command but discard the output. We expect none anyway since --quiet mode is on. + try: + subprocess.check_output(command, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as exc: + logger.error( + f"`trivy image --download-db-only` failed. Error msg: " + f"{exc.output.decode('utf-8') if type(exc.output) == bytes else exc.output}", + ) + raise + + +def _build_image_subcommand( + skip_update: bool, + ignore_unfixed: bool = True, + triage_filter_policy_file_path: Optional[str] = None, + os_findings_only: bool = False, + list_all_pkgs: bool = False, + security_checks: Optional[str] = None, +) -> List[str]: + image_subcmd_args: List[str] = [ + '--format', 'json', + + # Default = 5 minutes. Some images are humongous and need 15 mins. + '--timeout', '15m', + ] + + if skip_update: + image_subcmd_args.append('--skip-update') + + if ignore_unfixed: + image_subcmd_args.append('--ignore-unfixed') + + if triage_filter_policy_file_path: + image_subcmd_args.extend( + [ + '--ignore-policy', triage_filter_policy_file_path, + ], + ) + + # Trivy default = '--vuln-type=os,library' - https://aquasecurity.github.io/trivy/v0.19.2/getting-started/cli/image/ + if os_findings_only: + image_subcmd_args.extend( + ['--vuln-type', 'os'], + ) + + if list_all_pkgs: + image_subcmd_args.extend( + ['--list-all-pkgs'], + ) + + if security_checks: + image_subcmd_args.extend( + ['--security-checks', security_checks], + ) + + return image_subcmd_args + + +@timeit +def get_scan_results_for_single_image(ecr_image_uri: str, image_subcmd_args: List[str], trivy_path: str) -> List[Dict]: + """ + Runs trivy scanner on the given ecr_image_uri and returns vuln data results. + """ + # Get + trivy_output_as_str: bytes = _call_trivy_binary(ecr_image_uri, trivy_path, image_subcmd_args) + + # Transform + trivy_data: Dict = json.loads(trivy_output_as_str) + # See https://github.com/aquasecurity/trivy/discussions/1050 for schema v2 shape + if 'Results' in trivy_data and trivy_data['Results']: + return trivy_data['Results'] + else: + stat_handler.incr('image_scan_no_results_count') + logger.warning(f"trivy scan did not return a `results` key for URI = {ecr_image_uri}; continuing.") + return [] + + +def transform_scan_results(results: List[Dict]) -> List[Dict]: + """ + Trivy results produce a nested dictionary, so we pull out some info + from this to be added to TrivyImageFinding nodes + """ + for scan_class in results: + # Sometimes a scan class will have no vulns and Trivy will leave the key undefined instead of showing []. + if 'Vulnerabilities' in scan_class and scan_class['Vulnerabilities']: + parsed_vuln_results: List[Dict] = [] + for result in scan_class['Vulnerabilities']: + # If ID, Severity, FixedVersion, or PkgName do not exist, fail loudly. + # For all other fields, continue + parsed_result = { + "NodeId": f'TIF|{result["VulnerabilityID"]}', + "VulnerabilityID": result["VulnerabilityID"], + "Description": result.get("Description"), + "LastModifiedDate": result.get("LastModifiedDate"), + "PrimaryURL": result.get("PrimaryURL"), + "PublishedDate": result.get("PublishedDate"), + "Severity": result["Severity"], + "SeveritySource": result.get("SeveritySource"), + "Title": result.get("Title"), + "InstalledVersion": result["InstalledVersion"], + "PkgName": result["PkgName"], + "FixedVersion": result.get("FixedVersion"), + "nvd_v2_score": None, + "nvd_v2_vector": None, + "nvd_v3_score": None, + "nvd_v3_vector": None, + "redhat_v3_score": None, + "redhat_v3_vector": None, + "ubuntu_v3_score": None, + "ubuntu_v3_vector": None, + } + + if "CVSS" in result: + if "nvd" in result["CVSS"]: + nvd = result["CVSS"]["nvd"] + parsed_result["nvd_v2_score"] = nvd.get("V2Score") + parsed_result["nvd_v2_vector"] = nvd.get("V2Vector") + parsed_result["nvd_v3_score"] = nvd.get("V3Score") + parsed_result["nvd_v3_vector"] = nvd.get("V3Vector") + if "redhat" in result["CVSS"]: + redhat = result["CVSS"]["redhat"] + parsed_result["redhat_v3_score"] = redhat.get("V3Score") + parsed_result["redhat_v3_vector"] = redhat.get("V3Vector") + if "ubuntu" in result["CVSS"]: + redhat = result["CVSS"]["ubuntu"] + parsed_result["ubuntu_v3_score"] = redhat.get("V3Score") + parsed_result["ubuntu_v3_vector"] = redhat.get("V3Vector") + + parsed_vuln_results.append(parsed_result) + + scan_class['Vulnerabilities'] = parsed_vuln_results + + return results + + +@timeit +def cleanup(neo4j_session: Session, common_job_parameters: Dict[str, Any]) -> None: + GraphJob.run_from_json_file( + 'trivy_scan_findings_cleanup.json', neo4j_session, common_job_parameters, + ) + + +@timeit +def load_scan_packages( + neo4j_session: neo4j.Session, + scan_results: List[Dict], + ecr_image_digest: str, + ecr_image_tag: str, + ecr_repo_name: str, + update_tag: int, +) -> None: + for scan_class in scan_results: + if 'Packages' in scan_class and scan_class['Packages']: + validated_packages = _validate_packages(scan_class['Packages'], ecr_image_tag, ecr_repo_name) + neo4j_session.write_transaction( + _load_packages_in_single_class_tx, + ecr_image_digest, + ecr_image_tag, + ecr_repo_name, + validated_packages, + scan_class['Class'], + scan_class['Type'], + update_tag, + ) + + +@timeit +def _load_scan_results_in_single_class_tx( + tx: neo4j.Transaction, + ecr_image_digest: str, + ecr_image_tag: str, + ecr_repo_name: str, + vulns_of_single_class: List[Dict], + trivy_class: str, + trivy_type: str, + update_tag: int, +) -> None: + ingest_results = """ + MATCH (image:ECRImage{id: $ImageDigest}) + + UNWIND $Findings as finding + MERGE (t:TrivyImageFinding{id: finding.NodeId}) + ON CREATE SET t.firstseen = timestamp() + SET t:Risk, + t:CVE, + t.name = finding.VulnerabilityID, + t.cve_id = finding.VulnerabilityID, + t.lastupdated = $UpdateTag, + t.description = finding.Description, + t.last_modified_date = finding.LastModifiedDate, + t.primary_url = finding.PrimaryURL, + t.published_date = finding.PublishedDate, + t.severity = finding.Severity, + t.severity_source = finding.SeveritySource, + t.title = finding.Title, + t.cvss_nvd_v2_score = finding.nvd_v2_score, + t.cvss_nvd_v2_vector = finding.nvd_v2_vector, + t.cvss_nvd_v3_score = finding.nvd_v3_score, + t.cvss_nvd_v3_vector = finding.nvd_v3_vector, + t.cvss_redhat_v3_score = finding.redhat_v3_score, + t.cvss_redhat_v3_vector = finding.redhat_v3_vector, + t.cvss_ubuntu_v3_score = finding.ubuntu_v3_score, + t.cvss_ubuntu_v3_vector = finding.ubuntu_v3_vector, + t.class = $Class, + t.type = $Type + + MERGE (p:Package{id: finding.InstalledVersion + '|' + finding.PkgName}) + ON CREATE SET p.installed_version = finding.InstalledVersion, + p.name = finding.PkgName, + p.firstseen = timestamp() + SET p:TrivyPackage, + p.lastupdated = $UpdateTag, + p.version = finding.InstalledVersion, + p.class = $Class, + p.type = $Type + + MERGE (fix:TrivyFix{id: finding.FixedVersion + '|' + finding.PkgName}) + ON CREATE SET fix.firstseen = timestamp() + SET fix:Fix, + fix.version = finding.FixedVersion, + fix.lastupdated = $UpdateTag + + MERGE (p)-[should:SHOULD_UPDATE_TO]->(fix) + ON CREATE SET should.firstseen = timestamp() + SET should.version = finding.FixedVersion, + should.lastupdated = $UpdateTag + + MERGE (fix)-[applies:APPLIES_TO]->(t) + ON CREATE SET applies.firstseen = timestamp() + SET applies.lastupdated = $UpdateTag + + MERGE (p)-[r1:DEPLOYED]->(image) + ON CREATE SET r1.firstseen = timestamp() + SET r1.lastupdated = $UpdateTag + + MERGE (t)-[a:AFFECTS]->(p) + ON CREATE SET a.firstseen = timestamp() + SET a.lastupdated = $UpdateTag + + MERGE (t)-[a2:AFFECTS]->(image) + ON CREATE SET a2.firstseen = timestamp() + SET a2.lastupdated = $UpdateTag + """ + num_findings = len(vulns_of_single_class) + logger.info( + "Ingesting Trivy scan results: " + f"repo_name = {ecr_repo_name}, " + f"image_tag = {ecr_image_tag}, " + f"num_findings = {num_findings}, " + f"class = {trivy_class}, " + f"type = {trivy_type}, " + f"update_tag = {update_tag}.", + ) + stat_handler.incr('image_scan_cve_count', num_findings) + tx.run( + ingest_results, + Findings=vulns_of_single_class, + Class=trivy_class, + Type=trivy_type, + ImageDigest=ecr_image_digest, + ImageTag=ecr_image_tag, + RepoName=ecr_repo_name, + UpdateTag=update_tag, + ) + + +@timeit +def load_scan_vulns( + neo4j_session: neo4j.Session, + scan_results: List[Dict[str, Any]], + ecr_image_digest: str, + ecr_image_tag: str, + ecr_repo_name: str, + update_tag: int, +) -> None: + for scan_class in scan_results: + # Sometimes a scan class will have no vulns and Trivy will leave the key undefined instead of showing []. + if 'Vulnerabilities' in scan_class and scan_class['Vulnerabilities']: + neo4j_session.write_transaction( + _load_scan_results_in_single_class_tx, + ecr_image_digest, + ecr_image_tag, + ecr_repo_name, + scan_class['Vulnerabilities'], + scan_class['Class'], + scan_class['Type'], + update_tag, + ) + + +@timeit +def _load_packages_in_single_class_tx( + tx: neo4j.Transaction, + ecr_image_digest: str, + ecr_image_tag: str, + ecr_repo_name: str, + packages_of_single_class: List[Dict], + trivy_class: str, + trivy_type: str, + update_tag: int, +) -> None: + ingest_results = """ + MATCH (image:ECRImage{id: $ImageDigest}) + + UNWIND $Packages as pkg + MERGE (p:Package{id: pkg.Version + '|' + pkg.Name}) + ON CREATE SET p.installed_version = pkg.Version, + p.name = pkg.Name, + p.firstseen = timestamp() + SET p:TrivyPackage, + p.lastupdated = $UpdateTag, + p.version = pkg.Version, + p.class = $Class, + p.type = $Type + + MERGE (p)-[r1:DEPLOYED]->(image) + ON CREATE SET r1.firstseen = timestamp() + SET r1.lastupdated = $UpdateTag + """ + num_packages = len(packages_of_single_class) + logger.info( + f"Ingesting Trivy package: " + f"repo_name = {ecr_repo_name}, " + f"image_tag = {ecr_image_tag}, " + f"num_packages = {num_packages}, " + f"class = {trivy_class}, " + f"type = {trivy_type}.", + ) + tx.run( + ingest_results, + Packages=packages_of_single_class, + Class=trivy_class, + Type=trivy_type, + ImageDigest=ecr_image_digest, + ImageTag=ecr_image_tag, + RepoName=ecr_repo_name, + UpdateTag=update_tag, + ) + + +def _validate_packages(package_list: List[Dict], ecr_image_tag: str, ecr_repo_name: str) -> List[Dict]: + validated_packages: List[Dict] = [] + for pkg in package_list: + if 'Version' in pkg and pkg['Version'] and 'Name' in pkg and pkg['Name']: + validated_packages.append(pkg) + else: + logger.warning( + f"Package object does not have a `Name` or `Value` - skipping. Please check why." + f"ecr_image_tag = {ecr_image_tag}, " + f"ecr_repo_name = {ecr_repo_name}.", + ) + return validated_packages + + +@timeit +def sync_single_image( + neo4j_session: neo4j.Session, + image_tag: str, + image_uri: str, + repo_name: str, + image_digest: str, + update_tag: int, + skip_db_update: bool, + trivy_path: str, + trivy_opa_policy_file_path: str, +) -> None: + image_subcmd_args: List[str] = _build_image_subcommand( + skip_db_update, + ignore_unfixed=True, + triage_filter_policy_file_path=trivy_opa_policy_file_path, + os_findings_only=False, # scan for both os and library vuln classes. + list_all_pkgs=True, + security_checks='vuln', # trivy 0.30.1 says "If your scanning is slow, please try '--security-checks vuln'" + ) + results: List[Dict] = get_scan_results_for_single_image(image_uri, image_subcmd_args, trivy_path) + parsed_results: List[Dict] = transform_scan_results(results) + load_scan_vulns(neo4j_session, parsed_results, image_digest, image_tag, repo_name, update_tag) + load_scan_packages(neo4j_session, parsed_results, image_digest, image_tag, repo_name, update_tag) + stat_handler.incr('images_processed_count') diff --git a/cartography/sync.py b/cartography/sync.py index 185929861..5041334e9 100644 --- a/cartography/sync.py +++ b/cartography/sync.py @@ -27,6 +27,7 @@ import cartography.intel.lastpass import cartography.intel.oci import cartography.intel.okta +import cartography.intel.trivy from cartography.config import Config from cartography.stats import set_stats_client from cartography.util import STATUS_FAILURE @@ -51,6 +52,7 @@ 'kubernetes': cartography.intel.kubernetes.start_k8s_ingestion, 'lastpass': cartography.intel.lastpass.start_lastpass_ingestion, 'bigfix': cartography.intel.bigfix.start_bigfix_ingestion, + 'trivy': cartography.intel.trivy.start_trivy_scans, 'analysis': cartography.intel.analysis.run, })