diff --git a/cartography/intel/semgrep/__init__.py b/cartography/intel/semgrep/__init__.py index dbd72f1dd..d7e80971d 100644 --- a/cartography/intel/semgrep/__init__.py +++ b/cartography/intel/semgrep/__init__.py @@ -3,7 +3,9 @@ import neo4j from cartography.config import Config -from cartography.intel.semgrep.findings import sync +from cartography.intel.semgrep.dependencies import sync_dependencies +from cartography.intel.semgrep.deployment import sync_deployment +from cartography.intel.semgrep.findings import sync_findings from cartography.util import timeit @@ -20,4 +22,9 @@ def start_semgrep_ingestion( if not config.semgrep_app_token: logger.info('Semgrep import is not configured - skipping this module. See docs to configure.') return - sync(neo4j_session, config.semgrep_app_token, config.update_tag, common_job_parameters) + + # sync_deployment must be called first since it populates common_job_parameters + # with the deployment ID and slug, which are required by the other sync functions + sync_deployment(neo4j_session, config.semgrep_app_token, config.update_tag, common_job_parameters) + sync_dependencies(neo4j_session, config.semgrep_app_token, config.update_tag, common_job_parameters) + sync_findings(neo4j_session, config.semgrep_app_token, config.update_tag, common_job_parameters) diff --git a/cartography/intel/semgrep/dependencies.py b/cartography/intel/semgrep/dependencies.py new file mode 100644 index 000000000..536202296 --- /dev/null +++ b/cartography/intel/semgrep/dependencies.py @@ -0,0 +1,201 @@ +import logging +from typing import Any +from typing import Callable +from typing import Dict +from typing import List + +import neo4j +import requests +from requests.exceptions import HTTPError +from requests.exceptions import ReadTimeout + +from cartography.client.core.tx import load +from cartography.graph.job import GraphJob +from cartography.models.semgrep.dependencies import SemgrepGoLibrarySchema +from cartography.stats import get_stats_client +from cartography.util import merge_module_sync_metadata +from cartography.util import timeit + +logger = logging.getLogger(__name__) +stat_handler = get_stats_client(__name__) +_PAGE_SIZE = 10000 +_TIMEOUT = (60, 60) +_MAX_RETRIES = 3 + + +@timeit +def get_dependencies(semgrep_app_token: str, deployment_id: str, ecosystems: List[str]) -> List[Dict[str, Any]]: + """ + Gets all dependencies for the given ecosystems within the given Semgrep deployment ID. + param: semgrep_app_token: The Semgrep App token to use for authentication. + param: deployment_id: The Semgrep deployment ID to use for retrieving dependencies. + param: ecosystems: One or more ecosystems to import dependencies from, e.g. "gomod" or "pypi". + The list of supported ecosystems is defined here: + https://semgrep.dev/api/v1/docs/#tag/SupplyChainService/operation/semgrep_app.products.sca.handlers.dependency.list_dependencies_conexxion + """ + all_deps = [] + deps_url = f"https://semgrep.dev/api/v1/deployments/{deployment_id}/dependencies" + has_more = True + page = 0 + retries = 0 + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {semgrep_app_token}", + } + + request_data: dict[str, Any] = { + "pageSize": _PAGE_SIZE, + "dependencyFilter": { + "ecosystem": ecosystems, + }, + } + + logger.info(f"Retrieving Semgrep dependencies for deployment '{deployment_id}'.") + while has_more: + try: + response = requests.post(deps_url, json=request_data, headers=headers, timeout=_TIMEOUT) + response.raise_for_status() + data = response.json() + except (ReadTimeout, HTTPError): + logger.warning(f"Failed to retrieve Semgrep dependencies for page {page}. Retrying...") + retries += 1 + if retries >= _MAX_RETRIES: + raise + continue + deps = data.get("dependencies", []) + has_more = data.get("hasMore", False) + logger.info(f"Processed page {page} of Semgrep dependencies.") + all_deps.extend(deps) + retries = 0 + page += 1 + request_data["cursor"] = data.get("cursor") + + logger.info(f"Retrieved {len(all_deps)} Semgrep dependencies in {page} pages.") + return all_deps + + +def transform_dependencies(raw_deps: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Transforms the raw dependencies response from Semgrep API into a list of dicts + that can be used to create the Dependency nodes. + """ + + """ + sample raw_dep as of November 2024: + { + "repositoryId": "123456", + "definedAt": { + "path": "go.mod", + "startLine": "6", + "endLine": "6", + "url": "https://github.com/org/repo-name/blob/00000000000000000000000000000000/go.mod#L6", + "committedAt": "1970-01-01T00:00:00Z", + "startCol": "0", + "endCol": "0" + }, + "transitivity": "DIRECT", + "package": { + "name": "github.com/foo/bar", + "versionSpecifier": "1.2.3" + }, + "ecosystem": "gomod", + "licenses": [], + "pathToTransitivity": [] + }, + """ + deps = [] + for raw_dep in raw_deps: + + # We could call a different endpoint to get all repo IDs and store a mapping of repo ID to URL, + # but it's much simpler to just extract the URL from the definedAt field. + repo_url = raw_dep["definedAt"]["url"].split("/blob/", 1)[0] + + name = raw_dep["package"]["name"] + version = raw_dep["package"]["versionSpecifier"] + id = f"{name}|{version}" + + # As of November 2024, Semgrep does not import dependencies with version specifiers such as >, <, etc. + # For now, hardcode the specifier to == to align with GitHub-sourced Python dependencies. + # If Semgrep eventually supports version specifiers, update this line accordingly. + specifier = f"=={version}" + + deps.append({ + # existing dependency properties: + "id": id, + "name": name, + "specifier": specifier, + "version": version, + "repo_url": repo_url, + + # Semgrep-specific properties: + "ecosystem": raw_dep["ecosystem"], + "transitivity": raw_dep["transitivity"].lower(), + "url": raw_dep["definedAt"]["url"], + }) + + return deps + + +@timeit +def load_dependencies( + neo4j_session: neo4j.Session, + dependency_schema: Callable, + dependencies: List[Dict], + deployment_id: str, + update_tag: int, +) -> None: + logger.info(f"Loading {len(dependencies)} {dependency_schema().label} objects into the graph.") + load( + neo4j_session, + dependency_schema(), + dependencies, + lastupdated=update_tag, + DEPLOYMENT_ID=deployment_id, + ) + + +@timeit +def cleanup( + neo4j_session: neo4j.Session, + common_job_parameters: Dict[str, Any], +) -> None: + logger.info("Running Semgrep Go Library cleanup job.") + go_libraries_cleanup_job = GraphJob.from_node_schema( + SemgrepGoLibrarySchema(), common_job_parameters, + ) + go_libraries_cleanup_job.run(neo4j_session) + + +@timeit +def sync_dependencies( + neo4j_session: neo4j.Session, + semgrep_app_token: str, + update_tag: int, + common_job_parameters: Dict[str, Any], +) -> None: + + deployment_id = common_job_parameters.get("DEPLOYMENT_ID") + if not deployment_id: + logger.warning( + "Missing Semgrep deployment ID, ensure that sync_deployment() has been called." + "Skipping Semgrep dependencies sync job.", + ) + return + + logger.info("Running Semgrep dependencies sync job.") + + # fetch and load dependencies for the Go ecosystem + raw_go_deps = get_dependencies(semgrep_app_token, deployment_id, ecosystems=["gomod"]) + go_deps = transform_dependencies(raw_go_deps) + load_dependencies(neo4j_session, SemgrepGoLibrarySchema, go_deps, deployment_id, update_tag) + + cleanup(neo4j_session, common_job_parameters) + + merge_module_sync_metadata( + neo4j_session=neo4j_session, + group_type='Semgrep', + group_id=deployment_id, + synced_type='SemgrepDependency', + update_tag=update_tag, + stat_handler=stat_handler, + ) diff --git a/cartography/intel/semgrep/deployment.py b/cartography/intel/semgrep/deployment.py new file mode 100644 index 000000000..8dee04cb0 --- /dev/null +++ b/cartography/intel/semgrep/deployment.py @@ -0,0 +1,67 @@ +import logging +from typing import Any +from typing import Dict + +import neo4j +import requests + +from cartography.client.core.tx import load +from cartography.models.semgrep.deployment import SemgrepDeploymentSchema +from cartography.stats import get_stats_client +from cartography.util import timeit + +logger = logging.getLogger(__name__) +stat_handler = get_stats_client(__name__) +_TIMEOUT = (60, 60) + + +@timeit +def get_deployment(semgrep_app_token: str) -> Dict[str, Any]: + """ + Gets the deployment associated with the passed Semgrep App token. + param: semgrep_app_token: The Semgrep App token to use for authentication. + """ + deployment = {} + deployment_url = "https://semgrep.dev/api/v1/deployments" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {semgrep_app_token}", + } + response = requests.get(deployment_url, headers=headers, timeout=_TIMEOUT) + response.raise_for_status() + + data = response.json() + deployment["id"] = data["deployments"][0]["id"] + deployment["name"] = data["deployments"][0]["name"] + deployment["slug"] = data["deployments"][0]["slug"] + + return deployment + + +@timeit +def load_semgrep_deployment( + neo4j_session: neo4j.Session, deployment: Dict[str, Any], update_tag: int, +) -> None: + logger.info(f"Loading SemgrepDeployment {deployment} into the graph.") + load( + neo4j_session, + SemgrepDeploymentSchema(), + [deployment], + lastupdated=update_tag, + ) + + +@timeit +def sync_deployment( + neo4j_session: neo4j.Session, + semgrep_app_token: str, + update_tag: int, + common_job_parameters: Dict[str, Any], +) -> None: + + semgrep_deployment = get_deployment(semgrep_app_token) + deployment_id = semgrep_deployment["id"] + deployment_slug = semgrep_deployment["slug"] + load_semgrep_deployment(neo4j_session, semgrep_deployment, update_tag) + common_job_parameters["DEPLOYMENT_ID"] = deployment_id + common_job_parameters["DEPLOYMENT_SLUG"] = deployment_slug diff --git a/cartography/intel/semgrep/findings.py b/cartography/intel/semgrep/findings.py index 7d93ec157..f62b92fb2 100644 --- a/cartography/intel/semgrep/findings.py +++ b/cartography/intel/semgrep/findings.py @@ -11,7 +11,6 @@ from cartography.client.core.tx import load from cartography.graph.job import GraphJob -from cartography.models.semgrep.deployment import SemgrepDeploymentSchema from cartography.models.semgrep.findings import SemgrepSCAFindingSchema from cartography.models.semgrep.locations import SemgrepSCALocationSchema from cartography.stats import get_stats_client @@ -26,29 +25,6 @@ _MAX_RETRIES = 3 -@timeit -def get_deployment(semgrep_app_token: str) -> Dict[str, Any]: - """ - Gets the deployment associated with the passed Semgrep App token. - param: semgrep_app_token: The Semgrep App token to use for authentication. - """ - deployment = {} - deployment_url = "https://semgrep.dev/api/v1/deployments" - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {semgrep_app_token}", - } - response = requests.get(deployment_url, headers=headers, timeout=_TIMEOUT) - response.raise_for_status() - - data = response.json() - deployment["id"] = data["deployments"][0]["id"] - deployment["name"] = data["deployments"][0]["name"] - deployment["slug"] = data["deployments"][0]["slug"] - - return deployment - - @timeit def get_sca_vulns(semgrep_app_token: str, deployment_slug: str) -> List[Dict[str, Any]]: """ @@ -81,11 +57,11 @@ def get_sca_vulns(semgrep_app_token: str, deployment_slug: str) -> List[Dict[str response = requests.get(sca_url, params=request_data, headers=headers, timeout=_TIMEOUT) response.raise_for_status() data = response.json() - except (ReadTimeout, HTTPError) as e: + except (ReadTimeout, HTTPError): logger.warning(f"Failed to retrieve Semgrep SCA vulns for page {page}. Retrying...") retries += 1 if retries >= _MAX_RETRIES: - raise e + raise continue vulns = data["findings"] has_more = len(vulns) > 0 @@ -201,19 +177,6 @@ def transform_sca_vulns(raw_vulns: List[Dict[str, Any]]) -> Tuple[List[Dict[str, return vulns, usages -@timeit -def load_semgrep_deployment( - neo4j_session: neo4j.Session, deployment: Dict[str, Any], update_tag: int, -) -> None: - logger.info(f"Loading Semgrep deployment info {deployment} into the graph...") - load( - neo4j_session, - SemgrepDeploymentSchema(), - [deployment], - lastupdated=update_tag, - ) - - @timeit def load_semgrep_sca_vulns( neo4j_session: neo4j.Session, @@ -221,7 +184,7 @@ def load_semgrep_sca_vulns( deployment_id: str, update_tag: int, ) -> None: - logger.info(f"Loading {len(vulns)} Semgrep SCA vulns info into the graph.") + logger.info(f"Loading {len(vulns)} SemgrepSCAFinding objects into the graph.") load( neo4j_session, SemgrepSCAFindingSchema(), @@ -238,7 +201,7 @@ def load_semgrep_sca_usages( deployment_id: str, update_tag: int, ) -> None: - logger.info(f"Loading {len(usages)} Semgrep SCA usages info into the graph.") + logger.info(f"Loading {len(usages)} SemgrepSCALocation objects into the graph.") load( neo4j_session, SemgrepSCALocationSchema(), @@ -265,26 +228,32 @@ def cleanup( @timeit -def sync( - neo4j_sesion: neo4j.Session, +def sync_findings( + neo4j_session: neo4j.Session, semgrep_app_token: str, update_tag: int, common_job_parameters: Dict[str, Any], ) -> None: + + deployment_id = common_job_parameters.get("DEPLOYMENT_ID") + deployment_slug = common_job_parameters.get("DEPLOYMENT_SLUG") + if not deployment_id or not deployment_slug: + logger.warning( + "Missing Semgrep deployment ID or slug, ensure that sync_deployment() has been called." + "Skipping SCA findings sync job.", + ) + return + logger.info("Running Semgrep SCA findings sync job.") - semgrep_deployment = get_deployment(semgrep_app_token) - deployment_id = semgrep_deployment["id"] - deployment_slug = semgrep_deployment["slug"] - load_semgrep_deployment(neo4j_sesion, semgrep_deployment, update_tag) - common_job_parameters["DEPLOYMENT_ID"] = deployment_id raw_vulns = get_sca_vulns(semgrep_app_token, deployment_slug) vulns, usages = transform_sca_vulns(raw_vulns) - load_semgrep_sca_vulns(neo4j_sesion, vulns, deployment_id, update_tag) - load_semgrep_sca_usages(neo4j_sesion, usages, deployment_id, update_tag) - run_scoped_analysis_job('semgrep_sca_risk_analysis.json', neo4j_sesion, common_job_parameters) - cleanup(neo4j_sesion, common_job_parameters) + load_semgrep_sca_vulns(neo4j_session, vulns, deployment_id, update_tag) + load_semgrep_sca_usages(neo4j_session, usages, deployment_id, update_tag) + run_scoped_analysis_job('semgrep_sca_risk_analysis.json', neo4j_session, common_job_parameters) + + cleanup(neo4j_session, common_job_parameters) merge_module_sync_metadata( - neo4j_session=neo4j_sesion, + neo4j_session=neo4j_session, group_type='Semgrep', group_id=deployment_id, synced_type='SCA', diff --git a/cartography/models/semgrep/dependencies.py b/cartography/models/semgrep/dependencies.py new file mode 100644 index 000000000..f758ab692 --- /dev/null +++ b/cartography/models/semgrep/dependencies.py @@ -0,0 +1,77 @@ +from dataclasses import dataclass +from typing import Optional + +from cartography.models.core.common import PropertyRef +from cartography.models.core.nodes import CartographyNodeProperties +from cartography.models.core.nodes import CartographyNodeSchema +from cartography.models.core.nodes import ExtraNodeLabels +from cartography.models.core.relationships import CartographyRelProperties +from cartography.models.core.relationships import CartographyRelSchema +from cartography.models.core.relationships import LinkDirection +from cartography.models.core.relationships import make_target_node_matcher +from cartography.models.core.relationships import OtherRelationships +from cartography.models.core.relationships import TargetNodeMatcher + + +@dataclass(frozen=True) +class SemgrepDependencyNodeProperties(CartographyNodeProperties): + id: PropertyRef = PropertyRef('id') + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + name: PropertyRef = PropertyRef('name') + ecosystem: PropertyRef = PropertyRef('ecosystem') + version: PropertyRef = PropertyRef('version') + + +@dataclass(frozen=True) +class SemgrepDependencyToSemgrepDeploymentRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +# (:SemgrepDependency)<-[:RESOURCE]-(:SemgrepDeployment) +class SemgrepDependencyToSemgrepDeploymentSchema(CartographyRelSchema): + target_node_label: str = 'SemgrepDeployment' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('DEPLOYMENT_ID', set_in_kwargs=True)}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "RESOURCE" + properties: SemgrepDependencyToSemgrepDeploymentRelProperties = SemgrepDependencyToSemgrepDeploymentRelProperties() + + +@dataclass(frozen=True) +class SemgrepDependencyToGithubRepoRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + specifier: PropertyRef = PropertyRef('specifier') + transitivity: PropertyRef = PropertyRef('transitivity') + url: PropertyRef = PropertyRef('url') + + +@dataclass(frozen=True) +# (:SemgrepDependency)<-[:REQUIRES]-(:GitHubRepository) +class SemgrepDependencyToGithubRepoRel(CartographyRelSchema): + target_node_label: str = 'GitHubRepository' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('repo_url')}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "REQUIRES" + properties: SemgrepDependencyToGithubRepoRelProperties = SemgrepDependencyToGithubRepoRelProperties() + + +@dataclass(frozen=True) +class SemgrepSCAFindngToDependencyRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class SemgrepGoLibrarySchema(CartographyNodeSchema): + label: str = 'GoLibrary' + extra_node_labels: Optional[ExtraNodeLabels] = ExtraNodeLabels(['Dependency', 'SemgrepDependency']) + properties: SemgrepDependencyNodeProperties = SemgrepDependencyNodeProperties() + sub_resource_relationship: SemgrepDependencyToSemgrepDeploymentSchema = SemgrepDependencyToSemgrepDeploymentSchema() + other_relationships: OtherRelationships = OtherRelationships( + [ + SemgrepDependencyToGithubRepoRel(), + ], + ) diff --git a/docs/root/modules/semgrep/index.rst b/docs/root/modules/semgrep/index.rst index 54a4d625b..ce04f71c1 100644 --- a/docs/root/modules/semgrep/index.rst +++ b/docs/root/modules/semgrep/index.rst @@ -5,6 +5,7 @@ The Semgrep module has the following coverage: * Deployment * SCA Findings +* Dependencies .. toctree:: :hidden: diff --git a/docs/root/modules/semgrep/schema.md b/docs/root/modules/semgrep/schema.md index 71d2ff3b4..d214c7162 100644 --- a/docs/root/modules/semgrep/schema.md +++ b/docs/root/modules/semgrep/schema.md @@ -24,16 +24,19 @@ Represents a Semgrep [Deployment](https://semgrep.dev/api/v1/docs/#tag/Deploymen - A SemgrepDeployment contains SemgrepSCALocation's - ``` (SemgrepDeployment)-[RESOURCE]->(SemgrepSCALocation) ``` +- A SemgrepDeployment contains SemgrepDependency's + + ``` + (SemgrepDeployment)-[RESOURCE]->(SemgrepDependency) ``` ### SemgrepSCAFinding -Represents a [Semgre Supply Chain](https://semgrep.dev/docs/semgrep-supply-chain/overview/) finding. This is, a vulnerability in a dependency of a project discovered by Semgrep performing software composition analysis (SCA) and code reachability analysis. Before ingesting this node, make sure you have run Semgrep CI and that it's connected to Semgrep Cloud Platform [Running Semgrep CI with Semgrep Cloud Platform](https://semgrep.dev/docs/semgrep-ci/running-semgrep-ci-with-semgrep-cloud-platform/). The API called to retrieve this information is documented at https://semgrep.dev/api/v1/docs/#tag/SupplyChainService. +Represents a [Semgrep Supply Chain](https://semgrep.dev/docs/semgrep-supply-chain/overview/) finding. This is, a vulnerability in a dependency of a project discovered by Semgrep performing software composition analysis (SCA) and code reachability analysis. Before ingesting this node, make sure you have run Semgrep CI and that it's connected to Semgrep Cloud Platform [Running Semgrep CI with Semgrep Cloud Platform](https://semgrep.dev/docs/semgrep-ci/running-semgrep-ci-with-semgrep-cloud-platform/). The API called to retrieve this information is documented at https://semgrep.dev/api/v1/docs/#tag/SupplyChainService. | Field | Description | |-------|--------------| @@ -78,7 +81,7 @@ Represents a [Semgre Supply Chain](https://semgrep.dev/docs/semgrep-supply-chain (SemgrepSCAFinding)-[USAGE_AT]->(SemgrepSCALocation) ``` -- A SemgrepSCAFinding affects a Python Dependency (optional) +- A SemgrepSCAFinding affects a Dependency (optional) ``` (:SemgrepSCAFinding)-[:AFFECTS]->(:Dependency) @@ -90,7 +93,6 @@ Represents a [Semgre Supply Chain](https://semgrep.dev/docs/semgrep-supply-chain (:SemgrepSCAFinding)<-[:LINKED_TO]-(:CVE) ``` - ### SemgrepSCALocation Represents the location in a repository where a vulnerable dependency is used in a way that can trigger the vulnerability. @@ -106,3 +108,39 @@ Represents the location in a repository where a vulnerable dependency is used in | end_line | Line where the usage ends | | end_col | Column where the usage ends | | url | URL of the file where the usage was discovered | + + +### SemgrepDependency + +Represents a dependency of a repository as returned by the Semgrep +[List dependencies API](https://semgrep.dev/api/v1/docs/#tag/SupplyChainService/operation/semgrep_app.products.sca.handlers.dependency.list_dependencies_conexxion). + +| Field | Description | +|-------|--------------| +| firstseen | Timestamp of when a sync job first discovered this node | +| lastupdated | Timestamp of the last time the node was updated | +| **id** | Unique id formed by the name and version of the dependency | +| name | Name of the dependency | +| version | Version of the dependency | +| ecosystem | Ecosystem of the dependency, e.g. "gomod" for dependencies defined in go.mod files. (see [API docs](https://semgrep.dev/api/v1/docs/#tag/SupplyChainService/operation/semgrep_app.products.sca.handlers.dependency.list_dependencies_conexxion) for full list of options) | + + +### GoLibrary + +Represents a Go library dependency as listed in a go.mod file. +All GoLibrary nodes are also SemgrepDependency nodes. +See [SemgrepDependency](#semgrepdependency) for details. + + +#### Relationships + +- A SemgrepDependency is required by a GithubRepository (optional) + + ``` + (:SemgrepDependency)<-[:REQUIRES]-(:GithubRepository) + ``` + + Properties on REQUIRES relationship: + - specifier: A string describing the library version required by the repo (e.g. "==1.0.2") + - transitivity: A string describing whether the dependency is direct or [transitive](https://en.wikipedia.org/wiki/Transitive_dependency) (e.g. direct, transitive) + - url: The URL where the dependency is defined (e.g. https://github.com/org/repo/blob/00000000000000000000000000000000/go.mod#L6) diff --git a/tests/data/semgrep/dependencies.py b/tests/data/semgrep/dependencies.py new file mode 100644 index 000000000..0ee7646c3 --- /dev/null +++ b/tests/data/semgrep/dependencies.py @@ -0,0 +1,54 @@ +REPO_ID = "123456" + +DEPENDENCIES_RESPONSE = { + "dependencies": [ + { + "repositoryId": REPO_ID, + "definedAt": { + "path": "go.mod", + "startLine": "6", + "endLine": "6", + "url": "https://github.com/org/repository/blob/00000000000000000000000000000000/go.mod#L6", + "committedAt": "1970-01-01T00:00:00Z", + "startCol": "0", + "endCol": "0", + }, + "transitivity": "DIRECT", + "package": { + "name": "github.com/foo/baz", + "versionSpecifier": "1.2.3", + }, + "ecosystem": "gomod", + "licenses": [ + "MIT", + ], + "pathToTransitivity": [], + }, + { + "repositoryId": REPO_ID, + "definedAt": { + "path": "go.mod", + "startLine": "7", + "endLine": "7", + "url": "https://github.com/org/repository/blob/00000000000000000000000000000000/go.mod#L7", + "committedAt": "1970-01-01T00:00:00Z", + "startCol": "0", + "endCol": "0", + }, + "transitivity": "TRANSITIVE", + "package": { + "name": "github.com/foo/buzz", + "versionSpecifier": "4.5.0", + }, + "ecosystem": "gomod", + "licenses": [ + "MIT", + ], + "pathToTransitivity": [], + }, + ], + "hasMore": True, + "cursor": "123456789", +} + +RAW_DEPS = DEPENDENCIES_RESPONSE["dependencies"] diff --git a/tests/data/semgrep/deployment.py b/tests/data/semgrep/deployment.py new file mode 100644 index 000000000..c932a1be9 --- /dev/null +++ b/tests/data/semgrep/deployment.py @@ -0,0 +1,5 @@ +DEPLOYMENTS = { + "id": "123456", + "name": "Org", + "slug": "org", +} diff --git a/tests/data/semgrep/sca.py b/tests/data/semgrep/sca.py index 4c625da87..9b75b2392 100644 --- a/tests/data/semgrep/sca.py +++ b/tests/data/semgrep/sca.py @@ -1,8 +1,3 @@ -DEPLOYMENTS = { - "id": "123456", - "name": "Org", - "slug": "org", -} VULN_ID = 73537136 USAGE_ID = hash( "org/repository/blob/commit_id/src/packages/linked-accounts/components/LinkedAccountsTable/constants.tsx#L274", diff --git a/tests/integration/cartography/intel/semgrep/common.py b/tests/integration/cartography/intel/semgrep/common.py new file mode 100644 index 000000000..87b4fdd78 --- /dev/null +++ b/tests/integration/cartography/intel/semgrep/common.py @@ -0,0 +1,70 @@ +from string import Template +from typing import List + +import neo4j + +TEST_REPO_ID = "https://github.com/org/repository" +TEST_REPO_FULL_NAME = "org/repository" +TEST_REPO_NAME = "repository" +TEST_UPDATE_TAG = 123456789 + + +def check_nodes_as_list( + neo4j_session: neo4j.Session, node_label: str, attrs: List[str], +): + """ + Like tests.integration.util.check_nodes()` but returns a list instead of a set. + """ + if not attrs: + raise ValueError( + "`attrs` passed to check_nodes() must have at least one element.", + ) + + attrs = ", ".join(f"n.{attr}" for attr in attrs) + query_template = Template("MATCH (n:$NodeLabel) RETURN $Attrs") + result = neo4j_session.run( + query_template.safe_substitute(NodeLabel=node_label, Attrs=attrs), + ) + return sum([row.values() for row in result], []) + + +def create_github_repos(neo4j_session): + # Creates a set of GitHub repositories in the graph + neo4j_session.run( + """ + MERGE (repo:GitHubRepository{id: $repo_id, fullname: $repo_fullname, name: $repo_name}) + ON CREATE SET repo.firstseen = timestamp() + SET repo.lastupdated = $update_tag + SET repo.archived = false + """, + repo_id=TEST_REPO_ID, + repo_fullname=TEST_REPO_FULL_NAME, + update_tag=TEST_UPDATE_TAG, + repo_name=TEST_REPO_NAME, + ) + + +def create_dependency_nodes(neo4j_session): + # Creates a set of dependency nodes in the graph + neo4j_session.run( + """ + MERGE (dep:Dependency{id: $dep_id}) + ON CREATE SET dep.firstseen = timestamp() + SET dep.lastupdated = $update_tag + """, + dep_id="moment|2.29.2", + update_tag=TEST_UPDATE_TAG, + ) + + +def create_cve_nodes(neo4j_session): + # Creates a set of CVE nodes in the graph + neo4j_session.run( + """ + MERGE (cve:CVE{id: $cve_id}) + ON CREATE SET cve.firstseen = timestamp() + SET cve.lastupdated = $update_tag + """, + cve_id="CVE-2022-31129", + update_tag=TEST_UPDATE_TAG, + ) diff --git a/tests/integration/cartography/intel/semgrep/test_dependencies.py b/tests/integration/cartography/intel/semgrep/test_dependencies.py new file mode 100644 index 000000000..a0209b6a1 --- /dev/null +++ b/tests/integration/cartography/intel/semgrep/test_dependencies.py @@ -0,0 +1,105 @@ +from unittest.mock import patch + +import cartography.intel.semgrep.dependencies +import cartography.intel.semgrep.deployment +import tests.data.semgrep.dependencies +import tests.data.semgrep.deployment +from cartography.intel.semgrep.dependencies import sync_dependencies +from cartography.intel.semgrep.deployment import sync_deployment +from tests.integration.cartography.intel.semgrep.common import create_github_repos +from tests.integration.cartography.intel.semgrep.common import TEST_UPDATE_TAG +from tests.integration.util import check_nodes +from tests.integration.util import check_rels + + +@patch.object( + cartography.intel.semgrep.deployment, + "get_deployment", + return_value=tests.data.semgrep.deployment.DEPLOYMENTS, +) +@patch.object( + cartography.intel.semgrep.dependencies, + "get_dependencies", + return_value=tests.data.semgrep.dependencies.RAW_DEPS, +) +def test_sync_dependencies(mock_get_dependencies, mock_get_deployment, neo4j_session): + # Arrange + create_github_repos(neo4j_session) + semgrep_app_token = "your_semgrep_app_token" + common_job_parameters = { + "UPDATE_TAG": TEST_UPDATE_TAG, + } + + # Act + sync_deployment(neo4j_session, semgrep_app_token, TEST_UPDATE_TAG, common_job_parameters) + sync_dependencies(neo4j_session, semgrep_app_token, TEST_UPDATE_TAG, common_job_parameters) + + # Assert + assert check_nodes( + neo4j_session, + "SemgrepDeployment", + ["id", "name", "slug"], + ) == {("123456", "Org", "org")} + + assert check_nodes( + neo4j_session, + "SemgrepDependency", + [ + "id", + "lastupdated", + "name", + "version", + "ecosystem", + ], + ) == { + ( + "github.com/foo/baz|1.2.3", + TEST_UPDATE_TAG, + "github.com/foo/baz", + "1.2.3", + "gomod", + ), + ( + "github.com/foo/buzz|4.5.0", + TEST_UPDATE_TAG, + "github.com/foo/buzz", + "4.5.0", + "gomod", + ), + } + + assert check_rels( + neo4j_session, + "SemgrepDeployment", + "id", + "SemgrepDependency", + "id", + "RESOURCE", + ) == { + ( + "123456", + "github.com/foo/baz|1.2.3", + ), + ( + "123456", + "github.com/foo/buzz|4.5.0", + ), + } + + assert check_rels( + neo4j_session, + "GitHubRepository", + "fullname", + "SemgrepDependency", + "id", + "REQUIRES", + ) == { + ( + "org/repository", + "github.com/foo/baz|1.2.3", + ), + ( + "org/repository", + "github.com/foo/buzz|4.5.0", + ), + } diff --git a/tests/integration/cartography/intel/semgrep/test_findings.py b/tests/integration/cartography/intel/semgrep/test_findings.py index bdd1e8fa5..93a27a8fb 100644 --- a/tests/integration/cartography/intel/semgrep/test_findings.py +++ b/tests/integration/cartography/intel/semgrep/test_findings.py @@ -1,114 +1,52 @@ -from string import Template -from typing import List from unittest.mock import patch -import neo4j - +import cartography.intel.semgrep.deployment import cartography.intel.semgrep.findings +import tests.data.semgrep.deployment import tests.data.semgrep.sca -from cartography.intel.semgrep.findings import sync +from cartography.intel.semgrep.deployment import sync_deployment +from cartography.intel.semgrep.findings import sync_findings +from tests.integration.cartography.intel.semgrep.common import check_nodes_as_list +from tests.integration.cartography.intel.semgrep.common import create_cve_nodes +from tests.integration.cartography.intel.semgrep.common import create_dependency_nodes +from tests.integration.cartography.intel.semgrep.common import create_github_repos +from tests.integration.cartography.intel.semgrep.common import TEST_UPDATE_TAG from tests.integration.util import check_nodes from tests.integration.util import check_rels -TEST_REPO_ID = "https: //github.com/org/repository" -TEST_REPO_FULL_NAME = "org/repository" -TEST_REPO_NAME = "repository" -TEST_UPDATE_TAG = 123456789 - - -def _check_nodes_as_list( - neo4j_session: neo4j.Session, node_label: str, attrs: List[str], -): - """ - Like tests.integration.util.check_nodes()` but returns a list instead of a set. - """ - if not attrs: - raise ValueError( - "`attrs` passed to check_nodes() must have at least one element.", - ) - - attrs = ", ".join(f"n.{attr}" for attr in attrs) - query_template = Template("MATCH (n:$NodeLabel) RETURN $Attrs") - result = neo4j_session.run( - query_template.safe_substitute(NodeLabel=node_label, Attrs=attrs), - ) - return sum([row.values() for row in result], []) - - -def _create_github_repos(neo4j_session): - # Creates a set of GitHub repositories in the graph - neo4j_session.run( - """ - MERGE (repo:GitHubRepository{id: $repo_id, fullname: $repo_fullname, name: $repo_name}) - ON CREATE SET repo.firstseen = timestamp() - SET repo.lastupdated = $update_tag - SET repo.archived = false - """, - repo_id=TEST_REPO_ID, - repo_fullname=TEST_REPO_FULL_NAME, - update_tag=TEST_UPDATE_TAG, - repo_name=TEST_REPO_NAME, - ) - - -def _create_dependency_nodes(neo4j_session): - # Creates a set of dependency nodes in the graph - neo4j_session.run( - """ - MERGE (dep:Dependency{id: $dep_id}) - ON CREATE SET dep.firstseen = timestamp() - SET dep.lastupdated = $update_tag - """, - dep_id="moment|2.29.2", - update_tag=TEST_UPDATE_TAG, - ) - - -def _create_cve_nodes(neo4j_session): - # Creates a set of CVE nodes in the graph - neo4j_session.run( - """ - MERGE (cve:CVE{id: $cve_id}) - ON CREATE SET cve.firstseen = timestamp() - SET cve.lastupdated = $update_tag - """, - cve_id="CVE-2022-31129", - update_tag=TEST_UPDATE_TAG, - ) - @patch.object( - cartography.intel.semgrep.findings, + cartography.intel.semgrep.deployment, "get_deployment", - return_value=tests.data.semgrep.sca.DEPLOYMENTS, + return_value=tests.data.semgrep.deployment.DEPLOYMENTS, ) @patch.object( cartography.intel.semgrep.findings, "get_sca_vulns", return_value=tests.data.semgrep.sca.RAW_VULNS, ) -def test_sync(mock_get_sca_vulns, mock_get_deployment, neo4j_session): +def test_sync_findings(mock_get_sca_vulns, mock_get_deployment, neo4j_session): # Arrange - _create_github_repos(neo4j_session) - _create_dependency_nodes(neo4j_session) - _create_cve_nodes(neo4j_session) + create_github_repos(neo4j_session) + create_dependency_nodes(neo4j_session) + create_cve_nodes(neo4j_session) semgrep_app_token = "your_semgrep_app_token" common_job_parameters = { "UPDATE_TAG": TEST_UPDATE_TAG, } # Act - sync(neo4j_session, semgrep_app_token, TEST_UPDATE_TAG, common_job_parameters) + sync_deployment(neo4j_session, semgrep_app_token, TEST_UPDATE_TAG, common_job_parameters) + sync_findings(neo4j_session, semgrep_app_token, TEST_UPDATE_TAG, common_job_parameters) # Assert - assert check_nodes( neo4j_session, "SemgrepDeployment", ["id", "name", "slug"], ) == {("123456", "Org", "org")} - assert _check_nodes_as_list( + assert check_nodes_as_list( neo4j_session, "SemgrepSCAFinding", [