Adds support to query Semgrep API to ingest SCA vulns (#1224)

Adds a new Schema and Intel Job to query Semgrep Enterprise API and ingest Semgrep Supply Chain (SSC) findings. The schema connects a Semgrep Deployment with an id specific to a customer in Semgrep Enterprise to an SCA finding and location as a sub resource relationships. It also connects to a Github repository to match findings against where they were found. Each finding can have a location with the specific lines of code where the vulnerable dependency is being used. ![SemgrepCartographyfinal](https://github.com/lyft/cartography/assets/9236431/9a99ecdd-b40f-430e-bff5-fe950f4c713e) --------- Co-authored-by: Alex Chantavy <[email protected]>
cartography-cncf · Aug 3, 2023 · b0a58a5 · b0a58a5
1 parent 361fc5d
commit b0a58a5
Show file tree

Hide file tree

Showing 16 changed files with 901 additions and 0 deletions.
diff --git a/cartography/cli.py b/cartography/cli.py
@@ -500,6 +500,15 @@ def _build_parser(self):
                 'The Duo api hostname'
             ),
         )
+        parser.add_argument(
+            '--semgrep-app-token-env-var',
+            type=str,
+            default=None,
+            help=(
+                'The name of environment variable containing the Semgrep app token key. '
+                'Required if you are using the Semgrep intel module. Ignored otherwise.'
+            ),
+        )
         return parser
 
     def main(self, argv: str) -> int:
@@ -669,6 +678,13 @@ def main(self, argv: str) -> int:
             config.duo_api_key = None
             config.duo_api_secret = None
 
+        # Semgrep config
+        if config.semgrep_app_token_env_var:
+            logger.debug(f"Reading Semgrep App Token from environment variable {config.semgrep_app_token_env_var}")
+            config.semgrep_app_token = os.environ.get(config.semgrep_app_token_env_var)
+        else:
+            config.semgrep_app_token = None
+
         # Run cartography
         try:
             return cartography.sync.run_with_config(self.sync, config)

diff --git a/cartography/config.py b/cartography/config.py
@@ -103,6 +103,8 @@ class Config:
     :param duo_api_key: The Duo api secret. Optional.
     :type duo_api_hostname: str
     :param duo_api_hostname: The Duo api hostname, e.g. "api-abc123.duosecurity.com". Optional.
+    :param semgrep_app_token: The Semgrep api token. Optional.
+    :type semgrep_app_token: str
     """
 
     def __init__(
@@ -157,6 +159,7 @@ def __init__(
         duo_api_key=None,
         duo_api_secret=None,
         duo_api_hostname=None,
+        semgrep_app_token=None,
     ):
         self.neo4j_uri = neo4j_uri
         self.neo4j_user = neo4j_user
@@ -208,3 +211,4 @@ def __init__(
         self.duo_api_key = duo_api_key
         self.duo_api_secret = duo_api_secret
         self.duo_api_hostname = duo_api_hostname
+        self.semgrep_app_token = semgrep_app_token
diff --git a/cartography/intel/semgrep/__init__.py b/cartography/intel/semgrep/__init__.py
@@ -0,0 +1,23 @@
+import logging
+
+import neo4j
+
+from cartography.config import Config
+from cartography.intel.semgrep.findings import sync
+from cartography.util import timeit
+
+
+logger = logging.getLogger(__name__)
+
+
+@timeit
+def start_semgrep_ingestion(
+    neo4j_session: neo4j.Session, config: Config,
+) -> None:
+    common_job_parameters = {
+        "UPDATE_TAG": config.update_tag,
+    }
+    if not config.semgrep_app_token:
+        logger.info('Semgrep import is not configured - skipping this module. See docs to configure.')
+        return
+    sync(neo4j_session, config.semgrep_app_token, config.update_tag, common_job_parameters)
diff --git a/cartography/intel/semgrep/findings.py b/cartography/intel/semgrep/findings.py
@@ -0,0 +1,217 @@
+import logging
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Tuple
+
+import neo4j
+import requests
+
+from cartography.client.core.tx import load
+from cartography.graph.job import GraphJob
+from cartography.models.semgrep.deployment import SemgrepDeploymentSchema
+from cartography.models.semgrep.findings import SemgrepSCAFindingSchema
+from cartography.models.semgrep.locations import SemgrepSCALocationSchema
+from cartography.stats import get_stats_client
+from cartography.util import merge_module_sync_metadata
+from cartography.util import timeit
+
+logger = logging.getLogger(__name__)
+stat_handler = get_stats_client(__name__)
+_TIMEOUT = (60, 60)
+
+
+@timeit
+def get_deployment(semgrep_app_token: str) -> Dict[str, Any]:
+    """
+    Gets the deployment associated with the passed Semgrep App token.
+    param: semgrep_app_token: The Semgrep App token to use for authentication.
+    """
+    deployment = {}
+    deployment_url = "https://semgrep.dev/api/v1/deployments"
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {semgrep_app_token}",
+    }
+    response = requests.get(deployment_url, headers=headers, timeout=_TIMEOUT)
+    response.raise_for_status()
+
+    data = response.json()
+    deployment["id"] = data["deployments"][0]["id"]
+    deployment["name"] = data["deployments"][0]["name"]
+    deployment["slug"] = data["deployments"][0]["slug"]
+
+    return deployment
+
+
+@timeit
+def get_sca_vulns(semgrep_app_token: str, deployment_id: str) -> List[Dict[str, Any]]:
+    """
+    Gets the SCA vulns associated with the passed Semgrep App token and deployment id.
+    param: semgrep_app_token: The Semgrep App token to use for authentication.
+    param: deployment_id: The Semgrep deployment id to use for retrieving SCA vulns.
+    """
+    all_vulns = []
+    sca_url = f"https://semgrep.dev/api/sca/deployments/{deployment_id}/vulns"
+    has_more = True
+    cursor = ""
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {semgrep_app_token}",
+    }
+
+    while has_more:
+        params = {}
+        if cursor:
+            params = {"cursor": cursor}
+
+        response = requests.get(sca_url, params=params, headers=headers, timeout=_TIMEOUT)
+        response.raise_for_status()
+        data = response.json()
+        vulns = data["vulns"]
+        cursor = data.get("cursor")
+        has_more = data.get("hasMore", False)
+        all_vulns.extend(vulns)
+
+    return all_vulns
+
+
+def transform_sca_vulns(raw_vulns: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], List[Dict[str, str]]]:
+    """
+    Transforms the raw SCA vulns response from Semgrep API into a list of dicts
+    that can be used to create the SemgrepSCAFinding nodes.
+    """
+    vulns = []
+    usages = []
+    for vuln in raw_vulns:
+        sca_vuln: Dict[str, Any] = {}
+        # Mandatory fields
+        unique_id = f"{vuln['repositoryName']}|{vuln['advisory']['ruleId']}"
+        sca_vuln["id"] = unique_id
+        sca_vuln["repositoryName"] = vuln["repositoryName"]
+        sca_vuln["ruleId"] = vuln["advisory"]["ruleId"]
+        sca_vuln["title"] = vuln["advisory"]["title"]
+        sca_vuln["description"] = vuln["advisory"]["description"]
+        sca_vuln["ecosystem"] = vuln["advisory"]["ecosystem"]
+        sca_vuln["severity"] = vuln["advisory"]["severity"]
+        sca_vuln["reachability"] = vuln["advisory"]["reachability"]
+        sca_vuln["reachableIf"] = vuln["advisory"]["reachableIf"]
+        sca_vuln["exposureType"] = vuln["exposureType"]
+        dependency = f"{vuln['matchedDependency']['name']}|{vuln['matchedDependency']['versionSpecifier']}"
+        sca_vuln["matchedDependency"] = dependency
+        sca_vuln["dependencyFileLocation_path"] = vuln["dependencyFileLocation"]["path"]
+        sca_vuln["dependencyFileLocation_url"] = vuln["dependencyFileLocation"]["url"]
+        # Optional fields
+        sca_vuln["transitivity"] = vuln.get("transitivity", None)
+        cves = vuln.get("advisory", {}).get("references", {}).get("cveIds")
+        if len(cves) > 0:
+            # Take the first CVE
+            sca_vuln["cveId"] = vuln["advisory"]["references"]["cveIds"][0]
+        if vuln.get('closestSafeDependency'):
+            dep_fix = f"{vuln['closestSafeDependency']['name']}|{vuln['closestSafeDependency']['versionSpecifier']}"
+            sca_vuln["closestSafeDependency"] = dep_fix
+        if vuln["advisory"].get("references", {}).get("urls", []):
+            sca_vuln["ref_urls"] = vuln["advisory"].get("references", {}).get("urls", [])
+        sca_vuln["openedAt"] = vuln.get("openedAt", None)
+        for usage in vuln.get("usages", []):
+            usage_dict = {}
+            usage_dict["SCA_ID"] = unique_id
+            usage_dict["findingId"] = usage["findingId"]
+            usage_dict["path"] = usage["location"]["path"]
+            usage_dict["startLine"] = usage["location"]["startLine"]
+            usage_dict["startCol"] = usage["location"]["startCol"]
+            usage_dict["endLine"] = usage["location"]["endLine"]
+            usage_dict["endCol"] = usage["location"]["endCol"]
+            usage_dict["url"] = usage["location"]["url"]
+            usages.append(usage_dict)
+        vulns.append(sca_vuln)
+    return vulns, usages
+
+
+@timeit
+def load_semgrep_deployment(
+    neo4j_session: neo4j.Session, deployment: Dict[str, Any], update_tag: int,
+) -> None:
+    logger.info(f"Loading Semgrep deployment info {deployment} into the graph...")
+    load(
+        neo4j_session,
+        SemgrepDeploymentSchema(),
+        [deployment],
+        lastupdated=update_tag,
+    )
+
+
+@timeit
+def load_semgrep_sca_vulns(
+    neo4j_session: neo4j.Session,
+    vulns: List[Dict[str, Any]],
+    deployment_id: str,
+    update_tag: int,
+) -> None:
+    logger.info(f"Loading {len(vulns)} Semgrep SCA vulns info into the graph.")
+    load(
+        neo4j_session,
+        SemgrepSCAFindingSchema(),
+        vulns,
+        lastupdated=update_tag,
+        DEPLOYMENT_ID=deployment_id,
+    )
+
+
+@timeit
+def load_semgrep_sca_usages(
+    neo4j_session: neo4j.Session,
+    usages: List[Dict[str, Any]],
+    deployment_id: str,
+    update_tag: int,
+) -> None:
+    logger.info(f"Loading {len(usages)} Semgrep SCA usages info into the graph.")
+    load(
+        neo4j_session,
+        SemgrepSCALocationSchema(),
+        usages,
+        lastupdated=update_tag,
+        DEPLOYMENT_ID=deployment_id,
+    )
+
+
+@timeit
+def cleanup(
+    neo4j_session: neo4j.Session, common_job_parameters: Dict[str, Any],
+) -> None:
+    logger.info("Running Semgrep SCA findings cleanup job.")
+    findings_cleanup_job = GraphJob.from_node_schema(
+        SemgrepSCAFindingSchema(), common_job_parameters,
+    )
+    findings_cleanup_job.run(neo4j_session)
+    logger.info("Running Semgrep SCA Locations cleanup job.")
+    locations_cleanup_job = GraphJob.from_node_schema(
+        SemgrepSCALocationSchema(), common_job_parameters,
+    )
+    locations_cleanup_job.run(neo4j_session)
+
+
+@timeit
+def sync(
+    neo4j_sesion: neo4j.Session,
+    semgrep_app_token: str,
+    update_tag: int,
+    common_job_parameters: Dict[str, Any],
+) -> None:
+    logger.info("Running Semgrep SCA findings sync job.")
+    semgrep_deployment = get_deployment(semgrep_app_token)
+    load_semgrep_deployment(neo4j_sesion, semgrep_deployment, update_tag)
+    common_job_parameters["DEPLOYMENT_ID"] = semgrep_deployment["id"]
+    raw_vulns = get_sca_vulns(semgrep_app_token, semgrep_deployment["id"])
+    vulns, usages = transform_sca_vulns(raw_vulns)
+    load_semgrep_sca_vulns(neo4j_sesion, vulns, semgrep_deployment["id"], update_tag)
+    load_semgrep_sca_usages(neo4j_sesion, usages, semgrep_deployment["id"], update_tag)
+    cleanup(neo4j_sesion, common_job_parameters)
+    merge_module_sync_metadata(
+        neo4j_session=neo4j_sesion,
+        group_type='Semgrep',
+        group_id=common_job_parameters["DEPLOYMENT_ID"],
+        synced_type='SCA',
+        update_tag=update_tag,
+        stat_handler=stat_handler,
+    )
diff --git a/cartography/models/semgrep/__init__.py b/cartography/models/semgrep/__init__.py
diff --git a/cartography/models/semgrep/deployment.py b/cartography/models/semgrep/deployment.py
@@ -0,0 +1,19 @@
+from dataclasses import dataclass
+
+from cartography.models.core.common import PropertyRef
+from cartography.models.core.nodes import CartographyNodeProperties
+from cartography.models.core.nodes import CartographyNodeSchema
+
+
+@dataclass(frozen=True)
+class SemgrepDeploymentProperties(CartographyNodeProperties):
+    id: PropertyRef = PropertyRef('id')
+    lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True)
+    name: PropertyRef = PropertyRef('name', extra_index=True)
+    slug: PropertyRef = PropertyRef('slug', extra_index=True)
+
+
+@dataclass(frozen=True)
+class SemgrepDeploymentSchema(CartographyNodeSchema):
+    label: str = 'SemgrepDeployment'
+    properties: SemgrepDeploymentProperties = SemgrepDeploymentProperties()
diff --git a/cartography/models/semgrep/findings.py b/cartography/models/semgrep/findings.py
@@ -0,0 +1,80 @@
+from dataclasses import dataclass
+
+from cartography.models.core.common import PropertyRef
+from cartography.models.core.nodes import CartographyNodeProperties
+from cartography.models.core.nodes import CartographyNodeSchema
+from cartography.models.core.relationships import CartographyRelProperties
+from cartography.models.core.relationships import CartographyRelSchema
+from cartography.models.core.relationships import LinkDirection
+from cartography.models.core.relationships import make_target_node_matcher
+from cartography.models.core.relationships import OtherRelationships
+from cartography.models.core.relationships import TargetNodeMatcher
+
+
+@dataclass(frozen=True)
+class SemgrepSCAFindingNodeProperties(CartographyNodeProperties):
+    id: PropertyRef = PropertyRef('id')
+    lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True)
+    rule_id: PropertyRef = PropertyRef('ruleId', extra_index=True)
+    repository: PropertyRef = PropertyRef('repositoryName', extra_index=True)
+    summary: PropertyRef = PropertyRef('title', extra_index=True)
+    description: PropertyRef = PropertyRef('description')
+    package_manager: PropertyRef = PropertyRef('ecosystem')
+    severity: PropertyRef = PropertyRef('severity')
+    cve_id: PropertyRef = PropertyRef('cveId', extra_index=True)
+    reachability_check: PropertyRef = PropertyRef('reachability')
+    reachability_condition: PropertyRef = PropertyRef('reachableIf')
+    reachability: PropertyRef = PropertyRef('exposureType')
+    transitivity: PropertyRef = PropertyRef('transitivity')
+    dependency: PropertyRef = PropertyRef('matchedDependency')
+    dependency_fix: PropertyRef = PropertyRef('closestSafeDependency')
+    ref_urls: PropertyRef = PropertyRef('ref_urls')
+    dependency_file: PropertyRef = PropertyRef('dependencyFileLocation_path', extra_index=True)
+    dependency_file_url: PropertyRef = PropertyRef('dependencyFileLocation_url', extra_index=True)
+    scan_time: PropertyRef = PropertyRef('openedAt')
+
+
+@dataclass(frozen=True)
+class SemgrepSCAFindingToSemgrepDeploymentRelProperties(CartographyRelProperties):
+    lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True)
+
+
+@dataclass(frozen=True)
+# (:SemgrepSCAFinding)<-[:RESOURCE]-(:SemgrepDeployment)
+class SemgrepSCAFindingToSemgrepDeploymentSchema(CartographyRelSchema):
+    target_node_label: str = 'SemgrepDeployment'
+    target_node_matcher: TargetNodeMatcher = make_target_node_matcher(
+        {'id': PropertyRef('DEPLOYMENT_ID', set_in_kwargs=True)},
+    )
+    direction: LinkDirection = LinkDirection.INWARD
+    rel_label: str = "RESOURCE"
+    properties: SemgrepSCAFindingToSemgrepDeploymentRelProperties = SemgrepSCAFindingToSemgrepDeploymentRelProperties()
+
+
+@dataclass(frozen=True)
+class SemgrepSCAFindingToGithubRepoRelProperties(CartographyRelProperties):
+    lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True)
+
+
+@dataclass(frozen=True)
+# (:SemgrepSCAFinding)-[:FOUND_IN]->(:GitHubRepository)
+class SemgrepSCAFindingToGithubRepoRel(CartographyRelSchema):
+    target_node_label: str = 'GitHubRepository'
+    target_node_matcher: TargetNodeMatcher = make_target_node_matcher(
+        {'fullname': PropertyRef('repositoryName')},
+    )
+    direction: LinkDirection = LinkDirection.OUTWARD
+    rel_label: str = "FOUND_IN"
+    properties: SemgrepSCAFindingToGithubRepoRelProperties = SemgrepSCAFindingToGithubRepoRelProperties()
+
+
+@dataclass(frozen=True)
+class SemgrepSCAFindingSchema(CartographyNodeSchema):
+    label: str = 'SemgrepSCAFinding'
+    properties: SemgrepSCAFindingNodeProperties = SemgrepSCAFindingNodeProperties()
+    sub_resource_relationship: SemgrepSCAFindingToSemgrepDeploymentSchema = SemgrepSCAFindingToSemgrepDeploymentSchema()
+    other_relationships: OtherRelationships = OtherRelationships(
+        [
+            SemgrepSCAFindingToGithubRepoRel(),
+        ],
+    )