From 49399792c69503dc466dd97ab9673bf7b89b6e4b Mon Sep 17 00:00:00 2001 From: Hector Eryx Paredes Camacho Date: Fri, 19 Jan 2024 15:21:41 -0600 Subject: [PATCH] Semgrep SCA - add findings published date and fix status (#1281) The triage information from response https://semgrep.dev/api/docs#/SupplyChainService/SupplyChainService_ListVulns2 contains either "NEW", "CLOSED" or "IGNORED" status. This field can be used to determine if a finding has been fixed or not. Also, adding the "announcedAt" date to get the CVE/GHSA published date information. --- cartography/intel/semgrep/findings.py | 22 +++++++++++++++++----- cartography/models/semgrep/findings.py | 2 ++ 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/cartography/intel/semgrep/findings.py b/cartography/intel/semgrep/findings.py index 49b36695e1..fce81b7cb3 100644 --- a/cartography/intel/semgrep/findings.py +++ b/cartography/intel/semgrep/findings.py @@ -3,6 +3,7 @@ from typing import Dict from typing import List from typing import Tuple +from urllib.error import HTTPError import neo4j import requests @@ -20,6 +21,7 @@ logger = logging.getLogger(__name__) stat_handler = get_stats_client(__name__) _TIMEOUT = (60, 60) +_MAX_RETRIES = 3 @timeit @@ -57,6 +59,7 @@ def get_sca_vulns(semgrep_app_token: str, deployment_id: str) -> List[Dict[str, has_more = True cursor: Dict[str, str] = {} page = 1 + retries = 0 headers = { "Content-Type": "application/json", "Authorization": f"Bearer {semgrep_app_token}", @@ -78,16 +81,23 @@ def get_sca_vulns(semgrep_app_token: str, deployment_id: str) -> List[Dict[str, "issueOffset": cursor["issueOffset"], }, }) - - response = requests.post(sca_url, json=request_data, headers=headers, timeout=_TIMEOUT) - response.raise_for_status() - data = response.json() + try: + response = requests.post(sca_url, json=request_data, headers=headers, timeout=_TIMEOUT) + response.raise_for_status() + data = response.json() + except HTTPError as e: + logger.warning(f"Failed to retrieve Semgrep SCA vulns for page {page}. Retrying...") + retries += 1 + if retries >= _MAX_RETRIES: + raise e + continue vulns = data["vulns"] cursor = data.get("cursor") has_more = data.get("hasMore", False) - all_vulns.extend(vulns) if page % 10 == 0: logger.info(f"Processed {page} pages of Semgrep SCA vulnerabilities so far.") + all_vulns.extend(vulns) + retries = 0 return all_vulns @@ -128,6 +138,8 @@ def transform_sca_vulns(raw_vulns: List[Dict[str, Any]]) -> Tuple[List[Dict[str, if vuln["advisory"].get("references", {}).get("urls", []): sca_vuln["ref_urls"] = vuln["advisory"].get("references", {}).get("urls", []) sca_vuln["openedAt"] = vuln.get("openedAt", None) + sca_vuln["announcedAt"] = vuln.get("announcedAt", None) + sca_vuln["fixStatus"] = vuln["triage"]["status"] for usage in vuln.get("usages", []): usage_dict = {} usage_dict["SCA_ID"] = sca_vuln["id"] diff --git a/cartography/models/semgrep/findings.py b/cartography/models/semgrep/findings.py index 0506f82ed6..1d80281089 100644 --- a/cartography/models/semgrep/findings.py +++ b/cartography/models/semgrep/findings.py @@ -32,6 +32,8 @@ class SemgrepSCAFindingNodeProperties(CartographyNodeProperties): dependency_file: PropertyRef = PropertyRef('dependencyFileLocation_path', extra_index=True) dependency_file_url: PropertyRef = PropertyRef('dependencyFileLocation_url', extra_index=True) scan_time: PropertyRef = PropertyRef('openedAt') + published_time: PropertyRef = PropertyRef('announcedAt') + fix_status: PropertyRef = PropertyRef('fixStatus') @dataclass(frozen=True)