From 49399792c69503dc466dd97ab9673bf7b89b6e4b Mon Sep 17 00:00:00 2001
From: Hector Eryx Paredes Camacho <heryxpc@users.noreply.github.com>
Date: Fri, 19 Jan 2024 15:21:41 -0600
Subject: [PATCH] Semgrep SCA - add findings published date and fix status
 (#1281)

The triage information from response
https://semgrep.dev/api/docs#/SupplyChainService/SupplyChainService_ListVulns2
contains either "NEW", "CLOSED" or "IGNORED" status. This field can be
used to determine if a finding has been fixed or not.
Also, adding the "announcedAt" date to get the CVE/GHSA published date
information.
---
 cartography/intel/semgrep/findings.py  | 22 +++++++++++++++++-----
 cartography/models/semgrep/findings.py |  2 ++
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/cartography/intel/semgrep/findings.py b/cartography/intel/semgrep/findings.py
index 49b36695e1..fce81b7cb3 100644
--- a/cartography/intel/semgrep/findings.py
+++ b/cartography/intel/semgrep/findings.py
@@ -3,6 +3,7 @@
 from typing import Dict
 from typing import List
 from typing import Tuple
+from urllib.error import HTTPError
 
 import neo4j
 import requests
@@ -20,6 +21,7 @@
 logger = logging.getLogger(__name__)
 stat_handler = get_stats_client(__name__)
 _TIMEOUT = (60, 60)
+_MAX_RETRIES = 3
 
 
 @timeit
@@ -57,6 +59,7 @@ def get_sca_vulns(semgrep_app_token: str, deployment_id: str) -> List[Dict[str,
     has_more = True
     cursor: Dict[str, str] = {}
     page = 1
+    retries = 0
     headers = {
         "Content-Type": "application/json",
         "Authorization": f"Bearer {semgrep_app_token}",
@@ -78,16 +81,23 @@ def get_sca_vulns(semgrep_app_token: str, deployment_id: str) -> List[Dict[str,
                     "issueOffset": cursor["issueOffset"],
                 },
             })
-
-        response = requests.post(sca_url, json=request_data, headers=headers, timeout=_TIMEOUT)
-        response.raise_for_status()
-        data = response.json()
+        try:
+            response = requests.post(sca_url, json=request_data, headers=headers, timeout=_TIMEOUT)
+            response.raise_for_status()
+            data = response.json()
+        except HTTPError as e:
+            logger.warning(f"Failed to retrieve Semgrep SCA vulns for page {page}. Retrying...")
+            retries += 1
+            if retries >= _MAX_RETRIES:
+                raise e
+            continue
         vulns = data["vulns"]
         cursor = data.get("cursor")
         has_more = data.get("hasMore", False)
-        all_vulns.extend(vulns)
         if page % 10 == 0:
             logger.info(f"Processed {page} pages of Semgrep SCA vulnerabilities so far.")
+        all_vulns.extend(vulns)
+        retries = 0
 
     return all_vulns
 
@@ -128,6 +138,8 @@ def transform_sca_vulns(raw_vulns: List[Dict[str, Any]]) -> Tuple[List[Dict[str,
         if vuln["advisory"].get("references", {}).get("urls", []):
             sca_vuln["ref_urls"] = vuln["advisory"].get("references", {}).get("urls", [])
         sca_vuln["openedAt"] = vuln.get("openedAt", None)
+        sca_vuln["announcedAt"] = vuln.get("announcedAt", None)
+        sca_vuln["fixStatus"] = vuln["triage"]["status"]
         for usage in vuln.get("usages", []):
             usage_dict = {}
             usage_dict["SCA_ID"] = sca_vuln["id"]
diff --git a/cartography/models/semgrep/findings.py b/cartography/models/semgrep/findings.py
index 0506f82ed6..1d80281089 100644
--- a/cartography/models/semgrep/findings.py
+++ b/cartography/models/semgrep/findings.py
@@ -32,6 +32,8 @@ class SemgrepSCAFindingNodeProperties(CartographyNodeProperties):
     dependency_file: PropertyRef = PropertyRef('dependencyFileLocation_path', extra_index=True)
     dependency_file_url: PropertyRef = PropertyRef('dependencyFileLocation_url', extra_index=True)
     scan_time: PropertyRef = PropertyRef('openedAt')
+    published_time: PropertyRef = PropertyRef('announcedAt')
+    fix_status: PropertyRef = PropertyRef('fixStatus')
 
 
 @dataclass(frozen=True)