GitHub intel updates to error handling (cartography-cncf#1303)

## Summary We've found constant errors in the GitHub module when the response data does not have the expected schema. These updates, while not always prevent the crash, they will provide more insight into what happened. Signed-off-by: chandanchowdhury <[email protected]>
chandanchowdhury · Nov 27, 2024 · 603520c · 603520c
1 parent 1206082
commit 603520c
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 15 deletions.
diff --git a/cartography/intel/aws/ec2/snapshots.py b/cartography/intel/aws/ec2/snapshots.py
@@ -42,8 +42,10 @@ def get_snapshots(boto3_session: boto3.session.Session, region: str, in_use_snap
                 snapshots.extend(page['Snapshots'])
         except ClientError as e:
             if e.response['Error']['Code'] == 'InvalidSnapshot.NotFound':
-                logger.warning(f"Failed to retrieve page of in-use, \
-                    not owned snapshots. Continuing anyway. Error - {e}")
+                logger.warning(
+                    f"Failed to retrieve page of in-use, \
+                    not owned snapshots. Continuing anyway. Error - {e}",
+                )
             else:
                 raise
 

diff --git a/cartography/intel/github/teams.py b/cartography/intel/github/teams.py
@@ -57,10 +57,13 @@ def _get_team_repos_for_multiple_teams(
 
         team_repos = _get_team_repos(org, api_url, token, team_name) if repo_count > 0 else None
 
-        # Shape = [(repo_url, 'WRITE'), ...]]
-        repo_urls = [t['url'] for t in team_repos.nodes] if team_repos else []
-        repo_permissions = [t['permission'] for t in team_repos.edges] if team_repos else []
+        repo_urls = []
+        repo_permissions = []
+        if team_repos:
+            repo_urls = [t['url'] for t in team_repos.nodes] if team_repos.nodes else []
+            repo_permissions = [t['permission'] for t in team_repos.edges] if team_repos.edges else []
 
+        # Shape = [(repo_url, 'WRITE'), ...]]
         result[team_name] = list(zip(repo_urls, repo_permissions))
     return result
 

diff --git a/cartography/intel/github/util.py b/cartography/intel/github/util.py
@@ -81,12 +81,12 @@ def call_github_api(query: str, variables: str, token: str, api_url: str) -> Dic
 
 
 def fetch_page(
-        token: str,
-        api_url: str,
-        organization: str,
-        query: str,
-        cursor: Optional[str] = None,
-        **kwargs: Any,
+    token: str,
+    api_url: str,
+    organization: str,
+    query: str,
+    cursor: Optional[str] = None,
+    **kwargs: Any,
 ) -> Dict[str, Any]:
     """
     Return a single page of max size 100 elements from the Github api_url using the given `query` and `cursor` params.
@@ -139,6 +139,7 @@ def fetch_all(
     """
     cursor = None
     has_next_page = True
+    org_data: Dict[str, Any] = {}
     data: PaginatedGraphqlData = PaginatedGraphqlData(nodes=[], edges=[])
     retry = 0
 
@@ -170,6 +171,15 @@ def fetch_all(
             time.sleep(2 ** retry)
             continue
 
+        if 'data' not in resp:
+            logger.warning(
+                f'Got no "data" attribute in response: {resp}. '
+                f'Stopping requests for organization: {organization} and '
+                f'resource_type: {resource_type}',
+            )
+            has_next_page = False
+            continue
+
         resource = resp['data']['organization'][resource_type]
         if resource_inner_type:
             resource = resp['data']['organization'][resource_type][resource_inner_type]
@@ -180,6 +190,14 @@ def fetch_all(
 
         cursor = resource['pageInfo']['endCursor']
         has_next_page = resource['pageInfo']['hasNextPage']
-
-    org_data = {'url': resp['data']['organization']['url'], 'login': resp['data']['organization']['login']}
+        if not org_data:
+            org_data = {
+                'url': resp['data']['organization']['url'],
+                'login': resp['data']['organization']['login'],
+            }
+
+    if not org_data:
+        raise ValueError(
+            f"Didn't get any organization data for organization: {organization} and resource_type: {resource_type}",
+        )
     return data, org_data
diff --git a/tests/integration/cartography/test_util.py b/tests/integration/cartography/test_util.py
@@ -35,15 +35,17 @@ def test_merge_module_sync_metadata(mock_stat_incr, neo4j_session):
         stat_handler=stat_handler,
     )
     # Assert
-    nodes = neo4j_session.run(f"""
+    nodes = neo4j_session.run(
+        f"""
         MATCH (m:ModuleSyncMetadata{{id:'AWSAccount_{TEST_ACCOUNT_ID}_S3Bucket'}})
         RETURN
             m.id,
             m.syncedtype,
             m.grouptype,
             m.groupid,
             m.lastupdated
-    """)
+    """,
+    )
     # Assert
     actual_nodes = {
         (