From 603520ca6e054d189d323a5c605b6c3d832a16d0 Mon Sep 17 00:00:00 2001 From: Sergio Franco <58365614+serge-wq@users.noreply.github.com> Date: Mon, 13 May 2024 12:01:02 -0600 Subject: [PATCH] GitHub intel updates to error handling (#1303) ## Summary We've found constant errors in the GitHub module when the response data does not have the expected schema. These updates, while not always prevent the crash, they will provide more insight into what happened. Signed-off-by: chandanchowdhury --- cartography/intel/aws/ec2/snapshots.py | 6 ++-- cartography/intel/github/teams.py | 9 ++++-- cartography/intel/github/util.py | 34 +++++++++++++++++----- tests/integration/cartography/test_util.py | 6 ++-- 4 files changed, 40 insertions(+), 15 deletions(-) diff --git a/cartography/intel/aws/ec2/snapshots.py b/cartography/intel/aws/ec2/snapshots.py index ab2f3de794..a235d69edf 100644 --- a/cartography/intel/aws/ec2/snapshots.py +++ b/cartography/intel/aws/ec2/snapshots.py @@ -42,8 +42,10 @@ def get_snapshots(boto3_session: boto3.session.Session, region: str, in_use_snap snapshots.extend(page['Snapshots']) except ClientError as e: if e.response['Error']['Code'] == 'InvalidSnapshot.NotFound': - logger.warning(f"Failed to retrieve page of in-use, \ - not owned snapshots. Continuing anyway. Error - {e}") + logger.warning( + f"Failed to retrieve page of in-use, \ + not owned snapshots. Continuing anyway. Error - {e}", + ) else: raise diff --git a/cartography/intel/github/teams.py b/cartography/intel/github/teams.py index 1415abd50d..6afebeb824 100644 --- a/cartography/intel/github/teams.py +++ b/cartography/intel/github/teams.py @@ -57,10 +57,13 @@ def _get_team_repos_for_multiple_teams( team_repos = _get_team_repos(org, api_url, token, team_name) if repo_count > 0 else None - # Shape = [(repo_url, 'WRITE'), ...]] - repo_urls = [t['url'] for t in team_repos.nodes] if team_repos else [] - repo_permissions = [t['permission'] for t in team_repos.edges] if team_repos else [] + repo_urls = [] + repo_permissions = [] + if team_repos: + repo_urls = [t['url'] for t in team_repos.nodes] if team_repos.nodes else [] + repo_permissions = [t['permission'] for t in team_repos.edges] if team_repos.edges else [] + # Shape = [(repo_url, 'WRITE'), ...]] result[team_name] = list(zip(repo_urls, repo_permissions)) return result diff --git a/cartography/intel/github/util.py b/cartography/intel/github/util.py index f4e2ac6360..bd73ef2f4f 100644 --- a/cartography/intel/github/util.py +++ b/cartography/intel/github/util.py @@ -81,12 +81,12 @@ def call_github_api(query: str, variables: str, token: str, api_url: str) -> Dic def fetch_page( - token: str, - api_url: str, - organization: str, - query: str, - cursor: Optional[str] = None, - **kwargs: Any, + token: str, + api_url: str, + organization: str, + query: str, + cursor: Optional[str] = None, + **kwargs: Any, ) -> Dict[str, Any]: """ Return a single page of max size 100 elements from the Github api_url using the given `query` and `cursor` params. @@ -139,6 +139,7 @@ def fetch_all( """ cursor = None has_next_page = True + org_data: Dict[str, Any] = {} data: PaginatedGraphqlData = PaginatedGraphqlData(nodes=[], edges=[]) retry = 0 @@ -170,6 +171,15 @@ def fetch_all( time.sleep(2 ** retry) continue + if 'data' not in resp: + logger.warning( + f'Got no "data" attribute in response: {resp}. ' + f'Stopping requests for organization: {organization} and ' + f'resource_type: {resource_type}', + ) + has_next_page = False + continue + resource = resp['data']['organization'][resource_type] if resource_inner_type: resource = resp['data']['organization'][resource_type][resource_inner_type] @@ -180,6 +190,14 @@ def fetch_all( cursor = resource['pageInfo']['endCursor'] has_next_page = resource['pageInfo']['hasNextPage'] - - org_data = {'url': resp['data']['organization']['url'], 'login': resp['data']['organization']['login']} + if not org_data: + org_data = { + 'url': resp['data']['organization']['url'], + 'login': resp['data']['organization']['login'], + } + + if not org_data: + raise ValueError( + f"Didn't get any organization data for organization: {organization} and resource_type: {resource_type}", + ) return data, org_data diff --git a/tests/integration/cartography/test_util.py b/tests/integration/cartography/test_util.py index b3603b6e4e..35d0bc9985 100644 --- a/tests/integration/cartography/test_util.py +++ b/tests/integration/cartography/test_util.py @@ -35,7 +35,8 @@ def test_merge_module_sync_metadata(mock_stat_incr, neo4j_session): stat_handler=stat_handler, ) # Assert - nodes = neo4j_session.run(f""" + nodes = neo4j_session.run( + f""" MATCH (m:ModuleSyncMetadata{{id:'AWSAccount_{TEST_ACCOUNT_ID}_S3Bucket'}}) RETURN m.id, @@ -43,7 +44,8 @@ def test_merge_module_sync_metadata(mock_stat_incr, neo4j_session): m.grouptype, m.groupid, m.lastupdated - """) + """, + ) # Assert actual_nodes = { (