From ab0b0a2eb907f327e267ce0b61e33eff2ea06470 Mon Sep 17 00:00:00 2001 From: Jonny Dixon <45681293+acrylJonny@users.noreply.github.com> Date: Thu, 14 Nov 2024 09:17:50 +0000 Subject: [PATCH] fix(ingest/dremio): Dremio software jobs retrieval SQL query fix query error (#11817) Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> --- .../ingestion/source/dremio/dremio_api.py | 2 +- .../source/dremio/dremio_sql_queries.py | 19 ++++++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py index e28eb08e492ee..db83dde7cf613 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py @@ -249,7 +249,7 @@ def post(self, url: str, data: str) -> Dict: ) return response.json() - def execute_query(self, query: str, timeout: int = 300) -> List[Dict[str, Any]]: + def execute_query(self, query: str, timeout: int = 3600) -> List[Dict[str, Any]]: """Execute SQL query with timeout and error handling""" try: response = self.post(url="/sql", data=json.dumps({"sql": query})) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_sql_queries.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_sql_queries.py index 1c247c7d1f7bc..161e8141c8852 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_sql_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_sql_queries.py @@ -235,21 +235,34 @@ class DremioSQLQueries: TABLE_NAME ASC """ + # Dremio Documentation: https://docs.dremio.com/current/reference/sql/system-tables/jobs_recent/ + # queried_datasets incorrectly documented as [varchar]. Observed as varchar. + # LENGTH used as opposed to ARRAY_SIZE QUERY_ALL_JOBS = """ SELECT - * + job_id, + user_name, + submitted_ts, + query, + queried_datasets FROM SYS.JOBS_RECENT WHERE STATUS = 'COMPLETED' - AND ARRAY_SIZE(queried_datasets)>0 + AND LENGTH(queried_datasets)>0 AND user_name != '$dremio$' AND query_type not like '%INTERNAL%' """ + # Dremio Documentation: https://docs.dremio.com/cloud/reference/sql/system-tables/jobs-historical + # queried_datasets correctly documented as [varchar] QUERY_ALL_JOBS_CLOUD = """ SELECT - * + job_id, + user_name, + submitted_ts, + query, + CONCAT('[', ARRAY_TO_STRING(queried_datasets, ','), ']') as queried_datasets FROM sys.project.history.jobs WHERE