diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py index e28eb08e492ee..db83dde7cf613 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py @@ -249,7 +249,7 @@ def post(self, url: str, data: str) -> Dict: ) return response.json() - def execute_query(self, query: str, timeout: int = 300) -> List[Dict[str, Any]]: + def execute_query(self, query: str, timeout: int = 3600) -> List[Dict[str, Any]]: """Execute SQL query with timeout and error handling""" try: response = self.post(url="/sql", data=json.dumps({"sql": query})) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_sql_queries.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_sql_queries.py index 1c247c7d1f7bc..161e8141c8852 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_sql_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_sql_queries.py @@ -235,21 +235,34 @@ class DremioSQLQueries: TABLE_NAME ASC """ + # Dremio Documentation: https://docs.dremio.com/current/reference/sql/system-tables/jobs_recent/ + # queried_datasets incorrectly documented as [varchar]. Observed as varchar. + # LENGTH used as opposed to ARRAY_SIZE QUERY_ALL_JOBS = """ SELECT - * + job_id, + user_name, + submitted_ts, + query, + queried_datasets FROM SYS.JOBS_RECENT WHERE STATUS = 'COMPLETED' - AND ARRAY_SIZE(queried_datasets)>0 + AND LENGTH(queried_datasets)>0 AND user_name != '$dremio$' AND query_type not like '%INTERNAL%' """ + # Dremio Documentation: https://docs.dremio.com/cloud/reference/sql/system-tables/jobs-historical + # queried_datasets correctly documented as [varchar] QUERY_ALL_JOBS_CLOUD = """ SELECT - * + job_id, + user_name, + submitted_ts, + query, + CONCAT('[', ARRAY_TO_STRING(queried_datasets, ','), ']') as queried_datasets FROM sys.project.history.jobs WHERE