apache · kzosabe · Feb 5, 2022 · Feb 13, 2022 · Feb 13, 2022 · Feb 13, 2022
diff --git a/superset/db_engine_specs/base.py b/superset/db_engine_specs/base.py
@@ -1025,12 +1025,15 @@ def select_star(  # pylint: disable=too-many-arguments,too-many-locals
         return sql
 
     @classmethod
-    def estimate_statement_cost(cls, statement: str, cursor: Any) -> Dict[str, Any]:
+    def estimate_statement_cost(
+        cls, statement: str, cursor: Any, engine: Engine
+    ) -> Dict[str, Any]:
         """
         Generate a SQL query that estimates the cost of a given statement.
 
         :param statement: A single SQL statement
         :param cursor: Cursor instance
+        :param engine: Engine instance
         :return: Dictionary with different costs
         """
         raise Exception("Database does not support cost estimation")
@@ -1095,7 +1098,9 @@ def estimate_query_cost(
                 processed_statement = cls.process_statement(
                     statement, database, user_name
                 )
-                costs.append(cls.estimate_statement_cost(processed_statement, cursor))
+                costs.append(
+                    cls.estimate_statement_cost(processed_statement, cursor, engine)
+                )
         return costs
 
     @classmethod

diff --git a/superset/db_engine_specs/bigquery.py b/superset/db_engine_specs/bigquery.py
@@ -185,6 +185,60 @@ class BigQueryEngineSpec(BaseEngineSpec):
         ),
     }
 
+    @classmethod
+    def get_allow_cost_estimate(cls, extra: Dict[str, Any]) -> bool:
+        return True
+
+    @classmethod
+    def estimate_statement_cost(
+        cls, statement: str, cursor: Any, engine: Engine
+    ) -> Dict[str, Any]:
+        try:
+            # pylint: disable=import-outside-toplevel
+            from google.cloud import bigquery
+            from google.oauth2 import service_account
+        except ImportError as ex:
+            raise Exception(
+                "Could not import libraries `google.cloud` or `google.oauth2`, "
+                "which are required to be installed in your environment in order "
+                "to estimate cost"
+            ) from ex
+
+        creds = engine.dialect.credentials_info
+        credentials = service_account.Credentials.from_service_account_info(creds)
+        client = bigquery.Client(credentials=credentials)
+        dry_run_result = client.query(
+            statement, bigquery.job.QueryJobConfig(dry_run=True)
+        )
+
+        return {
+            "Total bytes processed": dry_run_result.total_bytes_processed,
+        }
+
+    @classmethod
+    def query_cost_formatter(
+        cls, raw_cost: List[Dict[str, Any]]
+    ) -> List[Dict[str, str]]:
+        def format_bytes_str(raw_bytes: int) -> str:
+            if not isinstance(raw_bytes, int):
+                return str(raw_bytes)
+            units = ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]
+            index = 0
+            bytes = float(raw_bytes)
+            while bytes >= 1024 and index < len(units) - 1:
+                bytes /= 1024
+                index += 1
+
+            return "{:.1f}".format(bytes) + f" {units[index]}"
+
+        return [
+            {
+                k: format_bytes_str(v) if k == "Total bytes processed" else str(v)
+                for k, v in row.items()
+            }
+            for row in raw_cost
+        ]
+
     @classmethod
     def convert_dttm(
         cls, target_type: str, dttm: datetime, db_extra: Optional[Dict[str, Any]] = None

diff --git a/superset/db_engine_specs/postgres.py b/superset/db_engine_specs/postgres.py
@@ -23,6 +23,7 @@
 from flask_babel import gettext as __
 from sqlalchemy.dialects.postgresql import ARRAY, DOUBLE_PRECISION, ENUM, JSON
 from sqlalchemy.dialects.postgresql.base import PGInspector
+from sqlalchemy.engine.base import Engine
 from sqlalchemy.types import String
 
 from superset.db_engine_specs.base import (
@@ -197,7 +198,9 @@ def get_allow_cost_estimate(cls, extra: Dict[str, Any]) -> bool:
         return True
 
     @classmethod
-    def estimate_statement_cost(cls, statement: str, cursor: Any) -> Dict[str, Any]:
+    def estimate_statement_cost(
+        cls, statement: str, cursor: Any, engine: Engine
+    ) -> Dict[str, Any]:
         sql = f"EXPLAIN {statement}"
         cursor.execute(sql)
 

diff --git a/superset/db_engine_specs/presto.py b/superset/db_engine_specs/presto.py
@@ -637,7 +637,9 @@ def select_star(  # pylint: disable=too-many-arguments
         )
 
     @classmethod
-    def estimate_statement_cost(cls, statement: str, cursor: Any) -> Dict[str, Any]:
+    def estimate_statement_cost(
+        cls, statement: str, cursor: Any, engine: Engine
+    ) -> Dict[str, Any]:
         """
         Run a SQL query that estimates the cost of a given statement.
 

diff --git a/superset/db_engine_specs/trino.py b/superset/db_engine_specs/trino.py
@@ -21,6 +21,7 @@
 
 import simplejson as json
 from flask import current_app
+from sqlalchemy.engine.base import Engine
 from sqlalchemy.engine.url import make_url, URL
 
 from superset.db_engine_specs.base import BaseEngineSpec
@@ -118,7 +119,9 @@ def get_allow_cost_estimate(cls, extra: Dict[str, Any]) -> bool:
         return True
 
     @classmethod
-    def estimate_statement_cost(cls, statement: str, cursor: Any) -> Dict[str, Any]:
+    def estimate_statement_cost(
+        cls, statement: str, cursor: Any, engine: Engine
+    ) -> Dict[str, Any]:
         """
         Run a SQL query that estimates the cost of a given statement.
 

diff --git a/tests/integration_tests/db_engine_specs/bigquery_tests.py b/tests/integration_tests/db_engine_specs/bigquery_tests.py
@@ -366,3 +366,79 @@ def test_calculated_column_in_order_by(self):
         }
         sql = table.get_query_str(query_obj)
         assert "ORDER BY gender_cc ASC" in sql
+
+    @mock.patch("google.cloud.bigquery.Client")
+    @mock.patch(
+        "google.oauth2.service_account.Credentials.from_service_account_info",
+        mock.Mock(),
+    )
+    def test_estimate_statement_cost_select_star(self, mocked_client_class):
+        mocked_client = mocked_client_class.return_value
+        mocked_client.query.return_value = mock.Mock()
+        mocked_client.query.return_value.total_bytes_processed = 123
+        cursor = mock.Mock()
+        engine = mock.Mock()
+        sql = "SELECT * FROM `some-project.database.table`"
+        results = BigQueryEngineSpec.estimate_statement_cost(sql, cursor, engine)
+        mocked_client.query.assert_called_once()
+        args = mocked_client.query.call_args.args
+        self.assertEqual(args[0], sql)
+        self.assertEqual(args[1].dry_run, True)
+        self.assertEqual(
+            results, {"Total bytes processed": 123},
+        )
+
+    @mock.patch("google.cloud.bigquery.Client")
+    @mock.patch(
+        "google.oauth2.service_account.Credentials.from_service_account_info",
+        mock.Mock(),
+    )
+    def test_estimate_statement_invalid_syntax(self, mocked_client_class):
+        from google.api_core.exceptions import BadRequest
+
+        cursor = mock.Mock()
+        mocked_client = mocked_client_class.return_value
+        mocked_client.query.side_effect = BadRequest(
+            """
+            POST https://bigquery.googleapis.com/bigquery/v2/projects/xxx/jobs?
+            prettyPrint=false: Table name "birth_names" missing dataset while no def
+            ault dataset is set in the request.
+
+            (job ID: xxx)
+
+            -----Query Job SQL Follows-----
+
+                |    .    |    .    |
+               1:DROP TABLE birth_names
+                |    .    |    .    |
+            """
+        )
+        engine = mock.Mock()
+        sql = "DROP TABLE birth_names"
+        with self.assertRaises(BadRequest):
+            BigQueryEngineSpec.estimate_statement_cost(sql, cursor, engine)
+
+    def test_query_cost_formatter_example_costs(self):
+        raw_cost = [
+            {"Total bytes processed": 123, "Some other column": 123,},
+            {"Total bytes processed": 1024, "Some other column": "abcde",},
+            {"Total bytes processed": 1024 * 1024 + 1024 * 512,},
+            {"Total bytes processed": 1024 ** 3,},
+            {"Total bytes processed": 1024 ** 4,},
+            {"Total bytes processed": 1024 ** 5,},
+            {"Total bytes processed": 1024 ** 6,},
+        ]
+        result = BigQueryEngineSpec.query_cost_formatter(raw_cost)
+        self.assertEqual(
+            result,
+            [
+                {"Total bytes processed": "123.0 B", "Some other column": "123",},
+                {"Total bytes processed": "1.0 KiB", "Some other column": "abcde",},
+                {"Total bytes processed": "1.5 MiB",},
+                {"Total bytes processed": "1.0 GiB",},
+                {"Total bytes processed": "1.0 TiB",},
+                {"Total bytes processed": "1.0 PiB",},
+                # Petabyte is the largest unit, but larger values can be handled
+                {"Total bytes processed": "1024.0 PiB",},
+            ],
+        )
diff --git a/tests/integration_tests/db_engine_specs/postgres_tests.py b/tests/integration_tests/db_engine_specs/postgres_tests.py
@@ -176,8 +176,9 @@ def test_estimate_statement_cost_select_star(self):
         cursor.fetchone.return_value = (
             "Seq Scan on birth_names  (cost=0.00..1537.91 rows=75691 width=46)",
         )
+        engine = mock.Mock()
         sql = "SELECT * FROM birth_names"
-        results = PostgresEngineSpec.estimate_statement_cost(sql, cursor)
+        results = PostgresEngineSpec.estimate_statement_cost(sql, cursor, engine)
         self.assertEqual(
             results, {"Start-up cost": 0.00, "Total cost": 1537.91,},
         )
@@ -196,9 +197,10 @@ def test_estimate_statement_invalid_syntax(self):
                             ^
             """
         )
+        engine = mock.Mock()
         sql = "DROP TABLE birth_names"
         with self.assertRaises(errors.SyntaxError):
-            PostgresEngineSpec.estimate_statement_cost(sql, cursor)
+            PostgresEngineSpec.estimate_statement_cost(sql, cursor, engine)
 
     def test_query_cost_formatter_example_costs(self):
         """

diff --git a/tests/integration_tests/db_engine_specs/presto_tests.py b/tests/integration_tests/db_engine_specs/presto_tests.py
@@ -795,17 +795,19 @@ def test_estimate_statement_cost(self):
         mock_cursor.fetchone.return_value = [
             '{"a": "b"}',
         ]
+        mock_engine = mock.Mock()
         result = PrestoEngineSpec.estimate_statement_cost(
-            "SELECT * FROM brth_names", mock_cursor
+            "SELECT * FROM brth_names", mock_cursor, mock_engine
         )
         assert result == estimate_json
 
     def test_estimate_statement_cost_invalid_syntax(self):
         mock_cursor = mock.MagicMock()
         mock_cursor.execute.side_effect = Exception()
+        mock_engine = mock.Mock()
         with self.assertRaises(Exception):
             PrestoEngineSpec.estimate_statement_cost(
-                "DROP TABLE brth_names", mock_cursor
+                "DROP TABLE brth_names", mock_cursor, mock_engine
             )
 
     def test_get_all_datasource_names(self):