From 6ae463838407b804c14887b8f0cfc7a39ca10001 Mon Sep 17 00:00:00 2001
From: Kazuhito Osabe <kzosabe@wip.place>
Date: Sun, 6 Feb 2022 08:25:30 +0900
Subject: [PATCH 1/8] Add engine param for each estimate_statement_cost call

---
 superset/db_engine_specs/base.py                          | 5 +++--
 superset/db_engine_specs/postgres.py                      | 3 ++-
 superset/db_engine_specs/presto.py                        | 2 +-
 superset/db_engine_specs/trino.py                         | 3 ++-
 tests/integration_tests/db_engine_specs/postgres_tests.py | 6 ++++--
 tests/integration_tests/db_engine_specs/presto_tests.py   | 6 ++++--
 6 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/superset/db_engine_specs/base.py b/superset/db_engine_specs/base.py
index 764f3fde70580..2c8f252d35dbe 100644
--- a/superset/db_engine_specs/base.py
+++ b/superset/db_engine_specs/base.py
@@ -1025,12 +1025,13 @@ def select_star(  # pylint: disable=too-many-arguments,too-many-locals
         return sql
 
     @classmethod
-    def estimate_statement_cost(cls, statement: str, cursor: Any) -> Dict[str, Any]:
+    def estimate_statement_cost(cls, statement: str, cursor: Any, engine: Engine) -> Dict[str, Any]:
         """
         Generate a SQL query that estimates the cost of a given statement.
 
         :param statement: A single SQL statement
         :param cursor: Cursor instance
+        :param engine: Engine instance
         :return: Dictionary with different costs
         """
         raise Exception("Database does not support cost estimation")
@@ -1095,7 +1096,7 @@ def estimate_query_cost(
                 processed_statement = cls.process_statement(
                     statement, database, user_name
                 )
-                costs.append(cls.estimate_statement_cost(processed_statement, cursor))
+                costs.append(cls.estimate_statement_cost(processed_statement, cursor, engine))
         return costs
 
     @classmethod
diff --git a/superset/db_engine_specs/postgres.py b/superset/db_engine_specs/postgres.py
index f6c6888ee97bb..b60058535c075 100644
--- a/superset/db_engine_specs/postgres.py
+++ b/superset/db_engine_specs/postgres.py
@@ -23,6 +23,7 @@
 from flask_babel import gettext as __
 from sqlalchemy.dialects.postgresql import ARRAY, DOUBLE_PRECISION, ENUM, JSON
 from sqlalchemy.dialects.postgresql.base import PGInspector
+from sqlalchemy.engine.base import Engine
 from sqlalchemy.types import String
 
 from superset.db_engine_specs.base import (
@@ -197,7 +198,7 @@ def get_allow_cost_estimate(cls, extra: Dict[str, Any]) -> bool:
         return True
 
     @classmethod
-    def estimate_statement_cost(cls, statement: str, cursor: Any) -> Dict[str, Any]:
+    def estimate_statement_cost(cls, statement: str, cursor: Any, engine: Engine) -> Dict[str, Any]:
         sql = f"EXPLAIN {statement}"
         cursor.execute(sql)
 
diff --git a/superset/db_engine_specs/presto.py b/superset/db_engine_specs/presto.py
index 376151587cdee..3e0a83507e543 100644
--- a/superset/db_engine_specs/presto.py
+++ b/superset/db_engine_specs/presto.py
@@ -637,7 +637,7 @@ def select_star(  # pylint: disable=too-many-arguments
         )
 
     @classmethod
-    def estimate_statement_cost(cls, statement: str, cursor: Any) -> Dict[str, Any]:
+    def estimate_statement_cost(cls, statement: str, cursor: Any, engine: Engine) -> Dict[str, Any]:
         """
         Run a SQL query that estimates the cost of a given statement.
 
diff --git a/superset/db_engine_specs/trino.py b/superset/db_engine_specs/trino.py
index 4e5f153ad2ab2..4f85c11c20de0 100644
--- a/superset/db_engine_specs/trino.py
+++ b/superset/db_engine_specs/trino.py
@@ -21,6 +21,7 @@
 
 import simplejson as json
 from flask import current_app
+from sqlalchemy.engine.base import Engine
 from sqlalchemy.engine.url import make_url, URL
 
 from superset.db_engine_specs.base import BaseEngineSpec
@@ -118,7 +119,7 @@ def get_allow_cost_estimate(cls, extra: Dict[str, Any]) -> bool:
         return True
 
     @classmethod
-    def estimate_statement_cost(cls, statement: str, cursor: Any) -> Dict[str, Any]:
+    def estimate_statement_cost(cls, statement: str, cursor: Any, engine: Engine) -> Dict[str, Any]:
         """
         Run a SQL query that estimates the cost of a given statement.
 
diff --git a/tests/integration_tests/db_engine_specs/postgres_tests.py b/tests/integration_tests/db_engine_specs/postgres_tests.py
index dcf5310fecac5..c600d2b435eab 100644
--- a/tests/integration_tests/db_engine_specs/postgres_tests.py
+++ b/tests/integration_tests/db_engine_specs/postgres_tests.py
@@ -176,8 +176,9 @@ def test_estimate_statement_cost_select_star(self):
         cursor.fetchone.return_value = (
             "Seq Scan on birth_names  (cost=0.00..1537.91 rows=75691 width=46)",
         )
+        engine = mock.Mock()
         sql = "SELECT * FROM birth_names"
-        results = PostgresEngineSpec.estimate_statement_cost(sql, cursor)
+        results = PostgresEngineSpec.estimate_statement_cost(sql, cursor, engine)
         self.assertEqual(
             results, {"Start-up cost": 0.00, "Total cost": 1537.91,},
         )
@@ -196,9 +197,10 @@ def test_estimate_statement_invalid_syntax(self):
                             ^
             """
         )
+        engine = mock.Mock()
         sql = "DROP TABLE birth_names"
         with self.assertRaises(errors.SyntaxError):
-            PostgresEngineSpec.estimate_statement_cost(sql, cursor)
+            PostgresEngineSpec.estimate_statement_cost(sql, cursor, engine)
 
     def test_query_cost_formatter_example_costs(self):
         """
diff --git a/tests/integration_tests/db_engine_specs/presto_tests.py b/tests/integration_tests/db_engine_specs/presto_tests.py
index 5833c6bdcbfcb..cb7e4018f9ff2 100644
--- a/tests/integration_tests/db_engine_specs/presto_tests.py
+++ b/tests/integration_tests/db_engine_specs/presto_tests.py
@@ -795,17 +795,19 @@ def test_estimate_statement_cost(self):
         mock_cursor.fetchone.return_value = [
             '{"a": "b"}',
         ]
+        mock_engine = mock.Mock()
         result = PrestoEngineSpec.estimate_statement_cost(
-            "SELECT * FROM brth_names", mock_cursor
+            "SELECT * FROM brth_names", mock_cursor, mock_engine
         )
         assert result == estimate_json
 
     def test_estimate_statement_cost_invalid_syntax(self):
         mock_cursor = mock.MagicMock()
         mock_cursor.execute.side_effect = Exception()
+        mock_engine = mock.Mock()
         with self.assertRaises(Exception):
             PrestoEngineSpec.estimate_statement_cost(
-                "DROP TABLE brth_names", mock_cursor
+                "DROP TABLE brth_names", mock_cursor, mock_engine
             )
 
     def test_get_all_datasource_names(self):

From 6e676f65e19734ed87b9d7ff8bb5c8d408167d81 Mon Sep 17 00:00:00 2001
From: Kazuhito Osabe <kzosabe@wip.place>
Date: Sun, 13 Feb 2022 18:17:58 +0900
Subject: [PATCH 2/8] Add bigquery cost estimation implementation

---
 superset/db_engine_specs/base.py     |  8 +++--
 superset/db_engine_specs/bigquery.py | 45 ++++++++++++++++++++++++++++
 superset/db_engine_specs/postgres.py |  4 ++-
 superset/db_engine_specs/presto.py   |  4 ++-
 superset/db_engine_specs/trino.py    |  4 ++-
 5 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/superset/db_engine_specs/base.py b/superset/db_engine_specs/base.py
index 2c8f252d35dbe..75baf2a15670d 100644
--- a/superset/db_engine_specs/base.py
+++ b/superset/db_engine_specs/base.py
@@ -1025,7 +1025,9 @@ def select_star(  # pylint: disable=too-many-arguments,too-many-locals
         return sql
 
     @classmethod
-    def estimate_statement_cost(cls, statement: str, cursor: Any, engine: Engine) -> Dict[str, Any]:
+    def estimate_statement_cost(
+        cls, statement: str, cursor: Any, engine: Engine
+    ) -> Dict[str, Any]:
         """
         Generate a SQL query that estimates the cost of a given statement.
 
@@ -1096,7 +1098,9 @@ def estimate_query_cost(
                 processed_statement = cls.process_statement(
                     statement, database, user_name
                 )
-                costs.append(cls.estimate_statement_cost(processed_statement, cursor, engine))
+                costs.append(
+                    cls.estimate_statement_cost(processed_statement, cursor, engine)
+                )
         return costs
 
     @classmethod
diff --git a/superset/db_engine_specs/bigquery.py b/superset/db_engine_specs/bigquery.py
index 30e04c4f2fe9b..cf889f236ee5d 100644
--- a/superset/db_engine_specs/bigquery.py
+++ b/superset/db_engine_specs/bigquery.py
@@ -24,6 +24,8 @@
 from apispec import APISpec
 from apispec.ext.marshmallow import MarshmallowPlugin
 from flask_babel import gettext as __
+from google.cloud import bigquery
+from google.oauth2 import service_account
 from marshmallow import fields, Schema
 from marshmallow.exceptions import ValidationError
 from sqlalchemy import column
@@ -185,6 +187,49 @@ class BigQueryEngineSpec(BaseEngineSpec):
         ),
     }
 
+    @classmethod
+    def get_allow_cost_estimate(cls, extra: Dict[str, Any]) -> bool:
+        return True
+
+    @classmethod
+    def estimate_statement_cost(
+        cls, statement: str, cursor: Any, engine: Engine
+    ) -> Dict[str, Any]:
+        creds = engine.dialect.credentials_info
+        credentials = service_account.Credentials.from_service_account_info(creds)
+        client = bigquery.Client(credentials=credentials)
+        dry_run_result = client.query(
+            statement, bigquery.job.QueryJobConfig(dry_run=True)
+        )
+
+        return {
+            "Total bytes processed": dry_run_result.total_bytes_processed,
+        }
+
+    @classmethod
+    def query_cost_formatter(
+        cls, raw_cost: List[Dict[str, Any]]
+    ) -> List[Dict[str, str]]:
+        def format_bytes_str(raw_bytes: int) -> str:
+            if not isinstance(raw_bytes, int):
+                return str(raw_bytes)
+            units = ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]
+            index = 0
+            bytes = float(raw_bytes)
+            while bytes >= 1024 and index < len(units) - 1:
+                bytes /= 1024
+                index += 1
+
+            return "{:.1f}".format(bytes) + f" {units[index]}"
+
+        return [
+            {
+                k: format_bytes_str(v) if k == "Total bytes processed" else str(v)
+                for k, v in row.items()
+            }
+            for row in raw_cost
+        ]
+
     @classmethod
     def convert_dttm(
         cls, target_type: str, dttm: datetime, db_extra: Optional[Dict[str, Any]] = None
diff --git a/superset/db_engine_specs/postgres.py b/superset/db_engine_specs/postgres.py
index b60058535c075..bbf58a364b270 100644
--- a/superset/db_engine_specs/postgres.py
+++ b/superset/db_engine_specs/postgres.py
@@ -198,7 +198,9 @@ def get_allow_cost_estimate(cls, extra: Dict[str, Any]) -> bool:
         return True
 
     @classmethod
-    def estimate_statement_cost(cls, statement: str, cursor: Any, engine: Engine) -> Dict[str, Any]:
+    def estimate_statement_cost(
+        cls, statement: str, cursor: Any, engine: Engine
+    ) -> Dict[str, Any]:
         sql = f"EXPLAIN {statement}"
         cursor.execute(sql)
 
diff --git a/superset/db_engine_specs/presto.py b/superset/db_engine_specs/presto.py
index 3e0a83507e543..1006c6f0f1447 100644
--- a/superset/db_engine_specs/presto.py
+++ b/superset/db_engine_specs/presto.py
@@ -637,7 +637,9 @@ def select_star(  # pylint: disable=too-many-arguments
         )
 
     @classmethod
-    def estimate_statement_cost(cls, statement: str, cursor: Any, engine: Engine) -> Dict[str, Any]:
+    def estimate_statement_cost(
+        cls, statement: str, cursor: Any, engine: Engine
+    ) -> Dict[str, Any]:
         """
         Run a SQL query that estimates the cost of a given statement.
 
diff --git a/superset/db_engine_specs/trino.py b/superset/db_engine_specs/trino.py
index 4f85c11c20de0..ba3e6ac9592d0 100644
--- a/superset/db_engine_specs/trino.py
+++ b/superset/db_engine_specs/trino.py
@@ -119,7 +119,9 @@ def get_allow_cost_estimate(cls, extra: Dict[str, Any]) -> bool:
         return True
 
     @classmethod
-    def estimate_statement_cost(cls, statement: str, cursor: Any, engine: Engine) -> Dict[str, Any]:
+    def estimate_statement_cost(
+        cls, statement: str, cursor: Any, engine: Engine
+    ) -> Dict[str, Any]:
         """
         Run a SQL query that estimates the cost of a given statement.
 

From 4e638432d8cab94c75790fe516d6782ead794c8b Mon Sep 17 00:00:00 2001
From: Kazuhito Osabe <kzosabe@wip.place>
Date: Sun, 13 Feb 2022 18:25:46 +0900
Subject: [PATCH 3/8] Add tests

---
 .../db_engine_specs/bigquery_tests.py         | 76 +++++++++++++++++++
 1 file changed, 76 insertions(+)

diff --git a/tests/integration_tests/db_engine_specs/bigquery_tests.py b/tests/integration_tests/db_engine_specs/bigquery_tests.py
index b7405092c5446..7e3fe2ecf78d9 100644
--- a/tests/integration_tests/db_engine_specs/bigquery_tests.py
+++ b/tests/integration_tests/db_engine_specs/bigquery_tests.py
@@ -366,3 +366,79 @@ def test_calculated_column_in_order_by(self):
         }
         sql = table.get_query_str(query_obj)
         assert "ORDER BY gender_cc ASC" in sql
+
+    @mock.patch("google.cloud.bigquery.Client")
+    @mock.patch(
+        "google.oauth2.service_account.Credentials.from_service_account_info",
+        mock.Mock(),
+    )
+    def test_estimate_statement_cost_select_star(self, mocked_client_class):
+        mocked_client = mocked_client_class.return_value
+        mocked_client.query.return_value = mock.Mock()
+        mocked_client.query.return_value.total_bytes_processed = 123
+        cursor = mock.Mock()
+        engine = mock.Mock()
+        sql = "SELECT * FROM `some-project.database.table`"
+        results = BigQueryEngineSpec.estimate_statement_cost(sql, cursor, engine)
+        mocked_client.query.assert_called_once()
+        args = mocked_client.query.call_args.args
+        self.assertEqual(args[0], sql)
+        self.assertEqual(args[1].dry_run, True)
+        self.assertEqual(
+            results, {"Total bytes processed": 123},
+        )
+
+    @mock.patch("google.cloud.bigquery.Client")
+    @mock.patch(
+        "google.oauth2.service_account.Credentials.from_service_account_info",
+        mock.Mock(),
+    )
+    def test_estimate_statement_invalid_syntax(self, mocked_client_class):
+        from google.api_core.exceptions import BadRequest
+
+        cursor = mock.Mock()
+        mocked_client = mocked_client_class.return_value
+        mocked_client.query.side_effect = BadRequest(
+            """
+            POST https://bigquery.googleapis.com/bigquery/v2/projects/xxx/jobs?
+            prettyPrint=false: Table name "birth_names" missing dataset while no def
+            ault dataset is set in the request.
+
+            (job ID: xxx)
+
+            -----Query Job SQL Follows-----
+
+                |    .    |    .    |
+               1:DROP TABLE birth_names
+                |    .    |    .    |
+            """
+        )
+        engine = mock.Mock()
+        sql = "DROP TABLE birth_names"
+        with self.assertRaises(BadRequest):
+            BigQueryEngineSpec.estimate_statement_cost(sql, cursor, engine)
+
+    def test_query_cost_formatter_example_costs(self):
+        raw_cost = [
+            {"Total bytes processed": 123, "Some other column": 123,},
+            {"Total bytes processed": 1024, "Some other column": "abcde",},
+            {"Total bytes processed": 1024 * 1024 + 1024 * 512,},
+            {"Total bytes processed": 1024 ** 3,},
+            {"Total bytes processed": 1024 ** 4,},
+            {"Total bytes processed": 1024 ** 5,},
+            {"Total bytes processed": 1024 ** 6,},
+        ]
+        result = BigQueryEngineSpec.query_cost_formatter(raw_cost)
+        self.assertEqual(
+            result,
+            [
+                {"Total bytes processed": "123.0 B", "Some other column": "123",},
+                {"Total bytes processed": "1.0 KiB", "Some other column": "abcde",},
+                {"Total bytes processed": "1.5 MiB",},
+                {"Total bytes processed": "1.0 GiB",},
+                {"Total bytes processed": "1.0 TiB",},
+                {"Total bytes processed": "1.0 PiB",},
+                # Petabyte is the largest unit, but larger values can be handled
+                {"Total bytes processed": "1024.0 PiB",},
+            ],
+        )

From 6c06b051b2b41c7dd27eb0d3ab5bb1de81bea868 Mon Sep 17 00:00:00 2001
From: Kazuhito Osabe <kzosabe@wip.place>
Date: Sun, 13 Feb 2022 21:19:03 +0900
Subject: [PATCH 4/8] Fix import issue

---
 superset/db_engine_specs/bigquery.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/superset/db_engine_specs/bigquery.py b/superset/db_engine_specs/bigquery.py
index cf889f236ee5d..00490eedaa512 100644
--- a/superset/db_engine_specs/bigquery.py
+++ b/superset/db_engine_specs/bigquery.py
@@ -24,8 +24,6 @@
 from apispec import APISpec
 from apispec.ext.marshmallow import MarshmallowPlugin
 from flask_babel import gettext as __
-from google.cloud import bigquery
-from google.oauth2 import service_account
 from marshmallow import fields, Schema
 from marshmallow.exceptions import ValidationError
 from sqlalchemy import column
@@ -195,6 +193,17 @@ def get_allow_cost_estimate(cls, extra: Dict[str, Any]) -> bool:
     def estimate_statement_cost(
         cls, statement: str, cursor: Any, engine: Engine
     ) -> Dict[str, Any]:
+        try:
+            # pylint: disable=import-outside-toplevel
+            from google.cloud import bigquery
+            from google.oauth2 import service_account
+        except ImportError as ex:
+            raise Exception(
+                "Could not import libraries `google.cloud` or `google.oauth2`, "
+                "which are required to be installed in your environment in order "
+                "to estimate cost"
+            ) from ex
+
         creds = engine.dialect.credentials_info
         credentials = service_account.Credentials.from_service_account_info(creds)
         client = bigquery.Client(credentials=credentials)

From dba135c94d068a5744caade30f45fdadfa842989 Mon Sep 17 00:00:00 2001
From: Kazuhito Osabe <kzosabe@wip.place>
Date: Mon, 14 Feb 2022 20:39:12 +0900
Subject: [PATCH 5/8] Fix lint

---
 superset/db_engine_specs/bigquery.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/superset/db_engine_specs/bigquery.py b/superset/db_engine_specs/bigquery.py
index 00490eedaa512..beeb439c0f3d0 100644
--- a/superset/db_engine_specs/bigquery.py
+++ b/superset/db_engine_specs/bigquery.py
@@ -224,12 +224,12 @@ def format_bytes_str(raw_bytes: int) -> str:
                 return str(raw_bytes)
             units = ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]
             index = 0
-            bytes = float(raw_bytes)
-            while bytes >= 1024 and index < len(units) - 1:
-                bytes /= 1024
+            bytes_float = float(raw_bytes)
+            while bytes_float >= 1024 and index < len(units) - 1:
+                bytes_float /= 1024
                 index += 1
 
-            return "{:.1f}".format(bytes) + f" {units[index]}"
+            return "{:.1f}".format(bytes_float) + f" {units[index]}"
 
         return [
             {

From 1fd1488ec7050cabd7d6512dc14b31271cd49181 Mon Sep 17 00:00:00 2001
From: Kazuhito Osabe <kzosabe@wip.place>
Date: Thu, 17 Feb 2022 19:09:32 +0900
Subject: [PATCH 6/8] Remove unnecessary try-except

---
 superset/db_engine_specs/bigquery.py | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/superset/db_engine_specs/bigquery.py b/superset/db_engine_specs/bigquery.py
index beeb439c0f3d0..bf148169f8196 100644
--- a/superset/db_engine_specs/bigquery.py
+++ b/superset/db_engine_specs/bigquery.py
@@ -193,16 +193,9 @@ def get_allow_cost_estimate(cls, extra: Dict[str, Any]) -> bool:
     def estimate_statement_cost(
         cls, statement: str, cursor: Any, engine: Engine
     ) -> Dict[str, Any]:
-        try:
-            # pylint: disable=import-outside-toplevel
-            from google.cloud import bigquery
-            from google.oauth2 import service_account
-        except ImportError as ex:
-            raise Exception(
-                "Could not import libraries `google.cloud` or `google.oauth2`, "
-                "which are required to be installed in your environment in order "
-                "to estimate cost"
-            ) from ex
+        # pylint: disable=import-outside-toplevel
+        from google.cloud import bigquery
+        from google.oauth2 import service_account
 
         creds = engine.dialect.credentials_info
         credentials = service_account.Credentials.from_service_account_info(creds)
@@ -370,16 +363,9 @@ def df_to_sql(
         :param to_sql_kwargs: The kwargs to be passed to pandas.DataFrame.to_sql` method
         """
 
-        try:
-            # pylint: disable=import-outside-toplevel
-            import pandas_gbq
-            from google.oauth2 import service_account
-        except ImportError as ex:
-            raise Exception(
-                "Could not import libraries `pandas_gbq` or `google.oauth2`, which are "
-                "required to be installed in your environment in order "
-                "to upload data to BigQuery"
-            ) from ex
+        # pylint: disable=import-outside-toplevel
+        import pandas_gbq
+        from google.oauth2 import service_account
 
         if not table.schema:
             raise Exception("The table schema must be defined")

From f5908e59651e7dc76aa877af53ce0aaf90d6df4b Mon Sep 17 00:00:00 2001
From: Kazuhito Osabe <kzosabe@wip.place>
Date: Thu, 17 Feb 2022 20:30:13 +0900
Subject: [PATCH 7/8] Add common implementation of humanize numbers

---
 superset/db_engine_specs/base.py   | 16 ++++++++++++++++
 superset/db_engine_specs/presto.py | 17 +----------------
 superset/db_engine_specs/trino.py  | 17 +----------------
 3 files changed, 18 insertions(+), 32 deletions(-)

diff --git a/superset/db_engine_specs/base.py b/superset/db_engine_specs/base.py
index 75baf2a15670d..a116a7d2de06c 100644
--- a/superset/db_engine_specs/base.py
+++ b/superset/db_engine_specs/base.py
@@ -1430,6 +1430,22 @@ def cancel_query(  # pylint: disable=unused-argument
     def parse_sql(cls, sql: str) -> List[str]:
         return [str(s).strip(" ;") for s in sqlparse.parse(sql)]
 
+    @classmethod
+    def _humanize(cls, value: Any, suffix: str) -> str:
+        try:
+            value = int(value)
+        except ValueError:
+            return str(value)
+
+        prefixes = ["K", "M", "G", "T", "P", "E", "Z", "Y"]
+        prefix = ""
+        to_next_prefix = 1000
+        while value > to_next_prefix and prefixes:
+            prefix = prefixes.pop(0)
+            value //= to_next_prefix
+
+        return f"{value} {prefix}{suffix}"
+
 
 # schema for adding a database by providing parameters instead of the
 # full SQLAlchemy URI
diff --git a/superset/db_engine_specs/presto.py b/superset/db_engine_specs/presto.py
index 1006c6f0f1447..99f65d44312db 100644
--- a/superset/db_engine_specs/presto.py
+++ b/superset/db_engine_specs/presto.py
@@ -677,21 +677,6 @@ def query_cost_formatter(
         :return: Human readable cost estimate
         """
 
-        def humanize(value: Any, suffix: str) -> str:
-            try:
-                value = int(value)
-            except ValueError:
-                return str(value)
-
-            prefixes = ["K", "M", "G", "T", "P", "E", "Z", "Y"]
-            prefix = ""
-            to_next_prefix = 1000
-            while value > to_next_prefix and prefixes:
-                prefix = prefixes.pop(0)
-                value //= to_next_prefix
-
-            return f"{value} {prefix}{suffix}"
-
         cost = []
         columns = [
             ("outputRowCount", "Output count", " rows"),
@@ -705,7 +690,7 @@ def humanize(value: Any, suffix: str) -> str:
             statement_cost = {}
             for key, label, suffix in columns:
                 if key in estimate:
-                    statement_cost[label] = humanize(estimate[key], suffix).strip()
+                    statement_cost[label] = cls._humanize(estimate[key], suffix).strip()
             cost.append(statement_cost)
 
         return cost
diff --git a/superset/db_engine_specs/trino.py b/superset/db_engine_specs/trino.py
index ba3e6ac9592d0..2f54f59b3a384 100644
--- a/superset/db_engine_specs/trino.py
+++ b/superset/db_engine_specs/trino.py
@@ -159,21 +159,6 @@ def query_cost_formatter(
         :return: Human readable cost estimate
         """
 
-        def humanize(value: Any, suffix: str) -> str:
-            try:
-                value = int(value)
-            except ValueError:
-                return str(value)
-
-            prefixes = ["K", "M", "G", "T", "P", "E", "Z", "Y"]
-            prefix = ""
-            to_next_prefix = 1000
-            while value > to_next_prefix and prefixes:
-                prefix = prefixes.pop(0)
-                value //= to_next_prefix
-
-            return f"{value} {prefix}{suffix}"
-
         cost = []
         columns = [
             ("outputRowCount", "Output count", " rows"),
@@ -187,7 +172,7 @@ def humanize(value: Any, suffix: str) -> str:
             statement_cost = {}
             for key, label, suffix in columns:
                 if key in estimate:
-                    statement_cost[label] = humanize(estimate[key], suffix).strip()
+                    statement_cost[label] = cls._humanize(estimate[key], suffix).strip()
             cost.append(statement_cost)
 
         return cost

From 9e36f0f45fe92b1c6081e4d39ad9488b00bcee61 Mon Sep 17 00:00:00 2001
From: Kazuhito Osabe <kzosabe@wip.place>
Date: Sat, 19 Feb 2022 10:23:13 +0900
Subject: [PATCH 8/8] Add the ability to handle byte count to humanize

---
 superset/db_engine_specs/base.py              | 23 ++++++++---
 superset/db_engine_specs/bigquery.py          | 30 ++++++--------
 superset/db_engine_specs/presto.py            | 16 ++++----
 superset/db_engine_specs/trino.py             | 16 ++++----
 .../db_engine_specs/bigquery_tests.py         | 25 +++++-------
 .../db_engine_specs/presto_tests.py           |  2 +-
 tests/unit_tests/db_engine_specs/test_base.py | 39 +++++++++++++++++++
 7 files changed, 97 insertions(+), 54 deletions(-)

diff --git a/superset/db_engine_specs/base.py b/superset/db_engine_specs/base.py
index a116a7d2de06c..96eecd358708b 100644
--- a/superset/db_engine_specs/base.py
+++ b/superset/db_engine_specs/base.py
@@ -1431,20 +1431,31 @@ def parse_sql(cls, sql: str) -> List[str]:
         return [str(s).strip(" ;") for s in sqlparse.parse(sql)]
 
     @classmethod
-    def _humanize(cls, value: Any, suffix: str) -> str:
+    def _humanize(cls, value: Any, suffix: str, category: Optional[str] = None) -> str:
         try:
             value = int(value)
         except ValueError:
             return str(value)
+        if category not in ["bytes", None]:
+            raise Exception(f"Unsupported value category: {category}")
 
-        prefixes = ["K", "M", "G", "T", "P", "E", "Z", "Y"]
-        prefix = ""
         to_next_prefix = 1000
-        while value > to_next_prefix and prefixes:
-            prefix = prefixes.pop(0)
+        prefixes = ["", "K", "M", "G", "T", "P", "E", "Z", "Y"]
+        suffixes = [p + suffix for p in prefixes]
+
+        if category == "bytes":
+            to_next_prefix = 1024
+            suffixes = ["B" if p == "" else p + "iB" for p in prefixes]
+
+        suffix = suffixes.pop(0)
+        while value >= to_next_prefix and suffixes:
+            suffix = suffixes.pop(0)
             value //= to_next_prefix
 
-        return f"{value} {prefix}{suffix}"
+        if not suffix.startswith(" "):
+            suffix = " " + suffix
+
+        return "{}{}".format(value, suffix).strip()
 
 
 # schema for adding a database by providing parameters instead of the
diff --git a/superset/db_engine_specs/bigquery.py b/superset/db_engine_specs/bigquery.py
index bf148169f8196..fb9e04474d212 100644
--- a/superset/db_engine_specs/bigquery.py
+++ b/superset/db_engine_specs/bigquery.py
@@ -212,26 +212,20 @@ def estimate_statement_cost(
     def query_cost_formatter(
         cls, raw_cost: List[Dict[str, Any]]
     ) -> List[Dict[str, str]]:
-        def format_bytes_str(raw_bytes: int) -> str:
-            if not isinstance(raw_bytes, int):
-                return str(raw_bytes)
-            units = ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]
-            index = 0
-            bytes_float = float(raw_bytes)
-            while bytes_float >= 1024 and index < len(units) - 1:
-                bytes_float /= 1024
-                index += 1
-
-            return "{:.1f}".format(bytes_float) + f" {units[index]}"
-
-        return [
-            {
-                k: format_bytes_str(v) if k == "Total bytes processed" else str(v)
-                for k, v in row.items()
-            }
-            for row in raw_cost
+        cost = []
+        columns = [
+            ("Total bytes processed", "", "bytes"),
         ]
 
+        for row in raw_cost:
+            statement_cost = {}
+            for key, suffix, category in columns:
+                if key in row:
+                    statement_cost[key] = cls._humanize(row[key], suffix, category)
+            cost.append(statement_cost)
+
+        return cost
+
     @classmethod
     def convert_dttm(
         cls, target_type: str, dttm: datetime, db_extra: Optional[Dict[str, Any]] = None
diff --git a/superset/db_engine_specs/presto.py b/superset/db_engine_specs/presto.py
index 99f65d44312db..c49b506f446b0 100644
--- a/superset/db_engine_specs/presto.py
+++ b/superset/db_engine_specs/presto.py
@@ -679,18 +679,20 @@ def query_cost_formatter(
 
         cost = []
         columns = [
-            ("outputRowCount", "Output count", " rows"),
-            ("outputSizeInBytes", "Output size", "B"),
-            ("cpuCost", "CPU cost", ""),
-            ("maxMemory", "Max memory", "B"),
-            ("networkCost", "Network cost", ""),
+            ("outputRowCount", "Output count", " rows", None),
+            ("outputSizeInBytes", "Output size", "", "bytes"),
+            ("cpuCost", "CPU cost", "", None),
+            ("maxMemory", "Max memory", "", "bytes"),
+            ("networkCost", "Network cost", "", None),
         ]
         for row in raw_cost:
             estimate: Dict[str, float] = row.get("estimate", {})
             statement_cost = {}
-            for key, label, suffix in columns:
+            for key, label, suffix, category in columns:
                 if key in estimate:
-                    statement_cost[label] = cls._humanize(estimate[key], suffix).strip()
+                    statement_cost[label] = cls._humanize(
+                        estimate[key], suffix, category
+                    ).strip()
             cost.append(statement_cost)
 
         return cost
diff --git a/superset/db_engine_specs/trino.py b/superset/db_engine_specs/trino.py
index 2f54f59b3a384..b04e31e08472a 100644
--- a/superset/db_engine_specs/trino.py
+++ b/superset/db_engine_specs/trino.py
@@ -161,18 +161,20 @@ def query_cost_formatter(
 
         cost = []
         columns = [
-            ("outputRowCount", "Output count", " rows"),
-            ("outputSizeInBytes", "Output size", "B"),
-            ("cpuCost", "CPU cost", ""),
-            ("maxMemory", "Max memory", "B"),
-            ("networkCost", "Network cost", ""),
+            ("outputRowCount", "Output count", " rows", None),
+            ("outputSizeInBytes", "Output size", "", "bytes"),
+            ("cpuCost", "CPU cost", "", None),
+            ("maxMemory", "Max memory", "", "bytes"),
+            ("networkCost", "Network cost", "", None),
         ]
         for row in raw_cost:
             estimate: Dict[str, float] = row.get("estimate", {})
             statement_cost = {}
-            for key, label, suffix in columns:
+            for key, label, suffix, category in columns:
                 if key in estimate:
-                    statement_cost[label] = cls._humanize(estimate[key], suffix).strip()
+                    statement_cost[label] = cls._humanize(
+                        estimate[key], suffix, category
+                    ).strip()
             cost.append(statement_cost)
 
         return cost
diff --git a/tests/integration_tests/db_engine_specs/bigquery_tests.py b/tests/integration_tests/db_engine_specs/bigquery_tests.py
index 7e3fe2ecf78d9..ae6d48a644541 100644
--- a/tests/integration_tests/db_engine_specs/bigquery_tests.py
+++ b/tests/integration_tests/db_engine_specs/bigquery_tests.py
@@ -420,25 +420,20 @@ def test_estimate_statement_invalid_syntax(self, mocked_client_class):
 
     def test_query_cost_formatter_example_costs(self):
         raw_cost = [
-            {"Total bytes processed": 123, "Some other column": 123,},
-            {"Total bytes processed": 1024, "Some other column": "abcde",},
-            {"Total bytes processed": 1024 * 1024 + 1024 * 512,},
-            {"Total bytes processed": 1024 ** 3,},
-            {"Total bytes processed": 1024 ** 4,},
-            {"Total bytes processed": 1024 ** 5,},
-            {"Total bytes processed": 1024 ** 6,},
+            {"Total bytes processed": 123},
+            {"Total bytes processed": 1024},
+            {"Total bytes processed": 1024 ** 2 + 1024 * 512,},
+            {"Total bytes processed": 1024 ** 3 * 100,},
+            {"Total bytes processed": 1024 ** 4 * 1000,},
         ]
         result = BigQueryEngineSpec.query_cost_formatter(raw_cost)
         self.assertEqual(
             result,
             [
-                {"Total bytes processed": "123.0 B", "Some other column": "123",},
-                {"Total bytes processed": "1.0 KiB", "Some other column": "abcde",},
-                {"Total bytes processed": "1.5 MiB",},
-                {"Total bytes processed": "1.0 GiB",},
-                {"Total bytes processed": "1.0 TiB",},
-                {"Total bytes processed": "1.0 PiB",},
-                # Petabyte is the largest unit, but larger values can be handled
-                {"Total bytes processed": "1024.0 PiB",},
+                {"Total bytes processed": "123 B"},
+                {"Total bytes processed": "1 KiB"},
+                {"Total bytes processed": "1 MiB",},
+                {"Total bytes processed": "100 GiB",},
+                {"Total bytes processed": "1000 TiB",},
             ],
         )
diff --git a/tests/integration_tests/db_engine_specs/presto_tests.py b/tests/integration_tests/db_engine_specs/presto_tests.py
index cb7e4018f9ff2..3579244215b7b 100644
--- a/tests/integration_tests/db_engine_specs/presto_tests.py
+++ b/tests/integration_tests/db_engine_specs/presto_tests.py
@@ -524,7 +524,7 @@ def test_query_cost_formatter(self):
         expected = [
             {
                 "Output count": "904 M rows",
-                "Output size": "354 GB",
+                "Output size": "329 GiB",
                 "CPU cost": "354 G",
                 "Max memory": "0 B",
                 "Network cost": "354 G",
diff --git a/tests/unit_tests/db_engine_specs/test_base.py b/tests/unit_tests/db_engine_specs/test_base.py
index 4dc27c0928f99..93fbaa2eb1512 100644
--- a/tests/unit_tests/db_engine_specs/test_base.py
+++ b/tests/unit_tests/db_engine_specs/test_base.py
@@ -17,6 +17,7 @@
 # pylint: disable=unused-argument, import-outside-toplevel, protected-access
 
 from textwrap import dedent
+from typing import Any, Optional
 
 import pytest
 from flask.ctx import AppContext
@@ -99,3 +100,41 @@ def test_cte_query_parsing(
 
     actual = BaseEngineSpec.get_cte_query(original)
     assert actual == expected
+
+
+@pytest.mark.parametrize(
+    "value,suffix,category,expected",
+    [
+        ("str", "", None, "str"),
+        (0, "", None, "0"),
+        (100, "", None, "100"),
+        (1000, "", None, "1 K"),
+        (10000, "", None, "10 K"),
+        (123, " rows", None, "123 rows"),
+        (1234, " rows", None, "1 K rows"),
+        (1999, " rows", None, "1 K rows"),
+        (2000, " rows", None, "2 K rows"),
+        (123, "", "bytes", "123 B"),
+        (1024, "", "bytes", "1 KiB"),
+        (1024 ** 2, "", "bytes", "1 MiB"),
+        (1000 ** 2, "J", None, "1 MJ"),
+        (1024 ** 3, "", "bytes", "1 GiB"),
+        (1000 ** 3, "W", None, "1 GW"),
+        (1024 ** 8, "", "bytes", "1 YiB"),
+        (1000 ** 8, "m", None, "1 Ym"),
+        # Yottabyte is the largest unit, but larger values can be handled
+        (1024 ** 9, "", "bytes", "1024 YiB"),
+        (1000 ** 9, "m", None, "1000 Ym"),
+    ],
+)
+def test_humanize(
+    app_context: AppContext,
+    value: Any,
+    suffix: str,
+    category: Optional[str],
+    expected: str,
+) -> None:
+    from superset.db_engine_specs.base import BaseEngineSpec
+
+    actual = BaseEngineSpec._humanize(value, suffix, category)
+    assert actual == expected