From 6ae463838407b804c14887b8f0cfc7a39ca10001 Mon Sep 17 00:00:00 2001 From: Kazuhito Osabe Date: Sun, 6 Feb 2022 08:25:30 +0900 Subject: [PATCH 1/8] Add engine param for each estimate_statement_cost call --- superset/db_engine_specs/base.py | 5 +++-- superset/db_engine_specs/postgres.py | 3 ++- superset/db_engine_specs/presto.py | 2 +- superset/db_engine_specs/trino.py | 3 ++- tests/integration_tests/db_engine_specs/postgres_tests.py | 6 ++++-- tests/integration_tests/db_engine_specs/presto_tests.py | 6 ++++-- 6 files changed, 16 insertions(+), 9 deletions(-) diff --git a/superset/db_engine_specs/base.py b/superset/db_engine_specs/base.py index 764f3fde70580..2c8f252d35dbe 100644 --- a/superset/db_engine_specs/base.py +++ b/superset/db_engine_specs/base.py @@ -1025,12 +1025,13 @@ def select_star( # pylint: disable=too-many-arguments,too-many-locals return sql @classmethod - def estimate_statement_cost(cls, statement: str, cursor: Any) -> Dict[str, Any]: + def estimate_statement_cost(cls, statement: str, cursor: Any, engine: Engine) -> Dict[str, Any]: """ Generate a SQL query that estimates the cost of a given statement. :param statement: A single SQL statement :param cursor: Cursor instance + :param engine: Engine instance :return: Dictionary with different costs """ raise Exception("Database does not support cost estimation") @@ -1095,7 +1096,7 @@ def estimate_query_cost( processed_statement = cls.process_statement( statement, database, user_name ) - costs.append(cls.estimate_statement_cost(processed_statement, cursor)) + costs.append(cls.estimate_statement_cost(processed_statement, cursor, engine)) return costs @classmethod diff --git a/superset/db_engine_specs/postgres.py b/superset/db_engine_specs/postgres.py index f6c6888ee97bb..b60058535c075 100644 --- a/superset/db_engine_specs/postgres.py +++ b/superset/db_engine_specs/postgres.py @@ -23,6 +23,7 @@ from flask_babel import gettext as __ from sqlalchemy.dialects.postgresql import ARRAY, DOUBLE_PRECISION, ENUM, JSON from sqlalchemy.dialects.postgresql.base import PGInspector +from sqlalchemy.engine.base import Engine from sqlalchemy.types import String from superset.db_engine_specs.base import ( @@ -197,7 +198,7 @@ def get_allow_cost_estimate(cls, extra: Dict[str, Any]) -> bool: return True @classmethod - def estimate_statement_cost(cls, statement: str, cursor: Any) -> Dict[str, Any]: + def estimate_statement_cost(cls, statement: str, cursor: Any, engine: Engine) -> Dict[str, Any]: sql = f"EXPLAIN {statement}" cursor.execute(sql) diff --git a/superset/db_engine_specs/presto.py b/superset/db_engine_specs/presto.py index 376151587cdee..3e0a83507e543 100644 --- a/superset/db_engine_specs/presto.py +++ b/superset/db_engine_specs/presto.py @@ -637,7 +637,7 @@ def select_star( # pylint: disable=too-many-arguments ) @classmethod - def estimate_statement_cost(cls, statement: str, cursor: Any) -> Dict[str, Any]: + def estimate_statement_cost(cls, statement: str, cursor: Any, engine: Engine) -> Dict[str, Any]: """ Run a SQL query that estimates the cost of a given statement. diff --git a/superset/db_engine_specs/trino.py b/superset/db_engine_specs/trino.py index 4e5f153ad2ab2..4f85c11c20de0 100644 --- a/superset/db_engine_specs/trino.py +++ b/superset/db_engine_specs/trino.py @@ -21,6 +21,7 @@ import simplejson as json from flask import current_app +from sqlalchemy.engine.base import Engine from sqlalchemy.engine.url import make_url, URL from superset.db_engine_specs.base import BaseEngineSpec @@ -118,7 +119,7 @@ def get_allow_cost_estimate(cls, extra: Dict[str, Any]) -> bool: return True @classmethod - def estimate_statement_cost(cls, statement: str, cursor: Any) -> Dict[str, Any]: + def estimate_statement_cost(cls, statement: str, cursor: Any, engine: Engine) -> Dict[str, Any]: """ Run a SQL query that estimates the cost of a given statement. diff --git a/tests/integration_tests/db_engine_specs/postgres_tests.py b/tests/integration_tests/db_engine_specs/postgres_tests.py index dcf5310fecac5..c600d2b435eab 100644 --- a/tests/integration_tests/db_engine_specs/postgres_tests.py +++ b/tests/integration_tests/db_engine_specs/postgres_tests.py @@ -176,8 +176,9 @@ def test_estimate_statement_cost_select_star(self): cursor.fetchone.return_value = ( "Seq Scan on birth_names (cost=0.00..1537.91 rows=75691 width=46)", ) + engine = mock.Mock() sql = "SELECT * FROM birth_names" - results = PostgresEngineSpec.estimate_statement_cost(sql, cursor) + results = PostgresEngineSpec.estimate_statement_cost(sql, cursor, engine) self.assertEqual( results, {"Start-up cost": 0.00, "Total cost": 1537.91,}, ) @@ -196,9 +197,10 @@ def test_estimate_statement_invalid_syntax(self): ^ """ ) + engine = mock.Mock() sql = "DROP TABLE birth_names" with self.assertRaises(errors.SyntaxError): - PostgresEngineSpec.estimate_statement_cost(sql, cursor) + PostgresEngineSpec.estimate_statement_cost(sql, cursor, engine) def test_query_cost_formatter_example_costs(self): """ diff --git a/tests/integration_tests/db_engine_specs/presto_tests.py b/tests/integration_tests/db_engine_specs/presto_tests.py index 5833c6bdcbfcb..cb7e4018f9ff2 100644 --- a/tests/integration_tests/db_engine_specs/presto_tests.py +++ b/tests/integration_tests/db_engine_specs/presto_tests.py @@ -795,17 +795,19 @@ def test_estimate_statement_cost(self): mock_cursor.fetchone.return_value = [ '{"a": "b"}', ] + mock_engine = mock.Mock() result = PrestoEngineSpec.estimate_statement_cost( - "SELECT * FROM brth_names", mock_cursor + "SELECT * FROM brth_names", mock_cursor, mock_engine ) assert result == estimate_json def test_estimate_statement_cost_invalid_syntax(self): mock_cursor = mock.MagicMock() mock_cursor.execute.side_effect = Exception() + mock_engine = mock.Mock() with self.assertRaises(Exception): PrestoEngineSpec.estimate_statement_cost( - "DROP TABLE brth_names", mock_cursor + "DROP TABLE brth_names", mock_cursor, mock_engine ) def test_get_all_datasource_names(self): From 6e676f65e19734ed87b9d7ff8bb5c8d408167d81 Mon Sep 17 00:00:00 2001 From: Kazuhito Osabe Date: Sun, 13 Feb 2022 18:17:58 +0900 Subject: [PATCH 2/8] Add bigquery cost estimation implementation --- superset/db_engine_specs/base.py | 8 +++-- superset/db_engine_specs/bigquery.py | 45 ++++++++++++++++++++++++++++ superset/db_engine_specs/postgres.py | 4 ++- superset/db_engine_specs/presto.py | 4 ++- superset/db_engine_specs/trino.py | 4 ++- 5 files changed, 60 insertions(+), 5 deletions(-) diff --git a/superset/db_engine_specs/base.py b/superset/db_engine_specs/base.py index 2c8f252d35dbe..75baf2a15670d 100644 --- a/superset/db_engine_specs/base.py +++ b/superset/db_engine_specs/base.py @@ -1025,7 +1025,9 @@ def select_star( # pylint: disable=too-many-arguments,too-many-locals return sql @classmethod - def estimate_statement_cost(cls, statement: str, cursor: Any, engine: Engine) -> Dict[str, Any]: + def estimate_statement_cost( + cls, statement: str, cursor: Any, engine: Engine + ) -> Dict[str, Any]: """ Generate a SQL query that estimates the cost of a given statement. @@ -1096,7 +1098,9 @@ def estimate_query_cost( processed_statement = cls.process_statement( statement, database, user_name ) - costs.append(cls.estimate_statement_cost(processed_statement, cursor, engine)) + costs.append( + cls.estimate_statement_cost(processed_statement, cursor, engine) + ) return costs @classmethod diff --git a/superset/db_engine_specs/bigquery.py b/superset/db_engine_specs/bigquery.py index 30e04c4f2fe9b..cf889f236ee5d 100644 --- a/superset/db_engine_specs/bigquery.py +++ b/superset/db_engine_specs/bigquery.py @@ -24,6 +24,8 @@ from apispec import APISpec from apispec.ext.marshmallow import MarshmallowPlugin from flask_babel import gettext as __ +from google.cloud import bigquery +from google.oauth2 import service_account from marshmallow import fields, Schema from marshmallow.exceptions import ValidationError from sqlalchemy import column @@ -185,6 +187,49 @@ class BigQueryEngineSpec(BaseEngineSpec): ), } + @classmethod + def get_allow_cost_estimate(cls, extra: Dict[str, Any]) -> bool: + return True + + @classmethod + def estimate_statement_cost( + cls, statement: str, cursor: Any, engine: Engine + ) -> Dict[str, Any]: + creds = engine.dialect.credentials_info + credentials = service_account.Credentials.from_service_account_info(creds) + client = bigquery.Client(credentials=credentials) + dry_run_result = client.query( + statement, bigquery.job.QueryJobConfig(dry_run=True) + ) + + return { + "Total bytes processed": dry_run_result.total_bytes_processed, + } + + @classmethod + def query_cost_formatter( + cls, raw_cost: List[Dict[str, Any]] + ) -> List[Dict[str, str]]: + def format_bytes_str(raw_bytes: int) -> str: + if not isinstance(raw_bytes, int): + return str(raw_bytes) + units = ["B", "KiB", "MiB", "GiB", "TiB", "PiB"] + index = 0 + bytes = float(raw_bytes) + while bytes >= 1024 and index < len(units) - 1: + bytes /= 1024 + index += 1 + + return "{:.1f}".format(bytes) + f" {units[index]}" + + return [ + { + k: format_bytes_str(v) if k == "Total bytes processed" else str(v) + for k, v in row.items() + } + for row in raw_cost + ] + @classmethod def convert_dttm( cls, target_type: str, dttm: datetime, db_extra: Optional[Dict[str, Any]] = None diff --git a/superset/db_engine_specs/postgres.py b/superset/db_engine_specs/postgres.py index b60058535c075..bbf58a364b270 100644 --- a/superset/db_engine_specs/postgres.py +++ b/superset/db_engine_specs/postgres.py @@ -198,7 +198,9 @@ def get_allow_cost_estimate(cls, extra: Dict[str, Any]) -> bool: return True @classmethod - def estimate_statement_cost(cls, statement: str, cursor: Any, engine: Engine) -> Dict[str, Any]: + def estimate_statement_cost( + cls, statement: str, cursor: Any, engine: Engine + ) -> Dict[str, Any]: sql = f"EXPLAIN {statement}" cursor.execute(sql) diff --git a/superset/db_engine_specs/presto.py b/superset/db_engine_specs/presto.py index 3e0a83507e543..1006c6f0f1447 100644 --- a/superset/db_engine_specs/presto.py +++ b/superset/db_engine_specs/presto.py @@ -637,7 +637,9 @@ def select_star( # pylint: disable=too-many-arguments ) @classmethod - def estimate_statement_cost(cls, statement: str, cursor: Any, engine: Engine) -> Dict[str, Any]: + def estimate_statement_cost( + cls, statement: str, cursor: Any, engine: Engine + ) -> Dict[str, Any]: """ Run a SQL query that estimates the cost of a given statement. diff --git a/superset/db_engine_specs/trino.py b/superset/db_engine_specs/trino.py index 4f85c11c20de0..ba3e6ac9592d0 100644 --- a/superset/db_engine_specs/trino.py +++ b/superset/db_engine_specs/trino.py @@ -119,7 +119,9 @@ def get_allow_cost_estimate(cls, extra: Dict[str, Any]) -> bool: return True @classmethod - def estimate_statement_cost(cls, statement: str, cursor: Any, engine: Engine) -> Dict[str, Any]: + def estimate_statement_cost( + cls, statement: str, cursor: Any, engine: Engine + ) -> Dict[str, Any]: """ Run a SQL query that estimates the cost of a given statement. From 4e638432d8cab94c75790fe516d6782ead794c8b Mon Sep 17 00:00:00 2001 From: Kazuhito Osabe Date: Sun, 13 Feb 2022 18:25:46 +0900 Subject: [PATCH 3/8] Add tests --- .../db_engine_specs/bigquery_tests.py | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/tests/integration_tests/db_engine_specs/bigquery_tests.py b/tests/integration_tests/db_engine_specs/bigquery_tests.py index b7405092c5446..7e3fe2ecf78d9 100644 --- a/tests/integration_tests/db_engine_specs/bigquery_tests.py +++ b/tests/integration_tests/db_engine_specs/bigquery_tests.py @@ -366,3 +366,79 @@ def test_calculated_column_in_order_by(self): } sql = table.get_query_str(query_obj) assert "ORDER BY gender_cc ASC" in sql + + @mock.patch("google.cloud.bigquery.Client") + @mock.patch( + "google.oauth2.service_account.Credentials.from_service_account_info", + mock.Mock(), + ) + def test_estimate_statement_cost_select_star(self, mocked_client_class): + mocked_client = mocked_client_class.return_value + mocked_client.query.return_value = mock.Mock() + mocked_client.query.return_value.total_bytes_processed = 123 + cursor = mock.Mock() + engine = mock.Mock() + sql = "SELECT * FROM `some-project.database.table`" + results = BigQueryEngineSpec.estimate_statement_cost(sql, cursor, engine) + mocked_client.query.assert_called_once() + args = mocked_client.query.call_args.args + self.assertEqual(args[0], sql) + self.assertEqual(args[1].dry_run, True) + self.assertEqual( + results, {"Total bytes processed": 123}, + ) + + @mock.patch("google.cloud.bigquery.Client") + @mock.patch( + "google.oauth2.service_account.Credentials.from_service_account_info", + mock.Mock(), + ) + def test_estimate_statement_invalid_syntax(self, mocked_client_class): + from google.api_core.exceptions import BadRequest + + cursor = mock.Mock() + mocked_client = mocked_client_class.return_value + mocked_client.query.side_effect = BadRequest( + """ + POST https://bigquery.googleapis.com/bigquery/v2/projects/xxx/jobs? + prettyPrint=false: Table name "birth_names" missing dataset while no def + ault dataset is set in the request. + + (job ID: xxx) + + -----Query Job SQL Follows----- + + | . | . | + 1:DROP TABLE birth_names + | . | . | + """ + ) + engine = mock.Mock() + sql = "DROP TABLE birth_names" + with self.assertRaises(BadRequest): + BigQueryEngineSpec.estimate_statement_cost(sql, cursor, engine) + + def test_query_cost_formatter_example_costs(self): + raw_cost = [ + {"Total bytes processed": 123, "Some other column": 123,}, + {"Total bytes processed": 1024, "Some other column": "abcde",}, + {"Total bytes processed": 1024 * 1024 + 1024 * 512,}, + {"Total bytes processed": 1024 ** 3,}, + {"Total bytes processed": 1024 ** 4,}, + {"Total bytes processed": 1024 ** 5,}, + {"Total bytes processed": 1024 ** 6,}, + ] + result = BigQueryEngineSpec.query_cost_formatter(raw_cost) + self.assertEqual( + result, + [ + {"Total bytes processed": "123.0 B", "Some other column": "123",}, + {"Total bytes processed": "1.0 KiB", "Some other column": "abcde",}, + {"Total bytes processed": "1.5 MiB",}, + {"Total bytes processed": "1.0 GiB",}, + {"Total bytes processed": "1.0 TiB",}, + {"Total bytes processed": "1.0 PiB",}, + # Petabyte is the largest unit, but larger values can be handled + {"Total bytes processed": "1024.0 PiB",}, + ], + ) From 6c06b051b2b41c7dd27eb0d3ab5bb1de81bea868 Mon Sep 17 00:00:00 2001 From: Kazuhito Osabe Date: Sun, 13 Feb 2022 21:19:03 +0900 Subject: [PATCH 4/8] Fix import issue --- superset/db_engine_specs/bigquery.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/superset/db_engine_specs/bigquery.py b/superset/db_engine_specs/bigquery.py index cf889f236ee5d..00490eedaa512 100644 --- a/superset/db_engine_specs/bigquery.py +++ b/superset/db_engine_specs/bigquery.py @@ -24,8 +24,6 @@ from apispec import APISpec from apispec.ext.marshmallow import MarshmallowPlugin from flask_babel import gettext as __ -from google.cloud import bigquery -from google.oauth2 import service_account from marshmallow import fields, Schema from marshmallow.exceptions import ValidationError from sqlalchemy import column @@ -195,6 +193,17 @@ def get_allow_cost_estimate(cls, extra: Dict[str, Any]) -> bool: def estimate_statement_cost( cls, statement: str, cursor: Any, engine: Engine ) -> Dict[str, Any]: + try: + # pylint: disable=import-outside-toplevel + from google.cloud import bigquery + from google.oauth2 import service_account + except ImportError as ex: + raise Exception( + "Could not import libraries `google.cloud` or `google.oauth2`, " + "which are required to be installed in your environment in order " + "to estimate cost" + ) from ex + creds = engine.dialect.credentials_info credentials = service_account.Credentials.from_service_account_info(creds) client = bigquery.Client(credentials=credentials) From dba135c94d068a5744caade30f45fdadfa842989 Mon Sep 17 00:00:00 2001 From: Kazuhito Osabe Date: Mon, 14 Feb 2022 20:39:12 +0900 Subject: [PATCH 5/8] Fix lint --- superset/db_engine_specs/bigquery.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/superset/db_engine_specs/bigquery.py b/superset/db_engine_specs/bigquery.py index 00490eedaa512..beeb439c0f3d0 100644 --- a/superset/db_engine_specs/bigquery.py +++ b/superset/db_engine_specs/bigquery.py @@ -224,12 +224,12 @@ def format_bytes_str(raw_bytes: int) -> str: return str(raw_bytes) units = ["B", "KiB", "MiB", "GiB", "TiB", "PiB"] index = 0 - bytes = float(raw_bytes) - while bytes >= 1024 and index < len(units) - 1: - bytes /= 1024 + bytes_float = float(raw_bytes) + while bytes_float >= 1024 and index < len(units) - 1: + bytes_float /= 1024 index += 1 - return "{:.1f}".format(bytes) + f" {units[index]}" + return "{:.1f}".format(bytes_float) + f" {units[index]}" return [ { From 1fd1488ec7050cabd7d6512dc14b31271cd49181 Mon Sep 17 00:00:00 2001 From: Kazuhito Osabe Date: Thu, 17 Feb 2022 19:09:32 +0900 Subject: [PATCH 6/8] Remove unnecessary try-except --- superset/db_engine_specs/bigquery.py | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/superset/db_engine_specs/bigquery.py b/superset/db_engine_specs/bigquery.py index beeb439c0f3d0..bf148169f8196 100644 --- a/superset/db_engine_specs/bigquery.py +++ b/superset/db_engine_specs/bigquery.py @@ -193,16 +193,9 @@ def get_allow_cost_estimate(cls, extra: Dict[str, Any]) -> bool: def estimate_statement_cost( cls, statement: str, cursor: Any, engine: Engine ) -> Dict[str, Any]: - try: - # pylint: disable=import-outside-toplevel - from google.cloud import bigquery - from google.oauth2 import service_account - except ImportError as ex: - raise Exception( - "Could not import libraries `google.cloud` or `google.oauth2`, " - "which are required to be installed in your environment in order " - "to estimate cost" - ) from ex + # pylint: disable=import-outside-toplevel + from google.cloud import bigquery + from google.oauth2 import service_account creds = engine.dialect.credentials_info credentials = service_account.Credentials.from_service_account_info(creds) @@ -370,16 +363,9 @@ def df_to_sql( :param to_sql_kwargs: The kwargs to be passed to pandas.DataFrame.to_sql` method """ - try: - # pylint: disable=import-outside-toplevel - import pandas_gbq - from google.oauth2 import service_account - except ImportError as ex: - raise Exception( - "Could not import libraries `pandas_gbq` or `google.oauth2`, which are " - "required to be installed in your environment in order " - "to upload data to BigQuery" - ) from ex + # pylint: disable=import-outside-toplevel + import pandas_gbq + from google.oauth2 import service_account if not table.schema: raise Exception("The table schema must be defined") From f5908e59651e7dc76aa877af53ce0aaf90d6df4b Mon Sep 17 00:00:00 2001 From: Kazuhito Osabe Date: Thu, 17 Feb 2022 20:30:13 +0900 Subject: [PATCH 7/8] Add common implementation of humanize numbers --- superset/db_engine_specs/base.py | 16 ++++++++++++++++ superset/db_engine_specs/presto.py | 17 +---------------- superset/db_engine_specs/trino.py | 17 +---------------- 3 files changed, 18 insertions(+), 32 deletions(-) diff --git a/superset/db_engine_specs/base.py b/superset/db_engine_specs/base.py index 75baf2a15670d..a116a7d2de06c 100644 --- a/superset/db_engine_specs/base.py +++ b/superset/db_engine_specs/base.py @@ -1430,6 +1430,22 @@ def cancel_query( # pylint: disable=unused-argument def parse_sql(cls, sql: str) -> List[str]: return [str(s).strip(" ;") for s in sqlparse.parse(sql)] + @classmethod + def _humanize(cls, value: Any, suffix: str) -> str: + try: + value = int(value) + except ValueError: + return str(value) + + prefixes = ["K", "M", "G", "T", "P", "E", "Z", "Y"] + prefix = "" + to_next_prefix = 1000 + while value > to_next_prefix and prefixes: + prefix = prefixes.pop(0) + value //= to_next_prefix + + return f"{value} {prefix}{suffix}" + # schema for adding a database by providing parameters instead of the # full SQLAlchemy URI diff --git a/superset/db_engine_specs/presto.py b/superset/db_engine_specs/presto.py index 1006c6f0f1447..99f65d44312db 100644 --- a/superset/db_engine_specs/presto.py +++ b/superset/db_engine_specs/presto.py @@ -677,21 +677,6 @@ def query_cost_formatter( :return: Human readable cost estimate """ - def humanize(value: Any, suffix: str) -> str: - try: - value = int(value) - except ValueError: - return str(value) - - prefixes = ["K", "M", "G", "T", "P", "E", "Z", "Y"] - prefix = "" - to_next_prefix = 1000 - while value > to_next_prefix and prefixes: - prefix = prefixes.pop(0) - value //= to_next_prefix - - return f"{value} {prefix}{suffix}" - cost = [] columns = [ ("outputRowCount", "Output count", " rows"), @@ -705,7 +690,7 @@ def humanize(value: Any, suffix: str) -> str: statement_cost = {} for key, label, suffix in columns: if key in estimate: - statement_cost[label] = humanize(estimate[key], suffix).strip() + statement_cost[label] = cls._humanize(estimate[key], suffix).strip() cost.append(statement_cost) return cost diff --git a/superset/db_engine_specs/trino.py b/superset/db_engine_specs/trino.py index ba3e6ac9592d0..2f54f59b3a384 100644 --- a/superset/db_engine_specs/trino.py +++ b/superset/db_engine_specs/trino.py @@ -159,21 +159,6 @@ def query_cost_formatter( :return: Human readable cost estimate """ - def humanize(value: Any, suffix: str) -> str: - try: - value = int(value) - except ValueError: - return str(value) - - prefixes = ["K", "M", "G", "T", "P", "E", "Z", "Y"] - prefix = "" - to_next_prefix = 1000 - while value > to_next_prefix and prefixes: - prefix = prefixes.pop(0) - value //= to_next_prefix - - return f"{value} {prefix}{suffix}" - cost = [] columns = [ ("outputRowCount", "Output count", " rows"), @@ -187,7 +172,7 @@ def humanize(value: Any, suffix: str) -> str: statement_cost = {} for key, label, suffix in columns: if key in estimate: - statement_cost[label] = humanize(estimate[key], suffix).strip() + statement_cost[label] = cls._humanize(estimate[key], suffix).strip() cost.append(statement_cost) return cost From 9e36f0f45fe92b1c6081e4d39ad9488b00bcee61 Mon Sep 17 00:00:00 2001 From: Kazuhito Osabe Date: Sat, 19 Feb 2022 10:23:13 +0900 Subject: [PATCH 8/8] Add the ability to handle byte count to humanize --- superset/db_engine_specs/base.py | 23 ++++++++--- superset/db_engine_specs/bigquery.py | 30 ++++++-------- superset/db_engine_specs/presto.py | 16 ++++---- superset/db_engine_specs/trino.py | 16 ++++---- .../db_engine_specs/bigquery_tests.py | 25 +++++------- .../db_engine_specs/presto_tests.py | 2 +- tests/unit_tests/db_engine_specs/test_base.py | 39 +++++++++++++++++++ 7 files changed, 97 insertions(+), 54 deletions(-) diff --git a/superset/db_engine_specs/base.py b/superset/db_engine_specs/base.py index a116a7d2de06c..96eecd358708b 100644 --- a/superset/db_engine_specs/base.py +++ b/superset/db_engine_specs/base.py @@ -1431,20 +1431,31 @@ def parse_sql(cls, sql: str) -> List[str]: return [str(s).strip(" ;") for s in sqlparse.parse(sql)] @classmethod - def _humanize(cls, value: Any, suffix: str) -> str: + def _humanize(cls, value: Any, suffix: str, category: Optional[str] = None) -> str: try: value = int(value) except ValueError: return str(value) + if category not in ["bytes", None]: + raise Exception(f"Unsupported value category: {category}") - prefixes = ["K", "M", "G", "T", "P", "E", "Z", "Y"] - prefix = "" to_next_prefix = 1000 - while value > to_next_prefix and prefixes: - prefix = prefixes.pop(0) + prefixes = ["", "K", "M", "G", "T", "P", "E", "Z", "Y"] + suffixes = [p + suffix for p in prefixes] + + if category == "bytes": + to_next_prefix = 1024 + suffixes = ["B" if p == "" else p + "iB" for p in prefixes] + + suffix = suffixes.pop(0) + while value >= to_next_prefix and suffixes: + suffix = suffixes.pop(0) value //= to_next_prefix - return f"{value} {prefix}{suffix}" + if not suffix.startswith(" "): + suffix = " " + suffix + + return "{}{}".format(value, suffix).strip() # schema for adding a database by providing parameters instead of the diff --git a/superset/db_engine_specs/bigquery.py b/superset/db_engine_specs/bigquery.py index bf148169f8196..fb9e04474d212 100644 --- a/superset/db_engine_specs/bigquery.py +++ b/superset/db_engine_specs/bigquery.py @@ -212,26 +212,20 @@ def estimate_statement_cost( def query_cost_formatter( cls, raw_cost: List[Dict[str, Any]] ) -> List[Dict[str, str]]: - def format_bytes_str(raw_bytes: int) -> str: - if not isinstance(raw_bytes, int): - return str(raw_bytes) - units = ["B", "KiB", "MiB", "GiB", "TiB", "PiB"] - index = 0 - bytes_float = float(raw_bytes) - while bytes_float >= 1024 and index < len(units) - 1: - bytes_float /= 1024 - index += 1 - - return "{:.1f}".format(bytes_float) + f" {units[index]}" - - return [ - { - k: format_bytes_str(v) if k == "Total bytes processed" else str(v) - for k, v in row.items() - } - for row in raw_cost + cost = [] + columns = [ + ("Total bytes processed", "", "bytes"), ] + for row in raw_cost: + statement_cost = {} + for key, suffix, category in columns: + if key in row: + statement_cost[key] = cls._humanize(row[key], suffix, category) + cost.append(statement_cost) + + return cost + @classmethod def convert_dttm( cls, target_type: str, dttm: datetime, db_extra: Optional[Dict[str, Any]] = None diff --git a/superset/db_engine_specs/presto.py b/superset/db_engine_specs/presto.py index 99f65d44312db..c49b506f446b0 100644 --- a/superset/db_engine_specs/presto.py +++ b/superset/db_engine_specs/presto.py @@ -679,18 +679,20 @@ def query_cost_formatter( cost = [] columns = [ - ("outputRowCount", "Output count", " rows"), - ("outputSizeInBytes", "Output size", "B"), - ("cpuCost", "CPU cost", ""), - ("maxMemory", "Max memory", "B"), - ("networkCost", "Network cost", ""), + ("outputRowCount", "Output count", " rows", None), + ("outputSizeInBytes", "Output size", "", "bytes"), + ("cpuCost", "CPU cost", "", None), + ("maxMemory", "Max memory", "", "bytes"), + ("networkCost", "Network cost", "", None), ] for row in raw_cost: estimate: Dict[str, float] = row.get("estimate", {}) statement_cost = {} - for key, label, suffix in columns: + for key, label, suffix, category in columns: if key in estimate: - statement_cost[label] = cls._humanize(estimate[key], suffix).strip() + statement_cost[label] = cls._humanize( + estimate[key], suffix, category + ).strip() cost.append(statement_cost) return cost diff --git a/superset/db_engine_specs/trino.py b/superset/db_engine_specs/trino.py index 2f54f59b3a384..b04e31e08472a 100644 --- a/superset/db_engine_specs/trino.py +++ b/superset/db_engine_specs/trino.py @@ -161,18 +161,20 @@ def query_cost_formatter( cost = [] columns = [ - ("outputRowCount", "Output count", " rows"), - ("outputSizeInBytes", "Output size", "B"), - ("cpuCost", "CPU cost", ""), - ("maxMemory", "Max memory", "B"), - ("networkCost", "Network cost", ""), + ("outputRowCount", "Output count", " rows", None), + ("outputSizeInBytes", "Output size", "", "bytes"), + ("cpuCost", "CPU cost", "", None), + ("maxMemory", "Max memory", "", "bytes"), + ("networkCost", "Network cost", "", None), ] for row in raw_cost: estimate: Dict[str, float] = row.get("estimate", {}) statement_cost = {} - for key, label, suffix in columns: + for key, label, suffix, category in columns: if key in estimate: - statement_cost[label] = cls._humanize(estimate[key], suffix).strip() + statement_cost[label] = cls._humanize( + estimate[key], suffix, category + ).strip() cost.append(statement_cost) return cost diff --git a/tests/integration_tests/db_engine_specs/bigquery_tests.py b/tests/integration_tests/db_engine_specs/bigquery_tests.py index 7e3fe2ecf78d9..ae6d48a644541 100644 --- a/tests/integration_tests/db_engine_specs/bigquery_tests.py +++ b/tests/integration_tests/db_engine_specs/bigquery_tests.py @@ -420,25 +420,20 @@ def test_estimate_statement_invalid_syntax(self, mocked_client_class): def test_query_cost_formatter_example_costs(self): raw_cost = [ - {"Total bytes processed": 123, "Some other column": 123,}, - {"Total bytes processed": 1024, "Some other column": "abcde",}, - {"Total bytes processed": 1024 * 1024 + 1024 * 512,}, - {"Total bytes processed": 1024 ** 3,}, - {"Total bytes processed": 1024 ** 4,}, - {"Total bytes processed": 1024 ** 5,}, - {"Total bytes processed": 1024 ** 6,}, + {"Total bytes processed": 123}, + {"Total bytes processed": 1024}, + {"Total bytes processed": 1024 ** 2 + 1024 * 512,}, + {"Total bytes processed": 1024 ** 3 * 100,}, + {"Total bytes processed": 1024 ** 4 * 1000,}, ] result = BigQueryEngineSpec.query_cost_formatter(raw_cost) self.assertEqual( result, [ - {"Total bytes processed": "123.0 B", "Some other column": "123",}, - {"Total bytes processed": "1.0 KiB", "Some other column": "abcde",}, - {"Total bytes processed": "1.5 MiB",}, - {"Total bytes processed": "1.0 GiB",}, - {"Total bytes processed": "1.0 TiB",}, - {"Total bytes processed": "1.0 PiB",}, - # Petabyte is the largest unit, but larger values can be handled - {"Total bytes processed": "1024.0 PiB",}, + {"Total bytes processed": "123 B"}, + {"Total bytes processed": "1 KiB"}, + {"Total bytes processed": "1 MiB",}, + {"Total bytes processed": "100 GiB",}, + {"Total bytes processed": "1000 TiB",}, ], ) diff --git a/tests/integration_tests/db_engine_specs/presto_tests.py b/tests/integration_tests/db_engine_specs/presto_tests.py index cb7e4018f9ff2..3579244215b7b 100644 --- a/tests/integration_tests/db_engine_specs/presto_tests.py +++ b/tests/integration_tests/db_engine_specs/presto_tests.py @@ -524,7 +524,7 @@ def test_query_cost_formatter(self): expected = [ { "Output count": "904 M rows", - "Output size": "354 GB", + "Output size": "329 GiB", "CPU cost": "354 G", "Max memory": "0 B", "Network cost": "354 G", diff --git a/tests/unit_tests/db_engine_specs/test_base.py b/tests/unit_tests/db_engine_specs/test_base.py index 4dc27c0928f99..93fbaa2eb1512 100644 --- a/tests/unit_tests/db_engine_specs/test_base.py +++ b/tests/unit_tests/db_engine_specs/test_base.py @@ -17,6 +17,7 @@ # pylint: disable=unused-argument, import-outside-toplevel, protected-access from textwrap import dedent +from typing import Any, Optional import pytest from flask.ctx import AppContext @@ -99,3 +100,41 @@ def test_cte_query_parsing( actual = BaseEngineSpec.get_cte_query(original) assert actual == expected + + +@pytest.mark.parametrize( + "value,suffix,category,expected", + [ + ("str", "", None, "str"), + (0, "", None, "0"), + (100, "", None, "100"), + (1000, "", None, "1 K"), + (10000, "", None, "10 K"), + (123, " rows", None, "123 rows"), + (1234, " rows", None, "1 K rows"), + (1999, " rows", None, "1 K rows"), + (2000, " rows", None, "2 K rows"), + (123, "", "bytes", "123 B"), + (1024, "", "bytes", "1 KiB"), + (1024 ** 2, "", "bytes", "1 MiB"), + (1000 ** 2, "J", None, "1 MJ"), + (1024 ** 3, "", "bytes", "1 GiB"), + (1000 ** 3, "W", None, "1 GW"), + (1024 ** 8, "", "bytes", "1 YiB"), + (1000 ** 8, "m", None, "1 Ym"), + # Yottabyte is the largest unit, but larger values can be handled + (1024 ** 9, "", "bytes", "1024 YiB"), + (1000 ** 9, "m", None, "1000 Ym"), + ], +) +def test_humanize( + app_context: AppContext, + value: Any, + suffix: str, + category: Optional[str], + expected: str, +) -> None: + from superset.db_engine_specs.base import BaseEngineSpec + + actual = BaseEngineSpec._humanize(value, suffix, category) + assert actual == expected