From d6d3bbbdc60504f40bd64ba401d932fd70a49b68 Mon Sep 17 00:00:00 2001 From: Daniel Gaspar Date: Thu, 13 Aug 2020 16:10:11 +0100 Subject: [PATCH 1/7] feat: dataset REST API for distinct values --- superset/constants.py | 1 + superset/datasets/api.py | 2 + superset/views/base_api.py | 155 +++++++++++++++++++++++++++++------- tests/datasets/api_tests.py | 46 ++++++++++- 4 files changed, 175 insertions(+), 29 deletions(-) diff --git a/superset/constants.py b/superset/constants.py index dbe6fd12086bd..ea14a38a0be2a 100644 --- a/superset/constants.py +++ b/superset/constants.py @@ -55,6 +55,7 @@ class RouteMethod: # pylint: disable=too-few-public-methods POST = "post" PUT = "put" RELATED = "related" + DISTINCT = "distinct" # Commonly used sets API_SET = {API_CREATE, API_DELETE, API_GET, API_READ, API_UPDATE} diff --git a/superset/datasets/api.py b/superset/datasets/api.py index 0d7973b4a79d2..b04cef8184454 100644 --- a/superset/datasets/api.py +++ b/superset/datasets/api.py @@ -67,6 +67,7 @@ class DatasetRestApi(BaseSupersetModelRestApi): include_route_methods = RouteMethod.REST_MODEL_VIEW_CRUD_SET | { RouteMethod.EXPORT, RouteMethod.RELATED, + RouteMethod.DISTINCT, "refresh", "related_objects", } @@ -151,6 +152,7 @@ class DatasetRestApi(BaseSupersetModelRestApi): } filter_rel_fields = {"database": [["id", DatabaseFilter, lambda: []]]} allowed_rel_fields = {"database", "owners"} + allowed_distinct_fields = {"schema"} openapi_spec_component_schemas = (DatasetRelatedObjectsResponse,) diff --git a/superset/views/base_api.py b/superset/views/base_api.py index 56fdba0197b9d..9f4ea2fadb6f3 100644 --- a/superset/views/base_api.py +++ b/superset/views/base_api.py @@ -19,12 +19,16 @@ from typing import Any, Callable, cast, Dict, List, Optional, Set, Tuple, Type, Union from apispec import APISpec +from apispec.exceptions import DuplicateComponentNameError from flask import Blueprint, Response -from flask_appbuilder import AppBuilder, Model, ModelRestApi +from flask_appbuilder import AppBuilder, ModelRestApi from flask_appbuilder.api import expose, protect, rison, safe from flask_appbuilder.models.filters import BaseFilter, Filters from flask_appbuilder.models.sqla.filters import FilterStartsWith -from marshmallow import Schema +from flask_appbuilder.models.sqla.interface import SQLAInterface +from marshmallow import Schema, fields +from sqlalchemy import distinct +from sqlalchemy import func from superset.stats_logger import BaseStatsLogger from superset.typing import FlaskResponse @@ -41,6 +45,25 @@ } +class RelatedResultResponseSchema(Schema): + value = fields.Integer(description="The related item identifier") + text = fields.String(description="The related item string representation") + + +class RelatedResponseSchema(Schema): + count = fields.Integer(description="The total number of related values") + result = fields.List(fields.Nested(RelatedResultResponseSchema)) + + +class DistinctResultResponseSchema(Schema): + text = fields.String(description="The distinct item") + + +class DistincResponseSchema(Schema): + count = fields.Integer(description="The total number of distinct values") + result = fields.List(fields.Nested(DistinctResultResponseSchema)) + + def statsd_metrics(f: Callable[..., Any]) -> Callable[..., Any]: """ Handle sending all statsd metrics from the REST API @@ -78,6 +101,7 @@ class BaseSupersetModelRestApi(ModelRestApi): "bulk_delete": "delete", "info": "list", "related": "list", + "distinct": "list", "thumbnail": "list", "refresh": "edit", "data": "list", @@ -112,6 +136,8 @@ class BaseSupersetModelRestApi(ModelRestApi): """ # pylint: disable=pointless-string-statement allowed_rel_fields: Set[str] = set() + allowed_distinct_fields: Set[str] = set() + openapi_spec_component_schemas: Tuple[Type[Schema], ...] = tuple() """ Add extra schemas to the OpenAPI component schemas section @@ -123,15 +149,29 @@ class BaseSupersetModelRestApi(ModelRestApi): show_columns: List[str] def __init__(self) -> None: - super().__init__() + # Setup statsd self.stats_logger = BaseStatsLogger() + # Add base API spec base query parameter schemas + if self.apispec_parameter_schemas is None: + self.apispec_parameter_schemas = {} + self.apispec_parameter_schemas["get_related_schema"] = get_related_schema + if self.openapi_spec_component_schemas is None: + self.openapi_spec_component_schemas = () + self.openapi_spec_component_schemas = self.openapi_spec_component_schemas + ( + RelatedResponseSchema, + DistincResponseSchema, + ) + super().__init__() def add_apispec_components(self, api_spec: APISpec) -> None: for schema in self.openapi_spec_component_schemas: - api_spec.components.schema( - schema.__name__, schema=schema, - ) + try: + api_spec.components.schema( + schema.__name__, schema=schema, + ) + except DuplicateComponentNameError: + pass super().add_apispec_components(api_spec) def create_blueprint( @@ -153,7 +193,7 @@ def _init_properties(self) -> None: super()._init_properties() def _get_related_filter( - self, datamodel: Model, column_name: str, value: str + self, datamodel: SQLAInterface, column_name: str, value: str ) -> Filters: filter_field = self.related_field_filters.get(column_name) if isinstance(filter_field, str): @@ -170,6 +210,18 @@ def _get_related_filter( ) return filters + def _get_distinct_filter(self, column_name: str, value: str) -> Filters: + filter_field = RelatedFieldFilter(column_name, FilterStartsWith) + filter_field = cast(RelatedFieldFilter, filter_field) + search_columns = [filter_field.field_name] if filter_field else None + filters = self.datamodel.get_filters(search_columns) + filters.add_filter_list(self.base_filters) + if value and filter_field: + filters.add_filter( + filter_field.field_name, filter_field.filter_class, value + ) + return filters + def incr_stats(self, action: str, func_name: str) -> None: """ Proxy function for statsd.incr to impose a key structure for REST API's @@ -251,39 +303,21 @@ def related(self, column_name: str, **kwargs: Any) -> FlaskResponse: content: application/json: schema: - type: object - properties: - page_size: - type: integer - page: - type: integer - filter: - type: string + $ref: '#/components/schemas/get_related_schema' responses: 200: description: Related column data content: application/json: schema: - type: object - properties: - count: - type: integer - result: - type: object - properties: - value: - type: integer - text: - type: string + schema: + $ref: "#/components/schemas/RelatedResponseSchema" 400: $ref: '#/components/responses/400' 401: $ref: '#/components/responses/401' 404: $ref: '#/components/responses/404' - 422: - $ref: '#/components/responses/422' 500: $ref: '#/components/responses/500' """ @@ -316,3 +350,68 @@ def related(self, column_name: str, **kwargs: Any) -> FlaskResponse: for value in values ] return self.response(200, count=count, result=result) + + @expose("/distinct/", methods=["GET"]) + @protect() + @safe + @statsd_metrics + @rison(get_related_schema) + def distinct(self, column_name: str, **kwargs: Any) -> FlaskResponse: + """Get related fields data + --- + get: + parameters: + - in: path + schema: + type: string + name: column_name + - in: query + name: q + content: + application/json: + schema: + $ref: '#/components/schemas/get_related_schema' + responses: + 200: + description: Related column data + content: + application/json: + schema: + schema: + $ref: "#/components/schemas/DistincResponseSchema" + 400: + $ref: '#/components/responses/400' + 401: + $ref: '#/components/responses/401' + 404: + $ref: '#/components/responses/404' + 500: + $ref: '#/components/responses/500' + """ + if column_name not in self.allowed_distinct_fields: + self.incr_stats("error", self.related.__name__) + return self.response_404() + args = kwargs.get("rison", {}) + # handle pagination + page, page_size = self._sanitize_page_args(*self._handle_page_args(args)) + # Create generic base filters with added request filter + filters = self._get_distinct_filter(column_name, args.get("filter")) + # Make the query + query_count = self.appbuilder.get_session.query( + func.count(distinct(getattr(self.datamodel.obj, column_name))) + ) + count = self.datamodel.apply_filters(query_count, filters).scalar() + if count == 0: + return self.response(200, count=count, result=[]) + query = self.appbuilder.get_session.query( + distinct(getattr(self.datamodel.obj, column_name)) + ) + # Apply generic base filters with added request filter + query = self.datamodel.apply_filters(query, filters) + # Apply sort + query = self.datamodel.apply_order_by(query, column_name, "asc") + # Apply pagination + result = self.datamodel.apply_pagination(query, page, page_size).all() + # produce response + result = [{"text": item[0]} for item in result if item[0] is not None] + return self.response(200, count=count, result=result) diff --git a/tests/datasets/api_tests.py b/tests/datasets/api_tests.py index a7795f962af31..35132630f4ae6 100644 --- a/tests/datasets/api_tests.py +++ b/tests/datasets/api_tests.py @@ -16,7 +16,7 @@ # under the License. """Unit tests for Superset""" import json -from typing import List +from typing import Any, Dict, List, Tuple, Union from unittest.mock import patch import prison @@ -170,6 +170,49 @@ def test_get_dataset_item(self): self.assertEqual(len(response["result"]["columns"]), 3) self.assertEqual(len(response["result"]["metrics"]), 2) + def test_get_dataset_distinct_schema(self): + """ + Dataset API: Test get dataset distinct schema + """ + example_db = get_example_database() + datasets = [] + if example_db.backend == "postgresql": + datasets.append( + self.insert_dataset("ab_permission", "public", [], get_main_database()) + ) + datasets.append( + self.insert_dataset( + "columns", "information_schema", [], get_main_database() + ) + ) + expected_response = { + "count": 2, + "result": [{"text": "information_schema"}, {"text": "public"}], + } + else: + datasets.append(self.insert_default_dataset()) + expected_response = {"count": 1, "result": [{"text": "",}]} + self.login(username="admin") + uri = "api/v1/dataset/distinct/schema" + rv = self.client.get(uri) + response = json.loads(rv.data.decode("utf-8")) + self.assertEqual(rv.status_code, 200) + self.assertEqual(response, expected_response) + + for dataset in datasets: + db.session.delete(dataset) + db.session.commit() + + def test_get_dataset_distinct_not_allowed(self): + """ + Dataset API: Test get dataset distinct not allowed + """ + self.login(username="admin") + uri = "api/v1/dataset/distinct/table_name" + rv = self.client.get(uri) + response = json.loads(rv.data.decode("utf-8")) + self.assertEqual(rv.status_code, 404) + def test_get_dataset_info(self): """ Dataset API: Test get dataset info @@ -358,6 +401,7 @@ def test_update_dataset_item(self): self.assertEqual(rv.status_code, 200) model = db.session.query(SqlaTable).get(dataset.id) self.assertEqual(model.description, dataset_data["description"]) + db.session.delete(dataset) db.session.commit() From 16b414bef5b066440fcfb1987291ca07a8fcae67 Mon Sep 17 00:00:00 2001 From: Daniel Gaspar Date: Thu, 13 Aug 2020 16:46:06 +0100 Subject: [PATCH 2/7] add tests and fix lint --- superset/views/base_api.py | 2 +- tests/datasets/api_tests.py | 48 ++++++++++++++++++++++++++++++++++--- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/superset/views/base_api.py b/superset/views/base_api.py index 9f4ea2fadb6f3..18636e052cd6b 100644 --- a/superset/views/base_api.py +++ b/superset/views/base_api.py @@ -152,7 +152,7 @@ def __init__(self) -> None: # Setup statsd self.stats_logger = BaseStatsLogger() # Add base API spec base query parameter schemas - if self.apispec_parameter_schemas is None: + if self.apispec_parameter_schemas is None: # type: ignore self.apispec_parameter_schemas = {} self.apispec_parameter_schemas["get_related_schema"] = get_related_schema if self.openapi_spec_component_schemas is None: diff --git a/tests/datasets/api_tests.py b/tests/datasets/api_tests.py index 35132630f4ae6..1a9b117b891f4 100644 --- a/tests/datasets/api_tests.py +++ b/tests/datasets/api_tests.py @@ -129,7 +129,6 @@ def test_get_dataset_related_database_gamma(self): """ Dataset API: Test get dataset related databases gamma """ - example_db = get_example_database() self.login(username="gamma") uri = "api/v1/dataset/related/database" rv = self.client.get(uri) @@ -174,6 +173,14 @@ def test_get_dataset_distinct_schema(self): """ Dataset API: Test get dataset distinct schema """ + + def pg_test_query_parameter(query_parameter, expected_response): + uri = f"api/v1/dataset/distinct/schema?q={prison.dumps(query_parameter)}" + rv = self.client.get(uri) + response = json.loads(rv.data.decode("utf-8")) + self.assertEqual(rv.status_code, 200) + self.assertEqual(response, expected_response) + example_db = get_example_database() datasets = [] if example_db.backend == "postgresql": @@ -191,7 +198,7 @@ def test_get_dataset_distinct_schema(self): } else: datasets.append(self.insert_default_dataset()) - expected_response = {"count": 1, "result": [{"text": "",}]} + expected_response = {"count": 1, "result": [{"text": ""}]} self.login(username="admin") uri = "api/v1/dataset/distinct/schema" rv = self.client.get(uri) @@ -199,6 +206,25 @@ def test_get_dataset_distinct_schema(self): self.assertEqual(rv.status_code, 200) self.assertEqual(response, expected_response) + if example_db.backend == "postgresql": + # Test filter + query_parameter = {"filter": "in"} + pg_test_query_parameter( + query_parameter, + {"count": 1, "result": [{"text": "information_schema"}]}, + ) + + query_parameter = {"page": 0, "page_size": 1} + pg_test_query_parameter( + query_parameter, + {"count": 2, "result": [{"text": "information_schema"}]}, + ) + + query_parameter = {"page": 1, "page_size": 1} + pg_test_query_parameter( + query_parameter, {"count": 2, "result": [{"text": "public"}]} + ) + for dataset in datasets: db.session.delete(dataset) db.session.commit() @@ -210,9 +236,25 @@ def test_get_dataset_distinct_not_allowed(self): self.login(username="admin") uri = "api/v1/dataset/distinct/table_name" rv = self.client.get(uri) - response = json.loads(rv.data.decode("utf-8")) self.assertEqual(rv.status_code, 404) + def test_get_dataset_distinct_gamma(self): + """ + Dataset API: Test get dataset distinct with gamma + """ + dataset = self.insert_default_dataset() + + self.login(username="gamma") + uri = "api/v1/dataset/distinct/schema" + rv = self.client.get(uri) + self.assertEqual(rv.status_code, 200) + response = json.loads(rv.data.decode("utf-8")) + self.assertEqual(response["count"], 0) + self.assertEqual(response["result"], []) + + db.session.delete(dataset) + db.session.commit() + def test_get_dataset_info(self): """ Dataset API: Test get dataset info From a6f6ce6344f6f46d56e21e9dae981afcb461568e Mon Sep 17 00:00:00 2001 From: Daniel Gaspar Date: Thu, 13 Aug 2020 18:09:17 +0100 Subject: [PATCH 3/7] fix mypy, and tests --- superset/views/base_api.py | 5 ++--- tests/datasets/api_tests.py | 21 +++++++++++++++++---- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/superset/views/base_api.py b/superset/views/base_api.py index 18636e052cd6b..c2b86464fef9a 100644 --- a/superset/views/base_api.py +++ b/superset/views/base_api.py @@ -26,9 +26,8 @@ from flask_appbuilder.models.filters import BaseFilter, Filters from flask_appbuilder.models.sqla.filters import FilterStartsWith from flask_appbuilder.models.sqla.interface import SQLAInterface -from marshmallow import Schema, fields -from sqlalchemy import distinct -from sqlalchemy import func +from marshmallow import fields, Schema +from sqlalchemy import distinct, func from superset.stats_logger import BaseStatsLogger from superset.typing import FlaskResponse diff --git a/tests/datasets/api_tests.py b/tests/datasets/api_tests.py index 1a9b117b891f4..f94c7f40bcf66 100644 --- a/tests/datasets/api_tests.py +++ b/tests/datasets/api_tests.py @@ -173,6 +173,7 @@ def test_get_dataset_distinct_schema(self): """ Dataset API: Test get dataset distinct schema """ + self.maxDiff = None def pg_test_query_parameter(query_parameter, expected_response): uri = f"api/v1/dataset/distinct/schema?q={prison.dumps(query_parameter)}" @@ -193,12 +194,24 @@ def pg_test_query_parameter(query_parameter, expected_response): ) ) expected_response = { - "count": 2, - "result": [{"text": "information_schema"}, {"text": "public"}], + "count": 5, + "result": [ + {"text": ""}, + {"text": "admin_database"}, + {"text": "information_schema"}, + {"text": "public"}, + {"text": "superset"}, + ], } else: - datasets.append(self.insert_default_dataset()) - expected_response = {"count": 1, "result": [{"text": ""}]} + expected_response = { + "count": 5, + "result": [ + {"text": ""}, + {"text": "admin_database"}, + {"text": "superset"}, + ], + } self.login(username="admin") uri = "api/v1/dataset/distinct/schema" rv = self.client.get(uri) From 0b9cebdec69861f0671bcddbfaf44ed7218219c8 Mon Sep 17 00:00:00 2001 From: Daniel Gaspar Date: Thu, 13 Aug 2020 18:24:36 +0100 Subject: [PATCH 4/7] fix docs --- superset/views/base_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/superset/views/base_api.py b/superset/views/base_api.py index c2b86464fef9a..4d7fe7d267c95 100644 --- a/superset/views/base_api.py +++ b/superset/views/base_api.py @@ -356,7 +356,7 @@ def related(self, column_name: str, **kwargs: Any) -> FlaskResponse: @statsd_metrics @rison(get_related_schema) def distinct(self, column_name: str, **kwargs: Any) -> FlaskResponse: - """Get related fields data + """Get distinct values from field data --- get: parameters: @@ -372,7 +372,7 @@ def distinct(self, column_name: str, **kwargs: Any) -> FlaskResponse: $ref: '#/components/schemas/get_related_schema' responses: 200: - description: Related column data + description: Distinct field data content: application/json: schema: From 98d74db400b8e4cad2120521a969b92c468b7f59 Mon Sep 17 00:00:00 2001 From: Daniel Gaspar Date: Thu, 13 Aug 2020 18:41:10 +0100 Subject: [PATCH 5/7] fix test --- tests/datasets/api_tests.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/datasets/api_tests.py b/tests/datasets/api_tests.py index f94c7f40bcf66..9692a3603f46e 100644 --- a/tests/datasets/api_tests.py +++ b/tests/datasets/api_tests.py @@ -221,7 +221,7 @@ def pg_test_query_parameter(query_parameter, expected_response): if example_db.backend == "postgresql": # Test filter - query_parameter = {"filter": "in"} + query_parameter = {"filter": "inf"} pg_test_query_parameter( query_parameter, {"count": 1, "result": [{"text": "information_schema"}]}, @@ -230,12 +230,12 @@ def pg_test_query_parameter(query_parameter, expected_response): query_parameter = {"page": 0, "page_size": 1} pg_test_query_parameter( query_parameter, - {"count": 2, "result": [{"text": "information_schema"}]}, + {"count": 5, "result": [{"text": ""}]}, ) query_parameter = {"page": 1, "page_size": 1} pg_test_query_parameter( - query_parameter, {"count": 2, "result": [{"text": "public"}]} + query_parameter, {"count": 5, "result": [{"text": "admin_database"}]} ) for dataset in datasets: From bbaa5e4ba4a082e9cc027d51377d4e153666d113 Mon Sep 17 00:00:00 2001 From: Daniel Gaspar Date: Thu, 13 Aug 2020 18:49:28 +0100 Subject: [PATCH 6/7] lint --- tests/datasets/api_tests.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/datasets/api_tests.py b/tests/datasets/api_tests.py index 9692a3603f46e..638ab01044b87 100644 --- a/tests/datasets/api_tests.py +++ b/tests/datasets/api_tests.py @@ -229,8 +229,7 @@ def pg_test_query_parameter(query_parameter, expected_response): query_parameter = {"page": 0, "page_size": 1} pg_test_query_parameter( - query_parameter, - {"count": 5, "result": [{"text": ""}]}, + query_parameter, {"count": 5, "result": [{"text": ""}]}, ) query_parameter = {"page": 1, "page_size": 1} From 04baa3406dbb6d5d03bdf9b82c251e603067a58f Mon Sep 17 00:00:00 2001 From: Daniel Gaspar Date: Thu, 13 Aug 2020 19:06:45 +0100 Subject: [PATCH 7/7] fix test --- tests/datasets/api_tests.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/tests/datasets/api_tests.py b/tests/datasets/api_tests.py index 638ab01044b87..798e0dde8ab52 100644 --- a/tests/datasets/api_tests.py +++ b/tests/datasets/api_tests.py @@ -173,7 +173,6 @@ def test_get_dataset_distinct_schema(self): """ Dataset API: Test get dataset distinct schema """ - self.maxDiff = None def pg_test_query_parameter(query_parameter, expected_response): uri = f"api/v1/dataset/distinct/schema?q={prison.dumps(query_parameter)}" @@ -203,23 +202,13 @@ def pg_test_query_parameter(query_parameter, expected_response): {"text": "superset"}, ], } - else: - expected_response = { - "count": 5, - "result": [ - {"text": ""}, - {"text": "admin_database"}, - {"text": "superset"}, - ], - } - self.login(username="admin") - uri = "api/v1/dataset/distinct/schema" - rv = self.client.get(uri) - response = json.loads(rv.data.decode("utf-8")) - self.assertEqual(rv.status_code, 200) - self.assertEqual(response, expected_response) + self.login(username="admin") + uri = "api/v1/dataset/distinct/schema" + rv = self.client.get(uri) + response = json.loads(rv.data.decode("utf-8")) + self.assertEqual(rv.status_code, 200) + self.assertEqual(response, expected_response) - if example_db.backend == "postgresql": # Test filter query_parameter = {"filter": "inf"} pg_test_query_parameter(