From 0a5a032b2db4132bd7ce06563f247c15a22984e5 Mon Sep 17 00:00:00 2001 From: David Butenhof Date: Wed, 3 May 2023 15:24:29 -0400 Subject: [PATCH] Return "raw" API parameters in pagination (#3412) * Return "raw" API parameters in pagination PBENCH-1133 The `GET /datasets` response is optimized for sequential pagination, providing a convenient "next_url" string that can be used directly. However if a client wants to support "random access" pagination, this requires that the client parses the URL string in order to modify the `offset` parameter. This attempts to make that a bit easier by supplementing the current response payload with a `parameters` field containing the query parameters JSON object, making it easy to update the `offset` parameter. (Making the unit tests work against the normalized parameter list proved a bit challenging and I ended up saving the original "raw" client parameters in the API `context` so they can be used directly.) --- lib/pbench/server/api/resources/__init__.py | 13 ++-- .../server/api/resources/datasets_list.py | 34 +++++++--- .../test/unit/server/test_datasets_list.py | 62 +++++++++++++++---- 3 files changed, 82 insertions(+), 27 deletions(-) diff --git a/lib/pbench/server/api/resources/__init__.py b/lib/pbench/server/api/resources/__init__.py index 4032b192f3..4b6cf4f73d 100644 --- a/lib/pbench/server/api/resources/__init__.py +++ b/lib/pbench/server/api/resources/__init__.py @@ -1700,11 +1700,8 @@ def _dispatch( try: if schema.query_schema: query_params = self._gather_query_params(request, schema.query_schema) - - params = self.schemas.validate( - method, - ApiParams(body=body_params, query=query_params, uri=uri_params), - ) + raw_params = ApiParams(body=body_params, query=query_params, uri=uri_params) + params = self.schemas.validate(method, raw_params) except APIInternalError as e: current_app.logger.exception("{} {}", api_name, e.details) abort(e.http_status, message=str(e)) @@ -1772,7 +1769,11 @@ def _dispatch( "attributes": None, } - context = {"auditing": auditing, "attributes": schema.attributes} + context = { + "auditing": auditing, + "attributes": schema.attributes, + "raw_params": raw_params, + } try: response = execute(params, request, context) except APIInternalError as e: diff --git a/lib/pbench/server/api/resources/datasets_list.py b/lib/pbench/server/api/resources/datasets_list.py index af49172bba..376ff5b5e1 100644 --- a/lib/pbench/server/api/resources/datasets_list.py +++ b/lib/pbench/server/api/resources/datasets_list.py @@ -296,7 +296,7 @@ def __init__(self, config: PbenchServerConfig): ) def get_paginated_obj( - self, query: Query, json: JSON, url: str + self, query: Query, json: JSON, raw_params: ApiParams, url: str ) -> tuple[list[JSONOBJECT], dict[str, str]]: """Helper function to return a slice of datasets (constructed according to the user specified limit and an offset number) and a paginated object @@ -309,10 +309,15 @@ def get_paginated_obj( "limit": 10 -> dataset[0: 10] "offset": 20 -> dataset[20: total_items_count] - TODO: We may need to optimize the pagination - e.g Use of unique pointers to record the last returned row and then - use this pointer in subsequent page request instead of an initial - start to narrow down the result. + Args: + query: A SQLAlchemy query object + json: The query parameters in normalized JSON form + raw_params: The original API parameters for reference + url: The API URL + + Returns: + The list of Dataset objects matched by the query and a pagination + framework object. """ paginated_result = {} query = query.distinct() @@ -333,14 +338,19 @@ def get_paginated_obj( Database.dump_query(query, current_app.logger) items = query.all() + raw = raw_params.query.copy() next_offset = offset + len(items) if next_offset < total_count: - json["offset"] = next_offset + json["offset"] = str(next_offset) + raw["offset"] = str(next_offset) parsed_url = urlparse(url) next_url = parsed_url._replace(query=urlencode_json(json)).geturl() else: + if limit: + raw["offset"] = str(total_count) next_url = "" + paginated_result["parameters"] = raw paginated_result["next_url"] = next_url paginated_result["total"] = total_count return items, paginated_result @@ -600,7 +610,12 @@ def daterange(self, query: Query) -> JSONOBJECT: return {} def datasets( - self, request: Request, aliases: dict[str, Any], json: JSONOBJECT, query: Query + self, + request: Request, + aliases: dict[str, Any], + json: JSONOBJECT, + raw_params: ApiParams, + query: Query, ) -> JSONOBJECT: """Gather and paginate the selected datasets @@ -611,6 +626,7 @@ def datasets( request: The HTTP Request object aliases: Map of join column aliases for each Metadata namespace json: The JSON query parameters + raw_params: The original API parameters (used for pagination) query: The basic filtered SQLAlchemy query object Returns: @@ -666,7 +682,7 @@ def datasets( try: datasets, paginated_result = self.get_paginated_obj( - query=query, json=json, url=request.url + query=query, json=json, raw_params=raw_params, url=request.url ) except (AttributeError, ProgrammingError, StatementError) as e: raise APIInternalError( @@ -812,5 +828,5 @@ def _get( result.update(self.daterange(query)) done = True if not done: - result = self.datasets(request, aliases, json, query) + result = self.datasets(request, aliases, json, context["raw_params"], query) return jsonify(result) diff --git a/lib/pbench/test/unit/server/test_datasets_list.py b/lib/pbench/test/unit/server/test_datasets_list.py index 6e15a162ba..300c3395bd 100644 --- a/lib/pbench/test/unit/server/test_datasets_list.py +++ b/lib/pbench/test/unit/server/test_datasets_list.py @@ -1,7 +1,7 @@ import datetime from http import HTTPStatus import re -from typing import Optional +from typing import Any, Optional import pytest import requests @@ -10,7 +10,7 @@ from sqlalchemy.orm import aliased, Query from pbench.server import JSON, JSONARRAY, JSONOBJECT -from pbench.server.api.resources import APIAbort +from pbench.server.api.resources import APIAbort, ApiParams from pbench.server.api.resources.datasets_list import DatasetsList, urlencode_json from pbench.server.database.database import Database from pbench.server.database.models.datasets import Dataset, Metadata @@ -129,17 +129,27 @@ def get_results(self, name_list: list[str], query: JSON, server_config) -> JSON: Returns: Paginated JSON object containing list of dataset values """ + + def convert(k: str, v: Any) -> Any: + if isinstance(v, str) and k in ("filter", "sort", "metadata"): + return [v] + elif isinstance(v, int): + return str(v) + else: + return v + results: list[JSON] = [] - offset = query.get("offset", 0) + offset = int(query.get("offset", "0")) limit = query.get("limit") if limit: - next_offset = offset + limit + next_offset = offset + int(limit) paginated_name_list = name_list[offset:next_offset] if next_offset >= len(name_list): + query["offset"] = str(len(name_list)) next_url = "" else: - query["offset"] = next_offset + query["offset"] = str(next_offset) next_url = ( f"http://localhost{server_config.rest_uri}/datasets?" + urlencode_json(query) @@ -161,7 +171,15 @@ def get_results(self, name_list: list[str], query: JSON, server_config) -> JSON: }, } ) - return {"next_url": next_url, "results": results, "total": len(name_list)} + q1 = {k: convert(k, v) for k, v in query.items()} + if "metadata" not in q1: + q1["metadata"] = ["dataset.uploaded"] + return { + "parameters": q1, + "next_url": next_url, + "results": results, + "total": len(name_list), + } def compare_results( self, result: JSONOBJECT, name_list: list[str], query: JSON, server_config @@ -190,8 +208,8 @@ def compare_results( (None, {}, ["fio_1", "fio_2"]), (None, {"access": "public"}, ["fio_1", "fio_2"]), ("drb", {"name": "fio"}, ["fio_1", "fio_2"]), - ("drb", {"name": "fio", "limit": 1}, ["fio_1", "fio_2"]), - ("drb", {"name": "fio", "limit": 1, "offset": 2}, ["fio_1", "fio_2"]), + ("drb", {"name": "fio", "limit": "1"}, ["fio_1", "fio_2"]), + ("drb", {"name": "fio", "limit": 1, "offset": "2"}, ["fio_1", "fio_2"]), ("drb", {"name": "fio", "offset": 1}, ["fio_1", "fio_2"]), ("drb", {"name": "fio", "offset": 2}, ["fio_1", "fio_2"]), ("drb", {"owner": "drb"}, ["drb", "fio_1"]), @@ -256,6 +274,17 @@ def compare_results( def test_dataset_list(self, server_config, query_as, login, query, results): """Test `datasets/list` filters + NOTE: Several of these queries use the "limit" and/or "offset" options + to test how the result set is segmented during pagination. These are + represented in the parametrization above interchangeably as integers or + strings. This is because (1) the actual input to the Pbench Server API + is always in string form as a URI query parameter but (2) the requests + package understands this and stringifies integer parameters while (3) + the Pbench Server API framework recognizes these are integer values and + presents them to the API code as integers. Mixing integer and string + representation here must have no impact on the operation of the API so + it's worth testing. + Args: server_config: The PbenchServerConfig object query_as: A fixture to provide a helper that executes the API call @@ -311,7 +340,9 @@ def test_mine_novalue(self, server_config, client, more_datasets, get_token_func headers=headers, ) assert response.status_code == HTTPStatus.OK - self.compare_results(response.json, ["drb", "fio_1"], {}, server_config) + self.compare_results( + response.json, ["drb", "fio_1"], {"mine": ""}, server_config + ) @pytest.mark.parametrize( "login,query,results", @@ -336,7 +367,7 @@ def test_dataset_paged_list(self, query_as, login, query, results, server_config results: A list of the dataset names we expect to be returned server_config: The PbenchServerConfig object """ - query.update({"metadata": ["dataset.uploaded"], "limit": 5}) + query.update({"metadata": ["dataset.uploaded"], "limit": "5"}) result = query_as(query, login, HTTPStatus.OK) self.compare_results(result.json, results, query, server_config) @@ -384,6 +415,7 @@ def test_get_key_errors(self, query_as): ) assert response.json == { "next_url": "", + "parameters": {"metadata": ["global.test.foo"]}, "results": [ { "metadata": {"global.test.foo": None}, @@ -444,6 +476,12 @@ def test_use_funk_metalog_keys(self, query_as): ) assert response.json == { "next_url": "", + "parameters": { + "filter": [ + "dataset.metalog.iterations/fooBar=10-what_else@weird.iteration_name:~10" + ], + "metadata": ["dataset.metalog.iterations/fooBar=10-what_else@weird"], + }, "results": [ { "metadata": { @@ -725,7 +763,7 @@ def test_mismatched_json_cast(self, query_as, server_config, query, results): "drb", HTTPStatus.OK, ) - self.compare_results(response.json, results, {}, server_config) + self.compare_results(response.json, results, {"filter": query}, server_config) @pytest.mark.parametrize( "query,message", @@ -769,7 +807,7 @@ def test_pagination_error(self, caplog, monkeypatch, query_as, exception, error) """ def do_error( - self, query: Query, json: JSONOBJECT, url: str + self, query: Query, json: JSONOBJECT, raw_params: ApiParams, url: str ) -> tuple[JSONARRAY, JSONOBJECT]: raise exception