From 0a5a032b2db4132bd7ce06563f247c15a22984e5 Mon Sep 17 00:00:00 2001
From: David Butenhof <dbutenho@redhat.com>
Date: Wed, 3 May 2023 15:24:29 -0400
Subject: [PATCH] Return "raw" API parameters in pagination (#3412)

* Return "raw" API parameters in pagination

PBENCH-1133

The `GET /datasets` response is optimized for sequential pagination, providing
a convenient "next_url" string that can be used directly. However if a client
wants to support "random access" pagination, this requires that the client
parses the URL string in order to modify the `offset` parameter.

This attempts to make that a bit easier by supplementing the current response
payload with a `parameters` field containing the query parameters JSON object,
making it easy to update the `offset` parameter.

(Making the unit tests work against the normalized parameter list proved a bit
challenging and I ended up saving the original "raw" client parameters in the
API `context` so they can be used directly.)
---
 lib/pbench/server/api/resources/__init__.py   | 13 ++--
 .../server/api/resources/datasets_list.py     | 34 +++++++---
 .../test/unit/server/test_datasets_list.py    | 62 +++++++++++++++----
 3 files changed, 82 insertions(+), 27 deletions(-)

diff --git a/lib/pbench/server/api/resources/__init__.py b/lib/pbench/server/api/resources/__init__.py
index 4032b192f3..4b6cf4f73d 100644
--- a/lib/pbench/server/api/resources/__init__.py
+++ b/lib/pbench/server/api/resources/__init__.py
@@ -1700,11 +1700,8 @@ def _dispatch(
         try:
             if schema.query_schema:
                 query_params = self._gather_query_params(request, schema.query_schema)
-
-            params = self.schemas.validate(
-                method,
-                ApiParams(body=body_params, query=query_params, uri=uri_params),
-            )
+            raw_params = ApiParams(body=body_params, query=query_params, uri=uri_params)
+            params = self.schemas.validate(method, raw_params)
         except APIInternalError as e:
             current_app.logger.exception("{} {}", api_name, e.details)
             abort(e.http_status, message=str(e))
@@ -1772,7 +1769,11 @@ def _dispatch(
             "attributes": None,
         }
 
-        context = {"auditing": auditing, "attributes": schema.attributes}
+        context = {
+            "auditing": auditing,
+            "attributes": schema.attributes,
+            "raw_params": raw_params,
+        }
         try:
             response = execute(params, request, context)
         except APIInternalError as e:
diff --git a/lib/pbench/server/api/resources/datasets_list.py b/lib/pbench/server/api/resources/datasets_list.py
index af49172bba..376ff5b5e1 100644
--- a/lib/pbench/server/api/resources/datasets_list.py
+++ b/lib/pbench/server/api/resources/datasets_list.py
@@ -296,7 +296,7 @@ def __init__(self, config: PbenchServerConfig):
         )
 
     def get_paginated_obj(
-        self, query: Query, json: JSON, url: str
+        self, query: Query, json: JSON, raw_params: ApiParams, url: str
     ) -> tuple[list[JSONOBJECT], dict[str, str]]:
         """Helper function to return a slice of datasets (constructed according
         to the user specified limit and an offset number) and a paginated object
@@ -309,10 +309,15 @@ def get_paginated_obj(
             "limit": 10 -> dataset[0: 10]
             "offset": 20 -> dataset[20: total_items_count]
 
-        TODO: We may need to optimize the pagination
-            e.g Use of unique pointers to record the last returned row and then
-            use this pointer in subsequent page request instead of an initial
-            start to narrow down the result.
+        Args:
+            query: A SQLAlchemy query object
+            json: The query parameters in normalized JSON form
+            raw_params: The original API parameters for reference
+            url: The API URL
+
+        Returns:
+            The list of Dataset objects matched by the query and a pagination
+            framework object.
         """
         paginated_result = {}
         query = query.distinct()
@@ -333,14 +338,19 @@ def get_paginated_obj(
         Database.dump_query(query, current_app.logger)
 
         items = query.all()
+        raw = raw_params.query.copy()
         next_offset = offset + len(items)
         if next_offset < total_count:
-            json["offset"] = next_offset
+            json["offset"] = str(next_offset)
+            raw["offset"] = str(next_offset)
             parsed_url = urlparse(url)
             next_url = parsed_url._replace(query=urlencode_json(json)).geturl()
         else:
+            if limit:
+                raw["offset"] = str(total_count)
             next_url = ""
 
+        paginated_result["parameters"] = raw
         paginated_result["next_url"] = next_url
         paginated_result["total"] = total_count
         return items, paginated_result
@@ -600,7 +610,12 @@ def daterange(self, query: Query) -> JSONOBJECT:
             return {}
 
     def datasets(
-        self, request: Request, aliases: dict[str, Any], json: JSONOBJECT, query: Query
+        self,
+        request: Request,
+        aliases: dict[str, Any],
+        json: JSONOBJECT,
+        raw_params: ApiParams,
+        query: Query,
     ) -> JSONOBJECT:
         """Gather and paginate the selected datasets
 
@@ -611,6 +626,7 @@ def datasets(
             request: The HTTP Request object
             aliases: Map of join column aliases for each Metadata namespace
             json: The JSON query parameters
+            raw_params: The original API parameters (used for pagination)
             query: The basic filtered SQLAlchemy query object
 
         Returns:
@@ -666,7 +682,7 @@ def datasets(
 
         try:
             datasets, paginated_result = self.get_paginated_obj(
-                query=query, json=json, url=request.url
+                query=query, json=json, raw_params=raw_params, url=request.url
             )
         except (AttributeError, ProgrammingError, StatementError) as e:
             raise APIInternalError(
@@ -812,5 +828,5 @@ def _get(
             result.update(self.daterange(query))
             done = True
         if not done:
-            result = self.datasets(request, aliases, json, query)
+            result = self.datasets(request, aliases, json, context["raw_params"], query)
         return jsonify(result)
diff --git a/lib/pbench/test/unit/server/test_datasets_list.py b/lib/pbench/test/unit/server/test_datasets_list.py
index 6e15a162ba..300c3395bd 100644
--- a/lib/pbench/test/unit/server/test_datasets_list.py
+++ b/lib/pbench/test/unit/server/test_datasets_list.py
@@ -1,7 +1,7 @@
 import datetime
 from http import HTTPStatus
 import re
-from typing import Optional
+from typing import Any, Optional
 
 import pytest
 import requests
@@ -10,7 +10,7 @@
 from sqlalchemy.orm import aliased, Query
 
 from pbench.server import JSON, JSONARRAY, JSONOBJECT
-from pbench.server.api.resources import APIAbort
+from pbench.server.api.resources import APIAbort, ApiParams
 from pbench.server.api.resources.datasets_list import DatasetsList, urlencode_json
 from pbench.server.database.database import Database
 from pbench.server.database.models.datasets import Dataset, Metadata
@@ -129,17 +129,27 @@ def get_results(self, name_list: list[str], query: JSON, server_config) -> JSON:
         Returns:
             Paginated JSON object containing list of dataset values
         """
+
+        def convert(k: str, v: Any) -> Any:
+            if isinstance(v, str) and k in ("filter", "sort", "metadata"):
+                return [v]
+            elif isinstance(v, int):
+                return str(v)
+            else:
+                return v
+
         results: list[JSON] = []
-        offset = query.get("offset", 0)
+        offset = int(query.get("offset", "0"))
         limit = query.get("limit")
 
         if limit:
-            next_offset = offset + limit
+            next_offset = offset + int(limit)
             paginated_name_list = name_list[offset:next_offset]
             if next_offset >= len(name_list):
+                query["offset"] = str(len(name_list))
                 next_url = ""
             else:
-                query["offset"] = next_offset
+                query["offset"] = str(next_offset)
                 next_url = (
                     f"http://localhost{server_config.rest_uri}/datasets?"
                     + urlencode_json(query)
@@ -161,7 +171,15 @@ def get_results(self, name_list: list[str], query: JSON, server_config) -> JSON:
                     },
                 }
             )
-        return {"next_url": next_url, "results": results, "total": len(name_list)}
+        q1 = {k: convert(k, v) for k, v in query.items()}
+        if "metadata" not in q1:
+            q1["metadata"] = ["dataset.uploaded"]
+        return {
+            "parameters": q1,
+            "next_url": next_url,
+            "results": results,
+            "total": len(name_list),
+        }
 
     def compare_results(
         self, result: JSONOBJECT, name_list: list[str], query: JSON, server_config
@@ -190,8 +208,8 @@ def compare_results(
             (None, {}, ["fio_1", "fio_2"]),
             (None, {"access": "public"}, ["fio_1", "fio_2"]),
             ("drb", {"name": "fio"}, ["fio_1", "fio_2"]),
-            ("drb", {"name": "fio", "limit": 1}, ["fio_1", "fio_2"]),
-            ("drb", {"name": "fio", "limit": 1, "offset": 2}, ["fio_1", "fio_2"]),
+            ("drb", {"name": "fio", "limit": "1"}, ["fio_1", "fio_2"]),
+            ("drb", {"name": "fio", "limit": 1, "offset": "2"}, ["fio_1", "fio_2"]),
             ("drb", {"name": "fio", "offset": 1}, ["fio_1", "fio_2"]),
             ("drb", {"name": "fio", "offset": 2}, ["fio_1", "fio_2"]),
             ("drb", {"owner": "drb"}, ["drb", "fio_1"]),
@@ -256,6 +274,17 @@ def compare_results(
     def test_dataset_list(self, server_config, query_as, login, query, results):
         """Test `datasets/list` filters
 
+        NOTE: Several of these queries use the "limit" and/or "offset" options
+        to test how the result set is segmented during pagination. These are
+        represented in the parametrization above interchangeably as integers or
+        strings. This is because (1) the actual input to the Pbench Server API
+        is always in string form as a URI query parameter but (2) the requests
+        package understands this and stringifies integer parameters while (3)
+        the Pbench Server API framework recognizes these are integer values and
+        presents them to the API code as integers. Mixing integer and string
+        representation here must have no impact on the operation of the API so
+        it's worth testing.
+
         Args:
             server_config: The PbenchServerConfig object
             query_as: A fixture to provide a helper that executes the API call
@@ -311,7 +340,9 @@ def test_mine_novalue(self, server_config, client, more_datasets, get_token_func
             headers=headers,
         )
         assert response.status_code == HTTPStatus.OK
-        self.compare_results(response.json, ["drb", "fio_1"], {}, server_config)
+        self.compare_results(
+            response.json, ["drb", "fio_1"], {"mine": ""}, server_config
+        )
 
     @pytest.mark.parametrize(
         "login,query,results",
@@ -336,7 +367,7 @@ def test_dataset_paged_list(self, query_as, login, query, results, server_config
             results: A list of the dataset names we expect to be returned
             server_config: The PbenchServerConfig object
         """
-        query.update({"metadata": ["dataset.uploaded"], "limit": 5})
+        query.update({"metadata": ["dataset.uploaded"], "limit": "5"})
         result = query_as(query, login, HTTPStatus.OK)
         self.compare_results(result.json, results, query, server_config)
 
@@ -384,6 +415,7 @@ def test_get_key_errors(self, query_as):
         )
         assert response.json == {
             "next_url": "",
+            "parameters": {"metadata": ["global.test.foo"]},
             "results": [
                 {
                     "metadata": {"global.test.foo": None},
@@ -444,6 +476,12 @@ def test_use_funk_metalog_keys(self, query_as):
         )
         assert response.json == {
             "next_url": "",
+            "parameters": {
+                "filter": [
+                    "dataset.metalog.iterations/fooBar=10-what_else@weird.iteration_name:~10"
+                ],
+                "metadata": ["dataset.metalog.iterations/fooBar=10-what_else@weird"],
+            },
             "results": [
                 {
                     "metadata": {
@@ -725,7 +763,7 @@ def test_mismatched_json_cast(self, query_as, server_config, query, results):
             "drb",
             HTTPStatus.OK,
         )
-        self.compare_results(response.json, results, {}, server_config)
+        self.compare_results(response.json, results, {"filter": query}, server_config)
 
     @pytest.mark.parametrize(
         "query,message",
@@ -769,7 +807,7 @@ def test_pagination_error(self, caplog, monkeypatch, query_as, exception, error)
         """
 
         def do_error(
-            self, query: Query, json: JSONOBJECT, url: str
+            self, query: Query, json: JSONOBJECT, raw_params: ApiParams, url: str
         ) -> tuple[JSONARRAY, JSONOBJECT]:
             raise exception