From 1f5cd39ca8f4f39bf3f040791bd173a1b9c04caa Mon Sep 17 00:00:00 2001 From: David Butenhof Date: Fri, 17 Mar 2023 07:18:28 -0400 Subject: [PATCH] Expose aggregate metadata namespace for UI (#3345) * Expose aggregate metadata namespace for UI PBENCH-1091 Managing the display of metadata in the dashboard is complicated by the fact that much of the namespace is highly dynamic: the `global` and `user` spaces are completely defined by clients, and can be completely different for each dataset, while the `dataset.metalog` namespace depends on the agent benchmark parameters and postprocessing. In order to provide a reference point, this PR proposes a way to acquire a JSON document representing the aggregated namespace across a set of selected datasets. It leverages the existing `GET /datasets` collection query for this purpose, including all the filter keywords: however instead of returning a list of matching datasets, it builds a JSON document showing the nested key namespace. A partial example follows: ``` GET /api/v1/datasets?keysummary=true ----- { "dataset": { "access": null, "id": null, "metalog": { "iterations/0__linpack-binary=:root:linpack:xlinpack_xeon64": { "clients": null, "iteration_name": null, "iteration_number": null, "linpack-binary": null }, [...] } } } ``` --- .../server/api/resources/datasets_list.py | 153 +++++++++++++----- .../test/unit/server/test_datasets_list.py | 91 +++++++++++ 2 files changed, 208 insertions(+), 36 deletions(-) diff --git a/lib/pbench/server/api/resources/datasets_list.py b/lib/pbench/server/api/resources/datasets_list.py index 4bdffaab20..f3853415b5 100644 --- a/lib/pbench/server/api/resources/datasets_list.py +++ b/lib/pbench/server/api/resources/datasets_list.py @@ -1,4 +1,5 @@ from http import HTTPStatus +from typing import Any from urllib.parse import urlencode, urlparse from flask import current_app @@ -50,14 +51,24 @@ def __init__(self, config: PbenchServerConfig): ApiMethod.GET, OperationCode.READ, query_schema=Schema( + # Filter criteria Parameter("mine", ParamType.BOOLEAN), Parameter("name", ParamType.STRING), Parameter("owner", ParamType.USER), Parameter("access", ParamType.ACCESS), Parameter("start", ParamType.DATE), Parameter("end", ParamType.DATE), + Parameter( + "filter", + ParamType.LIST, + element_type=ParamType.STRING, + string_list=",", + ), + # Pagination Parameter("offset", ParamType.INT), Parameter("limit", ParamType.INT), + # Output control + Parameter("keysummary", ParamType.BOOLEAN), Parameter( "metadata", ParamType.LIST, @@ -66,12 +77,6 @@ def __init__(self, config: PbenchServerConfig): key_path=True, string_list=",", ), - Parameter( - "filter", - ParamType.LIST, - element_type=ParamType.STRING, - string_list=",", - ), ), authorization=ApiAuthorizationType.USER_ACCESS, ), @@ -289,6 +294,108 @@ def filter_query(filters: list[str], query: Query) -> Query: return query.filter(and_(*and_list)) + def accumulate(self, aggregate: JSONOBJECT, key: str, value: Any): + """Recursive helper to accumulate the metadata namespace + + Iterate through a list of metadata key/value pairs to construct a + hierarchical aggregation of all metadata keys across the selected + datasets. Each key in the hierarchy is represented as a key in a + nested JSON object. "Leaf" keys have the value None. E.g., + + { + "dataset": {"name": None, "metalog": {"pbench": {"script": None}}}, + "server": {"deletion": None, "tarball-path": None}, + "global": {"server": {"legacy": {"sha1": None}}} + } + + Args: + aggregate: a JSONOBJECT to update with the recursive key/value + key: the current metadata key path element + value: the current metadata key's value + """ + if isinstance(value, dict): + p = aggregate.get(key) + if p is None: + p = {} + aggregate[key] = p + for k, v in value.items(): + self.accumulate(p, k, v) + elif key not in aggregate: + aggregate[key] = None + + def keyspace(self, query: Query) -> JSONOBJECT: + """Aggregate the dataset metadata keyspace + + Run the query we've compiled, but instead of returning Dataset proxies, + we only want the metadata key/value pairs we've selected. + + NOTE: The SQL left outer join returns a row for each row in the "left" + table (Dataset) even if there is no matching foreign key in the "right" + table (Metadata). This means a dataset with no metadata will result in + a join row here with key and value of None. The `elif` in the loop will + silently ignore rows with a null key to handle this case. + + Args: + query: The basic filtered SQLAlchemy query object + + Returns: + The aggregated keyspace JSON object + """ + aggregate: JSONOBJECT = { + "dataset": {c.name: None for c in Dataset.__table__._columns} + } + list = query.with_entities(Metadata.key, Metadata.value).all() + for k, v in list: + # "metalog" is a top-level key in the Metadata schema, but we + # report it as a sub-key of "dataset". + if k == Metadata.METALOG: + self.accumulate(aggregate["dataset"], k, v) + elif k: + self.accumulate(aggregate, k, v) + return aggregate + + def datasets(self, request: Request, json: JSONOBJECT, query: Query) -> JSONOBJECT: + """Gather and paginate the selected datasets + + Run the query we've compiled, with pagination limits applied; collect + results into a list of JSON objects including selected metadata keys. + + Args: + request: The HTTP Request object + json: The JSON query parameters + query: The basic filtered SQLAlchemy query object + + Returns: + The paginated dataset listing + """ + try: + datasets, paginated_result = self.get_paginated_obj( + query=query, json=json, url=request.url + ) + except (AttributeError, ProgrammingError, StatementError) as e: + raise APIInternalError( + f"Constructed SQL for {json} isn't executable" + ) from e + except Exception as e: + raise APIInternalError(f"Unexpected SQL exception: {e}") from e + + keys = json.get("metadata") + + response = [] + for dataset in datasets: + d = { + "name": dataset.name, + "resource_id": dataset.resource_id, + } + try: + d["metadata"] = self._get_dataset_metadata(dataset, keys) + except MetadataError: + d["metadata"] = None + response.append(d) + + paginated_result["results"] = response + return paginated_result + def _get( self, params: ApiParams, request: Request, context: ApiContext ) -> Response: @@ -346,33 +453,7 @@ def _get( else: owner = json.get("owner") query = self._build_sql_query(owner, json.get("access"), query) - - try: - datasets, paginated_result = self.get_paginated_obj( - query=query, json=json, url=request.url - ) - except (AttributeError, ProgrammingError, StatementError) as e: - raise APIInternalError( - f"Constructed SQL for {json} isn't executable" - ) from e - except Exception as e: - raise APIInternalError(f"Unexpected SQL exception: {e}") from e - - keys = json.get("metadata") - - response = [] - for dataset in datasets: - d = { - "name": dataset.name, - "resource_id": dataset.resource_id, - } - try: - d["metadata"] = self._get_dataset_metadata(dataset, keys) - except MetadataError as e: - current_app.logger.warning( - "Error getting metadata {} for dataset {}: {}", keys, dataset, e - ) - response.append(d) - - paginated_result["results"] = response - return jsonify(paginated_result) + if json.get("keysummary"): + return jsonify(self.keyspace(query)) + else: + return jsonify(self.datasets(request, json, query)) diff --git a/lib/pbench/test/unit/server/test_datasets_list.py b/lib/pbench/test/unit/server/test_datasets_list.py index bf277405c6..ff589e8306 100644 --- a/lib/pbench/test/unit/server/test_datasets_list.py +++ b/lib/pbench/test/unit/server/test_datasets_list.py @@ -345,6 +345,43 @@ def test_get_bad_keys(self, query_as): ) } + def test_get_get_errors(self, server_config, query_as): + """Test case reporting key errors + + Args: + query_as: Query helper fixture + """ + fio_1 = Dataset.query(name="fio_1") + fio_2 = Dataset.query(name="fio_2") + Metadata.setvalue(dataset=fio_1, key="global.test", value="ABC") + Metadata.setvalue(dataset=fio_2, key="global.test.foo", value="ABC") + response = query_as( + {"metadata": "global.test.foo"}, + "drb", + HTTPStatus.OK, + ) + assert response.json == { + "next_url": "", + "results": [ + { + "metadata": {"global.test.foo": None}, + "name": "drb", + "resource_id": "random_md5_string1", + }, + { + "metadata": {"global.test.foo": None}, + "name": "fio_1", + "resource_id": "random_md5_string3", + }, + { + "metadata": {"global.test.foo": "ABC"}, + "name": "fio_2", + "resource_id": "random_md5_string4", + }, + ], + "total": 3, + } + def test_get_unknown_keys(self, query_as): """Test case requesting non-existent query parameter keys. @@ -496,3 +533,57 @@ def do_error( if key in m: assert error in m break + + def test_key_summary(self, query_as): + """Test keyspace summary. + + With the `keysummary` query parameter, /datasets returns an aggregation + of defined metadata key namespaces for the selected datasets. + + We add a few metadata kays to the ones provided by the fixture to show + aggregation across multiple selected datasets. Note that without filter + criteria, the query here should return drb's "drb" and "fio_1" datasets + and test's public "fio_2" dataset. + """ + drb = Dataset.query(name="drb") + fio_1 = Dataset.query(name="fio_1") + + # Make sure we aggregate distinct namespaces across the three visible + # datasets by setting some varied keys. We leave fio_2 "pristine" to + # prove that the aggregator doesn't fail when we find no metadata for + # a dataset. We deliberately create the conflicting "global.legacy" + # and "global.legacy.server" to show that the conflict doesn't cause + # a problem. + Metadata.setvalue(dataset=drb, key="global.legacy", value="Truish") + Metadata.setvalue(dataset=fio_1, key="server.origin", value="SAT") + Metadata.setvalue(dataset=fio_1, key="global.legacy.server", value="ABC") + response = query_as({"keysummary": "true"}, "drb", HTTPStatus.OK) + assert response.json == { + "dataset": { + "access": None, + "id": None, + "metalog": { + "pbench": { + "config": None, + "date": None, + "name": None, + "script": None, + }, + "run": {"controller": None}, + }, + "name": None, + "owner_id": None, + "resource_id": None, + "uploaded": None, + }, + "global": {"contact": None, "legacy": {"server": None}}, + "server": { + "deletion": None, + "index-map": { + "unit-test.v5.result-data-sample.2020-08": None, + "unit-test.v6.run-data.2020-08": None, + "unit-test.v6.run-toc.2020-05": None, + }, + "origin": None, + }, + }