diff --git a/lib/pbench/server/api/resources/__init__.py b/lib/pbench/server/api/resources/__init__.py index 2ad4c53fee..b5ea4999aa 100644 --- a/lib/pbench/server/api/resources/__init__.py +++ b/lib/pbench/server/api/resources/__init__.py @@ -4,6 +4,7 @@ from http import HTTPStatus import json from json.decoder import JSONDecodeError +import re from typing import Any, Callable, Dict, List, NamedTuple, Optional, Union import uuid @@ -25,7 +26,6 @@ Dataset, DatasetNotFound, Metadata, - MetadataBadKey, MetadataError, OperationName, ) @@ -428,9 +428,11 @@ def convert_json(value: JSONOBJECT, parameter: "Parameter") -> JSONOBJECT: """Validate a parameter of JSON type. If the Parameter object defines a list of keywords, the JSON key values are - validated against that list. If the Parameter key_path attribute is set, - then only the first element of a dotted path (e.g., "user" for - "user.contact.email") is validated. + validated against that list. + + If the Parameter key_path attribute is set, validate that the key is legal + and if the value is a JSON object (dict) that all keys within the JSON are + legal. Args: value : JSON dict @@ -443,6 +445,25 @@ def convert_json(value: JSONOBJECT, parameter: "Parameter") -> JSONOBJECT: Returns: The JSON dict """ + _valid_key = re.compile(r"[a-z0-9_-]+") + + def keysafe(value: dict[str, Any]) -> list[str]: + """Test whether nested JSON keys are legal. + + Args: + value: A JSON structure + + Returns: + A list of bad keys (empty if all are good) + """ + bad = [] + for k, v in value.items(): + if not _valid_key.fullmatch(k): + bad.append(k) + if isinstance(v, dict): + bad.extend(keysafe(v)) + return bad + try: washed = json.loads(json.dumps(value)) except JSONDecodeError as e: @@ -455,12 +476,17 @@ def convert_json(value: JSONOBJECT, parameter: "Parameter") -> JSONOBJECT: if parameter.keywords: bad = [] - for k in value.keys(): - if parameter.key_path: - if not Metadata.is_key_path(k, parameter.keywords): + try: + for k, v in value.items(): + if parameter.key_path: + if not Metadata.is_key_path(k, parameter.keywords): + bad.append(k) + if isinstance(v, dict): + bad.extend(keysafe(v)) + elif k not in parameter.keywords: bad.append(k) - elif k not in parameter.keywords: - bad.append(k) + except Exception as e: + raise ConversionError(value, "JSON") from e if bad: raise KeywordError(parameter, f"JSON key{'s' if len(bad) > 1 else ''}", bad) @@ -546,10 +572,12 @@ def convert_keyword(value: str, parameter: "Parameter") -> Union[str, Enum]: except ValueError as e: raise KeywordError(parameter, "enum", [value]) from e - keyword = value.lower() + keyword = value if parameter.metalog_ok else value.lower() if not parameter.keywords: return keyword - elif parameter.key_path and Metadata.is_key_path(keyword, parameter.keywords): + elif parameter.key_path and Metadata.is_key_path( + keyword, parameter.keywords, metalog_key_ok=parameter.metalog_ok + ): return keyword elif keyword in parameter.keywords: return keyword @@ -715,6 +743,7 @@ def __init__( key_path: bool = False, string_list: Optional[str] = None, enum: Optional[type[Enum]] = None, + metalog_ok: bool = False, ): """Initialize a Parameter object describing a JSON parameter with its type and attributes. @@ -731,6 +760,9 @@ def __init__( will be split into lists. enum : An Enum subclass to which an upcased keyword should be converted. + metalog_ok : Normal key_path keywords are alphanumeric; in order to + accommodate `metadata.log` keys, we allow an expanded character + set in restricted cases. """ self.name = name self.type = type @@ -740,6 +772,7 @@ def __init__( self.key_path = key_path self.string_list = string_list self.enum = enum + self.metalog_ok = metalog_ok def invalid(self, json: JSONOBJECT) -> bool: """Check whether the value of this parameter in the JSON document is @@ -1535,8 +1568,7 @@ def _build_sql_query( def _get_dataset_metadata( self, dataset: Dataset, requested_items: List[str] ) -> JSON: - """Get requested metadata about a specific Dataset and return a JSON - fragment that can be added to other data about the Dataset. + """Get requested metadata for a specific Dataset This supports strict Metadata key/value items associated with the Dataset as well as selected columns from the Dataset model. @@ -1554,30 +1586,27 @@ def _get_dataset_metadata( metadata = {} for i in requested_items: - if Metadata.is_key_path(i, Metadata.METADATA_KEYS): - native_key = Metadata.get_native_key(i) - user_id = None - if native_key == Metadata.USER: - user_id = Auth.get_current_user_id() - try: - metadata[i] = Metadata.getvalue( - dataset=dataset, key=i, user_id=user_id - ) - except MetadataError: - metadata[i] = None - else: - raise MetadataBadKey(i) + native_key = Metadata.get_native_key(i) + user_id = None + if native_key == Metadata.USER: + user_id = Auth.get_current_user_id() + try: + metadata[i] = Metadata.getvalue(dataset=dataset, key=i, user_id=user_id) + except MetadataError: + metadata[i] = None return metadata def _set_dataset_metadata( self, dataset: Dataset, metadata: dict[str, JSONVALUE] ) -> dict[str, str]: - """Set metadata on a specific Dataset and return a summary of failures. + """Set metadata on a specific Dataset This supports strict Metadata key/value items associated with the Dataset as well as selected columns from the Dataset model. + Errors are collected and returned. + Args: dataset: Dataset object metadata: dict of key/value pairs diff --git a/lib/pbench/server/api/resources/datasets_list.py b/lib/pbench/server/api/resources/datasets_list.py index f3853415b5..ec41c758e9 100644 --- a/lib/pbench/server/api/resources/datasets_list.py +++ b/lib/pbench/server/api/resources/datasets_list.py @@ -76,6 +76,7 @@ def __init__(self, config: PbenchServerConfig): keywords=Metadata.METADATA_KEYS, key_path=True, string_list=",", + metalog_ok=True, ), ), authorization=ApiAuthorizationType.USER_ACCESS, diff --git a/lib/pbench/server/api/resources/datasets_metadata.py b/lib/pbench/server/api/resources/datasets_metadata.py index 1d0f4b84af..fcce2c418e 100644 --- a/lib/pbench/server/api/resources/datasets_metadata.py +++ b/lib/pbench/server/api/resources/datasets_metadata.py @@ -67,6 +67,7 @@ def __init__(self, config: PbenchServerConfig): keywords=Metadata.METADATA_KEYS, key_path=True, string_list=",", + metalog_ok=True, ) ), authorization=ApiAuthorizationType.DATASET, diff --git a/lib/pbench/server/api/resources/query_apis/datasets/datasets_detail.py b/lib/pbench/server/api/resources/query_apis/datasets/datasets_detail.py index d5c2159724..f46da76e72 100644 --- a/lib/pbench/server/api/resources/query_apis/datasets/datasets_detail.py +++ b/lib/pbench/server/api/resources/query_apis/datasets/datasets_detail.py @@ -45,6 +45,7 @@ def __init__(self, config: PbenchServerConfig): keywords=Metadata.METADATA_KEYS, key_path=True, string_list=",", + metalog_ok=True, ), ), authorization=ApiAuthorizationType.DATASET, diff --git a/lib/pbench/server/database/models/datasets.py b/lib/pbench/server/database/models/datasets.py index 83d0e81d76..6efc32c8a4 100644 --- a/lib/pbench/server/database/models/datasets.py +++ b/lib/pbench/server/database/models/datasets.py @@ -779,8 +779,25 @@ def is_key_path(key: str, valid: List[str], metalog_key_ok: bool = False) -> boo def getvalue( dataset: Dataset, key: str, user_id: Optional[str] = None ) -> Optional[JSON]: - """Returns the value of the specified key, which may be a dotted - hierarchical path (e.g., "server.deleted"). + """Returns the value of the specified metadata key. + + A metadata key may be a dotted hierarchical path, like "server.deleted" + or "dataset.metalog.pbench.script". + + This code assumes that any necessary syntax checks have been made by a + caller. Normally, user key update is restricted to a specific set of + keys and namespaces (first level key like "server" or "global"), and + keys are lowercase and alphanumeric. + + In the "dataset.metalog" sub-namespace, keys originate from a Pbench + Agent "metadata.log" config file, which may include non-alphanumeric + keys such as "iterations/1-tcp_stream-64B-1i". Pbench Server APIs may + allow access to these keys in limited circumstances for display and + filtering, and we don't interfere with that ability here. The namespace + key is always lowercased to find the SQL row; however other keys in the + path are opportunistically lowercased only if an exact match isn't + found. This allows "DATASET.SERVER" to match "dataset.server" but also + allows matching "dataset.metalog.iterations/1-tcp_stream-64B-1i". The specific value of the dotted key path is returned, not the top level Metadata object. The full JSON value of a top level key can be @@ -812,10 +829,11 @@ def getvalue( Returns: Value of the key path """ - if not Metadata.is_key_path(key, Metadata.METADATA_KEYS): + keys = key.split(".") + # Disallow ".." and trailing "." + if "" in keys: raise MetadataBadKey(key) - keys = key.lower().split(".") - native_key = keys.pop(0) + native_key = keys.pop(0).lower() if native_key == "dataset": value = dataset.as_dict() else: @@ -831,10 +849,13 @@ def getvalue( # the stored data; let the caller know with an exception. if type(value) is not dict: raise MetadataBadStructure(dataset, key, name) - if i not in value: + if i in value: + name = i + elif i.lower() in value: + name = i.lower() + else: return None - value = value[i] - name = i + value = value[name] return value @staticmethod diff --git a/lib/pbench/test/unit/server/query_apis/test_datasets_detail.py b/lib/pbench/test/unit/server/query_apis/test_datasets_detail.py index 5dcd6265cc..399491ec76 100644 --- a/lib/pbench/test/unit/server/query_apis/test_datasets_detail.py +++ b/lib/pbench/test/unit/server/query_apis/test_datasets_detail.py @@ -6,6 +6,7 @@ from pbench.server.api.resources.query_apis.datasets.datasets_detail import ( DatasetsDetail, ) +from pbench.server.database.models.datasets import Dataset, Metadata from pbench.test.unit.server.conftest import generate_token from pbench.test.unit.server.query_apis.commons import Commons @@ -197,7 +198,29 @@ def test_metadata( focus on the database dataset metadata... but necessarily has to borrow much of the setup. """ - query_params = {"metadata": ["global.seen", "server.deletion"]} + + # In order to test that this API supports getting non-alphanumeric + # metalog metadata keys, we decorate the dataset with one. To keep it + # simple, just remove the existing Metadata row and create a new one. + drb = Dataset.query(name="drb") + Metadata.delete(Metadata.get(drb, Metadata.METALOG)) + Metadata.create( + dataset=drb, + key=Metadata.METALOG, + value={ + "iterations/fooBar=10-what_else@weird": { + "iteration_name": "fooBar=10-what_else@weird" + }, + "run": {"controller": "node1.example.com"}, + }, + ) + query_params = { + "metadata": [ + "global.seen", + "server.deletion", + "dataset.metalog.iterations/fooBar=10-what_else@weird.iteration_name", + ] + } response_payload = { "took": 112, @@ -319,6 +342,7 @@ def test_metadata( "serverMetadata": { "global.seen": None, "server.deletion": "2022-12-26", + "dataset.metalog.iterations/fooBar=10-what_else@weird.iteration_name": "fooBar=10-what_else@weird", }, } assert expected == res_json diff --git a/lib/pbench/test/unit/server/test_datasets_list.py b/lib/pbench/test/unit/server/test_datasets_list.py index ff589e8306..c3f64c0665 100644 --- a/lib/pbench/test/unit/server/test_datasets_list.py +++ b/lib/pbench/test/unit/server/test_datasets_list.py @@ -309,6 +309,7 @@ def test_dataset_paged_list(self, query_as, login, query, results, server_config query: A JSON representation of the query parameters (these will be automatically supplemented with a metadata request term) results: A list of the dataset names we expect to be returned + server_config: The PbenchServerConfig object """ query.update({"metadata": ["dataset.uploaded"], "limit": 5}) result = query_as(query, login, HTTPStatus.OK) @@ -319,10 +320,6 @@ def test_unauth_dataset_list(self, query_as): Args: query_as: A fixture to provide a helper that executes the API call - login: The username as which to perform a query - query: A JSON representation of the query parameters (these will be - automatically supplemented with a metadata request term) - results: A list of the dataset names we expect to be returned """ query_as({"access": "private"}, None, HTTPStatus.UNAUTHORIZED) @@ -345,7 +342,7 @@ def test_get_bad_keys(self, query_as): ) } - def test_get_get_errors(self, server_config, query_as): + def test_get_key_errors(self, query_as): """Test case reporting key errors Args: @@ -382,6 +379,60 @@ def test_get_get_errors(self, server_config, query_as): "total": 3, } + def test_use_funk_metalog_keys(self, query_as): + """Test funky metadata.log keys + + Normally we constrain metadata keys to lowercase alphanumeric strings. + Traditional Pbench Agent `metadata.log` files contain keys constructed + from benchmark iteration values that can contain mixed case and symbol + characters. We allow these keys to be filtered and retrieved, but not + created, so test that we can filter on a funky key value and return + the key. + + Args: + query_as: Query helper fixture + """ + fio_1 = Dataset.query(name="fio_1") + Metadata.create( + dataset=fio_1, + key=Metadata.METALOG, + value={ + "pbench": { + "date": "2020-02-15T00:00:00", + "config": "test1", + "script": "unit-test", + "name": "fio_1", + }, + "iterations/fooBar=10-what_else@weird": { + "iteration_name": "fooBar=10-what_else@weird" + }, + "run": {"controller": "node1.example.com"}, + }, + ) + response = query_as( + { + "metadata": "dataset.metalog.iterations/fooBar=10-what_else@weird", + "filter": "dataset.metalog.iterations/fooBar=10-what_else@weird.iteration_name:~10", + }, + "drb", + HTTPStatus.OK, + ) + assert response.json == { + "next_url": "", + "results": [ + { + "metadata": { + "dataset.metalog.iterations/fooBar=10-what_else@weird": { + "iteration_name": "fooBar=10-what_else@weird" + } + }, + "name": "fio_1", + "resource_id": "random_md5_string3", + } + ], + "total": 1, + } + def test_get_unknown_keys(self, query_as): """Test case requesting non-existent query parameter keys. diff --git a/lib/pbench/test/unit/server/test_datasets_metadata.py b/lib/pbench/test/unit/server/test_datasets_metadata.py index 902a94a87c..b584522164 100644 --- a/lib/pbench/test/unit/server/test_datasets_metadata.py +++ b/lib/pbench/test/unit/server/test_datasets_metadata.py @@ -5,7 +5,7 @@ from pbench.server import JSON, OperationCode from pbench.server.database.models.audit import Audit, AuditStatus, AuditType -from pbench.server.database.models.datasets import Dataset, DatasetNotFound +from pbench.server.database.models.datasets import Dataset, DatasetNotFound, Metadata class TestDatasetsMetadataGet: @@ -209,6 +209,50 @@ def test_get_bad_query_2(self, query_get_as): ) assert response.json == {"message": "Unknown URL query keys: controller,plugh"} + def test_get_funky_metalog_key(self, query_get_as): + """Test funky metadata.log key + + Normally we constrain metadata keys to lowercase alphanumeric strings. + Traditional Pbench Agent `metadata.log` files contain keys constructed + from benchmark iteration values that can contain mixed case and symbol + characters. We allow these keys to be filtered and retrieved, but not + created, so test that we can filter on a funky key value and return + the key. + + Args: + query_get_as: Query helper fixture + """ + fio_1 = Dataset.query(name="fio_1") + Metadata.create( + dataset=fio_1, + key=Metadata.METALOG, + value={ + "pbench": { + "date": "2020-02-15T00:00:00", + "config": "test1", + "script": "unit-test", + "name": "fio_1", + }, + "iterations/fooBar=10-what_else@weird": { + "iteration_name": "fooBar=10-what_else@weird" + }, + "run": {"controller": "node1.example.com"}, + }, + ) + response = query_get_as( + "fio_1", + { + "metadata": ["dataset.metalog.iterations/fooBar=10-what_else@weird"], + }, + "drb", + HTTPStatus.OK, + ) + assert response.json == { + "dataset.metalog.iterations/fooBar=10-what_else@weird": { + "iteration_name": "fooBar=10-what_else@weird" + } + } + class TestDatasetsMetadataPut(TestDatasetsMetadataGet): @pytest.fixture() @@ -285,17 +329,39 @@ def test_put_no_dataset(self, client, server_config, attach_dataset): assert response.status_code == HTTPStatus.NOT_FOUND assert response.json == {"message": "Dataset 'foobar' not found"} - def test_put_bad_keys(self, client, server_config, attach_dataset): + @pytest.mark.parametrize( + "keys,keyerr", + ( + ( + {"xyzzy": "private", "what": "sup", "global.saved": True}, + "'what', 'xyzzy'", + ), + ({"global": {"Ab.foo": True}}, "'Ab.foo'"), + ({"global": {"ab@": True}}, "'ab@'"), + ({"global": {"abc": {"#$": "bad key"}}}, "'#$'"), + ( + { + "global": { + "a": { + "#bad": {"still@bad": "ok", "good": True}, + ".no": {"Yes": 0, "no?": 1}, + } + } + }, + "'#bad', '.no', 'Yes', 'no?', 'still@bad'", + ), + ({"global.AbC@foo=y": True}, "'global.AbC@foo=y'"), + ({"global..foo": True}, "'global..foo'"), + ), + ) + def test_put_bad_keys(self, client, server_config, attach_dataset, keys, keyerr): response = client.put( f"{server_config.rest_uri}/datasets/drb/metadata", - json={ - "metadata": {"xyzzy": "private", "what": "sup", "global.saved": True} - }, + json={"metadata": keys}, ) - assert response.status_code == HTTPStatus.BAD_REQUEST - assert response.json == { - "message": "Unrecognized JSON keys ['what', 'xyzzy'] for parameter metadata." - } + assert response.status_code == HTTPStatus.BAD_REQUEST, response.json["message"] + msg = response.json["message"] + assert "Unrecognized JSON key" in msg and keyerr in msg def test_put_reserved_metadata(self, client, server_config, attach_dataset): response = client.put( @@ -456,31 +522,41 @@ def test_put_set_errors(self, capinternal, monkeypatch, query_get_as, query_put_ def test_put(self, query_get_as, query_put_as): response = query_put_as( "drb", - {"metadata": {"global.seen": False, "global.saved": True}}, + {"metadata": {"global.seen": False, "global.under-score_hyphen": True}}, "drb", HTTPStatus.OK, ) assert response.json == { - "metadata": {"global.saved": True, "global.seen": False}, + "metadata": {"global.under-score_hyphen": True, "global.seen": False}, "errors": {}, } response = query_get_as( "drb", {"metadata": "global,dataset.access"}, "drb", HTTPStatus.OK ) assert response.json == { - "global": {"contact": "me@example.com", "saved": True, "seen": False}, + "global": { + "contact": "me@example.com", + "under-score_hyphen": True, + "seen": False, + }, "dataset.access": "private", } # Try a second GET, returning "global" fields separately: response = query_get_as( "drb", - {"metadata": ["global.seen", "global.saved", "dataset.access"]}, + { + "metadata": [ + "global.seen", + "global.under-score_hyphen", + "dataset.access", + ] + }, "drb", HTTPStatus.OK, ) assert response.json == { - "global.saved": True, + "global.under-score_hyphen": True, "global.seen": False, "dataset.access": "private", } @@ -510,7 +586,7 @@ def test_put(self, query_get_as, query_put_as): assert audit[1].user_name == "drb" assert audit[1].reason is None assert audit[1].attributes == { - "updated": {"global.seen": False, "global.saved": True} + "updated": {"global.seen": False, "global.under-score_hyphen": True} } def test_put_user(self, query_get_as, query_put_as): diff --git a/lib/pbench/test/unit/server/test_schema.py b/lib/pbench/test/unit/server/test_schema.py index 316567af84..df8668b796 100644 --- a/lib/pbench/test/unit/server/test_schema.py +++ b/lib/pbench/test/unit/server/test_schema.py @@ -326,21 +326,57 @@ def test_boolean(self, value, expected): assert x.normalize(value) == expected @pytest.mark.parametrize( - "input,expected", + "input,funky,expected", ( - ("yes", "yes"), - ("Yes", "yes"), - ("me.you", "me.you"), - ("me.You.him", "me.you.him"), - ("ME.US.HER.THEM", "me.us.her.them"), + ("yes", False, "yes"), + ("Yes", False, "yes"), + ("me.you", False, "me.you"), + ("me.You.him", False, "me.you.him"), + ("ME.US.HER.THEM", False, "me.us.her.them"), + ("me.a.b.fooBar:test@y=z", True, "me.a.b.fooBar:test@y=z"), + ("yes.a-7.b_2.foo", False, "yes.a-7.b_2.foo"), + ("yes.a_7.b-2.@foo", True, "yes.a_7.b-2.@foo"), + ("me.;baR.#f0o.x@y=10", True, "me.;baR.#f0o.x@y=10"), ), ) - def test_keyword_namespace(self, input, expected): + def test_keyword_namespace(self, input, funky, expected): + """Test parameter normalization for a keyword parameter.""" + x = Parameter( + "data", + ParamType.KEYWORD, + keywords=["yes", "me"], + key_path=True, + metalog_ok=funky, + ) + assert x.normalize(input) == expected + + @pytest.mark.parametrize( + "input,funky", + ( + ("me..you", False), + ("me..you", True), + ("me.#You.him", False), + ("them.us", True), + ("a.b.fooBar:test@y=z", True), + ("wow.a7.b2.foo", False), + ), + ) + def test_keyword_namespace_bad(self, input, funky): """ Test parameter normalization for a keyword parameter. """ - x = Parameter("data", ParamType.KEYWORD, keywords=["yes", "me"], key_path=True) - assert x.normalize(input) == expected + x = Parameter( + "data", + ParamType.KEYWORD, + keywords=["yes", "me"], + key_path=True, + metalog_ok=funky, + ) + with pytest.raises( + KeywordError, + match=f"Unrecognized keyword \\[{input!r}\\] for parameter data.", + ): + x.normalize(input) @pytest.mark.parametrize( "input,expected", diff --git a/lib/pbench/test/unit/server/test_upload.py b/lib/pbench/test/unit/server/test_upload.py index b696d5d3f4..4a0aa3a4f6 100644 --- a/lib/pbench/test/unit/server/test_upload.py +++ b/lib/pbench/test/unit/server/test_upload.py @@ -204,7 +204,7 @@ def test_bad_metadata_upload( "Content-Length": "STRING", }, query_string={ - "metadata": "foobar.badpath:data,server.deletion:3000-12-25T23:59:59+00:00" + "metadata": "global.xyz#A@b=z:y,foobar.badpath:data,server.deletion:3000-12-25T23:59:59+00:00" }, ) assert response.status_code == HTTPStatus.BAD_REQUEST @@ -212,6 +212,7 @@ def test_bad_metadata_upload( assert "errors" in json and "message" in json assert json["message"] == "at least one specified metadata key is invalid" assert json["errors"] == [ + "Key global.xyz#a@b=z is invalid or isn't settable", "Key foobar.badpath is invalid or isn't settable", "Metadata key 'server.deletion' value '3000-12-25T23:59:59+00:00' for dataset must be a date/time before 1979-12-30", ]