Support limited non-alphanumeric metadata keys (#3354)

* Support limited non-alphanumeric metadata keys PBENCH-1111 Internally defined and managed Pbench Server metadata keys are lowercase alphanumeric strings; however the dataset.metalog namespace is extracted directly from the tarball metadata.log file and some benchmarks (especially fio and uperf generate tool and iteration keys that aren't accessible because they fail the normal key path validation checks. This PR loosens validation checks in some paths so `GET` APIs that retrieve metadata will be able to find these keys, for example uperf iterations like `dataset.metalog.iterations/1-tcp_stream-64B-1i`. This PR also tightens the key namespace check within an incoming JSON body parameter, e.g. for `PUT /datasets/{id}/metadata` to ensure that all nested keys conform to the rules. (NOTE: I first noticed this problem when I added general metadata filtering, and made special allowances in the `filter` keyword processing, but at the time didn't want to expand the scope to deal with more general handling. I was reminded when I added the key aggregator, and finally I'm fixing it.)
distributed-system-analysis · Mar 22, 2023 · afb9bef · afb9bef
1 parent 97e17ca
commit afb9bef
Show file tree

Hide file tree

Showing 10 changed files with 307 additions and 66 deletions.
diff --git a/lib/pbench/server/api/resources/__init__.py b/lib/pbench/server/api/resources/__init__.py
@@ -4,6 +4,7 @@
 from http import HTTPStatus
 import json
 from json.decoder import JSONDecodeError
+import re
 from typing import Any, Callable, Dict, List, NamedTuple, Optional, Union
 import uuid
 
@@ -25,7 +26,6 @@
     Dataset,
     DatasetNotFound,
     Metadata,
-    MetadataBadKey,
     MetadataError,
     OperationName,
 )
@@ -428,9 +428,11 @@ def convert_json(value: JSONOBJECT, parameter: "Parameter") -> JSONOBJECT:
     """Validate a parameter of JSON type.
 
     If the Parameter object defines a list of keywords, the JSON key values are
-    validated against that list. If the Parameter key_path attribute is set,
-    then only the first element of a dotted path (e.g., "user" for
-    "user.contact.email") is validated.
+    validated against that list.
+
+    If the Parameter key_path attribute is set, validate that the key is legal
+    and if the value is a JSON object (dict) that all keys within the JSON are
+    legal.
 
     Args:
         value : JSON dict
@@ -443,6 +445,25 @@ def convert_json(value: JSONOBJECT, parameter: "Parameter") -> JSONOBJECT:
     Returns:
         The JSON dict
     """
+    _valid_key = re.compile(r"[a-z0-9_-]+")
+
+    def keysafe(value: dict[str, Any]) -> list[str]:
+        """Test whether nested JSON keys are legal.
+
+        Args:
+            value: A JSON structure
+
+        Returns:
+            A list of bad keys (empty if all are good)
+        """
+        bad = []
+        for k, v in value.items():
+            if not _valid_key.fullmatch(k):
+                bad.append(k)
+            if isinstance(v, dict):
+                bad.extend(keysafe(v))
+        return bad
+
     try:
         washed = json.loads(json.dumps(value))
     except JSONDecodeError as e:
@@ -455,12 +476,17 @@ def convert_json(value: JSONOBJECT, parameter: "Parameter") -> JSONOBJECT:
 
     if parameter.keywords:
         bad = []
-        for k in value.keys():
-            if parameter.key_path:
-                if not Metadata.is_key_path(k, parameter.keywords):
+        try:
+            for k, v in value.items():
+                if parameter.key_path:
+                    if not Metadata.is_key_path(k, parameter.keywords):
+                        bad.append(k)
+                    if isinstance(v, dict):
+                        bad.extend(keysafe(v))
+                elif k not in parameter.keywords:
                     bad.append(k)
-            elif k not in parameter.keywords:
-                bad.append(k)
+        except Exception as e:
+            raise ConversionError(value, "JSON") from e
         if bad:
             raise KeywordError(parameter, f"JSON key{'s' if len(bad) > 1 else ''}", bad)
 
@@ -546,10 +572,12 @@ def convert_keyword(value: str, parameter: "Parameter") -> Union[str, Enum]:
         except ValueError as e:
             raise KeywordError(parameter, "enum", [value]) from e
 
-    keyword = value.lower()
+    keyword = value if parameter.metalog_ok else value.lower()
     if not parameter.keywords:
         return keyword
-    elif parameter.key_path and Metadata.is_key_path(keyword, parameter.keywords):
+    elif parameter.key_path and Metadata.is_key_path(
+        keyword, parameter.keywords, metalog_key_ok=parameter.metalog_ok
+    ):
         return keyword
     elif keyword in parameter.keywords:
         return keyword
@@ -715,6 +743,7 @@ def __init__(
         key_path: bool = False,
         string_list: Optional[str] = None,
         enum: Optional[type[Enum]] = None,
+        metalog_ok: bool = False,
     ):
         """Initialize a Parameter object describing a JSON parameter with its
         type and attributes.
@@ -731,6 +760,9 @@ def __init__(
                 will be split into lists.
             enum : An Enum subclass to which an upcased keyword should be
                 converted.
+            metalog_ok : Normal key_path keywords are alphanumeric; in order to
+                accommodate `metadata.log` keys, we allow an expanded character
+                set in restricted cases.
         """
         self.name = name
         self.type = type
@@ -740,6 +772,7 @@ def __init__(
         self.key_path = key_path
         self.string_list = string_list
         self.enum = enum
+        self.metalog_ok = metalog_ok
 
     def invalid(self, json: JSONOBJECT) -> bool:
         """Check whether the value of this parameter in the JSON document is
@@ -1535,8 +1568,7 @@ def _build_sql_query(
     def _get_dataset_metadata(
         self, dataset: Dataset, requested_items: List[str]
     ) -> JSON:
-        """Get requested metadata about a specific Dataset and return a JSON
-        fragment that can be added to other data about the Dataset.
+        """Get requested metadata for a specific Dataset
 
         This supports strict Metadata key/value items associated with the
         Dataset as well as selected columns from the Dataset model.
@@ -1554,30 +1586,27 @@ def _get_dataset_metadata(
 
         metadata = {}
         for i in requested_items:
-            if Metadata.is_key_path(i, Metadata.METADATA_KEYS):
-                native_key = Metadata.get_native_key(i)
-                user_id = None
-                if native_key == Metadata.USER:
-                    user_id = Auth.get_current_user_id()
-                try:
-                    metadata[i] = Metadata.getvalue(
-                        dataset=dataset, key=i, user_id=user_id
-                    )
-                except MetadataError:
-                    metadata[i] = None
-            else:
-                raise MetadataBadKey(i)
+            native_key = Metadata.get_native_key(i)
+            user_id = None
+            if native_key == Metadata.USER:
+                user_id = Auth.get_current_user_id()
+            try:
+                metadata[i] = Metadata.getvalue(dataset=dataset, key=i, user_id=user_id)
+            except MetadataError:
+                metadata[i] = None
 
         return metadata
 
     def _set_dataset_metadata(
         self, dataset: Dataset, metadata: dict[str, JSONVALUE]
     ) -> dict[str, str]:
-        """Set metadata on a specific Dataset and return a summary of failures.
+        """Set metadata on a specific Dataset
 
         This supports strict Metadata key/value items associated with the
         Dataset as well as selected columns from the Dataset model.
 
+        Errors are collected and returned.
+
         Args:
             dataset: Dataset object
             metadata: dict of key/value pairs

diff --git a/lib/pbench/server/api/resources/datasets_list.py b/lib/pbench/server/api/resources/datasets_list.py
@@ -76,6 +76,7 @@ def __init__(self, config: PbenchServerConfig):
                         keywords=Metadata.METADATA_KEYS,
                         key_path=True,
                         string_list=",",
+                        metalog_ok=True,
                     ),
                 ),
                 authorization=ApiAuthorizationType.USER_ACCESS,

diff --git a/lib/pbench/server/api/resources/datasets_metadata.py b/lib/pbench/server/api/resources/datasets_metadata.py
@@ -67,6 +67,7 @@ def __init__(self, config: PbenchServerConfig):
                         keywords=Metadata.METADATA_KEYS,
                         key_path=True,
                         string_list=",",
+                        metalog_ok=True,
                     )
                 ),
                 authorization=ApiAuthorizationType.DATASET,

diff --git a/lib/pbench/server/api/resources/query_apis/datasets/datasets_detail.py b/lib/pbench/server/api/resources/query_apis/datasets/datasets_detail.py
@@ -45,6 +45,7 @@ def __init__(self, config: PbenchServerConfig):
                         keywords=Metadata.METADATA_KEYS,
                         key_path=True,
                         string_list=",",
+                        metalog_ok=True,
                     ),
                 ),
                 authorization=ApiAuthorizationType.DATASET,

diff --git a/lib/pbench/server/database/models/datasets.py b/lib/pbench/server/database/models/datasets.py
@@ -779,8 +779,25 @@ def is_key_path(key: str, valid: List[str], metalog_key_ok: bool = False) -> boo
     def getvalue(
         dataset: Dataset, key: str, user_id: Optional[str] = None
     ) -> Optional[JSON]:
-        """Returns the value of the specified key, which may be a dotted
-        hierarchical path (e.g., "server.deleted").
+        """Returns the value of the specified metadata key.
+
+        A metadata key may be a dotted hierarchical path, like "server.deleted"
+        or "dataset.metalog.pbench.script".
+
+        This code assumes that any necessary syntax checks have been made by a
+        caller. Normally, user key update is restricted to a specific set of
+        keys and namespaces (first level key like "server" or "global"), and
+        keys are lowercase and alphanumeric.
+
+        In the "dataset.metalog" sub-namespace, keys originate from a Pbench
+        Agent "metadata.log" config file, which may include non-alphanumeric
+        keys such as "iterations/1-tcp_stream-64B-1i". Pbench Server APIs may
+        allow access to these keys in limited circumstances for display and
+        filtering, and we don't interfere with that ability here. The namespace
+        key is always lowercased to find the SQL row; however other keys in the
+        path are opportunistically lowercased only if an exact match isn't
+        found. This allows "DATASET.SERVER" to match "dataset.server" but also
+        allows matching "dataset.metalog.iterations/1-tcp_stream-64B-1i".
 
         The specific value of the dotted key path is returned, not the top
         level Metadata object. The full JSON value of a top level key can be
@@ -812,10 +829,11 @@ def getvalue(
         Returns:
             Value of the key path
         """
-        if not Metadata.is_key_path(key, Metadata.METADATA_KEYS):
+        keys = key.split(".")
+        # Disallow ".." and trailing "."
+        if "" in keys:
             raise MetadataBadKey(key)
-        keys = key.lower().split(".")
-        native_key = keys.pop(0)
+        native_key = keys.pop(0).lower()
         if native_key == "dataset":
             value = dataset.as_dict()
         else:
@@ -831,10 +849,13 @@ def getvalue(
             # the stored data; let the caller know with an exception.
             if type(value) is not dict:
                 raise MetadataBadStructure(dataset, key, name)
-            if i not in value:
+            if i in value:
+                name = i
+            elif i.lower() in value:
+                name = i.lower()
+            else:
                 return None
-            value = value[i]
-            name = i
+            value = value[name]
         return value
 
     @staticmethod

diff --git a/lib/pbench/test/unit/server/query_apis/test_datasets_detail.py b/lib/pbench/test/unit/server/query_apis/test_datasets_detail.py
@@ -6,6 +6,7 @@
 from pbench.server.api.resources.query_apis.datasets.datasets_detail import (
     DatasetsDetail,
 )
+from pbench.server.database.models.datasets import Dataset, Metadata
 from pbench.test.unit.server.conftest import generate_token
 from pbench.test.unit.server.query_apis.commons import Commons
 
@@ -197,7 +198,29 @@ def test_metadata(
         focus on the database dataset metadata... but necessarily has to borrow
         much of the setup.
         """
-        query_params = {"metadata": ["global.seen", "server.deletion"]}
+
+        # In order to test that this API supports getting non-alphanumeric
+        # metalog metadata keys, we decorate the dataset with one. To keep it
+        # simple, just remove the existing Metadata row and create a new one.
+        drb = Dataset.query(name="drb")
+        Metadata.delete(Metadata.get(drb, Metadata.METALOG))
+        Metadata.create(
+            dataset=drb,
+            key=Metadata.METALOG,
+            value={
+                "iterations/fooBar=10-what_else@weird": {
+                    "iteration_name": "fooBar=10-what_else@weird"
+                },
+                "run": {"controller": "node1.example.com"},
+            },
+        )
+        query_params = {
+            "metadata": [
+                "global.seen",
+                "server.deletion",
+                "dataset.metalog.iterations/[email protected]_name",
+            ]
+        }
 
         response_payload = {
             "took": 112,
@@ -319,6 +342,7 @@ def test_metadata(
             "serverMetadata": {
                 "global.seen": None,
                 "server.deletion": "2022-12-26",
+                "dataset.metalog.iterations/[email protected]_name": "fooBar=10-what_else@weird",
             },
         }
         assert expected == res_json

diff --git a/lib/pbench/test/unit/server/test_datasets_list.py b/lib/pbench/test/unit/server/test_datasets_list.py
@@ -309,6 +309,7 @@ def test_dataset_paged_list(self, query_as, login, query, results, server_config
             query: A JSON representation of the query parameters (these will be
                 automatically supplemented with a metadata request term)
             results: A list of the dataset names we expect to be returned
+            server_config: The PbenchServerConfig object
         """
         query.update({"metadata": ["dataset.uploaded"], "limit": 5})
         result = query_as(query, login, HTTPStatus.OK)
@@ -319,10 +320,6 @@ def test_unauth_dataset_list(self, query_as):
 
         Args:
             query_as: A fixture to provide a helper that executes the API call
-            login: The username as which to perform a query
-            query: A JSON representation of the query parameters (these will be
-                automatically supplemented with a metadata request term)
-            results: A list of the dataset names we expect to be returned
         """
         query_as({"access": "private"}, None, HTTPStatus.UNAUTHORIZED)
 
@@ -345,7 +342,7 @@ def test_get_bad_keys(self, query_as):
             )
         }
 
-    def test_get_get_errors(self, server_config, query_as):
+    def test_get_key_errors(self, query_as):
         """Test case reporting key errors
 
         Args:
@@ -382,6 +379,60 @@ def test_get_get_errors(self, server_config, query_as):
             "total": 3,
         }
 
+    def test_use_funk_metalog_keys(self, query_as):
+        """Test funky metadata.log keys
+
+        Normally we constrain metadata keys to lowercase alphanumeric strings.
+        Traditional Pbench Agent `metadata.log` files contain keys constructed
+        from benchmark iteration values that can contain mixed case and symbol
+        characters. We allow these keys to be filtered and retrieved, but not
+        created, so test that we can filter on a funky key value and return
+        the key.
+
+        Args:
+            query_as: Query helper fixture
+        """
+        fio_1 = Dataset.query(name="fio_1")
+        Metadata.create(
+            dataset=fio_1,
+            key=Metadata.METALOG,
+            value={
+                "pbench": {
+                    "date": "2020-02-15T00:00:00",
+                    "config": "test1",
+                    "script": "unit-test",
+                    "name": "fio_1",
+                },
+                "iterations/fooBar=10-what_else@weird": {
+                    "iteration_name": "fooBar=10-what_else@weird"
+                },
+                "run": {"controller": "node1.example.com"},
+            },
+        )
+        response = query_as(
+            {
+                "metadata": "dataset.metalog.iterations/fooBar=10-what_else@weird",
+                "filter": "dataset.metalog.iterations/[email protected]_name:~10",
+            },
+            "drb",
+            HTTPStatus.OK,
+        )
+        assert response.json == {
+            "next_url": "",
+            "results": [
+                {
+                    "metadata": {
+                        "dataset.metalog.iterations/fooBar=10-what_else@weird": {
+                            "iteration_name": "fooBar=10-what_else@weird"
+                        }
+                    },
+                    "name": "fio_1",
+                    "resource_id": "random_md5_string3",
+                }
+            ],
+            "total": 1,
+        }
+
     def test_get_unknown_keys(self, query_as):
         """Test case requesting non-existent query parameter keys.