Skip to content

Commit

Permalink
Support limited non-alphanumeric metadata keys (#3354)
Browse files Browse the repository at this point in the history
* Support limited non-alphanumeric metadata keys

PBENCH-1111

Internally defined and managed Pbench Server metadata keys are lowercase
alphanumeric strings; however the dataset.metalog namespace is extracted
directly from the tarball metadata.log file and some benchmarks (especially
fio and uperf generate tool and iteration keys that aren't accessible
because they fail the normal key path validation checks.

This PR loosens validation checks in some paths so `GET` APIs that retrieve
metadata will be able to find these keys, for example uperf iterations like
`dataset.metalog.iterations/1-tcp_stream-64B-1i`.

This PR also tightens the key namespace check within an incoming JSON
body parameter, e.g. for `PUT /datasets/{id}/metadata` to ensure that all
nested keys conform to the rules.

(NOTE: I first noticed this problem when I added general metadata filtering,
and made special allowances in the `filter` keyword processing, but at the
time didn't want to expand the scope to deal with more general handling. I
was reminded when I added the key aggregator, and finally I'm fixing it.)
  • Loading branch information
dbutenhof authored Mar 22, 2023
1 parent 97e17ca commit afb9bef
Show file tree
Hide file tree
Showing 10 changed files with 307 additions and 66 deletions.
83 changes: 56 additions & 27 deletions lib/pbench/server/api/resources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from http import HTTPStatus
import json
from json.decoder import JSONDecodeError
import re
from typing import Any, Callable, Dict, List, NamedTuple, Optional, Union
import uuid

Expand All @@ -25,7 +26,6 @@
Dataset,
DatasetNotFound,
Metadata,
MetadataBadKey,
MetadataError,
OperationName,
)
Expand Down Expand Up @@ -428,9 +428,11 @@ def convert_json(value: JSONOBJECT, parameter: "Parameter") -> JSONOBJECT:
"""Validate a parameter of JSON type.
If the Parameter object defines a list of keywords, the JSON key values are
validated against that list. If the Parameter key_path attribute is set,
then only the first element of a dotted path (e.g., "user" for
"user.contact.email") is validated.
validated against that list.
If the Parameter key_path attribute is set, validate that the key is legal
and if the value is a JSON object (dict) that all keys within the JSON are
legal.
Args:
value : JSON dict
Expand All @@ -443,6 +445,25 @@ def convert_json(value: JSONOBJECT, parameter: "Parameter") -> JSONOBJECT:
Returns:
The JSON dict
"""
_valid_key = re.compile(r"[a-z0-9_-]+")

def keysafe(value: dict[str, Any]) -> list[str]:
"""Test whether nested JSON keys are legal.
Args:
value: A JSON structure
Returns:
A list of bad keys (empty if all are good)
"""
bad = []
for k, v in value.items():
if not _valid_key.fullmatch(k):
bad.append(k)
if isinstance(v, dict):
bad.extend(keysafe(v))
return bad

try:
washed = json.loads(json.dumps(value))
except JSONDecodeError as e:
Expand All @@ -455,12 +476,17 @@ def convert_json(value: JSONOBJECT, parameter: "Parameter") -> JSONOBJECT:

if parameter.keywords:
bad = []
for k in value.keys():
if parameter.key_path:
if not Metadata.is_key_path(k, parameter.keywords):
try:
for k, v in value.items():
if parameter.key_path:
if not Metadata.is_key_path(k, parameter.keywords):
bad.append(k)
if isinstance(v, dict):
bad.extend(keysafe(v))
elif k not in parameter.keywords:
bad.append(k)
elif k not in parameter.keywords:
bad.append(k)
except Exception as e:
raise ConversionError(value, "JSON") from e
if bad:
raise KeywordError(parameter, f"JSON key{'s' if len(bad) > 1 else ''}", bad)

Expand Down Expand Up @@ -546,10 +572,12 @@ def convert_keyword(value: str, parameter: "Parameter") -> Union[str, Enum]:
except ValueError as e:
raise KeywordError(parameter, "enum", [value]) from e

keyword = value.lower()
keyword = value if parameter.metalog_ok else value.lower()
if not parameter.keywords:
return keyword
elif parameter.key_path and Metadata.is_key_path(keyword, parameter.keywords):
elif parameter.key_path and Metadata.is_key_path(
keyword, parameter.keywords, metalog_key_ok=parameter.metalog_ok
):
return keyword
elif keyword in parameter.keywords:
return keyword
Expand Down Expand Up @@ -715,6 +743,7 @@ def __init__(
key_path: bool = False,
string_list: Optional[str] = None,
enum: Optional[type[Enum]] = None,
metalog_ok: bool = False,
):
"""Initialize a Parameter object describing a JSON parameter with its
type and attributes.
Expand All @@ -731,6 +760,9 @@ def __init__(
will be split into lists.
enum : An Enum subclass to which an upcased keyword should be
converted.
metalog_ok : Normal key_path keywords are alphanumeric; in order to
accommodate `metadata.log` keys, we allow an expanded character
set in restricted cases.
"""
self.name = name
self.type = type
Expand All @@ -740,6 +772,7 @@ def __init__(
self.key_path = key_path
self.string_list = string_list
self.enum = enum
self.metalog_ok = metalog_ok

def invalid(self, json: JSONOBJECT) -> bool:
"""Check whether the value of this parameter in the JSON document is
Expand Down Expand Up @@ -1535,8 +1568,7 @@ def _build_sql_query(
def _get_dataset_metadata(
self, dataset: Dataset, requested_items: List[str]
) -> JSON:
"""Get requested metadata about a specific Dataset and return a JSON
fragment that can be added to other data about the Dataset.
"""Get requested metadata for a specific Dataset
This supports strict Metadata key/value items associated with the
Dataset as well as selected columns from the Dataset model.
Expand All @@ -1554,30 +1586,27 @@ def _get_dataset_metadata(

metadata = {}
for i in requested_items:
if Metadata.is_key_path(i, Metadata.METADATA_KEYS):
native_key = Metadata.get_native_key(i)
user_id = None
if native_key == Metadata.USER:
user_id = Auth.get_current_user_id()
try:
metadata[i] = Metadata.getvalue(
dataset=dataset, key=i, user_id=user_id
)
except MetadataError:
metadata[i] = None
else:
raise MetadataBadKey(i)
native_key = Metadata.get_native_key(i)
user_id = None
if native_key == Metadata.USER:
user_id = Auth.get_current_user_id()
try:
metadata[i] = Metadata.getvalue(dataset=dataset, key=i, user_id=user_id)
except MetadataError:
metadata[i] = None

return metadata

def _set_dataset_metadata(
self, dataset: Dataset, metadata: dict[str, JSONVALUE]
) -> dict[str, str]:
"""Set metadata on a specific Dataset and return a summary of failures.
"""Set metadata on a specific Dataset
This supports strict Metadata key/value items associated with the
Dataset as well as selected columns from the Dataset model.
Errors are collected and returned.
Args:
dataset: Dataset object
metadata: dict of key/value pairs
Expand Down
1 change: 1 addition & 0 deletions lib/pbench/server/api/resources/datasets_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def __init__(self, config: PbenchServerConfig):
keywords=Metadata.METADATA_KEYS,
key_path=True,
string_list=",",
metalog_ok=True,
),
),
authorization=ApiAuthorizationType.USER_ACCESS,
Expand Down
1 change: 1 addition & 0 deletions lib/pbench/server/api/resources/datasets_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def __init__(self, config: PbenchServerConfig):
keywords=Metadata.METADATA_KEYS,
key_path=True,
string_list=",",
metalog_ok=True,
)
),
authorization=ApiAuthorizationType.DATASET,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def __init__(self, config: PbenchServerConfig):
keywords=Metadata.METADATA_KEYS,
key_path=True,
string_list=",",
metalog_ok=True,
),
),
authorization=ApiAuthorizationType.DATASET,
Expand Down
37 changes: 29 additions & 8 deletions lib/pbench/server/database/models/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -779,8 +779,25 @@ def is_key_path(key: str, valid: List[str], metalog_key_ok: bool = False) -> boo
def getvalue(
dataset: Dataset, key: str, user_id: Optional[str] = None
) -> Optional[JSON]:
"""Returns the value of the specified key, which may be a dotted
hierarchical path (e.g., "server.deleted").
"""Returns the value of the specified metadata key.
A metadata key may be a dotted hierarchical path, like "server.deleted"
or "dataset.metalog.pbench.script".
This code assumes that any necessary syntax checks have been made by a
caller. Normally, user key update is restricted to a specific set of
keys and namespaces (first level key like "server" or "global"), and
keys are lowercase and alphanumeric.
In the "dataset.metalog" sub-namespace, keys originate from a Pbench
Agent "metadata.log" config file, which may include non-alphanumeric
keys such as "iterations/1-tcp_stream-64B-1i". Pbench Server APIs may
allow access to these keys in limited circumstances for display and
filtering, and we don't interfere with that ability here. The namespace
key is always lowercased to find the SQL row; however other keys in the
path are opportunistically lowercased only if an exact match isn't
found. This allows "DATASET.SERVER" to match "dataset.server" but also
allows matching "dataset.metalog.iterations/1-tcp_stream-64B-1i".
The specific value of the dotted key path is returned, not the top
level Metadata object. The full JSON value of a top level key can be
Expand Down Expand Up @@ -812,10 +829,11 @@ def getvalue(
Returns:
Value of the key path
"""
if not Metadata.is_key_path(key, Metadata.METADATA_KEYS):
keys = key.split(".")
# Disallow ".." and trailing "."
if "" in keys:
raise MetadataBadKey(key)
keys = key.lower().split(".")
native_key = keys.pop(0)
native_key = keys.pop(0).lower()
if native_key == "dataset":
value = dataset.as_dict()
else:
Expand All @@ -831,10 +849,13 @@ def getvalue(
# the stored data; let the caller know with an exception.
if type(value) is not dict:
raise MetadataBadStructure(dataset, key, name)
if i not in value:
if i in value:
name = i
elif i.lower() in value:
name = i.lower()
else:
return None
value = value[i]
name = i
value = value[name]
return value

@staticmethod
Expand Down
26 changes: 25 additions & 1 deletion lib/pbench/test/unit/server/query_apis/test_datasets_detail.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pbench.server.api.resources.query_apis.datasets.datasets_detail import (
DatasetsDetail,
)
from pbench.server.database.models.datasets import Dataset, Metadata
from pbench.test.unit.server.conftest import generate_token
from pbench.test.unit.server.query_apis.commons import Commons

Expand Down Expand Up @@ -197,7 +198,29 @@ def test_metadata(
focus on the database dataset metadata... but necessarily has to borrow
much of the setup.
"""
query_params = {"metadata": ["global.seen", "server.deletion"]}

# In order to test that this API supports getting non-alphanumeric
# metalog metadata keys, we decorate the dataset with one. To keep it
# simple, just remove the existing Metadata row and create a new one.
drb = Dataset.query(name="drb")
Metadata.delete(Metadata.get(drb, Metadata.METALOG))
Metadata.create(
dataset=drb,
key=Metadata.METALOG,
value={
"iterations/fooBar=10-what_else@weird": {
"iteration_name": "fooBar=10-what_else@weird"
},
"run": {"controller": "node1.example.com"},
},
)
query_params = {
"metadata": [
"global.seen",
"server.deletion",
"dataset.metalog.iterations/[email protected]_name",
]
}

response_payload = {
"took": 112,
Expand Down Expand Up @@ -319,6 +342,7 @@ def test_metadata(
"serverMetadata": {
"global.seen": None,
"server.deletion": "2022-12-26",
"dataset.metalog.iterations/[email protected]_name": "fooBar=10-what_else@weird",
},
}
assert expected == res_json
Expand Down
61 changes: 56 additions & 5 deletions lib/pbench/test/unit/server/test_datasets_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ def test_dataset_paged_list(self, query_as, login, query, results, server_config
query: A JSON representation of the query parameters (these will be
automatically supplemented with a metadata request term)
results: A list of the dataset names we expect to be returned
server_config: The PbenchServerConfig object
"""
query.update({"metadata": ["dataset.uploaded"], "limit": 5})
result = query_as(query, login, HTTPStatus.OK)
Expand All @@ -319,10 +320,6 @@ def test_unauth_dataset_list(self, query_as):
Args:
query_as: A fixture to provide a helper that executes the API call
login: The username as which to perform a query
query: A JSON representation of the query parameters (these will be
automatically supplemented with a metadata request term)
results: A list of the dataset names we expect to be returned
"""
query_as({"access": "private"}, None, HTTPStatus.UNAUTHORIZED)

Expand All @@ -345,7 +342,7 @@ def test_get_bad_keys(self, query_as):
)
}

def test_get_get_errors(self, server_config, query_as):
def test_get_key_errors(self, query_as):
"""Test case reporting key errors
Args:
Expand Down Expand Up @@ -382,6 +379,60 @@ def test_get_get_errors(self, server_config, query_as):
"total": 3,
}

def test_use_funk_metalog_keys(self, query_as):
"""Test funky metadata.log keys
Normally we constrain metadata keys to lowercase alphanumeric strings.
Traditional Pbench Agent `metadata.log` files contain keys constructed
from benchmark iteration values that can contain mixed case and symbol
characters. We allow these keys to be filtered and retrieved, but not
created, so test that we can filter on a funky key value and return
the key.
Args:
query_as: Query helper fixture
"""
fio_1 = Dataset.query(name="fio_1")
Metadata.create(
dataset=fio_1,
key=Metadata.METALOG,
value={
"pbench": {
"date": "2020-02-15T00:00:00",
"config": "test1",
"script": "unit-test",
"name": "fio_1",
},
"iterations/fooBar=10-what_else@weird": {
"iteration_name": "fooBar=10-what_else@weird"
},
"run": {"controller": "node1.example.com"},
},
)
response = query_as(
{
"metadata": "dataset.metalog.iterations/fooBar=10-what_else@weird",
"filter": "dataset.metalog.iterations/[email protected]_name:~10",
},
"drb",
HTTPStatus.OK,
)
assert response.json == {
"next_url": "",
"results": [
{
"metadata": {
"dataset.metalog.iterations/fooBar=10-what_else@weird": {
"iteration_name": "fooBar=10-what_else@weird"
}
},
"name": "fio_1",
"resource_id": "random_md5_string3",
}
],
"total": 1,
}

def test_get_unknown_keys(self, query_as):
"""Test case requesting non-existent query parameter keys.
Expand Down
Loading

0 comments on commit afb9bef

Please sign in to comment.