From 83411ab0cbacd35e172d8fa3bf17f0139d711bf1 Mon Sep 17 00:00:00 2001 From: David Butenhof Date: Fri, 14 Apr 2023 17:36:21 -0400 Subject: [PATCH 1/6] Add typed filtering PBENCH-1124 Support type-cast filter expressions in `GET /datasets`. The primary objective is to support a paginated "Expiring Soon" view in the dashboard, requiring the ability to look for datasets expiring before a fixed timestamp. Previously, `GET /datasets?filter` worked by casting all SQL data to "string" and then comparing against the raw extracted string from the query parameter. Now it's possible to identify a type as well as additional comparison operators. For example, `GET /datasets?filter=server.deletion:<2023-05-01:date` will select all datasets with expiration timestamps earlier than 2023-05-1. To target this specific capability, the functional tests now override the default `server.deletion` on some uploads and verify that those datasets are returned by the filtered query. --- jenkins/run-server-func-tests | 13 +- lib/pbench/client/__init__.py | 13 + .../server/api/resources/datasets_list.py | 304 +++++++++++++++--- lib/pbench/test/functional/server/test_put.py | 92 +++++- .../test/unit/server/test_datasets_list.py | 101 +++++- 5 files changed, 449 insertions(+), 74 deletions(-) diff --git a/jenkins/run-server-func-tests b/jenkins/run-server-func-tests index d5eb268ad8..09708af384 100755 --- a/jenkins/run-server-func-tests +++ b/jenkins/run-server-func-tests @@ -21,11 +21,19 @@ elif [[ -n "${1}" ]]; then exit 2 fi +function dump_journal { + printf -- "+++ journalctl dump +++\n" + # Try to capture the functional test container's logs. + podman exec ${PB_SERVER_CONTAINER_NAME} journalctl + printf -- "\n--- journalctl dump ---\n\n" +} + function cleanup { if [[ -n "${cleanup_flag}" ]]; then # Remove the Pbench Server container and the dependencies pod which we # just created and ran; remove any dangling containers; and then remove # any dangling images. + dump_journal echo "Forcefully removing the Pbench Server container..." >&2 podman rm --force --ignore ${PB_SERVER_CONTAINER_NAME} echo "Forcefully removing the Pbench Support Services pod..." >&2 @@ -85,10 +93,7 @@ fi if [[ ${exit_status} -ne 0 ]]; then printf -- "\nFunctional tests exited with code %s\n" ${exit_status} - printf -- "+++ journalctl dump +++\n" - # Try to capture the functional test container's logs. - podman exec ${PB_SERVER_CONTAINER_NAME} journalctl - printf -- "\n--- journalctl dump ---\n\n" + dump_journal fi if [[ -z "${cleanup_flag}" ]]; then diff --git a/lib/pbench/client/__init__.py b/lib/pbench/client/__init__.py index 6015509644..c7135f9a14 100644 --- a/lib/pbench/client/__init__.py +++ b/lib/pbench/client/__init__.py @@ -489,3 +489,16 @@ def update( uri_params={"dataset": dataset_id}, params=params, ).json() + + def get_settings(self, key: Optional[str] = None) -> JSONOBJECT: + """Return requested metadata for a specified dataset. + + Args: + dataset_id: the resource ID of the targeted dataset + metadata: a list of metadata keys to return + + Returns: + A JSON document containing the requested key values + """ + params = {"key": key} if key else None + return self.get(api=API.SERVER_SETTINGS, uri_params=params).json() diff --git a/lib/pbench/server/api/resources/datasets_list.py b/lib/pbench/server/api/resources/datasets_list.py index 58e1dae0ec..cf70fb2ddf 100644 --- a/lib/pbench/server/api/resources/datasets_list.py +++ b/lib/pbench/server/api/resources/datasets_list.py @@ -1,14 +1,16 @@ +from dataclasses import dataclass from http import HTTPStatus -from typing import Any +import logging +from typing import Any, Callable, Optional from urllib.parse import urlencode, urlparse from flask import current_app from flask.json import jsonify from flask.wrappers import Request, Response -from sqlalchemy import and_, asc, cast, desc, func, or_, String +from sqlalchemy import and_, asc, Boolean, cast, desc, func, Integer, or_, String from sqlalchemy.exc import ProgrammingError, StatementError from sqlalchemy.orm import aliased, Query -from sqlalchemy.sql.expression import Alias +from sqlalchemy.sql.expression import Alias, BinaryExpression, ColumnElement from pbench.server import JSON, JSONOBJECT, OperationCode, PbenchServerConfig from pbench.server.api.resources import ( @@ -20,12 +22,17 @@ ApiMethod, ApiParams, ApiSchema, + convert_boolean, + convert_date, + convert_int, + convert_string, Parameter, ParamType, Schema, ) import pbench.server.auth.auth as Auth from pbench.server.database.database import Database +from pbench.server.database.models import TZDateTime from pbench.server.database.models.datasets import ( Dataset, Metadata, @@ -34,6 +41,67 @@ ) +@dataclass +class Type: + """Link query filter type data + + Link the filter type name, the equivalent SQLAlchemy type, and the Pbench + API conversion method to create a compatible object from a string. + + Fields: + sqtype: The base SQLAlchemy type corresponding to a Python target type + convert: The Pbench API conversion method to verify and convert a + string + """ + + sqltype: Any + convert: Callable[[str, Any], Any] + + +"""Associate the name of a filter type to the Type record describing it.""" +TYPES = { + "bool": Type(Boolean, convert_boolean), + "date": Type(TZDateTime, convert_date), + "int": Type(Integer, convert_int), + "str": Type(String, convert_string), +} + + +"""Define the set of operators we allow in query filters. + +This maps a symbolic name to a SQLAlchemy filter method on the column. +""" +OPERATORS = { + "~": "contains", + "=": "__eq__", + "<": "__lt__", + ">": "__gt__", + "<=": "__le__", + ">=": "__ge__", + "!=": "__ne__", +} + + +def make_operator( + expression: ColumnElement, operator: str, value: Any +) -> BinaryExpression: + """Return the SQLAlchemy operator method of a column or JSON expression. + + Args: + expression: A SQLAlchemy expression or column + operator: The operator prefix + value: The value to be compared against + + Returns: + A SQLAlchemy filter expression. + """ + x = getattr(expression, OPERATORS[operator])(value) + current_app.logger.info( + "Expression type {} yields type {}", type(expression).__name__, type(x).__name__ + ) + return x + + def urlencode_json(json: JSON) -> str: """We must properly encode the metadata query parameter as a list of keys.""" new_json = {} @@ -42,6 +110,135 @@ def urlencode_json(json: JSON) -> str: return urlencode(new_json) +class Term: + """Help parsing a filter term.""" + + def __init__(self, term: str): + self.chain = None + self.key = None + self.value = None + self.term = term + self.buffer = term + self.logger = logging.getLogger("term") + + def _remove_prefix( + self, prefixes: tuple[str], default: Optional[str] = None + ) -> Optional[str]: + """Remove a string prefix. + + Looking at the current buffer, remove a known prefix before processing + the next character. A list of prefixes may be specified: they're sorted + longest first so that we don't accidentally match a substring. + + Args: + prefixes: a collection of prefix strings + default: the default prefix if none appears in the string. + + Returns: + The prefix, or the default. + """ + prefix = default + for p in sorted(prefixes, key=lambda k: len(k), reverse=True): + if self.buffer.startswith(p): + prefix = self.buffer[: len(p)] + self.buffer = self.buffer[len(p) :] + break + return prefix + + def _next_token(self, optional: bool = False) -> str: + """Extract a string from the head of the buffer. + + To filter using metadata keys (e.g., unusual benchmark generated + metadata.log keys) or values (e.g. a date/time string) which may + contain ":" symbols, the key or value may be enclosed in quotes. + Either single or double quotes are allowed. + + We're looking for the ":" symbol delimiting the next term of the filter + expression, or (if that's marked "optional") the end of the buffer. + + Args: + optional: The terminating ":" separator is optional. + + Returns: + The token, unquoted + """ + buffer = self.buffer + if buffer.startswith(("'", '"')): + quote = buffer[:1] + end = buffer.find(quote, 1) + if end < 0: + raise APIAbort( + HTTPStatus.BAD_REQUEST, f"Bad quote termination in {self.term!r}" + ) + next = buffer[1:end] + end += 1 + else: + end = buffer.find(":") + if end < 0: + if not optional: + raise APIAbort( + HTTPStatus.BAD_REQUEST, f"Missing terminator for {buffer!r}" + ) + next = buffer + end = len(buffer) + else: + next = buffer[:end] + buffer = buffer[end:] + if not optional and (len(buffer) == 0 or not buffer.startswith(":")): + self.logger.info( + "[optional %s, len %d, next %s]", optional, len(buffer), buffer[0] + ) + raise APIAbort(HTTPStatus.BAD_REQUEST, f"Missing terminator for {buffer!r}") + buffer = buffer[1:] + self.buffer = buffer + return next + + def parse_filter(self) -> "Term": + """Parse a filter term like "server.deletion:[]value:type" + + Returns a dictionary with "native_key", "full_key", "operator" (default + "="), "value", and "type" fields. + + The key and value can be quoted to include mixed case and symbols, + including the ":"character, by starting and starting and ending the key + or value with the "'" or '"' character, e.g., + "'dataset.metalog.tool/iteration:1':'foo:1#bar':str". + + Returns: + self + """ + + # The "chain" allows chaining multiple expressions as an OR + self.chain = self._remove_prefix(("^",)) + + # The metadata key token + self.key = self._next_token() + if not Metadata.is_key_path( + self.key, Metadata.METADATA_KEYS, metalog_key_ok=True + ): + raise APIAbort(HTTPStatus.BAD_REQUEST, str(MetadataBadKey(self.key))) + + # The comparison operator (defaults to "=") + self.operator = self._remove_prefix(OPERATORS.keys(), default="=") + + # The filter value + self.value = self._next_token(optional=True) + + # The comparison type, defaults to "str" + self.type = self.buffer.lower() + if self.type and self.type not in TYPES: + raise APIAbort( + HTTPStatus.BAD_REQUEST, + f"The filter type {self.type!r} must be one of {','.join(t for t in TYPES.keys())}", + ) + if not self.key or not self.value: + raise APIAbort( + HTTPStatus.BAD_REQUEST, + f"filter {self.term!r} must have the form 'k:v[:type]'", + ) + return self + + class DatasetsList(ApiBase): """API class to list datasets based on database metadata.""" @@ -159,14 +356,41 @@ def filter_query( 'saved': True}` because there's no mechanism to parse JSON expressions. This might be added in the future. - To match against a key value, separate the key path from the value with - a `:` character, e.g., "dataset.name:fio". This produces an exact match - expression as an AND (required) term in the query: only datasets with - the exact name "fio" will be returned. - - To match against a subset of a key value, insert a tilde (~) following - the `:`, like "dataset.name:~fio". This produces a "contains" filter - that will match against any name containing the substring "fio". + Specify the value to match separated from the key path by a `:` + character, e.g., "dataset.name:fio". By default this produces an exact + match expression as an AND (required) term in the query: only datasets + with the exact name "fio" will be returned. + + The value may be prefixed by an optional operator in order to perform a + comparison other than strict equality: + + = Look for matches strictly equal to the specified value (default) + ~ Look for matches containing the specified value as a substring + > Look for matches strictly greater than the specified value + < Look for matches strictly less than the specified value + >= Look for matches greater than or equal to the specified value + <= Look for matches less than or equal to the specified value + != Look for matches not strictly equal to the specified value + + After the value, you can optionally specify a type for the comparison. + Note that an incompatible type (other than the default "str") for a + primary "dataset" column will be rejected, but general Metadata + (including "dataset.metalog") is stored as generic JSON and the API + will attempt to cast the selected data to the specified type. + + str Perform a string match + int Perform an integer match + bool Perform a boolean match (boolean values are t[rue], f[alse], + y[es], and n[o]) + date Perform a date match: the selected key must have a string value + in ISO-8601 format, and will be interpreted as UTC if no time + zone is specified. + + For example + + dataset.uploaded:>2023-05-01:date + global.dashboard.seen:t:bool + dataset.metalog.pbench.script:!=fio To produce an OR (optional) filter term, you can prefix the `key:value` with the caret (^) character. Adjacent OR terms will be combined in a @@ -177,8 +401,8 @@ def filter_query( AND and OR terms can be chained by ordering them: - "dataset.name:~fio,^global.dashboard.saved:true, - ^user.dashboard.favorite:true,server.origin:RIYA" + "dataset.name:~fio,^global.dashboard.saved:true:bool, + ^user.dashboard.favorite:true:bool,server.origin:RIYA" will match any dataset with a name containing the substring "fio" which ALSO has been marked in the dashboard as either "saved" OR (by the @@ -223,24 +447,13 @@ def filter_query( and_list = [] for kw in filters: combine_or = False - contains = False - try: - k, v = kw.split(":", maxsplit=1) - except ValueError: - raise APIAbort( - HTTPStatus.BAD_REQUEST, f"filter {kw!r} must have the form 'k:v'" - ) - if k.startswith("^"): + term = Term(kw).parse_filter() + if term.chain == "^": combine_or = True - k = k[1:] - if v.startswith("~"): - contains = True - v = v[1:] - - if not Metadata.is_key_path(k, Metadata.METADATA_KEYS, metalog_key_ok=True): - raise APIAbort(HTTPStatus.BAD_REQUEST, str(MetadataBadKey(k))) - keys = k.split(".") + keys = term.key.split(".") native_key = keys.pop(0).lower() + vtype = term.type if term.type else "str" + value = TYPES[vtype].convert(term.value, None) filter = None if native_key == Metadata.DATASET: @@ -254,19 +467,20 @@ def filter_query( else: try: c = getattr(Dataset, second) - try: - is_str = c.type.python_type is str - except NotImplementedError: - is_str = False - if is_str: - column = c - else: + if vtype == "str" and not isinstance(c.type, String): column = cast(getattr(Dataset, second), String) + else: + if not isinstance(c.type, TYPES[vtype].sqltype): + raise APIAbort( + HTTPStatus.BAD_REQUEST, + f"Type {vtype} of value {value} is not compatible with dataset column {c.name}", + ) + column = c except AttributeError as e: raise APIAbort( - HTTPStatus.BAD_REQUEST, str(MetadataBadKey(k)) + HTTPStatus.BAD_REQUEST, str(MetadataBadKey(term.key)) ) from e - filter = column.contains(v) if contains else column == v + filter = make_operator(column, term.operator, value) elif native_key == Metadata.USER: # The user namespace requires special handling because the # values are always qualified by the owning user rather than @@ -275,14 +489,20 @@ def filter_query( if not user_id: raise APIAbort( HTTPStatus.UNAUTHORIZED, - f"Metadata key {k} cannot be used by an unauthenticated client", + f"Metadata key {term.key} cannot be used by an unauthenticated client", ) # NOTE: We don't want to *evaluate* the filter expression here, so - # check explicitly for None. + # check explicitly for None. I.e., "we have no filter" rather than + # "the evaluated result of this filter is falsey". if filter is None: - expression = aliases[native_key].value[keys].as_string() - filter = expression.contains(v) if contains else expression == v + expression = ( + aliases[native_key] + .value[keys] + .as_string() + .cast(TYPES[vtype].sqltype) + ) + filter = make_operator(expression, term.operator, value) if combine_or: or_list.append(filter) diff --git a/lib/pbench/test/functional/server/test_put.py b/lib/pbench/test/functional/server/test_put.py index b28061ee4a..b5389890ca 100644 --- a/lib/pbench/test/functional/server/test_put.py +++ b/lib/pbench/test/functional/server/test_put.py @@ -1,11 +1,12 @@ from dataclasses import dataclass -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from http import HTTPStatus import os.path from pathlib import Path import re import time +import dateutil.parser import pytest from requests.exceptions import HTTPError @@ -14,6 +15,11 @@ TARBALL_DIR = Path("lib/pbench/test/functional/server/tarballs") SPECIAL_DIR = TARBALL_DIR / "special" +SHORT_EXPIRATION_DAYS = 10 + + +def utc_from_str(date: str) -> datetime: + return dateutil.parser.parse(date).replace(tzinfo=timezone.utc) @dataclass @@ -32,17 +38,22 @@ def test_upload_all(self, server_client: PbenchServerClient, login_user): """Upload each of the pregenerated tarballs, and perform some basic sanity checks on the resulting server state. """ - print(" ... reporting behaviors ...") + print(" ... uploading tarballs ...") tarballs: dict[str, Tarball] = {} access = ["private", "public"] cur_access = 0 + # We're going to make some datasets expire "soon" (much sooner than the + # default) so we can test filtering typed metadata. + expire_soon = datetime.now(timezone.utc) + timedelta(days=SHORT_EXPIRATION_DAYS) + for t in TARBALL_DIR.glob("*.tar.xz"): a = access[cur_access] if a == "public": metadata = ( - "server.origin:test,user.pbench.access:public,server.archiveonly:n" + "server.origin:test,user.pbench.access:public,server.archiveonly:n", + f"server.deletion:{expire_soon:%Y-%m-%d %H:%M%z}", ) else: metadata = None @@ -53,7 +64,7 @@ def test_upload_all(self, server_client: PbenchServerClient, login_user): assert ( response.status_code == HTTPStatus.CREATED ), f"upload returned unexpected status {response.status_code}, {response.text} ({t})" - print(f"\t... uploaded {t.name}") + print(f"\t... uploaded {t.name}: {a}") datasets = server_client.get_list( metadata=["dataset.access", "server.tarball-path", "dataset.operations"] @@ -182,7 +193,7 @@ def check_indexed(server_client: PbenchServerClient, datasets): not_indexed = [] try: for dataset in datasets: - print(f"\t... on {dataset.metadata['server.tarball-path']}") + print(f"\t... on {dataset.name}") metadata = server_client.get_metadata( dataset.resource_id, ["dataset.operations"] ) @@ -217,8 +228,7 @@ def test_index_all(self, server_client: PbenchServerClient, login_user): state and metadata look good. """ tarball_names = frozenset(t.name for t in TARBALL_DIR.glob("*.tar.xz")) - - print(" ... reporting behaviors ...") + print(" ... reporting dataset status ...") # Test get_list pagination: to avoid forcing a list, we'll count the # iterations separately. (Note that this is really an implicit test @@ -233,6 +243,7 @@ def test_index_all(self, server_client: PbenchServerClient, login_user): "dataset.access", "server.archiveonly", "server.origin", + "server.deletion", "user.pbench.access", ], ) @@ -258,11 +269,11 @@ def test_index_all(self, server_client: PbenchServerClient, login_user): f" = {exc.response.status_code}, text = {exc.response.text!r}" ) - # For each dataset we find, poll the state until it reaches Indexed - # state, or until we time out. Since the cron jobs run once a minute - # and they start on the minute, we make our 1st check 45 seconds into - # the next minute, and then check at 45 seconds past the minute until - # we reached 5 minutes past the original start time. + # For each dataset we find, poll the state until it has been indexed, + # or until we time out. The indexer runs once a minute slightly after + # the minute, so we make our 1st check 45 seconds into the next minute, + # and then check at 45 seconds past the minute until we reach 5 minutes + # past the original start time. oneminute = timedelta(minutes=1) now = start = datetime.utcnow() timeout = start + timedelta(minutes=5) @@ -280,7 +291,7 @@ def test_index_all(self, server_client: PbenchServerClient, login_user): if os.path.basename(tp) not in tarball_names: continue count += 1 - print(f"\t... indexed {tp}") + print(f"\t... indexed {dataset.name}") now = datetime.utcnow() if not not_indexed or now >= timeout: break @@ -289,7 +300,7 @@ def test_index_all(self, server_client: PbenchServerClient, login_user): now = datetime.utcnow() assert not not_indexed, ( f"Timed out after {(now - start).total_seconds()} seconds; unindexed datasets: " - + ", ".join(d.metadata["server.tarball-path"] for d in not_indexed) + + ", ".join(d.name for d in not_indexed) ) assert ( len(tarball_names) == count @@ -421,6 +432,58 @@ def test_list_filter_and_or(self, server_client: PbenchServerClient, login_user) "2018" in date or "2019" in date ), f"Dataset {dataset.name} date is {date}" + @pytest.mark.dependency(name="list_type", depends=["upload"], scope="session") + def test_list_filter_typed(self, server_client: PbenchServerClient, login_user): + """Test filtering using typed matches. + + We compare the actual `server.deletion` time against what we believe we + set when uploading. Note that the metadata validator adds a full day to + the specified timestamp and then drops the time of day, effectively + rounding up to midnight, so here we add a day to the offset. + """ + test_sets = {} + for t in TARBALL_DIR.glob("*.tar.xz"): + r = Dataset.md5(t) + m = server_client.get_metadata( + r, + [ + "dataset.access", + "dataset.name", + "dataset.uploaded", + "server.deletion", + ], + ) + test_sets[r] = m + soonish = datetime.now(timezone.utc) + timedelta(days=20) + print( + f"... find matches for 'server.deletion:<{soonish:%Y-%m-%dT%H:%M%z}:date'" + ) + + datasets = server_client.get_list( + metadata=[ + "dataset.metalog.pbench.script", + "dataset.uploaded", + "server.deletion", + ], + owner="tester", + # NOTE: we use archaic "US standard" date notation here to minimize + # the risk of succeeding via a simple alphanumeric comparison. + filter=[f"server.deletion:<'{soonish:%m/%d/%Y %H:%M%z}':date"], + ) + + for dataset in datasets: + deletion = utc_from_str(dataset.metadata["server.deletion"]) + assert ( + deletion < soonish + ), f"Filter returned {dataset.name}, with expiration out of range ({deletion:%Y-%m-%d})" + test_sets.pop(dataset.resource_id, None) + + for r, m in test_sets.items(): + deletion = utc_from_str(m["server.deletion"]) + assert ( + deletion >= soonish + ), f"Filter failed to return {m['dataset.name']}, with expiration in range ({deletion:%Y-%m-%d})" + class TestUpdate: @pytest.mark.dependency(name="publish", depends=["index"], scope="session") @@ -447,6 +510,7 @@ class TestDelete: "list_and", "list_none", "list_or", + "list_type", "publish", ], scope="session", diff --git a/lib/pbench/test/unit/server/test_datasets_list.py b/lib/pbench/test/unit/server/test_datasets_list.py index 2583d584f7..60692054ab 100644 --- a/lib/pbench/test/unit/server/test_datasets_list.py +++ b/lib/pbench/test/unit/server/test_datasets_list.py @@ -492,18 +492,58 @@ def test_get_repeat_keys(self, query_as): @pytest.mark.parametrize( "filters,expected", [ - (["dataset.name:fio"], "datasets.name = 'fio'"), + (["dataset.name:=fio"], "datasets.name = 'fio'"), ( - ["dataset.metalog.pbench.script:fio"], - "dataset_metadata_1.value[['pbench', 'script']] = 'fio'", + ["dataset.uploaded:>2023-01-01:date"], + "datasets.uploaded > '2023-01-01 00:00:00'", + ), + ( + ["dataset.uploaded:!=2023-01-01:date"], + "datasets.uploaded != '2023-01-01 00:00:00'", + ), + ( + ["server.deletion:<2023-05-01:date"], + "CAST(dataset_metadata_2.value[['deletion']] AS DATETIME) < '2023-05-01 00:00:00'", + ), + ( + ["'dataset.metalog.pbench.script':='fio'"], + "CAST(dataset_metadata_1.value[['pbench', 'script']] AS VARCHAR) = 'fio'", + ), + ( + ["'dataset.metalog.pbench.script':!=fio"], + "CAST(dataset_metadata_1.value[['pbench', 'script']] AS VARCHAR) != 'fio'", + ), + ( + ["dataset.metalog.run.date:~fio"], + "(CAST(dataset_metadata_1.value[['run', 'date']] AS VARCHAR) LIKE '%' || 'fio' || '%')", + ), + ( + ["global.something.integer:<1:int"], + "CAST(dataset_metadata_3.value[['something', 'integer']] AS INTEGER) < 1", + ), + ( + ["global.something.integer:>2:int"], + "CAST(dataset_metadata_3.value[['something', 'integer']] AS INTEGER) > 2", + ), + ( + ["global.something.integer:<=1:int"], + "CAST(dataset_metadata_3.value[['something', 'integer']] AS INTEGER) <= 1", + ), + ( + ["global.something.integer:>=2:int"], + "CAST(dataset_metadata_3.value[['something', 'integer']] AS INTEGER) >= 2", + ), + ( + ["global.something.boolean:t:bool"], + "CAST(dataset_metadata_3.value[['something', 'boolean']] AS BOOLEAN) = true", ), ( ["user.d.f:1"], - "dataset_metadata_4.value[['d', 'f']] = '1'", + "CAST(dataset_metadata_4.value[['d', 'f']] AS VARCHAR) = '1'", ), ( - ["dataset.name:~fio", "^global.x:1", "^user.y:~yes"], - "(datasets.name LIKE '%' || 'fio' || '%') AND (dataset_metadata_3.value[['x']] = '1' OR ((dataset_metadata_4.value[['y']]) LIKE '%' || 'yes' || '%'))", + ["dataset.name:~fio", "^'global.x':1", "^user.y:~yes"], + "(datasets.name LIKE '%' || 'fio' || '%') AND (CAST(dataset_metadata_3.value[['x']] AS VARCHAR) = '1' OR (CAST(dataset_metadata_4.value[['y']] AS VARCHAR) LIKE '%' || 'yes' || '%'))", ), ( ["dataset.uploaded:~2000"], @@ -544,11 +584,11 @@ def test_filter_query(self, monkeypatch, client, filters, expected): (["dataset.name:fio"], "datasets.name = 'fio'"), ( ["dataset.metalog.pbench.script:fio"], - "dataset_metadata_1.value[['pbench', 'script']] = 'fio'", + "CAST(dataset_metadata_1.value[['pbench', 'script']] AS VARCHAR) = 'fio'", ), ( ["dataset.name:~fio", "^global.x:1"], - "(datasets.name LIKE '%' || 'fio' || '%') AND dataset_metadata_3.value[['x']] = '1'", + "(datasets.name LIKE '%' || 'fio' || '%') AND CAST(dataset_metadata_3.value[['x']] AS VARCHAR) = '1'", ), ( ["dataset.uploaded:~2000"], @@ -596,15 +636,47 @@ def test_user_no_auth(self, monkeypatch, db_session): assert e.value.http_status == HTTPStatus.UNAUTHORIZED @pytest.mark.parametrize( - "meta,error", + "meta,error,message", [ - ("user.foo:1", HTTPStatus.UNAUTHORIZED), - ("x.y:1", HTTPStatus.BAD_REQUEST), - ("global.x=3", HTTPStatus.BAD_REQUEST), - ("dataset.notright:10", HTTPStatus.BAD_REQUEST), + ( + "user.foo:1", + HTTPStatus.UNAUTHORIZED, + "Metadata key user.foo cannot be used by an unauthenticated client", + ), + ("x.y:1", HTTPStatus.BAD_REQUEST, "Metadata key 'x.y' is not supported"), + ( + "global.x=3", + HTTPStatus.BAD_REQUEST, + "Missing terminator for 'global.x=3'", + ), + ( + "dataset.notright:10", + HTTPStatus.BAD_REQUEST, + "Metadata key 'dataset.notright' is not supported", + ), + ( + "'dataset.name:foo", + HTTPStatus.BAD_REQUEST, + 'Bad quote termination in "\'dataset.name:foo"', + ), + ( + "dataset.name:'foo", + HTTPStatus.BAD_REQUEST, + 'Bad quote termination in "dataset.name:\'foo"', + ), + ( + "server.deletion:<2023-05-01:time", + HTTPStatus.BAD_REQUEST, + "The filter type 'time' must be one of bool,date,int,str", + ), + ( + "server.deletion:2023-05-01:date:", + HTTPStatus.BAD_REQUEST, + "The filter type 'date:' must be one of bool,date,int,str", + ), ], ) - def test_filter_errors(self, monkeypatch, db_session, meta, error): + def test_filter_errors(self, monkeypatch, client, meta, error, message): """Test invalid filter expressions.""" monkeypatch.setattr( "pbench.server.api.resources.datasets_list.Auth.get_current_user_id", @@ -614,6 +686,7 @@ def test_filter_errors(self, monkeypatch, db_session, meta, error): with pytest.raises(APIAbort) as e: DatasetsList.filter_query([meta], aliases, query) assert e.value.http_status == error + assert str(e.value) == message @pytest.mark.parametrize( "exception,error", From e3b016ee55560a4536c57348ca77fd1a5823fe1a Mon Sep 17 00:00:00 2001 From: David Butenhof Date: Mon, 17 Apr 2023 07:47:56 -0400 Subject: [PATCH 2/6] Minor cleanup --- .../server/api/resources/datasets_list.py | 6 +--- lib/pbench/test/functional/server/test_put.py | 28 ++++++------------- 2 files changed, 10 insertions(+), 24 deletions(-) diff --git a/lib/pbench/server/api/resources/datasets_list.py b/lib/pbench/server/api/resources/datasets_list.py index cf70fb2ddf..618720e78e 100644 --- a/lib/pbench/server/api/resources/datasets_list.py +++ b/lib/pbench/server/api/resources/datasets_list.py @@ -95,11 +95,7 @@ def make_operator( Returns: A SQLAlchemy filter expression. """ - x = getattr(expression, OPERATORS[operator])(value) - current_app.logger.info( - "Expression type {} yields type {}", type(expression).__name__, type(x).__name__ - ) - return x + return getattr(expression, OPERATORS[operator])(value) def urlencode_json(json: JSON) -> str: diff --git a/lib/pbench/test/functional/server/test_put.py b/lib/pbench/test/functional/server/test_put.py index b5389890ca..a2de16f4c1 100644 --- a/lib/pbench/test/functional/server/test_put.py +++ b/lib/pbench/test/functional/server/test_put.py @@ -436,41 +436,30 @@ def test_list_filter_and_or(self, server_client: PbenchServerClient, login_user) def test_list_filter_typed(self, server_client: PbenchServerClient, login_user): """Test filtering using typed matches. - We compare the actual `server.deletion` time against what we believe we - set when uploading. Note that the metadata validator adds a full day to - the specified timestamp and then drops the time of day, effectively - rounding up to midnight, so here we add a day to the offset. + We set an early "server.deletion" time on alternating datasets as we + uploaded them. Now prove that we can use a typed "date" filter to find + them. We'll further validate that none of the other datasets we + uploaded from TARBALL_DIR should have been returned. """ test_sets = {} for t in TARBALL_DIR.glob("*.tar.xz"): r = Dataset.md5(t) m = server_client.get_metadata( r, - [ - "dataset.access", - "dataset.name", - "dataset.uploaded", - "server.deletion", - ], + ["dataset.name", "server.deletion"], ) test_sets[r] = m soonish = datetime.now(timezone.utc) + timedelta(days=20) - print( - f"... find matches for 'server.deletion:<{soonish:%Y-%m-%dT%H:%M%z}:date'" - ) datasets = server_client.get_list( - metadata=[ - "dataset.metalog.pbench.script", - "dataset.uploaded", - "server.deletion", - ], + metadata=["server.deletion"], owner="tester", - # NOTE: we use archaic "US standard" date notation here to minimize + # NOTE: using weird "US standard" date notation here minimizes # the risk of succeeding via a simple alphanumeric comparison. filter=[f"server.deletion:<'{soonish:%m/%d/%Y %H:%M%z}':date"], ) + # Confirm that the returned datasets match for dataset in datasets: deletion = utc_from_str(dataset.metadata["server.deletion"]) assert ( @@ -478,6 +467,7 @@ def test_list_filter_typed(self, server_client: PbenchServerClient, login_user): ), f"Filter returned {dataset.name}, with expiration out of range ({deletion:%Y-%m-%d})" test_sets.pop(dataset.resource_id, None) + # Confirm that the remaining TARBALL_DIR datasets don't match for r, m in test_sets.items(): deletion = utc_from_str(m["server.deletion"]) assert ( From 75ca0286766eea5b373602ddeca0a70974cb707f Mon Sep 17 00:00:00 2001 From: David Butenhof Date: Tue, 18 Apr 2023 10:46:27 -0400 Subject: [PATCH 3/6] Updates --- docs/API/V1/list.md | 71 +++++++++++++------ jenkins/run-server-func-tests | 4 +- lib/pbench/client/__init__.py | 9 ++- .../server/api/resources/datasets_list.py | 50 ++++++------- lib/pbench/test/functional/server/test_put.py | 2 +- .../test/unit/server/test_datasets_list.py | 67 ++++++++++++++++- 6 files changed, 145 insertions(+), 58 deletions(-) diff --git a/docs/API/V1/list.md b/docs/API/V1/list.md index 7cb6872300..b7790dc339 100644 --- a/docs/API/V1/list.md +++ b/docs/API/V1/list.md @@ -45,20 +45,49 @@ specified date. `filter` metadata filtering \ Select datasets matching the metadata expressions specified via `filter` -query parameters. Each expression is the name of a metadata key (for example, -`dataset.name`), followed by a colon (`:`) and the comparison string. The -comparison string may be prefixed with a tilde (`~`) to make it a partial -("contains") comparison instead of an exact match. For example, -`dataset.name:foo` looks for datasets with the name "foo" exactly, whereas -`dataset.name:~foo` looks for datasets with a name containing the substring -"foo". - -These may be combined across multiple `filter` query parameters or as -comma-separated lists in a single query parameter. Multiple filter expressions -form an `AND` expression, however consecutive filter expressions can be joined -in an `OR` expression by using the circumflex (`^`) character prior to each. -(The first expression with `^` begins an `OR` list while the first subsequent -expression outout `^` ends the `OR` list and is combined with an `AND`.) +query parameters. Each expression has the format `[chain]key:[op]value[:type]`: + +* `chain` Prefix an expression with `^` (circumflex) to allow combining a set +of expressions with `OR` rather than the default `AND`. +* `key` The name of a metadata key (for example, `dataset.name`) + +* `op` An operator to specify how to compare the key value: + + * `=` (Default) Compare for equality + * `~` Compare against a substring + * `>` Greater than + * `<` Less than + * `>=` Greater than or equal to + * `<=` Less than or equal to + * `!=` Not equal + +* `value` The value to compare against. This will be interpreted based on the specified type. +* `type` The string value will be cast to this type. Any value can be cast to +type `str`. General metadata keys (`server`, `global`, `user`, and +`dataset.metalog` namespaces) that have values incompatible with the specified +type will be ignored. If you specify an incompatible type for a primary +`dataset` key, an error will be returned as these types are defined by the +Pbench schema so no match would be possible. (For example, `dataset.name:2:int` +or `dataset.access:2023-05-01:date`.) + + * `str` (Default) Compare as a string + * `bool` Compare as a boolean + * `int` Compare as an integer + * `date` Compare as a date-time string. ISO-8601 recommended, and UTC is + assumed if no timezone is specified. + +For example, `dataset.name:foo` looks for datasets with the name "foo" exactly, +whereas `dataset.name:~foo` looks for datasets with a name containing the +substring "foo". + +Multiple expressions may be combined across multiple `filter` query parameters +or as comma-separated lists in a single query parameter. Multiple filter +expressions are combined as an `AND` expression, matching only when all +expressions match. However a set of consecutive filter expressions can form +an `OR` expression by using the circumflex (`^`) "chain" character on each +expression term. The first expression with `^` begins an `OR` list while the +first subsequent expression without `^` ends the `OR` list and is combined with +the nested `OR` expression as an `AND`. For example, - `filter=dataset.name:a,server.origin:EC2` returns datasets with a name of @@ -67,12 +96,14 @@ For example, returns datasets with a name of "a" and *either* an origin of "EC2" or generated from the "pbench-fio" script. -_NOTE_: `filter` expression values, like the `true` in -`GET /api/v1/datasets?filter=server.archiveonly:true`, are always interpreted -as strings, so be careful about the string representation of the value (in this -case, a boolean, which is represented in JSON as `true` or `false`). Beware -especially when attempting to match a JSON document (such as -`dataset.metalog.pbench`). +_NOTE_: `filter` expression term values, like the `true` in +`GET /api/v1/datasets?filter=server.archiveonly:true`, are by default +interpreted as strings, so be careful about the string representation of the +value. In this case, `server.archiveonly` is a boolean, which will be matched +as a string value `true` or `false`. You can instead specify the expression +term as `server.archiveonly:t:bool` which will treat the specified match value +as a boolean (`t[rue]` or `y[es]` for true, `f[alse]` or `n[o]` for false) and +match against the boolean metadata value. `keysummary` boolean \ Instead of displaying a list of selected datasets and metadata, use the set of diff --git a/jenkins/run-server-func-tests b/jenkins/run-server-func-tests index 09708af384..ccd6a49647 100755 --- a/jenkins/run-server-func-tests +++ b/jenkins/run-server-func-tests @@ -33,7 +33,6 @@ function cleanup { # Remove the Pbench Server container and the dependencies pod which we # just created and ran; remove any dangling containers; and then remove # any dangling images. - dump_journal echo "Forcefully removing the Pbench Server container..." >&2 podman rm --force --ignore ${PB_SERVER_CONTAINER_NAME} echo "Forcefully removing the Pbench Support Services pod..." >&2 @@ -67,6 +66,7 @@ until curl -s -o /dev/null ${SERVER_API_ENDPOINTS}; do if [[ $(date +"%s") -ge ${end_in_epoch_secs} ]]; then echo "Timed out waiting for the reverse proxy to show up!" >&2 exit_status=1 + dump_journal exit ${exit_status} fi sleep 1 @@ -92,8 +92,8 @@ else fi if [[ ${exit_status} -ne 0 ]]; then - printf -- "\nFunctional tests exited with code %s\n" ${exit_status} dump_journal + printf -- "\nFunctional tests exited with code %s\n" ${exit_status} fi if [[ -z "${cleanup_flag}" ]]; then diff --git a/lib/pbench/client/__init__.py b/lib/pbench/client/__init__.py index c7135f9a14..d9358dcfa0 100644 --- a/lib/pbench/client/__init__.py +++ b/lib/pbench/client/__init__.py @@ -490,15 +490,14 @@ def update( params=params, ).json() - def get_settings(self, key: Optional[str] = None) -> JSONOBJECT: - """Return requested metadata for a specified dataset. + def get_settings(self, key: str = "") -> JSONOBJECT: + """Return requested server setting. Args: - dataset_id: the resource ID of the targeted dataset - metadata: a list of metadata keys to return + key: A server settings key; if omitted, return all settings Returns: A JSON document containing the requested key values """ - params = {"key": key} if key else None + params = {"key": key} return self.get(api=API.SERVER_SETTINGS, uri_params=params).json() diff --git a/lib/pbench/server/api/resources/datasets_list.py b/lib/pbench/server/api/resources/datasets_list.py index 618720e78e..a81825ce4e 100644 --- a/lib/pbench/server/api/resources/datasets_list.py +++ b/lib/pbench/server/api/resources/datasets_list.py @@ -43,10 +43,10 @@ @dataclass class Type: - """Link query filter type data + """Help keep track of filter types - Link the filter type name, the equivalent SQLAlchemy type, and the Pbench - API conversion method to create a compatible object from a string. + Used to link a filter type name to the equivalent SQLAlchemy type and the + Pbench API conversion method to create a compatible object from a string. Fields: sqtype: The base SQLAlchemy type corresponding to a Python target type @@ -112,7 +112,9 @@ class Term: def __init__(self, term: str): self.chain = None self.key = None + self.operator = None self.value = None + self.type = None self.term = term self.buffer = term self.logger = logging.getLogger("term") @@ -171,29 +173,25 @@ def _next_token(self, optional: bool = False) -> str: else: end = buffer.find(":") if end < 0: - if not optional: - raise APIAbort( - HTTPStatus.BAD_REQUEST, f"Missing terminator for {buffer!r}" - ) next = buffer end = len(buffer) else: next = buffer[:end] buffer = buffer[end:] - if not optional and (len(buffer) == 0 or not buffer.startswith(":")): - self.logger.info( - "[optional %s, len %d, next %s]", optional, len(buffer), buffer[0] + if buffer.startswith(":"): + buffer = buffer[1:] + elif not optional: + raise APIAbort( + HTTPStatus.BAD_REQUEST, f"Missing ':' terminator in {next!r}" ) - raise APIAbort(HTTPStatus.BAD_REQUEST, f"Missing terminator for {buffer!r}") - buffer = buffer[1:] self.buffer = buffer return next def parse_filter(self) -> "Term": - """Parse a filter term like "server.deletion:[]value:type" + """Parse a filter term like ":[]value[:type]" - Returns a dictionary with "native_key", "full_key", "operator" (default - "="), "value", and "type" fields. + Returns a dictionary with "key", "operator" (default "="), "value", and + "type" fields. The key and value can be quoted to include mixed case and symbols, including the ":"character, by starting and starting and ending the key @@ -366,7 +364,7 @@ def filter_query( < Look for matches strictly less than the specified value >= Look for matches greater than or equal to the specified value <= Look for matches less than or equal to the specified value - != Look for matches not strictly equal to the specified value + != Look for matches not equal to the specified value After the value, you can optionally specify a type for the comparison. Note that an incompatible type (other than the default "str") for a @@ -378,9 +376,9 @@ def filter_query( int Perform an integer match bool Perform a boolean match (boolean values are t[rue], f[alse], y[es], and n[o]) - date Perform a date match: the selected key must have a string value - in ISO-8601 format, and will be interpreted as UTC if no time - zone is specified. + date Perform a date match: the selected key must have a representing + a date-time string (ISO-8601 preferred). UTC is assumed if no + timezone is specified. For example @@ -442,10 +440,8 @@ def filter_query( or_list = [] and_list = [] for kw in filters: - combine_or = False term = Term(kw).parse_filter() - if term.chain == "^": - combine_or = True + combine_or = term.chain == "^" keys = term.key.split(".") native_key = keys.pop(0).lower() vtype = term.type if term.type else "str" @@ -469,7 +465,7 @@ def filter_query( if not isinstance(c.type, TYPES[vtype].sqltype): raise APIAbort( HTTPStatus.BAD_REQUEST, - f"Type {vtype} of value {value} is not compatible with dataset column {c.name}", + f"Type {vtype!r} of value {value!r} is not compatible with dataset column {c.name}", ) column = c except AttributeError as e: @@ -492,12 +488,8 @@ def filter_query( # check explicitly for None. I.e., "we have no filter" rather than # "the evaluated result of this filter is falsey". if filter is None: - expression = ( - aliases[native_key] - .value[keys] - .as_string() - .cast(TYPES[vtype].sqltype) - ) + expression = aliases[native_key].value[keys].as_string() + expression = expression.cast(TYPES[vtype].sqltype) filter = make_operator(expression, term.operator, value) if combine_or: diff --git a/lib/pbench/test/functional/server/test_put.py b/lib/pbench/test/functional/server/test_put.py index a2de16f4c1..f868c3ee30 100644 --- a/lib/pbench/test/functional/server/test_put.py +++ b/lib/pbench/test/functional/server/test_put.py @@ -449,7 +449,7 @@ def test_list_filter_typed(self, server_client: PbenchServerClient, login_user): ["dataset.name", "server.deletion"], ) test_sets[r] = m - soonish = datetime.now(timezone.utc) + timedelta(days=20) + soonish = datetime.now(timezone.utc) + timedelta(days=SHORT_EXPIRATION_DAYS * 2) datasets = server_client.get_list( metadata=["server.deletion"], diff --git a/lib/pbench/test/unit/server/test_datasets_list.py b/lib/pbench/test/unit/server/test_datasets_list.py index 60692054ab..616bfdf7c9 100644 --- a/lib/pbench/test/unit/server/test_datasets_list.py +++ b/lib/pbench/test/unit/server/test_datasets_list.py @@ -647,7 +647,17 @@ def test_user_no_auth(self, monkeypatch, db_session): ( "global.x=3", HTTPStatus.BAD_REQUEST, - "Missing terminator for 'global.x=3'", + "Missing ':' terminator in 'global.x=3'", + ), + ( + "'global.x'", + HTTPStatus.BAD_REQUEST, + "Missing ':' terminator in 'global.x'", + ), + ( + "dataset.x':v", + HTTPStatus.BAD_REQUEST, + 'Metadata key "dataset.x\'" is not supported', ), ( "dataset.notright:10", @@ -688,6 +698,61 @@ def test_filter_errors(self, monkeypatch, client, meta, error, message): assert e.value.http_status == error assert str(e.value) == message + @pytest.mark.parametrize( + "query,results", + [ + ("global.legacy:t:bool", ["drb"]), + ("global.legacy:>2:int", ["fio_2"]), + ("global.legacy:>2023-05-01:date", []), + ], + ) + def test_mismatched_json_cast(self, query_as, server_config, query, results): + """Verify DB engine behavior for mismatched metadata casts. + + Verify that a typed filter ignores datasets where the metadata key + type isn't compatible with the implicit cast. + """ + drb = Dataset.query(name="drb") + fio_1 = Dataset.query(name="fio_1") + fio_2 = Dataset.query(name="fio_2") + + Metadata.setvalue(dataset=drb, key="global.legacy", value=True) + Metadata.setvalue(dataset=fio_1, key="global.legacy.server", value="ABC") + Metadata.setvalue(dataset=fio_2, key="global.legacy", value=4) + + response = query_as( + {"filter": query, "metadata": ["dataset.uploaded"]}, + "drb", + HTTPStatus.OK, + ) + self.compare_results(response.json, results, {}, server_config) + + @pytest.mark.parametrize( + "query,message", + [ + ( + "dataset.name:t:bool", + "Type 'bool' of value True is not compatible with dataset column name", + ), + ( + "dataset.uploaded:>2:int", + "Type 'int' of value 2 is not compatible with dataset column uploaded", + ), + ], + ) + def test_mismatched_dataset_cast(self, query_as, server_config, query, message): + """Verify DB engine behavior for mismatched metadata casts. + + Verify that a typed filter ignores datasets where the metadata key + type isn't compatible with the implicit cast. + """ + response = query_as( + {"filter": query, "metadata": ["dataset.uploaded"]}, + "drb", + HTTPStatus.BAD_REQUEST, + ) + assert response.json["message"] == message + @pytest.mark.parametrize( "exception,error", ( From fd563bbada910ebd1d95ce1d1705dd221d922e48 Mon Sep 17 00:00:00 2001 From: David Butenhof Date: Wed, 19 Apr 2023 16:01:02 -0400 Subject: [PATCH 4/6] Add `name` documentation! --- docs/API/V1/list.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/API/V1/list.md b/docs/API/V1/list.md index b7790dc339..3b4f550c64 100644 --- a/docs/API/V1/list.md +++ b/docs/API/V1/list.md @@ -136,6 +136,10 @@ Allows filtering for datasets owned by the authenticated client (if the value is omitted, e.g., `?mine` or `?mine=true`) or owned by *other* users (e.g., `?mine=false`). +`name` string \ +Select only datasets with a specified substring in their name. The filter +`?name=fio` is semantically equivalent to `?filter=dataset.name:~fio`. + `offset` integer \ "Paginate" the selected datasets by skipping the first `offset` datasets that would have been selected by the other query terms. This can be used with From ced6fc56a2d09bad5ca5efc1dc2c2e00b93c5e2d Mon Sep 17 00:00:00 2001 From: David Butenhof Date: Thu, 20 Apr 2023 16:07:39 -0400 Subject: [PATCH 5/6] More tweaks --- docs/API/V1/list.md | 19 +++++++------- .../server/api/resources/datasets_list.py | 25 +++++++++++++------ .../test/unit/server/test_datasets_list.py | 10 ++++---- 3 files changed, 32 insertions(+), 22 deletions(-) diff --git a/docs/API/V1/list.md b/docs/API/V1/list.md index 3b4f550c64..66868118c0 100644 --- a/docs/API/V1/list.md +++ b/docs/API/V1/list.md @@ -83,24 +83,25 @@ substring "foo". Multiple expressions may be combined across multiple `filter` query parameters or as comma-separated lists in a single query parameter. Multiple filter expressions are combined as an `AND` expression, matching only when all -expressions match. However a set of consecutive filter expressions can form -an `OR` expression by using the circumflex (`^`) "chain" character on each -expression term. The first expression with `^` begins an `OR` list while the -first subsequent expression without `^` ends the `OR` list and is combined with -the nested `OR` expression as an `AND`. +expressions match. However any consecutive set of expressions starting with `^` +are collected into an "`OR` list" that will be `AND`-ed with the surrounding +terms. For example, - `filter=dataset.name:a,server.origin:EC2` returns datasets with a name of "a" and an origin of "EC2". -- `filter=dataset.name:a,^server.origin:EC2,^dataset.metalog.pbench.script:fio` -returns datasets with a name of "a" and *either* an origin of "EC2" or generated -from the "pbench-fio" script. +- `filter=dataset.name:~andy,^server.origin:EC2,^server.origin:RIYA, +dataset.access:public` +returns only "public" datasets with a name containing the string "andy" which also +have an origin of either "EC2" or "RIYA". As a SQL query, we might write it +as `dataset.name like "%andy%" and (server.origin = 'EC2' or +server.origin = 'RIYA') and dataset.access = 'public'`. _NOTE_: `filter` expression term values, like the `true` in `GET /api/v1/datasets?filter=server.archiveonly:true`, are by default interpreted as strings, so be careful about the string representation of the value. In this case, `server.archiveonly` is a boolean, which will be matched -as a string value `true` or `false`. You can instead specify the expression +as a string value "true" or "false". You can instead specify the expression term as `server.archiveonly:t:bool` which will treat the specified match value as a boolean (`t[rue]` or `y[es]` for true, `f[alse]` or `n[o]` for false) and match against the boolean metadata value. diff --git a/lib/pbench/server/api/resources/datasets_list.py b/lib/pbench/server/api/resources/datasets_list.py index a81825ce4e..76eed314f9 100644 --- a/lib/pbench/server/api/resources/datasets_list.py +++ b/lib/pbench/server/api/resources/datasets_list.py @@ -119,6 +119,15 @@ def __init__(self, term: str): self.buffer = term self.logger = logging.getLogger("term") + @classmethod + def parse(cls, term: str) -> "Term": + """Factory method to construct an instance and parse a string + + Args: + term: A filter expression to parse + """ + return cls(term).parse_filter() + def _remove_prefix( self, prefixes: tuple[str], default: Optional[str] = None ) -> Optional[str]: @@ -188,7 +197,7 @@ def _next_token(self, optional: bool = False) -> str: return next def parse_filter(self) -> "Term": - """Parse a filter term like ":[]value[:type]" + """Parse a filter term like ":[][:]" Returns a dictionary with "key", "operator" (default "="), "value", and "type" fields. @@ -219,7 +228,7 @@ def parse_filter(self) -> "Term": self.value = self._next_token(optional=True) # The comparison type, defaults to "str" - self.type = self.buffer.lower() + self.type = self.buffer.lower() if self.buffer else "str" if self.type and self.type not in TYPES: raise APIAbort( HTTPStatus.BAD_REQUEST, @@ -376,9 +385,9 @@ def filter_query( int Perform an integer match bool Perform a boolean match (boolean values are t[rue], f[alse], y[es], and n[o]) - date Perform a date match: the selected key must have a representing - a date-time string (ISO-8601 preferred). UTC is assumed if no - timezone is specified. + date Perform a date match: the selected key value (and supplied + filter value) must be strings representing a date-time, ideally + in ISO-8601 format. UTC is assumed if no timezone is specified. For example @@ -440,11 +449,11 @@ def filter_query( or_list = [] and_list = [] for kw in filters: - term = Term(kw).parse_filter() + term = Term.parse(kw) combine_or = term.chain == "^" keys = term.key.split(".") native_key = keys.pop(0).lower() - vtype = term.type if term.type else "str" + vtype = term.type value = TYPES[vtype].convert(term.value, None) filter = None @@ -465,7 +474,7 @@ def filter_query( if not isinstance(c.type, TYPES[vtype].sqltype): raise APIAbort( HTTPStatus.BAD_REQUEST, - f"Type {vtype!r} of value {value!r} is not compatible with dataset column {c.name}", + f"Filter of type {vtype!r} is not compatible with key 'dataset.{c.name}'", ) column = c except AttributeError as e: diff --git a/lib/pbench/test/unit/server/test_datasets_list.py b/lib/pbench/test/unit/server/test_datasets_list.py index 616bfdf7c9..6e15a162ba 100644 --- a/lib/pbench/test/unit/server/test_datasets_list.py +++ b/lib/pbench/test/unit/server/test_datasets_list.py @@ -710,7 +710,7 @@ def test_mismatched_json_cast(self, query_as, server_config, query, results): """Verify DB engine behavior for mismatched metadata casts. Verify that a typed filter ignores datasets where the metadata key - type isn't compatible with the implicit cast. + type isn't compatible with the required cast. """ drb = Dataset.query(name="drb") fio_1 = Dataset.query(name="fio_1") @@ -732,19 +732,19 @@ def test_mismatched_json_cast(self, query_as, server_config, query, results): [ ( "dataset.name:t:bool", - "Type 'bool' of value True is not compatible with dataset column name", + "Filter of type 'bool' is not compatible with key 'dataset.name'", ), ( "dataset.uploaded:>2:int", - "Type 'int' of value 2 is not compatible with dataset column uploaded", + "Filter of type 'int' is not compatible with key 'dataset.uploaded'", ), ], ) def test_mismatched_dataset_cast(self, query_as, server_config, query, message): """Verify DB engine behavior for mismatched metadata casts. - Verify that a typed filter ignores datasets where the metadata key - type isn't compatible with the implicit cast. + Verify that a typed filter generates an error when it targets a primary + dataset key with an incompatible type. """ response = query_as( {"filter": query, "metadata": ["dataset.uploaded"]}, From 2a59acb4c408ed1bb73d0065e951959e9bf3ec81 Mon Sep 17 00:00:00 2001 From: David Butenhof Date: Fri, 21 Apr 2023 07:24:12 -0400 Subject: [PATCH 6/6] A few tweaks more --- jenkins/run-server-func-tests | 2 +- lib/pbench/server/api/resources/datasets_list.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/jenkins/run-server-func-tests b/jenkins/run-server-func-tests index ccd6a49647..c55ab3417f 100755 --- a/jenkins/run-server-func-tests +++ b/jenkins/run-server-func-tests @@ -93,7 +93,7 @@ fi if [[ ${exit_status} -ne 0 ]]; then dump_journal - printf -- "\nFunctional tests exited with code %s\n" ${exit_status} + printf -- "\nFunctional tests exited with code %s\n" ${exit_status} >&2 fi if [[ -z "${cleanup_flag}" ]]; then diff --git a/lib/pbench/server/api/resources/datasets_list.py b/lib/pbench/server/api/resources/datasets_list.py index 76eed314f9..af49172bba 100644 --- a/lib/pbench/server/api/resources/datasets_list.py +++ b/lib/pbench/server/api/resources/datasets_list.py @@ -197,16 +197,20 @@ def _next_token(self, optional: bool = False) -> str: return next def parse_filter(self) -> "Term": - """Parse a filter term like ":[][:]" + """Parse a filter term like ":[][:]" - Returns a dictionary with "key", "operator" (default "="), "value", and - "type" fields. + Separates the "key", "operator", "value", and "type" fields of a filter + expression. The key and value can be quoted to include mixed case and symbols, - including the ":"character, by starting and starting and ending the key - or value with the "'" or '"' character, e.g., + including the ":" character, by starting and starting and ending the + key or value with the "'" or '"' character, e.g., "'dataset.metalog.tool/iteration:1':'foo:1#bar':str". + The "operator", if omitted, defaults to "=", and "type" defaults to + "str" (string), so that `dataset.name:foo` has the same effect as + `dataset.name:=foo:str`. + Returns: self """