From 6d9c40f0b67d9d7b7fda79cbc4410493a50266e0 Mon Sep 17 00:00:00 2001 From: David Butenhof Date: Mon, 24 Apr 2023 07:48:55 -0400 Subject: [PATCH] Add typed filtering (#3385) * Add typed filtering PBENCH-1124 Support type-cast filter expressions in `GET /datasets`. The primary objective is to support a paginated "Expiring Soon" view in the dashboard, requiring the ability to look for datasets expiring before a fixed timestamp. Previously, `GET /datasets?filter` worked by casting all SQL data to "string" and then comparing against the raw extracted string from the query parameter. Now it's possible to identify a type as well as additional comparison operators. For example, `GET /datasets?filter=server.deletion:<2023-05-01:date` will select all datasets with expiration timestamps earlier than 2023-05-1. To target this specific capability, the functional tests now override the default `server.deletion` on some uploads and verify that those datasets are returned by the filtered query. --- docs/API/V1/list.md | 84 +++-- jenkins/run-server-func-tests | 15 +- lib/pbench/client/__init__.py | 12 + .../server/api/resources/datasets_list.py | 307 +++++++++++++++--- lib/pbench/test/functional/server/test_put.py | 82 ++++- .../test/unit/server/test_datasets_list.py | 166 +++++++++- 6 files changed, 566 insertions(+), 100 deletions(-) diff --git a/docs/API/V1/list.md b/docs/API/V1/list.md index 7cb6872300..66868118c0 100644 --- a/docs/API/V1/list.md +++ b/docs/API/V1/list.md @@ -45,34 +45,66 @@ specified date. `filter` metadata filtering \ Select datasets matching the metadata expressions specified via `filter` -query parameters. Each expression is the name of a metadata key (for example, -`dataset.name`), followed by a colon (`:`) and the comparison string. The -comparison string may be prefixed with a tilde (`~`) to make it a partial -("contains") comparison instead of an exact match. For example, -`dataset.name:foo` looks for datasets with the name "foo" exactly, whereas -`dataset.name:~foo` looks for datasets with a name containing the substring -"foo". - -These may be combined across multiple `filter` query parameters or as -comma-separated lists in a single query parameter. Multiple filter expressions -form an `AND` expression, however consecutive filter expressions can be joined -in an `OR` expression by using the circumflex (`^`) character prior to each. -(The first expression with `^` begins an `OR` list while the first subsequent -expression outout `^` ends the `OR` list and is combined with an `AND`.) +query parameters. Each expression has the format `[chain]key:[op]value[:type]`: + +* `chain` Prefix an expression with `^` (circumflex) to allow combining a set +of expressions with `OR` rather than the default `AND`. +* `key` The name of a metadata key (for example, `dataset.name`) + +* `op` An operator to specify how to compare the key value: + + * `=` (Default) Compare for equality + * `~` Compare against a substring + * `>` Greater than + * `<` Less than + * `>=` Greater than or equal to + * `<=` Less than or equal to + * `!=` Not equal + +* `value` The value to compare against. This will be interpreted based on the specified type. +* `type` The string value will be cast to this type. Any value can be cast to +type `str`. General metadata keys (`server`, `global`, `user`, and +`dataset.metalog` namespaces) that have values incompatible with the specified +type will be ignored. If you specify an incompatible type for a primary +`dataset` key, an error will be returned as these types are defined by the +Pbench schema so no match would be possible. (For example, `dataset.name:2:int` +or `dataset.access:2023-05-01:date`.) + + * `str` (Default) Compare as a string + * `bool` Compare as a boolean + * `int` Compare as an integer + * `date` Compare as a date-time string. ISO-8601 recommended, and UTC is + assumed if no timezone is specified. + +For example, `dataset.name:foo` looks for datasets with the name "foo" exactly, +whereas `dataset.name:~foo` looks for datasets with a name containing the +substring "foo". + +Multiple expressions may be combined across multiple `filter` query parameters +or as comma-separated lists in a single query parameter. Multiple filter +expressions are combined as an `AND` expression, matching only when all +expressions match. However any consecutive set of expressions starting with `^` +are collected into an "`OR` list" that will be `AND`-ed with the surrounding +terms. For example, - `filter=dataset.name:a,server.origin:EC2` returns datasets with a name of "a" and an origin of "EC2". -- `filter=dataset.name:a,^server.origin:EC2,^dataset.metalog.pbench.script:fio` -returns datasets with a name of "a" and *either* an origin of "EC2" or generated -from the "pbench-fio" script. - -_NOTE_: `filter` expression values, like the `true` in -`GET /api/v1/datasets?filter=server.archiveonly:true`, are always interpreted -as strings, so be careful about the string representation of the value (in this -case, a boolean, which is represented in JSON as `true` or `false`). Beware -especially when attempting to match a JSON document (such as -`dataset.metalog.pbench`). +- `filter=dataset.name:~andy,^server.origin:EC2,^server.origin:RIYA, +dataset.access:public` +returns only "public" datasets with a name containing the string "andy" which also +have an origin of either "EC2" or "RIYA". As a SQL query, we might write it +as `dataset.name like "%andy%" and (server.origin = 'EC2' or +server.origin = 'RIYA') and dataset.access = 'public'`. + +_NOTE_: `filter` expression term values, like the `true` in +`GET /api/v1/datasets?filter=server.archiveonly:true`, are by default +interpreted as strings, so be careful about the string representation of the +value. In this case, `server.archiveonly` is a boolean, which will be matched +as a string value "true" or "false". You can instead specify the expression +term as `server.archiveonly:t:bool` which will treat the specified match value +as a boolean (`t[rue]` or `y[es]` for true, `f[alse]` or `n[o]` for false) and +match against the boolean metadata value. `keysummary` boolean \ Instead of displaying a list of selected datasets and metadata, use the set of @@ -105,6 +137,10 @@ Allows filtering for datasets owned by the authenticated client (if the value is omitted, e.g., `?mine` or `?mine=true`) or owned by *other* users (e.g., `?mine=false`). +`name` string \ +Select only datasets with a specified substring in their name. The filter +`?name=fio` is semantically equivalent to `?filter=dataset.name:~fio`. + `offset` integer \ "Paginate" the selected datasets by skipping the first `offset` datasets that would have been selected by the other query terms. This can be used with diff --git a/jenkins/run-server-func-tests b/jenkins/run-server-func-tests index d5eb268ad8..c55ab3417f 100755 --- a/jenkins/run-server-func-tests +++ b/jenkins/run-server-func-tests @@ -21,6 +21,13 @@ elif [[ -n "${1}" ]]; then exit 2 fi +function dump_journal { + printf -- "+++ journalctl dump +++\n" + # Try to capture the functional test container's logs. + podman exec ${PB_SERVER_CONTAINER_NAME} journalctl + printf -- "\n--- journalctl dump ---\n\n" +} + function cleanup { if [[ -n "${cleanup_flag}" ]]; then # Remove the Pbench Server container and the dependencies pod which we @@ -59,6 +66,7 @@ until curl -s -o /dev/null ${SERVER_API_ENDPOINTS}; do if [[ $(date +"%s") -ge ${end_in_epoch_secs} ]]; then echo "Timed out waiting for the reverse proxy to show up!" >&2 exit_status=1 + dump_journal exit ${exit_status} fi sleep 1 @@ -84,11 +92,8 @@ else fi if [[ ${exit_status} -ne 0 ]]; then - printf -- "\nFunctional tests exited with code %s\n" ${exit_status} - printf -- "+++ journalctl dump +++\n" - # Try to capture the functional test container's logs. - podman exec ${PB_SERVER_CONTAINER_NAME} journalctl - printf -- "\n--- journalctl dump ---\n\n" + dump_journal + printf -- "\nFunctional tests exited with code %s\n" ${exit_status} >&2 fi if [[ -z "${cleanup_flag}" ]]; then diff --git a/lib/pbench/client/__init__.py b/lib/pbench/client/__init__.py index 6015509644..d9358dcfa0 100644 --- a/lib/pbench/client/__init__.py +++ b/lib/pbench/client/__init__.py @@ -489,3 +489,15 @@ def update( uri_params={"dataset": dataset_id}, params=params, ).json() + + def get_settings(self, key: str = "") -> JSONOBJECT: + """Return requested server setting. + + Args: + key: A server settings key; if omitted, return all settings + + Returns: + A JSON document containing the requested key values + """ + params = {"key": key} + return self.get(api=API.SERVER_SETTINGS, uri_params=params).json() diff --git a/lib/pbench/server/api/resources/datasets_list.py b/lib/pbench/server/api/resources/datasets_list.py index 58e1dae0ec..af49172bba 100644 --- a/lib/pbench/server/api/resources/datasets_list.py +++ b/lib/pbench/server/api/resources/datasets_list.py @@ -1,14 +1,16 @@ +from dataclasses import dataclass from http import HTTPStatus -from typing import Any +import logging +from typing import Any, Callable, Optional from urllib.parse import urlencode, urlparse from flask import current_app from flask.json import jsonify from flask.wrappers import Request, Response -from sqlalchemy import and_, asc, cast, desc, func, or_, String +from sqlalchemy import and_, asc, Boolean, cast, desc, func, Integer, or_, String from sqlalchemy.exc import ProgrammingError, StatementError from sqlalchemy.orm import aliased, Query -from sqlalchemy.sql.expression import Alias +from sqlalchemy.sql.expression import Alias, BinaryExpression, ColumnElement from pbench.server import JSON, JSONOBJECT, OperationCode, PbenchServerConfig from pbench.server.api.resources import ( @@ -20,12 +22,17 @@ ApiMethod, ApiParams, ApiSchema, + convert_boolean, + convert_date, + convert_int, + convert_string, Parameter, ParamType, Schema, ) import pbench.server.auth.auth as Auth from pbench.server.database.database import Database +from pbench.server.database.models import TZDateTime from pbench.server.database.models.datasets import ( Dataset, Metadata, @@ -34,6 +41,63 @@ ) +@dataclass +class Type: + """Help keep track of filter types + + Used to link a filter type name to the equivalent SQLAlchemy type and the + Pbench API conversion method to create a compatible object from a string. + + Fields: + sqtype: The base SQLAlchemy type corresponding to a Python target type + convert: The Pbench API conversion method to verify and convert a + string + """ + + sqltype: Any + convert: Callable[[str, Any], Any] + + +"""Associate the name of a filter type to the Type record describing it.""" +TYPES = { + "bool": Type(Boolean, convert_boolean), + "date": Type(TZDateTime, convert_date), + "int": Type(Integer, convert_int), + "str": Type(String, convert_string), +} + + +"""Define the set of operators we allow in query filters. + +This maps a symbolic name to a SQLAlchemy filter method on the column. +""" +OPERATORS = { + "~": "contains", + "=": "__eq__", + "<": "__lt__", + ">": "__gt__", + "<=": "__le__", + ">=": "__ge__", + "!=": "__ne__", +} + + +def make_operator( + expression: ColumnElement, operator: str, value: Any +) -> BinaryExpression: + """Return the SQLAlchemy operator method of a column or JSON expression. + + Args: + expression: A SQLAlchemy expression or column + operator: The operator prefix + value: The value to be compared against + + Returns: + A SQLAlchemy filter expression. + """ + return getattr(expression, OPERATORS[operator])(value) + + def urlencode_json(json: JSON) -> str: """We must properly encode the metadata query parameter as a list of keys.""" new_json = {} @@ -42,6 +106,146 @@ def urlencode_json(json: JSON) -> str: return urlencode(new_json) +class Term: + """Help parsing a filter term.""" + + def __init__(self, term: str): + self.chain = None + self.key = None + self.operator = None + self.value = None + self.type = None + self.term = term + self.buffer = term + self.logger = logging.getLogger("term") + + @classmethod + def parse(cls, term: str) -> "Term": + """Factory method to construct an instance and parse a string + + Args: + term: A filter expression to parse + """ + return cls(term).parse_filter() + + def _remove_prefix( + self, prefixes: tuple[str], default: Optional[str] = None + ) -> Optional[str]: + """Remove a string prefix. + + Looking at the current buffer, remove a known prefix before processing + the next character. A list of prefixes may be specified: they're sorted + longest first so that we don't accidentally match a substring. + + Args: + prefixes: a collection of prefix strings + default: the default prefix if none appears in the string. + + Returns: + The prefix, or the default. + """ + prefix = default + for p in sorted(prefixes, key=lambda k: len(k), reverse=True): + if self.buffer.startswith(p): + prefix = self.buffer[: len(p)] + self.buffer = self.buffer[len(p) :] + break + return prefix + + def _next_token(self, optional: bool = False) -> str: + """Extract a string from the head of the buffer. + + To filter using metadata keys (e.g., unusual benchmark generated + metadata.log keys) or values (e.g. a date/time string) which may + contain ":" symbols, the key or value may be enclosed in quotes. + Either single or double quotes are allowed. + + We're looking for the ":" symbol delimiting the next term of the filter + expression, or (if that's marked "optional") the end of the buffer. + + Args: + optional: The terminating ":" separator is optional. + + Returns: + The token, unquoted + """ + buffer = self.buffer + if buffer.startswith(("'", '"')): + quote = buffer[:1] + end = buffer.find(quote, 1) + if end < 0: + raise APIAbort( + HTTPStatus.BAD_REQUEST, f"Bad quote termination in {self.term!r}" + ) + next = buffer[1:end] + end += 1 + else: + end = buffer.find(":") + if end < 0: + next = buffer + end = len(buffer) + else: + next = buffer[:end] + buffer = buffer[end:] + if buffer.startswith(":"): + buffer = buffer[1:] + elif not optional: + raise APIAbort( + HTTPStatus.BAD_REQUEST, f"Missing ':' terminator in {next!r}" + ) + self.buffer = buffer + return next + + def parse_filter(self) -> "Term": + """Parse a filter term like ":[][:]" + + Separates the "key", "operator", "value", and "type" fields of a filter + expression. + + The key and value can be quoted to include mixed case and symbols, + including the ":" character, by starting and starting and ending the + key or value with the "'" or '"' character, e.g., + "'dataset.metalog.tool/iteration:1':'foo:1#bar':str". + + The "operator", if omitted, defaults to "=", and "type" defaults to + "str" (string), so that `dataset.name:foo` has the same effect as + `dataset.name:=foo:str`. + + Returns: + self + """ + + # The "chain" allows chaining multiple expressions as an OR + self.chain = self._remove_prefix(("^",)) + + # The metadata key token + self.key = self._next_token() + if not Metadata.is_key_path( + self.key, Metadata.METADATA_KEYS, metalog_key_ok=True + ): + raise APIAbort(HTTPStatus.BAD_REQUEST, str(MetadataBadKey(self.key))) + + # The comparison operator (defaults to "=") + self.operator = self._remove_prefix(OPERATORS.keys(), default="=") + + # The filter value + self.value = self._next_token(optional=True) + + # The comparison type, defaults to "str" + self.type = self.buffer.lower() if self.buffer else "str" + if self.type and self.type not in TYPES: + raise APIAbort( + HTTPStatus.BAD_REQUEST, + f"The filter type {self.type!r} must be one of {','.join(t for t in TYPES.keys())}", + ) + if not self.key or not self.value: + raise APIAbort( + HTTPStatus.BAD_REQUEST, + f"filter {self.term!r} must have the form 'k:v[:type]'", + ) + return self + + class DatasetsList(ApiBase): """API class to list datasets based on database metadata.""" @@ -159,14 +363,41 @@ def filter_query( 'saved': True}` because there's no mechanism to parse JSON expressions. This might be added in the future. - To match against a key value, separate the key path from the value with - a `:` character, e.g., "dataset.name:fio". This produces an exact match - expression as an AND (required) term in the query: only datasets with - the exact name "fio" will be returned. - - To match against a subset of a key value, insert a tilde (~) following - the `:`, like "dataset.name:~fio". This produces a "contains" filter - that will match against any name containing the substring "fio". + Specify the value to match separated from the key path by a `:` + character, e.g., "dataset.name:fio". By default this produces an exact + match expression as an AND (required) term in the query: only datasets + with the exact name "fio" will be returned. + + The value may be prefixed by an optional operator in order to perform a + comparison other than strict equality: + + = Look for matches strictly equal to the specified value (default) + ~ Look for matches containing the specified value as a substring + > Look for matches strictly greater than the specified value + < Look for matches strictly less than the specified value + >= Look for matches greater than or equal to the specified value + <= Look for matches less than or equal to the specified value + != Look for matches not equal to the specified value + + After the value, you can optionally specify a type for the comparison. + Note that an incompatible type (other than the default "str") for a + primary "dataset" column will be rejected, but general Metadata + (including "dataset.metalog") is stored as generic JSON and the API + will attempt to cast the selected data to the specified type. + + str Perform a string match + int Perform an integer match + bool Perform a boolean match (boolean values are t[rue], f[alse], + y[es], and n[o]) + date Perform a date match: the selected key value (and supplied + filter value) must be strings representing a date-time, ideally + in ISO-8601 format. UTC is assumed if no timezone is specified. + + For example + + dataset.uploaded:>2023-05-01:date + global.dashboard.seen:t:bool + dataset.metalog.pbench.script:!=fio To produce an OR (optional) filter term, you can prefix the `key:value` with the caret (^) character. Adjacent OR terms will be combined in a @@ -177,8 +408,8 @@ def filter_query( AND and OR terms can be chained by ordering them: - "dataset.name:~fio,^global.dashboard.saved:true, - ^user.dashboard.favorite:true,server.origin:RIYA" + "dataset.name:~fio,^global.dashboard.saved:true:bool, + ^user.dashboard.favorite:true:bool,server.origin:RIYA" will match any dataset with a name containing the substring "fio" which ALSO has been marked in the dashboard as either "saved" OR (by the @@ -222,25 +453,12 @@ def filter_query( or_list = [] and_list = [] for kw in filters: - combine_or = False - contains = False - try: - k, v = kw.split(":", maxsplit=1) - except ValueError: - raise APIAbort( - HTTPStatus.BAD_REQUEST, f"filter {kw!r} must have the form 'k:v'" - ) - if k.startswith("^"): - combine_or = True - k = k[1:] - if v.startswith("~"): - contains = True - v = v[1:] - - if not Metadata.is_key_path(k, Metadata.METADATA_KEYS, metalog_key_ok=True): - raise APIAbort(HTTPStatus.BAD_REQUEST, str(MetadataBadKey(k))) - keys = k.split(".") + term = Term.parse(kw) + combine_or = term.chain == "^" + keys = term.key.split(".") native_key = keys.pop(0).lower() + vtype = term.type + value = TYPES[vtype].convert(term.value, None) filter = None if native_key == Metadata.DATASET: @@ -254,19 +472,20 @@ def filter_query( else: try: c = getattr(Dataset, second) - try: - is_str = c.type.python_type is str - except NotImplementedError: - is_str = False - if is_str: - column = c - else: + if vtype == "str" and not isinstance(c.type, String): column = cast(getattr(Dataset, second), String) + else: + if not isinstance(c.type, TYPES[vtype].sqltype): + raise APIAbort( + HTTPStatus.BAD_REQUEST, + f"Filter of type {vtype!r} is not compatible with key 'dataset.{c.name}'", + ) + column = c except AttributeError as e: raise APIAbort( - HTTPStatus.BAD_REQUEST, str(MetadataBadKey(k)) + HTTPStatus.BAD_REQUEST, str(MetadataBadKey(term.key)) ) from e - filter = column.contains(v) if contains else column == v + filter = make_operator(column, term.operator, value) elif native_key == Metadata.USER: # The user namespace requires special handling because the # values are always qualified by the owning user rather than @@ -275,14 +494,16 @@ def filter_query( if not user_id: raise APIAbort( HTTPStatus.UNAUTHORIZED, - f"Metadata key {k} cannot be used by an unauthenticated client", + f"Metadata key {term.key} cannot be used by an unauthenticated client", ) # NOTE: We don't want to *evaluate* the filter expression here, so - # check explicitly for None. + # check explicitly for None. I.e., "we have no filter" rather than + # "the evaluated result of this filter is falsey". if filter is None: expression = aliases[native_key].value[keys].as_string() - filter = expression.contains(v) if contains else expression == v + expression = expression.cast(TYPES[vtype].sqltype) + filter = make_operator(expression, term.operator, value) if combine_or: or_list.append(filter) diff --git a/lib/pbench/test/functional/server/test_put.py b/lib/pbench/test/functional/server/test_put.py index b28061ee4a..f868c3ee30 100644 --- a/lib/pbench/test/functional/server/test_put.py +++ b/lib/pbench/test/functional/server/test_put.py @@ -1,11 +1,12 @@ from dataclasses import dataclass -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from http import HTTPStatus import os.path from pathlib import Path import re import time +import dateutil.parser import pytest from requests.exceptions import HTTPError @@ -14,6 +15,11 @@ TARBALL_DIR = Path("lib/pbench/test/functional/server/tarballs") SPECIAL_DIR = TARBALL_DIR / "special" +SHORT_EXPIRATION_DAYS = 10 + + +def utc_from_str(date: str) -> datetime: + return dateutil.parser.parse(date).replace(tzinfo=timezone.utc) @dataclass @@ -32,17 +38,22 @@ def test_upload_all(self, server_client: PbenchServerClient, login_user): """Upload each of the pregenerated tarballs, and perform some basic sanity checks on the resulting server state. """ - print(" ... reporting behaviors ...") + print(" ... uploading tarballs ...") tarballs: dict[str, Tarball] = {} access = ["private", "public"] cur_access = 0 + # We're going to make some datasets expire "soon" (much sooner than the + # default) so we can test filtering typed metadata. + expire_soon = datetime.now(timezone.utc) + timedelta(days=SHORT_EXPIRATION_DAYS) + for t in TARBALL_DIR.glob("*.tar.xz"): a = access[cur_access] if a == "public": metadata = ( - "server.origin:test,user.pbench.access:public,server.archiveonly:n" + "server.origin:test,user.pbench.access:public,server.archiveonly:n", + f"server.deletion:{expire_soon:%Y-%m-%d %H:%M%z}", ) else: metadata = None @@ -53,7 +64,7 @@ def test_upload_all(self, server_client: PbenchServerClient, login_user): assert ( response.status_code == HTTPStatus.CREATED ), f"upload returned unexpected status {response.status_code}, {response.text} ({t})" - print(f"\t... uploaded {t.name}") + print(f"\t... uploaded {t.name}: {a}") datasets = server_client.get_list( metadata=["dataset.access", "server.tarball-path", "dataset.operations"] @@ -182,7 +193,7 @@ def check_indexed(server_client: PbenchServerClient, datasets): not_indexed = [] try: for dataset in datasets: - print(f"\t... on {dataset.metadata['server.tarball-path']}") + print(f"\t... on {dataset.name}") metadata = server_client.get_metadata( dataset.resource_id, ["dataset.operations"] ) @@ -217,8 +228,7 @@ def test_index_all(self, server_client: PbenchServerClient, login_user): state and metadata look good. """ tarball_names = frozenset(t.name for t in TARBALL_DIR.glob("*.tar.xz")) - - print(" ... reporting behaviors ...") + print(" ... reporting dataset status ...") # Test get_list pagination: to avoid forcing a list, we'll count the # iterations separately. (Note that this is really an implicit test @@ -233,6 +243,7 @@ def test_index_all(self, server_client: PbenchServerClient, login_user): "dataset.access", "server.archiveonly", "server.origin", + "server.deletion", "user.pbench.access", ], ) @@ -258,11 +269,11 @@ def test_index_all(self, server_client: PbenchServerClient, login_user): f" = {exc.response.status_code}, text = {exc.response.text!r}" ) - # For each dataset we find, poll the state until it reaches Indexed - # state, or until we time out. Since the cron jobs run once a minute - # and they start on the minute, we make our 1st check 45 seconds into - # the next minute, and then check at 45 seconds past the minute until - # we reached 5 minutes past the original start time. + # For each dataset we find, poll the state until it has been indexed, + # or until we time out. The indexer runs once a minute slightly after + # the minute, so we make our 1st check 45 seconds into the next minute, + # and then check at 45 seconds past the minute until we reach 5 minutes + # past the original start time. oneminute = timedelta(minutes=1) now = start = datetime.utcnow() timeout = start + timedelta(minutes=5) @@ -280,7 +291,7 @@ def test_index_all(self, server_client: PbenchServerClient, login_user): if os.path.basename(tp) not in tarball_names: continue count += 1 - print(f"\t... indexed {tp}") + print(f"\t... indexed {dataset.name}") now = datetime.utcnow() if not not_indexed or now >= timeout: break @@ -289,7 +300,7 @@ def test_index_all(self, server_client: PbenchServerClient, login_user): now = datetime.utcnow() assert not not_indexed, ( f"Timed out after {(now - start).total_seconds()} seconds; unindexed datasets: " - + ", ".join(d.metadata["server.tarball-path"] for d in not_indexed) + + ", ".join(d.name for d in not_indexed) ) assert ( len(tarball_names) == count @@ -421,6 +432,48 @@ def test_list_filter_and_or(self, server_client: PbenchServerClient, login_user) "2018" in date or "2019" in date ), f"Dataset {dataset.name} date is {date}" + @pytest.mark.dependency(name="list_type", depends=["upload"], scope="session") + def test_list_filter_typed(self, server_client: PbenchServerClient, login_user): + """Test filtering using typed matches. + + We set an early "server.deletion" time on alternating datasets as we + uploaded them. Now prove that we can use a typed "date" filter to find + them. We'll further validate that none of the other datasets we + uploaded from TARBALL_DIR should have been returned. + """ + test_sets = {} + for t in TARBALL_DIR.glob("*.tar.xz"): + r = Dataset.md5(t) + m = server_client.get_metadata( + r, + ["dataset.name", "server.deletion"], + ) + test_sets[r] = m + soonish = datetime.now(timezone.utc) + timedelta(days=SHORT_EXPIRATION_DAYS * 2) + + datasets = server_client.get_list( + metadata=["server.deletion"], + owner="tester", + # NOTE: using weird "US standard" date notation here minimizes + # the risk of succeeding via a simple alphanumeric comparison. + filter=[f"server.deletion:<'{soonish:%m/%d/%Y %H:%M%z}':date"], + ) + + # Confirm that the returned datasets match + for dataset in datasets: + deletion = utc_from_str(dataset.metadata["server.deletion"]) + assert ( + deletion < soonish + ), f"Filter returned {dataset.name}, with expiration out of range ({deletion:%Y-%m-%d})" + test_sets.pop(dataset.resource_id, None) + + # Confirm that the remaining TARBALL_DIR datasets don't match + for r, m in test_sets.items(): + deletion = utc_from_str(m["server.deletion"]) + assert ( + deletion >= soonish + ), f"Filter failed to return {m['dataset.name']}, with expiration in range ({deletion:%Y-%m-%d})" + class TestUpdate: @pytest.mark.dependency(name="publish", depends=["index"], scope="session") @@ -447,6 +500,7 @@ class TestDelete: "list_and", "list_none", "list_or", + "list_type", "publish", ], scope="session", diff --git a/lib/pbench/test/unit/server/test_datasets_list.py b/lib/pbench/test/unit/server/test_datasets_list.py index 2583d584f7..6e15a162ba 100644 --- a/lib/pbench/test/unit/server/test_datasets_list.py +++ b/lib/pbench/test/unit/server/test_datasets_list.py @@ -492,18 +492,58 @@ def test_get_repeat_keys(self, query_as): @pytest.mark.parametrize( "filters,expected", [ - (["dataset.name:fio"], "datasets.name = 'fio'"), + (["dataset.name:=fio"], "datasets.name = 'fio'"), ( - ["dataset.metalog.pbench.script:fio"], - "dataset_metadata_1.value[['pbench', 'script']] = 'fio'", + ["dataset.uploaded:>2023-01-01:date"], + "datasets.uploaded > '2023-01-01 00:00:00'", + ), + ( + ["dataset.uploaded:!=2023-01-01:date"], + "datasets.uploaded != '2023-01-01 00:00:00'", + ), + ( + ["server.deletion:<2023-05-01:date"], + "CAST(dataset_metadata_2.value[['deletion']] AS DATETIME) < '2023-05-01 00:00:00'", + ), + ( + ["'dataset.metalog.pbench.script':='fio'"], + "CAST(dataset_metadata_1.value[['pbench', 'script']] AS VARCHAR) = 'fio'", + ), + ( + ["'dataset.metalog.pbench.script':!=fio"], + "CAST(dataset_metadata_1.value[['pbench', 'script']] AS VARCHAR) != 'fio'", + ), + ( + ["dataset.metalog.run.date:~fio"], + "(CAST(dataset_metadata_1.value[['run', 'date']] AS VARCHAR) LIKE '%' || 'fio' || '%')", + ), + ( + ["global.something.integer:<1:int"], + "CAST(dataset_metadata_3.value[['something', 'integer']] AS INTEGER) < 1", + ), + ( + ["global.something.integer:>2:int"], + "CAST(dataset_metadata_3.value[['something', 'integer']] AS INTEGER) > 2", + ), + ( + ["global.something.integer:<=1:int"], + "CAST(dataset_metadata_3.value[['something', 'integer']] AS INTEGER) <= 1", + ), + ( + ["global.something.integer:>=2:int"], + "CAST(dataset_metadata_3.value[['something', 'integer']] AS INTEGER) >= 2", + ), + ( + ["global.something.boolean:t:bool"], + "CAST(dataset_metadata_3.value[['something', 'boolean']] AS BOOLEAN) = true", ), ( ["user.d.f:1"], - "dataset_metadata_4.value[['d', 'f']] = '1'", + "CAST(dataset_metadata_4.value[['d', 'f']] AS VARCHAR) = '1'", ), ( - ["dataset.name:~fio", "^global.x:1", "^user.y:~yes"], - "(datasets.name LIKE '%' || 'fio' || '%') AND (dataset_metadata_3.value[['x']] = '1' OR ((dataset_metadata_4.value[['y']]) LIKE '%' || 'yes' || '%'))", + ["dataset.name:~fio", "^'global.x':1", "^user.y:~yes"], + "(datasets.name LIKE '%' || 'fio' || '%') AND (CAST(dataset_metadata_3.value[['x']] AS VARCHAR) = '1' OR (CAST(dataset_metadata_4.value[['y']] AS VARCHAR) LIKE '%' || 'yes' || '%'))", ), ( ["dataset.uploaded:~2000"], @@ -544,11 +584,11 @@ def test_filter_query(self, monkeypatch, client, filters, expected): (["dataset.name:fio"], "datasets.name = 'fio'"), ( ["dataset.metalog.pbench.script:fio"], - "dataset_metadata_1.value[['pbench', 'script']] = 'fio'", + "CAST(dataset_metadata_1.value[['pbench', 'script']] AS VARCHAR) = 'fio'", ), ( ["dataset.name:~fio", "^global.x:1"], - "(datasets.name LIKE '%' || 'fio' || '%') AND dataset_metadata_3.value[['x']] = '1'", + "(datasets.name LIKE '%' || 'fio' || '%') AND CAST(dataset_metadata_3.value[['x']] AS VARCHAR) = '1'", ), ( ["dataset.uploaded:~2000"], @@ -596,15 +636,57 @@ def test_user_no_auth(self, monkeypatch, db_session): assert e.value.http_status == HTTPStatus.UNAUTHORIZED @pytest.mark.parametrize( - "meta,error", + "meta,error,message", [ - ("user.foo:1", HTTPStatus.UNAUTHORIZED), - ("x.y:1", HTTPStatus.BAD_REQUEST), - ("global.x=3", HTTPStatus.BAD_REQUEST), - ("dataset.notright:10", HTTPStatus.BAD_REQUEST), + ( + "user.foo:1", + HTTPStatus.UNAUTHORIZED, + "Metadata key user.foo cannot be used by an unauthenticated client", + ), + ("x.y:1", HTTPStatus.BAD_REQUEST, "Metadata key 'x.y' is not supported"), + ( + "global.x=3", + HTTPStatus.BAD_REQUEST, + "Missing ':' terminator in 'global.x=3'", + ), + ( + "'global.x'", + HTTPStatus.BAD_REQUEST, + "Missing ':' terminator in 'global.x'", + ), + ( + "dataset.x':v", + HTTPStatus.BAD_REQUEST, + 'Metadata key "dataset.x\'" is not supported', + ), + ( + "dataset.notright:10", + HTTPStatus.BAD_REQUEST, + "Metadata key 'dataset.notright' is not supported", + ), + ( + "'dataset.name:foo", + HTTPStatus.BAD_REQUEST, + 'Bad quote termination in "\'dataset.name:foo"', + ), + ( + "dataset.name:'foo", + HTTPStatus.BAD_REQUEST, + 'Bad quote termination in "dataset.name:\'foo"', + ), + ( + "server.deletion:<2023-05-01:time", + HTTPStatus.BAD_REQUEST, + "The filter type 'time' must be one of bool,date,int,str", + ), + ( + "server.deletion:2023-05-01:date:", + HTTPStatus.BAD_REQUEST, + "The filter type 'date:' must be one of bool,date,int,str", + ), ], ) - def test_filter_errors(self, monkeypatch, db_session, meta, error): + def test_filter_errors(self, monkeypatch, client, meta, error, message): """Test invalid filter expressions.""" monkeypatch.setattr( "pbench.server.api.resources.datasets_list.Auth.get_current_user_id", @@ -614,6 +696,62 @@ def test_filter_errors(self, monkeypatch, db_session, meta, error): with pytest.raises(APIAbort) as e: DatasetsList.filter_query([meta], aliases, query) assert e.value.http_status == error + assert str(e.value) == message + + @pytest.mark.parametrize( + "query,results", + [ + ("global.legacy:t:bool", ["drb"]), + ("global.legacy:>2:int", ["fio_2"]), + ("global.legacy:>2023-05-01:date", []), + ], + ) + def test_mismatched_json_cast(self, query_as, server_config, query, results): + """Verify DB engine behavior for mismatched metadata casts. + + Verify that a typed filter ignores datasets where the metadata key + type isn't compatible with the required cast. + """ + drb = Dataset.query(name="drb") + fio_1 = Dataset.query(name="fio_1") + fio_2 = Dataset.query(name="fio_2") + + Metadata.setvalue(dataset=drb, key="global.legacy", value=True) + Metadata.setvalue(dataset=fio_1, key="global.legacy.server", value="ABC") + Metadata.setvalue(dataset=fio_2, key="global.legacy", value=4) + + response = query_as( + {"filter": query, "metadata": ["dataset.uploaded"]}, + "drb", + HTTPStatus.OK, + ) + self.compare_results(response.json, results, {}, server_config) + + @pytest.mark.parametrize( + "query,message", + [ + ( + "dataset.name:t:bool", + "Filter of type 'bool' is not compatible with key 'dataset.name'", + ), + ( + "dataset.uploaded:>2:int", + "Filter of type 'int' is not compatible with key 'dataset.uploaded'", + ), + ], + ) + def test_mismatched_dataset_cast(self, query_as, server_config, query, message): + """Verify DB engine behavior for mismatched metadata casts. + + Verify that a typed filter generates an error when it targets a primary + dataset key with an incompatible type. + """ + response = query_as( + {"filter": query, "metadata": ["dataset.uploaded"]}, + "drb", + HTTPStatus.BAD_REQUEST, + ) + assert response.json["message"] == message @pytest.mark.parametrize( "exception,error",