From fd16ec63cb05525525f89ee93793db5bf18c229f Mon Sep 17 00:00:00 2001 From: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Date: Fri, 24 Nov 2023 11:22:46 +0100 Subject: [PATCH] refactor: Add support for new filters declaration (#6397) * Rework filter logic for InMemoryDocumentStore to support new filters declaration * Fix legacy filters tests * Simplify logic and handle dates comparison * Rework MetadataRouter to support new filters * Update docstrings * Add release notes * Fix linting * Avoid duplicating filters specifications * Handle corner case * Simplify docstring * Fix filters logic and tests * Fix Document Store testing legacy filters tests --- .../pipelines/test_preprocessing_pipeline.py | 2 +- .../components/routers/metadata_router.py | 41 +- .../in_memory/document_store.py | 82 +- haystack/preview/document_stores/protocols.py | 99 +- haystack/preview/testing/document_store.py | 25 +- haystack/preview/utils/filters.py | 365 ++---- .../rework-filters-1bb103d196a1912b.yaml | 87 ++ .../routers/test_metadata_router.py | 16 +- .../preview/document_stores/test_in_memory.py | 4 - test/preview/utils/test_filters.py | 1030 +++++++++-------- 10 files changed, 872 insertions(+), 879 deletions(-) create mode 100644 releasenotes/notes/rework-filters-1bb103d196a1912b.yaml diff --git a/e2e/preview/pipelines/test_preprocessing_pipeline.py b/e2e/preview/pipelines/test_preprocessing_pipeline.py index 60c6dbe396..2f16f1d993 100644 --- a/e2e/preview/pipelines/test_preprocessing_pipeline.py +++ b/e2e/preview/pipelines/test_preprocessing_pipeline.py @@ -18,7 +18,7 @@ def test_preprocessing_pipeline(tmp_path): preprocessing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter") preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier") preprocessing_pipeline.add_component( - instance=MetadataRouter(rules={"en": {"language": {"$eq": "en"}}}), name="router" + instance=MetadataRouter(rules={"en": {"field": "language", "operator": "==", "value": "en"}}), name="router" ) preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner") preprocessing_pipeline.add_component( diff --git a/haystack/preview/components/routers/metadata_router.py b/haystack/preview/components/routers/metadata_router.py index 6f7df9b9e5..f83b1e5542 100644 --- a/haystack/preview/components/routers/metadata_router.py +++ b/haystack/preview/components/routers/metadata_router.py @@ -1,7 +1,7 @@ from typing import Dict, List from haystack.preview import component, Document -from haystack.preview.utils.filters import document_matches_filter +from haystack.preview.utils.filters import document_matches_filter, convert @component @@ -19,12 +19,36 @@ def __init__(self, rules: Dict[str, Dict]): follow the format of filtering expressions in Haystack. For example: ```python { - "edge_1": {"created_at": {"$gte": "2023-01-01", "$lt": "2023-04-01"}}, - "edge_2": {"created_at": {"$gte": "2023-04-01", "$lt": "2023-07-01"}}, - "edge_3": {"created_at": {"$gte": "2023-07-01", "$lt": "2023-10-01"}}, - "edge_4": {"created_at": {"$gte": "2023-10-01", "$lt": "2024-01-01"}}, - } - ``` + "edge_1": { + "operator": "AND", + "conditions": [ + {"field": "meta.created_at", "operator": ">=", "value": "2023-01-01"}, + {"field": "meta.created_at", "operator": "<", "value": "2023-04-01"}, + ], + }, + "edge_2": { + "operator": "AND", + "conditions": [ + {"field": "meta.created_at", "operator": ">=", "value": "2023-04-01"}, + {"field": "meta.created_at", "operator": "<", "value": "2023-07-01"}, + ], + }, + "edge_3": { + "operator": "AND", + "conditions": [ + {"field": "meta.created_at", "operator": ">=", "value": "2023-07-01"}, + {"field": "meta.created_at", "operator": "<", "value": "2023-10-01"}, + ], + }, + "edge_4": { + "operator": "AND", + "conditions": [ + {"field": "meta.created_at", "operator": ">=", "value": "2023-10-01"}, + {"field": "meta.created_at", "operator": "<", "value": "2024-01-01"}, + ], + }, + } + ``` """ self.rules = rules component.set_output_types(self, unmatched=List[Document], **{edge: List[Document] for edge in rules}) @@ -43,6 +67,9 @@ def run(self, documents: List[Document]): for document in documents: cur_document_matched = False for edge, rule in self.rules.items(): + if "operator" not in rule: + # Must be a legacy filter, convert it + rule = convert(rule) if document_matches_filter(rule, document): output[edge].append(document) cur_document_matched = True diff --git a/haystack/preview/document_stores/in_memory/document_store.py b/haystack/preview/document_stores/in_memory/document_store.py index f52359760f..f00c4199bd 100644 --- a/haystack/preview/document_stores/in_memory/document_store.py +++ b/haystack/preview/document_stores/in_memory/document_store.py @@ -11,7 +11,7 @@ from haystack.preview.document_stores.decorator import document_store from haystack.preview.dataclasses import Document from haystack.preview.document_stores.protocols import DuplicatePolicy -from haystack.preview.utils.filters import document_matches_filter +from haystack.preview.utils.filters import document_matches_filter, convert from haystack.preview.document_stores.errors import DuplicateDocumentError, DocumentStoreError from haystack.preview.utils import expit @@ -92,75 +92,15 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc """ Returns the documents that match the filters provided. - Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical operator (`"$and"`, - `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `$ne`, `"$in"`, `$nin`, `"$gt"`, `"$gte"`, `"$lt"`, - `"$lte"`) or a metadata field name. - - Logical operator keys take a dictionary of metadata field names and/or logical operators as value. Metadata - field names take a dictionary of comparison operators as value. Comparison operator keys take a single value or - (in case of `"$in"`) a list of values as value. If no logical operator is provided, `"$and"` is used as default - operation. If no comparison operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used - as default operation. - - Example: - - ```python - filters = { - "$and": { - "type": {"$eq": "article"}, - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": { - "genre": {"$in": ["economy", "politics"]}, - "publisher": {"$eq": "nytimes"} - } - } - } - # or simpler using default operators - filters = { - "type": "article", - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": { - "genre": ["economy", "politics"], - "publisher": "nytimes" - } - } - ``` - - To use the same logical operator multiple times on the same level, logical operators can take a list of - dictionaries as value. - - Example: - - ```python - filters = { - "$or": [ - { - "$and": { - "Type": "News Paper", - "Date": { - "$lt": "2019-01-01" - } - } - }, - { - "$and": { - "Type": "Blog Post", - "Date": { - "$gte": "2019-01-01" - } - } - } - ] - } - ``` + For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol documentation. :param filters: The filters to apply to the document list. :return: A list of Documents that match the given filters. """ if filters: - return [doc for doc in self.storage.values() if document_matches_filter(conditions=filters, document=doc)] + if "operator" not in filters: + filters = convert(filters) + return [doc for doc in self.storage.values() if document_matches_filter(filters=filters, document=doc)] return list(self.storage.values()) def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL) -> int: @@ -220,9 +160,17 @@ def bm25_retrieval( if not query: raise ValueError("Query should be a non-empty string") - content_type_filter = {"$or": {"content": {"$not": None}, "dataframe": {"$not": None}}} + content_type_filter = { + "operator": "OR", + "conditions": [ + {"field": "content", "operator": "!=", "value": None}, + {"field": "dataframe", "operator": "!=", "value": None}, + ], + } if filters: - filters = {"$and": [content_type_filter, filters]} + if "operator" not in filters: + filters = convert(filters) + filters = {"operator": "AND", "conditions": [content_type_filter, filters]} else: filters = content_type_filter all_documents = self.filter_documents(filters=filters) diff --git a/haystack/preview/document_stores/protocols.py b/haystack/preview/document_stores/protocols.py index d011552892..6a27f19551 100644 --- a/haystack/preview/document_stores/protocols.py +++ b/haystack/preview/document_stores/protocols.py @@ -51,69 +51,64 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc """ Returns the documents that match the filters provided. - Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical operator (`"$and"`, - `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `$ne`, `"$in"`, `$nin`, `"$gt"`, `"$gte"`, `"$lt"`, - `"$lte"`) or a metadata field name. + Filters are defined as nested dictionaries that can be of two types: + - Comparison + - Logic - Logical operator keys take a dictionary of metadata field names and/or logical operators as value. Metadata - field names take a dictionary of comparison operators as value. Comparison operator keys take a single value or - (in case of `"$in"`) a list of values as value. If no logical operator is provided, `"$and"` is used as default - operation. If no comparison operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used - as default operation. + Comparison dictionaries must contain the keys: - Example: + - `field` + - `operator` + - `value` - ```python - filters = { - "$and": { - "type": {"$eq": "article"}, - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": { - "genre": {"$in": ["economy", "politics"]}, - "publisher": {"$eq": "nytimes"} - } - } - } - # or simpler using default operators - filters = { - "type": "article", - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": { - "genre": ["economy", "politics"], - "publisher": "nytimes" - } - } - ``` + Logic dictionaries must contain the keys: + + - `operator` + - `conditions` + + The `conditions` key must be a list of dictionaries, either of type Comparison or Logic. - To use the same logical operator multiple times on the same level, logical operators can take a list of - dictionaries as value. + The `operator` value in Comparison dictionaries must be one of: - Example: + - `==` + - `!=` + - `>` + - `>=` + - `<` + - `<=` + - `in` + - `not in` + The `operator` values in Logic dictionaries must be one of: + + - `NOT` + - `OR` + - `AND` + + + A simple filter: + ```python + filters = {"field": "meta.type", "operator": "==", "value": "article"} + ``` + + A more complex filter: ```python filters = { - "$or": [ + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.date", "operator": ">=", "value": 1420066800}, + {"field": "meta.date", "operator": "<", "value": 1609455600}, + {"field": "meta.rating", "operator": ">=", "value": 3}, { - "$and": { - "Type": "News Paper", - "Date": { - "$lt": "2019-01-01" - } - } + "operator": "OR", + "conditions": [ + {"field": "meta.genre", "operator": "in", "value": ["economy", "politics"]}, + {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, + ], }, - { - "$and": { - "Type": "Blog Post", - "Date": { - "$gte": "2019-01-01" - } - } - } - ] + ], } - ``` :param filters: the filters to apply to the document list. :return: a list of Documents that match the given filters. diff --git a/haystack/preview/testing/document_store.py b/haystack/preview/testing/document_store.py index 04b0541e5f..bb8c1f15fb 100644 --- a/haystack/preview/testing/document_store.py +++ b/haystack/preview/testing/document_store.py @@ -236,7 +236,7 @@ def document_store(self): @pytest.mark.unit def test_incorrect_filter_type(self, document_store: DocumentStore, filterable_docs: List[Document]): document_store.write_documents(filterable_docs) - with pytest.raises(FilterError): + with pytest.raises(ValueError): document_store.filter_documents(filters="something odd") # type: ignore @pytest.mark.unit @@ -574,7 +574,9 @@ def document_store(self): def test_lt_filter(self, document_store: DocumentStore, filterable_docs: List[Document]): document_store.write_documents(filterable_docs) result = document_store.filter_documents(filters={"number": {"$lt": 0.0}}) - assert result == [doc for doc in filterable_docs if "number" in doc.meta and doc.meta["number"] < 0] + assert result == [ + doc for doc in filterable_docs if doc.meta.get("number") is not None and doc.meta["number"] < 0 + ] @pytest.mark.unit def test_lt_filter_non_numeric(self, document_store: DocumentStore, filterable_docs: List[Document]): @@ -614,7 +616,9 @@ def document_store(self): def test_lte_filter(self, document_store: DocumentStore, filterable_docs: List[Document]): document_store.write_documents(filterable_docs) result = document_store.filter_documents(filters={"number": {"$lte": 2.0}}) - assert result == [doc for doc in filterable_docs if "number" in doc.meta and doc.meta["number"] <= 2.0] + assert result == [ + doc for doc in filterable_docs if doc.meta.get("number") is not None and doc.meta["number"] <= 2.0 + ] @pytest.mark.unit def test_lte_filter_non_numeric(self, document_store: DocumentStore, filterable_docs: List[Document]): @@ -658,7 +662,8 @@ def test_filter_simple_or(self, document_store: DocumentStore, filterable_docs: assert result == [ doc for doc in filterable_docs - if (("number" in doc.meta and doc.meta["number"] < 1) or doc.meta.get("name") in ["name_0", "name_1"]) + if (doc.meta.get("number") is not None and doc.meta["number"] < 1) + or doc.meta.get("name") in ["name_0", "name_1"] ] @pytest.mark.unit @@ -733,7 +738,10 @@ def test_filter_nested_or(self, document_store: DocumentStore, filterable_docs: assert result == [ doc for doc in filterable_docs - if (doc.meta.get("name") in ["name_0", "name_1"] or ("number" in doc.meta and doc.meta["number"] < 1)) + if ( + doc.meta.get("name") in ["name_0", "name_1"] + or (doc.meta.get("number") is not None and doc.meta["number"] < 1) + ) ] @pytest.mark.unit @@ -783,11 +791,8 @@ def test_filter_nested_or_and(self, document_store: DocumentStore, filterable_do doc for doc in filterable_docs if ( - ("number" in doc.meta and doc.meta["number"] < 1) - or ( - doc.meta.get("name") in ["name_0", "name_1"] - and ("chapter" in doc.meta and doc.meta["chapter"] != "intro") - ) + (doc.meta.get("number") is not None and doc.meta["number"] < 1) + or (doc.meta.get("name") in ["name_0", "name_1"] and (doc.meta.get("chapter") != "intro")) ) ] diff --git a/haystack/preview/utils/filters.py b/haystack/preview/utils/filters.py index 61016f993f..35475c15db 100644 --- a/haystack/preview/utils/filters.py +++ b/haystack/preview/utils/filters.py @@ -1,297 +1,174 @@ from typing import List, Any, Union, Dict +from dataclasses import fields from datetime import datetime -import numpy as np import pandas as pd from haystack.preview.dataclasses import Document from haystack.preview.errors import FilterError -GT_TYPES = (int, float, np.number) -IN_TYPES = (list, set, tuple) - - -def not_operation(conditions: List[Any], document: Document, _current_key: str): +def document_matches_filter(filters: Dict[str, Any], document: Document) -> bool: """ - Applies a NOT to all the nested conditions. - - :param conditions: the filters dictionary. - :param document: the document to test. - :param _current_key: internal, don't use. - :return: True if the document matches the negated filters, False otherwise + Return whether `filters` match the Document. + For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol documentation. """ - return not and_operation(conditions=conditions, document=document, _current_key=_current_key) + if "field" in filters: + return _comparison_condition(filters, document) + return _logic_condition(filters, document) -def and_operation(conditions: List[Any], document: Document, _current_key: str): - """ - Applies an AND to all the nested conditions. +def _and(document: Document, conditions: List[Dict[str, Any]]) -> bool: + return all(_comparison_condition(condition, document) for condition in conditions) - :param conditions: the filters dictionary. - :param document: the document to test. - :param _current_key: internal, don't use. - :return: True if the document matches all the filters, False otherwise - """ - return all( - document_matches_filter(conditions=condition, document=document, _current_key=_current_key) - for condition in conditions - ) +def _or(document: Document, conditions: List[Dict[str, Any]]) -> bool: + return any(_comparison_condition(condition, document) for condition in conditions) -def or_operation(conditions: List[Any], document: Document, _current_key: str): - """ - Applies an OR to all the nested conditions. - :param conditions: the filters dictionary. - :param document: the document to test. - :param _current_key: internal, don't use. - :return: True if the document matches any of the filters, False otherwise - """ - return any( - document_matches_filter(conditions=condition, document=document, _current_key=_current_key) - for condition in conditions - ) +def _not(document: Document, conditions: List[Dict[str, Any]]) -> bool: + return not _and(document, conditions) -def _safe_eq(first: Any, second: Any) -> bool: - """ - Compares objects for equality, even np.ndarrays and pandas DataFrames. - """ +LOGICAL_OPERATORS = {"NOT": _not, "OR": _or, "AND": _and} - if isinstance(first, pd.DataFrame): - first = first.to_json() - if isinstance(second, pd.DataFrame): - second = second.to_json() +def _equal(document_value: Any, filter_value: Any) -> bool: + if isinstance(document_value, pd.DataFrame): + document_value = document_value.to_json() - if isinstance(first, np.ndarray): - first = first.tolist() + if isinstance(filter_value, pd.DataFrame): + filter_value = filter_value.to_json() - if isinstance(second, np.ndarray): - second = second.tolist() + return document_value == filter_value - return first == second +def _not_equal(document_value: Any, filter_value: Any) -> bool: + return not _equal(document_value=document_value, filter_value=filter_value) -def _safe_gt(first: Any, second: Any) -> bool: - """ - Checks if first is bigger than second. - Works only for numerical values and dates in ISO format (YYYY-MM-DD). Strings, lists, tables and tensors all raise exceptions. - """ - if not isinstance(first, GT_TYPES) or not isinstance(second, GT_TYPES): +def _greater_than(document_value: Any, filter_value: Any) -> bool: + if document_value is None or filter_value is None: + # We can't compare None values reliably using operators '>', '>=', '<', '<=' + return False + + if isinstance(document_value, str) or isinstance(filter_value, str): try: - first = datetime.fromisoformat(first) - second = datetime.fromisoformat(second) - except (ValueError, TypeError): - raise FilterError( - f"Can't evaluate '{type(first).__name__} > {type(second).__name__}'. " - f"Convert these values into one of the following types: {[type_.__name__ for type_ in GT_TYPES]} " - f"or a datetime string in ISO 8601 format." + document_value = datetime.fromisoformat(document_value) + filter_value = datetime.fromisoformat(filter_value) + except (ValueError, TypeError) as exc: + msg = ( + "Can't compare strings using operators '>', '>=', '<', '<='. " + "Strings are only comparable if they are ISO formatted dates." ) - return bool(first > second) + raise FilterError(msg) from exc + if type(filter_value) in [list, pd.DataFrame]: + msg = f"Filter value can't be of type {type(filter_value)} using operators '>', '>=', '<', '<='" + raise FilterError(msg) + return document_value > filter_value -def eq_operation(fields, field_name, value): - """ - Checks for equality between the document's field value value and a fixed value. - - :param fields: all the document's field value - :param field_name: the field to test - :param value: the fixed value to compare against - :return: True if the values are equal, False otherwise - """ - if not field_name in fields: +def _greater_than_equal(document_value: Any, filter_value: Any) -> bool: + if document_value is None or filter_value is None: + # We can't compare None values reliably using operators '>', '>=', '<', '<=' return False - return _safe_eq(fields[field_name], value) - + return _equal(document_value=document_value, filter_value=filter_value) or _greater_than( + document_value=document_value, filter_value=filter_value + ) -def in_operation(fields, field_name, value): - """ - Checks for whether the document's field value value is present into the given list. - :param fields: all the document's field value - :param field_name: the field to test - :param value; the fixed value to compare against - :return: True if the document's value is included in the given list, False otherwise - """ - if not field_name in fields: +def _less_than(document_value: Any, filter_value: Any) -> bool: + if document_value is None or filter_value is None: + # We can't compare None values reliably using operators '>', '>=', '<', '<=' return False - if not isinstance(value, IN_TYPES): - raise FilterError("$in accepts only iterable values like lists, sets and tuples.") - - return any(_safe_eq(fields[field_name], v) for v in value) - - -def ne_operation(fields, field_name, value): - """ - Checks for inequality between the document's field value value and a fixed value. - - :param fields: all the document's field value - :param field_name: the field to test - :param value; the fixed value to compare against - :return: True if the values are different, False otherwise - """ - return not eq_operation(fields, field_name, value) - - -def nin_operation(fields, field_name, value): - """ - Checks whether the document's field value value is absent from the given list. - - :param fields: all the document's field value - :param field_name: the field to test - :param value; the fixed value to compare against - :return: True if the document's value is not included in the given list, False otherwise - """ - return not in_operation(fields, field_name, value) + return not _greater_than_equal(document_value=document_value, filter_value=filter_value) -def gt_operation(fields, field_name, value): - """ - Checks whether the document's field value value is (strictly) larger than the given value. - - :param fields: all the document's field value - :param field_name: the field to test - :param value; the fixed value to compare against - :return: True if the document's value is strictly larger than the fixed value, False otherwise - """ - if not field_name in fields: +def _less_than_equal(document_value: Any, filter_value: Any) -> bool: + if document_value is None or filter_value is None: + # We can't compare None values reliably using operators '>', '>=', '<', '<=' return False - return _safe_gt(fields[field_name], value) - -def gte_operation(fields, field_name, value): - """ - Checks whether the document's field value value is larger than or equal to the given value. - - :param fields: all the document's field value - :param field_name: the field to test - :param value; the fixed value to compare against - :return: True if the document's value is larger than or equal to the fixed value, False otherwise - """ - return gt_operation(fields, field_name, value) or eq_operation(fields, field_name, value) + return not _greater_than(document_value=document_value, filter_value=filter_value) -def lt_operation(fields, field_name, value): - """ - Checks whether the document's field value value is (strictly) smaller than the given value. +def _in(document_value: Any, filter_value: Any) -> bool: + if not isinstance(filter_value, list): + msg = ( + f"Filter value must be a `list` when using operator 'in' or 'not in', received type '{type(filter_value)}'" + ) + raise FilterError(msg) + return any(_equal(e, document_value) for e in filter_value) - :param fields: all the document's field value - :param field_name: the field to test - :param value; the fixed value to compare against - :return: True if the document's value is strictly smaller than the fixed value, False otherwise - """ - if not field_name in fields: - return False - return not _safe_gt(fields[field_name], value) and not _safe_eq(fields[field_name], value) +def _not_in(document_value: Any, filter_value: Any) -> bool: + return not _in(document_value=document_value, filter_value=filter_value) -def lte_operation(fields, field_name, value): - """ - Checks whether the document's field value value is smaller than or equal to the given value. - :param fields: all the document's field value - :param field_name: the field to test - :param value; the fixed value to compare against - :return: True if the document's value is smaller than or equal to the fixed value, False otherwise - """ - if not field_name in fields: - return False - return not _safe_gt(fields[field_name], value) - - -LOGICAL_STATEMENTS = {"$not": not_operation, "$and": and_operation, "$or": or_operation} -OPERATORS = { - "$eq": eq_operation, - "$in": in_operation, - "$ne": ne_operation, - "$nin": nin_operation, - "$gt": gt_operation, - "$gte": gte_operation, - "$lt": lt_operation, - "$lte": lte_operation, +COMPARISON_OPERATORS = { + "==": _equal, + "!=": _not_equal, + ">": _greater_than, + ">=": _greater_than_equal, + "<": _less_than, + "<=": _less_than_equal, + "in": _in, + "not in": _not_in, } -RESERVED_KEYS = [*LOGICAL_STATEMENTS.keys(), *OPERATORS.keys()] - - -def document_matches_filter(conditions: Union[Dict, List], document: Document, _current_key=None): - """ - Check if a document's metadata matches the provided filter conditions. - This function evaluates the specified conditions against the metadata of the given document - and returns True if the conditions are met, otherwise it returns False. - :param conditions: A dictionary or list containing filter conditions to be applied to the document's metadata. - :param document: The document whose metadata will be evaluated against the conditions. - :param _current_key: internal parameter, don't use. - :return: True if the document's metadata matches the filter conditions, False otherwise. - """ - if isinstance(conditions, dict): - # Check for malformed filters, like {"name": {"year": "2020"}} - if _current_key and any(key not in RESERVED_KEYS for key in conditions.keys()): - raise FilterError( - f"This filter ({{{_current_key}: {conditions}}}) seems to be malformed. " - "Comparisons between dictionaries are not currently supported. " - "Check the documentation to learn more about filters syntax." - ) - - if len(conditions.keys()) > 1: - # The default operation for a list of sibling conditions is $and - return and_operation(conditions=_list_conditions(conditions), document=document, _current_key=_current_key) - - field_key, field_value = list(conditions.items())[0] - - # Nested logical statement ($and, $or, $not) - if field_key in LOGICAL_STATEMENTS.keys(): - return LOGICAL_STATEMENTS[field_key]( - conditions=_list_conditions(field_value), document=document, _current_key=_current_key - ) - - # A comparison operator ($eq, $in, $gte, ...) - if field_key in OPERATORS.keys(): - if not _current_key: - raise FilterError( - "Filters can't start with an operator like $eq and $in. You have to specify the field name first. " - "See the examples in the documentation." - ) - return OPERATORS[field_key](fields=document.to_dict(), field_name=_current_key, value=field_value) - - # Otherwise fall back to the defaults - conditions = _list_conditions(field_value) - _current_key = field_key - - # Defaults for implicit filters - if isinstance(conditions, list): - if all(isinstance(cond, dict) for cond in conditions): - # The default operation for a list of sibling conditions is $and - return and_operation(conditions=_list_conditions(conditions), document=document, _current_key=_current_key) - else: - # The default operator for a {key: [value1, value2]} filter is $in - return in_operation(fields=document.to_dict(), field_name=_current_key, value=conditions) - - if _current_key: - # The default operator for a {key: value} filter is $eq - return eq_operation(fields=document.to_dict(), field_name=_current_key, value=conditions) - - raise FilterError("Filters must be dictionaries or lists. See the examples in the documentation.") - - -def _list_conditions(conditions: Any) -> List[Any]: - """ - Make sure all nested conditions are not dictionaries or single values, but always lists. - - :param conditions: the conditions to transform into a list - :returns: a list of filters - """ - if isinstance(conditions, list): - return conditions - if isinstance(conditions, dict): - return [{key: value} for key, value in conditions.items()] - return [conditions] +def _logic_condition(condition: Dict[str, Any], document: Document) -> bool: + if "operator" not in condition: + msg = f"'operator' key missing in {condition}" + raise FilterError(msg) + if "conditions" not in condition: + msg = f"'conditions' key missing in {condition}" + raise FilterError(msg) + operator: str = condition["operator"] + conditions: List[Dict[str, Any]] = condition["conditions"] + return LOGICAL_OPERATORS[operator](document, conditions) + + +def _comparison_condition(condition: Dict[str, Any], document: Document) -> bool: + if "field" not in condition: + # 'field' key is only found in comparison dictionaries. + # We assume this is a logic dictionary since it's not present. + return _logic_condition(condition, document) + field: str = condition["field"] + + if "operator" not in condition: + msg = f"'operator' key missing in {condition}" + raise FilterError(msg) + if "value" not in condition: + msg = f"'value' key missing in {condition}" + raise FilterError(msg) + + if "." in field: + # Handles fields formatted like so: + # 'meta.person.name' + parts = field.split(".") + document_value = getattr(document, parts[0]) + for part in parts[1:]: + if part not in document_value: + # If a field is not found we treat it as None + document_value = None + break + document_value = document_value[part] + elif field not in [f.name for f in fields(document)]: + # Converted legacy filters don't add the `meta.` prefix, so we assume + # that all filter fields that are not actual fields in Document are converted + # filters. + # + # We handle this to avoid breaking compatibility with converted legacy filters. + # This will be removed as soon as we stop supporting legacy filters. + document_value = document.meta.get(field) + else: + document_value = getattr(document, field) + operator: str = condition["operator"] + filter_value: Any = condition["value"] + return COMPARISON_OPERATORS[operator](filter_value=filter_value, document_value=document_value) def convert(filters: Dict[str, Any]) -> Dict[str, Any]: diff --git a/releasenotes/notes/rework-filters-1bb103d196a1912b.yaml b/releasenotes/notes/rework-filters-1bb103d196a1912b.yaml new file mode 100644 index 0000000000..57509ec334 --- /dev/null +++ b/releasenotes/notes/rework-filters-1bb103d196a1912b.yaml @@ -0,0 +1,87 @@ +--- +prelude: > + With proposal [#6001](https://github.com/deepset-ai/haystack/pull/6001) we introduced a better specification to declare filters in Haystack 2.x. + The new syntax is a bit more verbose but less confusing and ambiguous as there are no implicit operators. + This will simplify conversion from this common syntax to a Document Store specific filtering logic, so it will ease + development of new Document Store. + Since everything must be declared explicitly it will also make it easier for user to understand the filters just + by reading them. + + The full specification is as follow. + + --- + + Filters top level must be a dictionary. + + There are two types of dictionaries: + + - Comparison + - Logic + + Top level can be either be a Comparison or Logic dictionary. + + Comparison dictionaries must contain the keys: + + - `field` + - `operator` + - `value` + + Logic dictionaries must contain the keys: + + - `operator` + - `conditions` + + `conditions` key must be a list of dictionaries, either Comparison or Logic. + + `operator` values in Comparison dictionaries must be: + + - `==` + - `!=` + - `>` + - `>=` + - `<` + - `<=` + - `in` + - `not in` + + `operator` values in Logic dictionaries must be: + + - `NOT` + - `OR` + - `AND` + + --- + + A simple filter: + + ```python + filters = {"field": "meta.type", "operator": "==", "value": "article"} + ``` + + A more complex filter: + ```python + filters = { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.date", "operator": ">=", "value": 1420066800}, + {"field": "meta.date", "operator": "<", "value": 1609455600}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + { + "operator": "OR", + "conditions": [ + {"field": "meta.genre", "operator": "in", "value": ["economy", "politics"]}, + {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, + ], + }, + ], + } + ``` + + --- + + To avoid causing too much disruption for users using legacy filters we'll keep supporting them for the time being. + We also provide a utility `convert` function for developers implementing their Document Store to do the same. +preview: + - | + Refactored `InMemoryDocumentStore` and `MetadataRouter` filtering logic to support new filters declaration. diff --git a/test/preview/components/routers/test_metadata_router.py b/test/preview/components/routers/test_metadata_router.py index c20a66a3e5..5109f8db6c 100644 --- a/test/preview/components/routers/test_metadata_router.py +++ b/test/preview/components/routers/test_metadata_router.py @@ -8,8 +8,20 @@ class TestMetadataRouter: @pytest.mark.unit def test_run(self): rules = { - "edge_1": {"created_at": {"$gte": "2023-01-01", "$lt": "2023-04-01"}}, - "edge_2": {"created_at": {"$gte": "2023-04-01", "$lt": "2023-07-01"}}, + "edge_1": { + "operator": "AND", + "conditions": [ + {"field": "meta.created_at", "operator": ">=", "value": "2023-01-01"}, + {"field": "meta.created_at", "operator": "<", "value": "2023-04-01"}, + ], + }, + "edge_2": { + "operator": "AND", + "conditions": [ + {"field": "meta.created_at", "operator": ">=", "value": "2023-04-01"}, + {"field": "meta.created_at", "operator": "<", "value": "2023-07-01"}, + ], + }, } router = MetadataRouter(rules=rules) documents = [ diff --git a/test/preview/document_stores/test_in_memory.py b/test/preview/document_stores/test_in_memory.py index 65d148c124..ce37754338 100644 --- a/test/preview/document_stores/test_in_memory.py +++ b/test/preview/document_stores/test_in_memory.py @@ -146,10 +146,6 @@ def test_bm25_retrieval_with_two_queries(self, document_store: InMemoryDocumentS results = document_store.bm25_retrieval(query="Python", top_k=1) assert results[0].content == "Python is a popular programming language" - @pytest.mark.skip(reason="Filter is not working properly, see https://github.com/deepset-ai/haystack/issues/6153") - def test_eq_filter_embedding(self, document_store: InMemoryDocumentStore, filterable_docs): - pass - # Test a query, add a new document and make sure results are appropriately updated @pytest.mark.unit def test_bm25_retrieval_with_updated_docs(self, document_store: InMemoryDocumentStore): diff --git a/test/preview/utils/test_filters.py b/test/preview/utils/test_filters.py index 2ca7826a86..1b3baaf771 100644 --- a/test/preview/utils/test_filters.py +++ b/test/preview/utils/test_filters.py @@ -1,508 +1,554 @@ import pytest import pandas as pd -import numpy as np from haystack.preview import Document from haystack.preview.errors import FilterError from haystack.preview.utils.filters import convert, document_matches_filter +document_matches_filter_data = [ + # == operator params + pytest.param( + {"field": "meta.name", "operator": "==", "value": "test"}, + Document(meta={"name": "test"}), + True, + id="== operator with equal values", + ), + pytest.param( + {"field": "meta.name", "operator": "==", "value": "test"}, + Document(meta={"name": "different value"}), + False, + id="== operator with different values", + ), + pytest.param( + {"field": "meta.name", "operator": "==", "value": "test"}, + Document(meta={"name": ["test"]}), + False, + id="== operator with different types values", + ), + pytest.param( + {"field": "dataframe", "operator": "==", "value": pd.DataFrame([1])}, + Document(dataframe=pd.DataFrame([1])), + True, + id="== operator with equal pandas.DataFrame values", + ), + pytest.param( + {"field": "dataframe", "operator": "==", "value": pd.DataFrame([1])}, + Document(dataframe=pd.DataFrame([10])), + False, + id="== operator with different pandas.DataFrame values", + ), + pytest.param( + {"field": "meta.name", "operator": "==", "value": "test"}, + Document(), + False, + id="== operator with missing Document value", + ), + pytest.param( + {"field": "meta.name", "operator": "==", "value": "test"}, + Document(meta={"name": None}), + False, + id="== operator with None Document value", + ), + pytest.param( + {"field": "meta.name", "operator": "==", "value": None}, + Document(meta={"name": "test"}), + False, + id="== operator with None filter value", + ), + # != operator params + pytest.param( + {"field": "meta.name", "operator": "!=", "value": "test"}, + Document(meta={"name": "test"}), + False, + id="!= operator with equal values", + ), + pytest.param( + {"field": "meta.name", "operator": "!=", "value": "test"}, + Document(meta={"name": "different value"}), + True, + id="!= operator with different values", + ), + pytest.param( + {"field": "meta.name", "operator": "!=", "value": "test"}, + Document(meta={"name": ["test"]}), + True, + id="!= operator with different types values", + ), + pytest.param( + {"field": "dataframe", "operator": "!=", "value": pd.DataFrame([1])}, + Document(dataframe=pd.DataFrame([1])), + False, + id="!= operator with equal pandas.DataFrame values", + ), + pytest.param( + {"field": "dataframe", "operator": "!=", "value": pd.DataFrame([1])}, + Document(dataframe=pd.DataFrame([10])), + True, + id="!= operator with different pandas.DataFrame values", + ), + pytest.param( + {"field": "meta.name", "operator": "!=", "value": "test"}, Document(), True, id="!= operator with missing value" + ), + pytest.param( + {"field": "meta.name", "operator": "!=", "value": "test"}, + Document(meta={"name": None}), + True, + id="!= operator with None Document value", + ), + pytest.param( + {"field": "meta.name", "operator": "!=", "value": None}, + Document(meta={"name": "test"}), + True, + id="!= operator with None filter value", + ), + # > operator params + pytest.param( + {"field": "meta.page", "operator": ">", "value": 10}, + Document(meta={"page": 10}), + False, + id="> operator with equal Document value", + ), + pytest.param( + {"field": "meta.page", "operator": ">", "value": 10}, + Document(meta={"page": 11}), + True, + id="> operator with greater Document value", + ), + pytest.param( + {"field": "meta.page", "operator": ">", "value": 10}, + Document(meta={"page": 9}), + False, + id="> operator with smaller Document value", + ), + pytest.param( + {"field": "meta.date", "operator": ">", "value": "1969-07-21T20:17:40"}, + Document(meta={"date": "1969-07-21T20:17:40"}), + False, + id="> operator with equal ISO 8601 datetime Document value", + ), + pytest.param( + {"field": "meta.date", "operator": ">", "value": "1969-07-21T20:17:40"}, + Document(meta={"date": "1972-12-11T19:54:58"}), + True, + id="> operator with greater ISO 8601 datetime Document value", + ), + pytest.param( + {"field": "meta.date", "operator": ">", "value": "1972-12-11T19:54:58"}, + Document(meta={"date": "1969-07-21T20:17:40"}), + False, + id="> operator with smaller ISO 8601 datetime Document value", + ), + pytest.param( + {"field": "meta.page", "operator": ">", "value": 10}, + Document(), + False, + id="> operator with missing Document value", + ), + pytest.param( + {"field": "meta.page", "operator": ">", "value": 10}, + Document(meta={"page": None}), + False, + id="> operator with None Document value", + ), + pytest.param( + {"field": "meta.page", "operator": ">", "value": None}, + Document(meta={"page": 10}), + False, + id="> operator with None filter value", + ), + pytest.param( + {"field": "meta.page", "operator": ">", "value": None}, + Document(meta={"page": None}), + False, + id="> operator with None Document and filter value", + ), + # >= operator params + pytest.param( + {"field": "meta.page", "operator": ">=", "value": 10}, + Document(meta={"page": 10}), + True, + id=">= operator with equal Document value", + ), + pytest.param( + {"field": "meta.page", "operator": ">=", "value": 10}, + Document(meta={"page": 11}), + True, + id=">= operator with greater Document value", + ), + pytest.param( + {"field": "meta.page", "operator": ">=", "value": 10}, + Document(meta={"page": 9}), + False, + id=">= operator with smaller Document value", + ), + pytest.param( + {"field": "meta.date", "operator": ">=", "value": "1969-07-21T20:17:40"}, + Document(meta={"date": "1969-07-21T20:17:40"}), + True, + id=">= operator with equal ISO 8601 datetime Document value", + ), + pytest.param( + {"field": "meta.date", "operator": ">=", "value": "1969-07-21T20:17:40"}, + Document(meta={"date": "1972-12-11T19:54:58"}), + True, + id=">= operator with greater ISO 8601 datetime Document value", + ), + pytest.param( + {"field": "meta.date", "operator": ">=", "value": "1972-12-11T19:54:58"}, + Document(meta={"date": "1969-07-21T20:17:40"}), + False, + id=">= operator with smaller ISO 8601 datetime Document value", + ), + pytest.param( + {"field": "meta.page", "operator": ">=", "value": 10}, + Document(), + False, + id=">= operator with missing Document value", + ), + pytest.param( + {"field": "meta.page", "operator": ">=", "value": 10}, + Document(meta={"page": None}), + False, + id=">= operator with None Document value", + ), + pytest.param( + {"field": "meta.page", "operator": ">=", "value": None}, + Document(meta={"page": 10}), + False, + id=">= operator with None filter value", + ), + pytest.param( + {"field": "meta.page", "operator": ">=", "value": None}, + Document(meta={"page": None}), + False, + id=">= operator with None Document and filter value", + ), + # < operator params + pytest.param( + {"field": "meta.page", "operator": "<", "value": 10}, + Document(meta={"page": 10}), + False, + id="< operator with equal Document value", + ), + pytest.param( + {"field": "meta.page", "operator": "<", "value": 10}, + Document(meta={"page": 11}), + False, + id="< operator with greater Document value", + ), + pytest.param( + {"field": "meta.page", "operator": "<", "value": 10}, + Document(meta={"page": 9}), + True, + id="< operator with smaller Document value", + ), + pytest.param( + {"field": "meta.date", "operator": "<", "value": "1969-07-21T20:17:40"}, + Document(meta={"date": "1969-07-21T20:17:40"}), + False, + id="< operator with equal ISO 8601 datetime Document value", + ), + pytest.param( + {"field": "meta.date", "operator": "<", "value": "1969-07-21T20:17:40"}, + Document(meta={"date": "1972-12-11T19:54:58"}), + False, + id="< operator with greater ISO 8601 datetime Document value", + ), + pytest.param( + {"field": "meta.date", "operator": "<", "value": "1972-12-11T19:54:58"}, + Document(meta={"date": "1969-07-21T20:17:40"}), + True, + id="< operator with smaller ISO 8601 datetime Document value", + ), + pytest.param( + {"field": "meta.page", "operator": "<", "value": 10}, + Document(), + False, + id="< operator with missing Document value", + ), + pytest.param( + {"field": "meta.page", "operator": "<", "value": 10}, + Document(meta={"page": None}), + False, + id="< operator with None Document value", + ), + pytest.param( + {"field": "meta.page", "operator": "<", "value": None}, + Document(meta={"page": 10}), + False, + id="< operator with None filter value", + ), + pytest.param( + {"field": "meta.page", "operator": "<", "value": None}, + Document(meta={"page": None}), + False, + id="< operator with None Document and filter value", + ), + # <= operator params + pytest.param( + {"field": "meta.page", "operator": "<=", "value": 10}, + Document(meta={"page": 10}), + True, + id="<= operator with equal Document value", + ), + pytest.param( + {"field": "meta.page", "operator": "<=", "value": 10}, + Document(meta={"page": 11}), + False, + id="<= operator with greater Document value", + ), + pytest.param( + {"field": "meta.page", "operator": "<=", "value": 10}, + Document(meta={"page": 9}), + True, + id="<= operator with smaller Document value", + ), + pytest.param( + {"field": "meta.date", "operator": "<=", "value": "1969-07-21T20:17:40"}, + Document(meta={"date": "1969-07-21T20:17:40"}), + True, + id="<= operator with equal ISO 8601 datetime Document value", + ), + pytest.param( + {"field": "meta.date", "operator": "<=", "value": "1969-07-21T20:17:40"}, + Document(meta={"date": "1972-12-11T19:54:58"}), + False, + id="<= operator with greater ISO 8601 datetime Document value", + ), + pytest.param( + {"field": "meta.date", "operator": "<=", "value": "1972-12-11T19:54:58"}, + Document(meta={"date": "1969-07-21T20:17:40"}), + True, + id="<= operator with smaller ISO 8601 datetime Document value", + ), + pytest.param( + {"field": "meta.page", "operator": "<=", "value": 10}, + Document(), + False, + id="<= operator with missing Document value", + ), + pytest.param( + {"field": "meta.page", "operator": "<=", "value": 10}, + Document(meta={"page": None}), + False, + id="<= operator with None Document value", + ), + pytest.param( + {"field": "meta.page", "operator": "<=", "value": None}, + Document(meta={"page": 10}), + False, + id="<= operator with None filter value", + ), + pytest.param( + {"field": "meta.page", "operator": "<=", "value": None}, + Document(meta={"page": None}), + False, + id="<= operator with None Document and filter value", + ), + # in operator params + pytest.param( + {"field": "meta.page", "operator": "in", "value": [9, 10]}, + Document(meta={"page": 1}), + False, + id="in operator with filter value not containing Document value", + ), + pytest.param( + {"field": "meta.page", "operator": "in", "value": [9, 10]}, + Document(meta={"page": 10}), + True, + id="in operator with filter value containing Document value", + ), + # not in operator params + pytest.param( + {"field": "meta.page", "operator": "not in", "value": [9, 10]}, + Document(meta={"page": 1}), + True, + id="not in operator with filter value not containing Document value", + ), + pytest.param( + {"field": "meta.page", "operator": "not in", "value": [9, 10]}, + Document(meta={"page": 10}), + False, + id="not in operator with filter value containing Document value", + ), + # AND operator params + pytest.param( + { + "operator": "AND", + "conditions": [ + {"field": "meta.page", "operator": "==", "value": 10}, + {"field": "meta.type", "operator": "==", "value": "article"}, + ], + }, + Document(meta={"page": 10, "type": "article"}), + True, + id="AND operator with Document matching all conditions", + ), + pytest.param( + { + "operator": "AND", + "conditions": [ + {"field": "meta.page", "operator": "==", "value": 10}, + {"field": "meta.type", "operator": "==", "value": "article"}, + ], + }, + Document(meta={"page": 20, "type": "article"}), + False, + id="AND operator with Document matching a single condition", + ), + pytest.param( + { + "operator": "AND", + "conditions": [ + {"field": "meta.page", "operator": "==", "value": 10}, + {"field": "meta.type", "operator": "==", "value": "article"}, + ], + }, + Document(meta={"page": 11, "value": "blog post"}), + False, + id="AND operator with Document matching no condition", + ), + # OR operator params + pytest.param( + { + "operator": "OR", + "conditions": [ + {"field": "meta.page", "operator": "==", "value": 10}, + {"field": "meta.type", "operator": "==", "value": "article"}, + ], + }, + Document(meta={"page": 10, "type": "article"}), + True, + id="OR operator with Document matching all conditions", + ), + pytest.param( + { + "operator": "OR", + "conditions": [ + {"field": "meta.page", "operator": "==", "value": 10}, + {"field": "meta.type", "operator": "==", "value": "article"}, + ], + }, + Document(meta={"page": 20, "type": "article"}), + True, + id="OR operator with Document matching a single condition", + ), + pytest.param( + { + "operator": "OR", + "conditions": [ + {"field": "meta.page", "operator": "==", "value": 10}, + {"field": "meta.type", "operator": "==", "value": "article"}, + ], + }, + Document(meta={"page": 11, "value": "blog post"}), + False, + id="OR operator with Document matching no condition", + ), + # NOT operator params + pytest.param( + { + "operator": "NOT", + "conditions": [ + {"field": "meta.page", "operator": "==", "value": 10}, + {"field": "meta.type", "operator": "==", "value": "article"}, + ], + }, + Document(meta={"page": 10, "type": "article"}), + False, + id="NOT operator with Document matching all conditions", + ), + pytest.param( + { + "operator": "NOT", + "conditions": [ + {"field": "meta.page", "operator": "==", "value": 10}, + {"field": "meta.type", "operator": "==", "value": "article"}, + ], + }, + Document(meta={"page": 20, "type": "article"}), + True, + id="NOT operator with Document matching a single condition", + ), + pytest.param( + { + "operator": "NOT", + "conditions": [ + {"field": "meta.page", "operator": "==", "value": 10}, + {"field": "meta.type", "operator": "==", "value": "article"}, + ], + }, + Document(meta={"page": 11, "value": "blog post"}), + True, + id="NOT operator with Document matching no condition", + ), +] -class TestFilterUtils: # pylint: disable=R0904 - @pytest.mark.unit - def test_eq_match(self): - document = Document(meta={"name": "test"}) - filter = {"name": "test"} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_eq_no_match(self): - document = Document(meta={"name": "test"}) - filter = {"name": "test1"} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_eq_no_match_missing_key(self): - document = Document(meta={"name": "test"}) - filter = {"name1": "test"} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_explicit_eq(self): - document = Document(meta={"name": "test"}) - filter = {"name": {"$eq": "test"}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_eq_different_types(self): - document = Document(meta={"name": 1}) - filter = {"name": "1"} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_eq_dataframes(self): - document = Document(meta={"name": pd.DataFrame({"a": [1, 2, 3]})}) - filter = {"name": pd.DataFrame({"a": [1, 2, 3]})} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_eq_dataframes_no_match(self): - document = Document(meta={"name": pd.DataFrame({"a": [1, 2, 3]})}) - filter = {"name": pd.DataFrame({"a": [1, 2, 4]})} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_eq_np_arrays(self): - document = Document(meta={"name": np.array([1, 2, 3])}) - filter = {"name": np.array([1, 2, 3])} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_eq_np_arrays_no_match(self): - document = Document(meta={"name": np.array([1, 2, 3])}) - filter = {"name": np.array([1, 2, 4])} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_ne_match(self): - document = Document(meta={"name": "test"}) - filter = {"name": {"$ne": "test1"}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_ne_no_match(self): - document = Document(meta={"name": "test"}) - filter = {"name": {"$ne": "test"}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_ne_no_match_missing_key(self): - document = Document(meta={"name": "test"}) - filter = {"name1": {"$ne": "test"}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_ne_different_types(self): - document = Document(meta={"name": 1}) - filter = {"name": {"$ne": "1"}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_ne_dataframes(self): - document = Document(meta={"name": pd.DataFrame({"a": [1, 2, 3]})}) - filter = {"name": {"$ne": pd.DataFrame({"a": [1, 2, 4]})}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_ne_dataframes_no_match(self): - document = Document(meta={"name": pd.DataFrame({"a": [1, 2, 3]})}) - filter = {"name": {"$ne": pd.DataFrame({"a": [1, 2, 3]})}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_ne_np_arrays(self): - document = Document(meta={"name": np.array([1, 2, 3])}) - filter = {"name": {"$ne": np.array([1, 2, 4])}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_ne_np_arrays_no_match(self): - document = Document(meta={"name": np.array([1, 2, 3])}) - filter = {"name": {"$ne": np.array([1, 2, 3])}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_in_match_list(self): - document = Document(meta={"name": "test"}) - filter = {"name": {"$in": ["test", "test1"]}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_in_no_match_list(self): - document = Document(meta={"name": "test"}) - filter = {"name": {"$in": ["test2", "test3"]}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_in_implicit(self): - document = Document(meta={"name": "test"}) - filter = {"name": ["test", "test1"]} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_in_match_set(self): - document = Document(meta={"name": "test"}) - filter = {"name": {"$in": {"test", "test1"}}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_in_no_match_set(self): - document = Document(meta={"name": "test"}) - filter = {"name": {"$in": {"test2", "test3"}}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_in_match_tuple(self): - document = Document(meta={"name": "test"}) - filter = {"name": {"$in": ("test", "test1")}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_in_no_match_tuple(self): - document = Document(meta={"name": "test"}) - filter = {"name": {"$in": ("test2", "test3")}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_in_no_match_missing_key(self): - document = Document(meta={"name": "test"}) - filter = {"name1": {"$in": ["test", "test1"]}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_in_unsupported_type(self): - document = Document(meta={"name": "test"}) - filter = {"name": {"$in": "unsupported"}} - with pytest.raises(FilterError, match=r"\$in accepts only iterable values like lists, sets and tuples"): - document_matches_filter(filter, document) - - @pytest.mark.unit - def test_nin_match_list(self): - document = Document(meta={"name": "test"}) - filter = {"name": {"$nin": ["test1", "test2"]}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_nin_no_match_list(self): - document = Document(meta={"name": "test"}) - filter = {"name": {"$nin": ["test", "test1"]}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_nin_match_set(self): - document = Document(meta={"name": "test"}) - filter = {"name": {"$nin": {"test1", "test2"}}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_nin_no_match_set(self): - document = Document(meta={"name": "test"}) - filter = {"name": {"$nin": {"test", "test1"}}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_nin_match_tuple(self): - document = Document(meta={"name": "test"}) - filter = {"name": {"$nin": ("test1", "test2")}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_nin_no_match_tuple(self): - document = Document(meta={"name": "test"}) - filter = {"name": {"$nin": ("test", "test1")}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_nin_no_match_missing_key(self): - document = Document(meta={"name": "test"}) - filter = {"name1": {"$nin": ["test", "test1"]}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_nin_unsupported_type(self): - document = Document(meta={"name": "test"}) - filter = {"name": {"$nin": "unsupported"}} - with pytest.raises(FilterError, match=r"\$in accepts only iterable values like lists, sets and tuples"): - document_matches_filter(filter, document) - - @pytest.mark.unit - def test_gt_match_int(self): - document = Document(meta={"age": 21}) - filter = {"age": {"$gt": 20}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_gt_no_match_int(self): - document = Document(meta={"age": 19}) - filter = {"age": {"$gt": 20}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_gt_match_float(self): - document = Document(meta={"number": 90.5}) - filter = {"number": {"$gt": 90.0}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_gt_no_match_float(self): - document = Document(meta={"number": 89.5}) - filter = {"number": {"$gt": 90.0}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_gt_match_np_number(self): - document = Document(meta={"value": np.float64(7.5)}) - filter = {"value": {"$gt": np.float64(7.0)}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_gt_no_match_np_number(self): - document = Document(meta={"value": np.float64(6.5)}) - filter = {"value": {"$gt": np.float64(7.0)}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_gt_match_date_string(self): - document = Document(meta={"date": "2022-01-02"}) - filter = {"date": {"$gt": "2022-01-01"}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_gt_no_match_date_string(self): - document = Document(meta={"date": "2022-01-01"}) - filter = {"date": {"$gt": "2022-01-01"}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_gt_no_match_missing_key(self): - document = Document(meta={"age": 21}) - filter = {"age1": {"$gt": 20}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_gt_unsupported_type(self): - document = Document(meta={"age": 21}) - filter = {"age": {"$gt": "unsupported"}} - with pytest.raises( - FilterError, - match=( - r"Convert these values into one of the following types: \['int', 'float', 'number'\] or a datetime string " - "in ISO 8601 format" - ), - ): - document_matches_filter(filter, document) - - @pytest.mark.unit - def test_gte_match_int(self): - document = Document(meta={"age": 21}) - filter_1 = {"age": {"$gte": 21}} - filter_2 = {"age": {"$gte": 20}} - assert document_matches_filter(filter_1, document) - assert document_matches_filter(filter_2, document) - - @pytest.mark.unit - def test_gte_no_match_int(self): - document = Document(meta={"age": 20}) - filter = {"age": {"$gte": 21}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_gte_match_float(self): - document = Document(meta={"number": 90.5}) - filter_1 = {"number": {"$gte": 90.5}} - filter_2 = {"number": {"$gte": 90.4}} - assert document_matches_filter(filter_1, document) - assert document_matches_filter(filter_2, document) - - @pytest.mark.unit - def test_gte_no_match_float(self): - document = Document(meta={"number": 90.4}) - filter = {"number": {"$gte": 90.5}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_gte_match_np_number(self): - document = Document(meta={"value": np.float64(7.5)}) - filter_1 = {"value": {"$gte": np.float64(7.5)}} - filter_2 = {"value": {"$gte": np.float64(7.4)}} - assert document_matches_filter(filter_1, document) - assert document_matches_filter(filter_2, document) - - @pytest.mark.unit - def test_gte_no_match_np_number(self): - document = Document(meta={"value": np.float64(7.4)}) - filter = {"value": {"$gte": np.float64(7.5)}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_gte_match_date_string(self): - document = Document(meta={"date": "2022-01-02"}) - filter_1 = {"date": {"$gte": "2022-01-02"}} - filter_2 = {"date": {"$gte": "2022-01-01"}} - assert document_matches_filter(filter_1, document) - assert document_matches_filter(filter_2, document) - - @pytest.mark.unit - def test_gte_no_match_date_string(self): - document = Document(meta={"date": "2022-01-01"}) - filter = {"date": {"$gte": "2022-01-02"}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_gte_unsupported_type(self): - document = Document(meta={"age": 21}) - filter = {"age": {"$gte": "unsupported"}} - with pytest.raises( - FilterError, - match=( - r"Convert these values into one of the following types: \['int', 'float', 'number'\] or a datetime string " - "in ISO 8601 format" - ), - ): - document_matches_filter(filter, document) - - @pytest.mark.unit - def test_lt_match_int(self): - document = Document(meta={"age": 19}) - filter = {"age": {"$lt": 20}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_lt_no_match_int(self): - document = Document(meta={"age": 20}) - filter = {"age": {"$lt": 20}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_lt_match_float(self): - document = Document(meta={"number": 89.9}) - filter = {"number": {"$lt": 90.0}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_lt_no_match_float(self): - document = Document(meta={"number": 90.0}) - filter = {"number": {"$lt": 90.0}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_lt_match_np_number(self): - document = Document(meta={"value": np.float64(6.9)}) - filter = {"value": {"$lt": np.float64(7.0)}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_lt_no_match_np_number(self): - document = Document(meta={"value": np.float64(7.0)}) - filter = {"value": {"$lt": np.float64(7.0)}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_lt_match_date_string(self): - document = Document(meta={"date": "2022-01-01"}) - filter = {"date": {"$lt": "2022-01-02"}} - assert document_matches_filter(filter, document) - - @pytest.mark.unit - def test_lt_no_match_date_string(self): - document = Document(meta={"date": "2022-01-02"}) - filter = {"date": {"$lt": "2022-01-02"}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_lt_unsupported_type(self): - document = Document(meta={"age": 21}) - filter = {"age": {"$lt": "unsupported"}} - with pytest.raises( - FilterError, - match=( - r"Convert these values into one of the following types: \['int', 'float', 'number'\] or a datetime string " - "in ISO 8601 format" - ), - ): - document_matches_filter(filter, document) - - @pytest.mark.unit - def test_lte_match_int(self): - document = Document(meta={"age": 21}) - filter_1 = {"age": {"$lte": 21}} - filter_2 = {"age": {"$lte": 20}} - assert not document_matches_filter(filter_2, document) - assert document_matches_filter(filter_1, document) - - @pytest.mark.unit - def test_lte_no_match_int(self): - document = Document(meta={"age": 22}) - filter = {"age": {"$lte": 21}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_lte_match_float(self): - document = Document(meta={"number": 90.5}) - filter_1 = {"number": {"$lte": 90.5}} - filter_2 = {"number": {"$lte": 90.4}} - assert not document_matches_filter(filter_2, document) - assert document_matches_filter(filter_1, document) - - @pytest.mark.unit - def test_lte_no_match_float(self): - document = Document(meta={"number": 90.6}) - filter = {"number": {"$lte": 90.5}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_lte_match_np_number(self): - document = Document(meta={"value": np.float64(7.5)}) - filter_1 = {"value": {"$lte": np.float64(7.5)}} - filter_2 = {"value": {"$lte": np.float64(7.4)}} - assert not document_matches_filter(filter_2, document) - assert document_matches_filter(filter_1, document) - - @pytest.mark.unit - def test_lte_no_match_np_number(self): - document = Document(meta={"value": np.float64(7.6)}) - filter = {"value": {"$lte": np.float64(7.5)}} - assert not document_matches_filter(filter, document) - - @pytest.mark.unit - def test_lte_match_date_string(self): - document = Document(meta={"date": "2022-01-02"}) - filter_1 = {"date": {"$lte": "2022-01-02"}} - filter_2 = {"date": {"$lte": "2022-01-01"}} - assert not document_matches_filter(filter_2, document) - assert document_matches_filter(filter_1, document) - - @pytest.mark.unit - def test_lte_no_match_date_string(self): - document = Document(meta={"date": "2022-01-03"}) - filter = {"date": {"$lte": "2022-01-02"}} - assert not document_matches_filter(filter, document) - @pytest.mark.unit - def test_lte_unsupported_type(self): - document = Document(meta={"age": 21}) - filter = {"age": {"$lte": "unsupported"}} - with pytest.raises( - FilterError, - match=( - r"Convert these values into one of the following types: \['int', 'float', 'number'\] or a datetime string " - "in ISO 8601 format" - ), - ): - document_matches_filter(filter, document) +@pytest.mark.parametrize("filter, document, expected_result", document_matches_filter_data) +def test_document_matches_filter(filter, document, expected_result): + assert document_matches_filter(filter, document) == expected_result - @pytest.mark.unit - def test_implicit_and(self): - document = Document(meta={"age": 21, "name": "John"}) - filter = {"age": {"$gt": 18}, "name": "John"} - assert document_matches_filter(filter, document) - @pytest.mark.unit - def test_explicit_and(self): - document = Document(meta={"age": 21}) - filter = {"age": {"$and": {"$gt": 18}, "$lt": 25}} - assert document_matches_filter(filter, document) +document_matches_filter_raises_error_data = [ + # > operator params + pytest.param({"field": "meta.page", "operator": ">", "value": "10"}, id="> operator with string filter value"), + pytest.param({"field": "meta.page", "operator": ">", "value": [10]}, id="> operator with list filter value"), + pytest.param( + {"field": "meta.page", "operator": ">", "value": pd.DataFrame([10])}, + id="> operator with pandas.DataFrame filter value", + ), + # >= operator params + pytest.param({"field": "meta.page", "operator": ">=", "value": "10"}, id=">= operator with string filter value"), + pytest.param({"field": "meta.page", "operator": ">=", "value": [10]}, id=">= operator with list filter value"), + pytest.param( + {"field": "meta.page", "operator": ">=", "value": pd.DataFrame([10])}, + id=">= operator with pandas.DataFrame filter value", + ), + # < operator params + pytest.param({"field": "meta.page", "operator": "<", "value": "10"}, id="< operator with string filter value"), + pytest.param({"field": "meta.page", "operator": "<", "value": [10]}, id="< operator with list filter value"), + pytest.param( + {"field": "meta.page", "operator": "<", "value": pd.DataFrame([10])}, + id="< operator with pandas.DataFrame filter value", + ), + # <= operator params + pytest.param({"field": "meta.page", "operator": "<=", "value": "10"}, id="<= operator with string filter value"), + pytest.param({"field": "meta.page", "operator": "<=", "value": [10]}, id="<= operator with list filter value"), + pytest.param( + {"field": "meta.page", "operator": "<=", "value": pd.DataFrame([10])}, + id="<= operator with pandas.DataFrame filter value", + ), + # in operator params + pytest.param({"field": "meta.page", "operator": "in", "value": 1}, id="in operator with non list filter value"), + # at some point we might want to support any iterable and this test should fail + pytest.param( + {"field": "meta.page", "operator": "in", "value": (10, 11)}, id="in operator with non list filter value" + ), + # not in operator params + pytest.param( + {"field": "meta.page", "operator": "not in", "value": 1}, id="not in operator with non list filter value" + ), + # at some point we might want to support any iterable and this test should fail + pytest.param( + {"field": "meta.page", "operator": "not in", "value": (10, 11)}, id="not in operator with non list filter value" + ), + # Malformed filters + pytest.param( + {"conditions": [{"field": "meta.name", "operator": "==", "value": "test"}]}, id="Missing root operator key" + ), + pytest.param({"operator": "AND"}, id="Missing root conditions key"), + pytest.param({"operator": "==", "value": "test"}, id="Missing condition field key"), + pytest.param({"field": "meta.name", "value": "test"}, id="Missing condition operator key"), + pytest.param({"field": "meta.name", "operator": "=="}, id="Missing condition value key"), +] - @pytest.mark.unit - def test_or(self): - document = Document(meta={"age": 26}) - filter = {"age": {"$or": [{"$gt": 18}, {"$lt": 25}]}} - assert document_matches_filter(filter, document) - @pytest.mark.unit - def test_not(self): - document = Document(meta={"age": 17}) - filter = {"age": {"$not": {"$gt": 18}}} - assert document_matches_filter(filter, document) +@pytest.mark.parametrize("filter", document_matches_filter_raises_error_data) +def test_document_matches_filter_raises_error(filter): + with pytest.raises(FilterError): + document = Document(meta={"page": 10}) + document_matches_filter(filter, document) filters_data = [