From 042350031468d0479f20d186e2b1114c6ad81e56 Mon Sep 17 00:00:00 2001 From: sarayourfriend <24264157+sarayourfriend@users.noreply.github.com> Date: Mon, 21 Nov 2022 15:34:12 +1100 Subject: [PATCH 1/4] Fix quoted audio search example escaping --- api/catalog/api/examples/audio_requests.py | 2 +- api/catalog/api/examples/image_requests.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/api/catalog/api/examples/audio_requests.py b/api/catalog/api/examples/audio_requests.py index 5fab43653..b817ede4f 100644 --- a/api/catalog/api/examples/audio_requests.py +++ b/api/catalog/api/examples/audio_requests.py @@ -10,7 +10,7 @@ syntax_examples = { "using single query parameter": "test", "using multiple query parameters": "test&license=pdm,by&categories=illustration&page_size=1&page=1", # noqa: E501 - "that is an exact match of Giacomo Puccini": '"Giacomo Puccini"', + "that is an exact match of Giacomo Puccini": r"%22Giacomo%20Puccini%22", "related to both dog and cat": "dog+cat", "related to dog or cat, but not necessarily both": "dog|cat", "related to dog but won't include results related to 'pug'": "dog -pug", diff --git a/api/catalog/api/examples/image_requests.py b/api/catalog/api/examples/image_requests.py index 4f846a036..3524e4745 100644 --- a/api/catalog/api/examples/image_requests.py +++ b/api/catalog/api/examples/image_requests.py @@ -10,7 +10,7 @@ syntax_examples = { "using single query parameter": "test", "using multiple query parameters": "test&license=pdm,by&categories=illustration&page_size=1&page=1", # noqa: E501 - "that are an exact match of Claude Monet": '"Claude Monet"', + "that are an exact match of Claude Monet": "%22Claude%20Monet%22", "related to both dog and cat": "dog+cat", "related to dog or cat, but not necessarily both": "dog|cat", "related to dog but won't include results related to 'pug'": "dog -pug", From 2eee346d2b4ab06c9b4def9f04c1443768f22b4e Mon Sep 17 00:00:00 2001 From: sarayourfriend <24264157+sarayourfriend@users.noreply.github.com> Date: Mon, 21 Nov 2022 15:48:18 +1100 Subject: [PATCH 2/4] Make quoted queries behave as described in API documentation --- .../api/controllers/search_controller.py | 19 +++++++++++++------ api/test/audio_integration_test.py | 6 ++++++ api/test/image_integration_test.py | 6 ++++++ api/test/media_integration.py | 19 +++++++++++++++++++ 4 files changed, 44 insertions(+), 6 deletions(-) diff --git a/api/catalog/api/controllers/search_controller.py b/api/catalog/api/controllers/search_controller.py index 99fb71aaf..cd2742b59 100644 --- a/api/catalog/api/controllers/search_controller.py +++ b/api/catalog/api/controllers/search_controller.py @@ -340,18 +340,25 @@ def search( search_fields = ["tags.name", "title", "description"] if "q" in search_params.data: query = _quote_escape(search_params.data["q"]) + base_query_kwargs = { + "query": query, + "fields": search_fields, + "default_operator": "AND", + } + + if '"' in query: + base_query_kwargs["quote_field_suffix"] = ".exact" + s = s.query( "simple_query_string", - query=query, - fields=search_fields, - default_operator="AND", + **base_query_kwargs, ) - # Boost exact matches + # Boost exact matches on the title quotes_stripped = query.replace('"', "") exact_match_boost = Q( "simple_query_string", - fields=["title"], - query=f'"{quotes_stripped}"', + fields=["title.exact"], + query=f"{quotes_stripped}", boost=10000, ) s = search_client.query(Q("bool", must=s.query, should=exact_match_boost)) diff --git a/api/test/audio_integration_test.py b/api/test/audio_integration_test.py index 4e0bf6e28..69ad8b8e6 100644 --- a/api/test/audio_integration_test.py +++ b/api/test/audio_integration_test.py @@ -15,6 +15,7 @@ search_by_category, search_consistency, search_quotes, + search_quotes_exact, search_source_and_excluded, search_special_chars, stats, @@ -101,6 +102,11 @@ def test_search_quotes(): search_quotes("audio", "love") +def test_search_quotes_exact(): + # ``water running`` returns different results when quoted vs unquoted + search_quotes_exact("audio", "water running") + + def test_search_with_special_characters(): search_special_chars("audio", "love") diff --git a/api/test/image_integration_test.py b/api/test/image_integration_test.py index 8c8dc63e5..acb5d0d44 100644 --- a/api/test/image_integration_test.py +++ b/api/test/image_integration_test.py @@ -15,6 +15,7 @@ search_all_excluded, search_consistency, search_quotes, + search_quotes_exact, search_source_and_excluded, search_special_chars, stats, @@ -53,6 +54,11 @@ def test_search_quotes(): search_quotes("images", "dog") +def test_search_quotes_exact(): + # ``bird perched`` returns different results when quoted vs unquoted + search_quotes_exact("images", "bird perched") + + def test_search_with_special_characters(): search_special_chars("images", "dog") diff --git a/api/test/media_integration.py b/api/test/media_integration.py index 99d30e474..835147f03 100644 --- a/api/test/media_integration.py +++ b/api/test/media_integration.py @@ -45,6 +45,25 @@ def search_quotes(media_path, q="test"): assert response.status_code == 200 +def search_quotes_exact(media_path, q): + """Only returns exact matches for the given query""" + unquoted_response = requests.get(f"{API_URL}/v1/{media_path}?q={q}", verify=False) + assert unquoted_response.status_code == 200 + unquoted_results = unquoted_response.json()["results"] + assert len(unquoted_results) > 0 + + quoted_response = requests.get(f'{API_URL}/v1/{media_path}?q="{q}"', verify=False) + assert quoted_response.status_code == 200 + quoted_results = quoted_response.json()["results"] + assert len(quoted_results) > 0 + + # The rationale here is that the unquoted results will match more records due + # to the query being overall less strict. Quoting the query will make it more + # strict causing it to return fewer results. + # Above we check that the results are not 0 to confirm that we do still get results back. + assert len(quoted_results) < len(unquoted_results) + + def search_special_chars(media_path, q="test"): """Returns a response when query includes special characters.""" response = requests.get(f"{API_URL}/v1/{media_path}?q={q}!", verify=False) From c6619b40e16b350c47bc3f118f1dc9d00c62717d Mon Sep 17 00:00:00 2001 From: sarayourfriend <24264157+sarayourfriend@users.noreply.github.com> Date: Thu, 24 Nov 2022 17:10:43 +1100 Subject: [PATCH 3/4] Undo change breaking title match boosting --- api/catalog/api/controllers/search_controller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/catalog/api/controllers/search_controller.py b/api/catalog/api/controllers/search_controller.py index cd2742b59..e02170ffa 100644 --- a/api/catalog/api/controllers/search_controller.py +++ b/api/catalog/api/controllers/search_controller.py @@ -357,7 +357,7 @@ def search( quotes_stripped = query.replace('"', "") exact_match_boost = Q( "simple_query_string", - fields=["title.exact"], + fields=["title"], query=f"{quotes_stripped}", boost=10000, ) From 595a18e463909c8f2c88fcdcafa8e2fef03288a2 Mon Sep 17 00:00:00 2001 From: sarayourfriend <24264157+sarayourfriend@users.noreply.github.com> Date: Tue, 29 Nov 2022 14:20:03 +1100 Subject: [PATCH 4/4] Fix and future proof tests against additional test data --- api/test/media_integration.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/api/test/media_integration.py b/api/test/media_integration.py index 835147f03..f90acb059 100644 --- a/api/test/media_integration.py +++ b/api/test/media_integration.py @@ -49,19 +49,19 @@ def search_quotes_exact(media_path, q): """Only returns exact matches for the given query""" unquoted_response = requests.get(f"{API_URL}/v1/{media_path}?q={q}", verify=False) assert unquoted_response.status_code == 200 - unquoted_results = unquoted_response.json()["results"] - assert len(unquoted_results) > 0 + unquoted_result_count = unquoted_response.json()["result_count"] + assert unquoted_result_count > 0 quoted_response = requests.get(f'{API_URL}/v1/{media_path}?q="{q}"', verify=False) assert quoted_response.status_code == 200 - quoted_results = quoted_response.json()["results"] - assert len(quoted_results) > 0 + quoted_result_count = quoted_response.json()["result_count"] + assert quoted_result_count > 0 # The rationale here is that the unquoted results will match more records due # to the query being overall less strict. Quoting the query will make it more # strict causing it to return fewer results. # Above we check that the results are not 0 to confirm that we do still get results back. - assert len(quoted_results) < len(unquoted_results) + assert quoted_result_count < unquoted_result_count def search_special_chars(media_path, q="test"):