From 35d00e3675d5268cbb6ee0128b3595a5b008c407 Mon Sep 17 00:00:00 2001 From: sarayourfriend Date: Thu, 4 Apr 2024 11:51:49 +1100 Subject: [PATCH 1/3] Add warning to search response when source parameter has mixed validity --- api/api/examples/audio_responses.py | 1 + api/api/examples/image_responses.py | 1 + api/api/serializers/media_serializers.py | 44 ++++++++++++--- api/api/utils/pagination.py | 54 +++++++++++++++---- api/api/views/media_views.py | 1 + api/test/fixtures/rest_framework.py | 2 +- .../integration/test_media_integration.py | 51 ++++++++++++++++++ .../serializers/test_media_serializers.py | 28 +++++----- 8 files changed, 149 insertions(+), 33 deletions(-) diff --git a/api/api/examples/audio_responses.py b/api/api/examples/audio_responses.py index 4b53e9f62f1..c434b863974 100644 --- a/api/api/examples/audio_responses.py +++ b/api/api/examples/audio_responses.py @@ -64,6 +64,7 @@ "results": [ base_audio | {"fields_matched": ["title"]}, ], + "warnings": [], }, } diff --git a/api/api/examples/image_responses.py b/api/api/examples/image_responses.py index 7db0ff3a18c..ec387c8a237 100644 --- a/api/api/examples/image_responses.py +++ b/api/api/examples/image_responses.py @@ -72,6 +72,7 @@ "page_size": 20, "page": 1, "results": [base_image | {"fields_matched": ["title"]}], + "warnings": [], }, } diff --git a/api/api/serializers/media_serializers.py b/api/api/serializers/media_serializers.py index c47219b693b..1d4ff103c58 100644 --- a/api/api/serializers/media_serializers.py +++ b/api/api/serializers/media_serializers.py @@ -1,17 +1,21 @@ import logging from collections import namedtuple +from typing import TypedDict from django.conf import settings from django.core.exceptions import ValidationError as DjangoValidationError from django.core.validators import MaxValueValidator +from django.urls import reverse from rest_framework import serializers from rest_framework.exceptions import NotAuthenticated, ValidationError +from rest_framework.request import Request from drf_spectacular.utils import extend_schema_serializer from elasticsearch_dsl.response import Hit from api.constants import sensitivity from api.constants.licenses import LICENSE_GROUPS +from api.constants.media_types import MediaType from api.constants.parameters import COLLECTION, TAG from api.constants.sorting import DESCENDING, RELEVANCE, SORT_DIRECTIONS, SORT_FIELDS from api.controllers import search_controller @@ -294,8 +298,16 @@ class MediaSearchRequestSerializer(PaginatedRequestSerializer): required=False, ) + class Context(TypedDict, total=True): + warnings: list[dict] + media_type: MediaType + request: Request + + context: Context + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.context["warnings"] = [] self.media_type = self.context.get("media_type") if not self.media_type: raise ValueError( @@ -398,15 +410,31 @@ def validate_source(self, value): ) return value else: - sources = value.lower().split(",") - valid_sources = set( - [source for source in sources if source in allowed_sources] - ) - if len(sources) > len(valid_sources): - invalid_sources = set(sources).difference(valid_sources) - logger.warning( - f"Invalid sources in search query: {invalid_sources}; sources query: '{value}'" + sources = set(value.lower().split(",")) + valid_sources = {source for source in sources if source in allowed_sources} + if not valid_sources: + # Raise only if there are _no_ valid sources selected + # If the requester passed only `mispelled_museum_name1,mispelled_musesum_name2` + # the request cannot move forward, as all the top responses will likely be from Flickr + # which provides radically different responses than most other providers. + # If even one source is valid, it won't be a problem, in which case we'll issue a warning + raise serializers.ValidationError( + f"Invalid source parameter '{value}'. No valid sources selected. " + f"Refer to the source list for valid options: {sources_list}." + ) + elif invalid_sources := (sources - valid_sources): + self.context["warnings"].append( + { + "code": "partially invalid source parameter", + "message": "The source parameter was partially invalid.", + "invalid_sources": invalid_sources, + "referenced_sources": valid_sources, + "available_sources": self.context["request"].build_absolute_uri( + reverse(f"{self.media_type}-stats") + ), + } ) + return ",".join(valid_sources) def validate_excluded_source(self, input_sources): diff --git a/api/api/utils/pagination.py b/api/api/utils/pagination.py index 85e2cf24b3a..f2ede4b278a 100644 --- a/api/api/utils/pagination.py +++ b/api/api/utils/pagination.py @@ -7,11 +7,17 @@ class StandardPagination(PageNumberPagination): page_size_query_param = "page_size" page_query_param = "page" + result_count: int | None + page_count: int | None + page: int + warnings: list[dict] + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.result_count = None # populated later self.page_count = None # populated later self.page = 1 # default, gets updated when necessary + self.warnings = [] # populated later as needed def get_paginated_response(self, data): return Response( @@ -21,6 +27,7 @@ def get_paginated_response(self, data): "page_size": self.page_size, "page": self.page, "results": data, + "warnings": list(self.warnings), } ) @@ -39,15 +46,44 @@ def get_paginated_response_schema(self, schema): "page_size": ("The number of items per page.", 20), "page": ("The current page number returned in the response.", 1), } + + properties = { + field: { + "type": "integer", + "description": description, + "example": example, + } + for field, (description, example) in field_descriptions.items() + } | { + "results": schema, + "warnings": { + "type": "array", + "items": { + "type": "object", + }, + "description": ( + "Warnings pertinent to the request. " + "If there are no warnings, this list will be empty. " + "Warnings are non-critical problems with the request. " + "Responses with warnings should be treated as unstable. " + "Warning descriptions must not be treated as machine readable " + "and their schema can change at any time." + ), + "example": [ + { + "code": "partially invalid request parameter", + "message": ( + "Some of the request parameters were bad, " + "but we processed the request anywhere. " + "Here's some information that might help you " + "fix the problem for future requests." + ), + } + ], + }, + } + return { "type": "object", - "properties": { - field: { - "type": "integer", - "description": description, - "example": example, - } - for field, (description, example) in field_descriptions.items() - } - | {"results": schema}, + "properties": properties, } diff --git a/api/api/views/media_views.py b/api/api/views/media_views.py index b242eceb833..1018ef94c78 100644 --- a/api/api/views/media_views.py +++ b/api/api/views/media_views.py @@ -160,6 +160,7 @@ def get_media_results( ): page_size = self.paginator.page_size = params.data["page_size"] page = self.paginator.page = params.data["page"] + self.paginator.warnings = params.context["warnings"] hashed_ip = hash(self._get_user_ip(request)) filter_dead = params.validated_data.get("filter_dead", True) diff --git a/api/test/fixtures/rest_framework.py b/api/test/fixtures/rest_framework.py index 3359b0a81df..f96eeb0aacd 100644 --- a/api/test/fixtures/rest_framework.py +++ b/api/test/fixtures/rest_framework.py @@ -4,7 +4,7 @@ @pytest.fixture -def api_client(): +def api_client() -> APIClient: return APIClient() diff --git a/api/test/integration/test_media_integration.py b/api/test/integration/test_media_integration.py index c933e57cc5e..fe1e148d2fa 100644 --- a/api/test/integration/test_media_integration.py +++ b/api/test/integration/test_media_integration.py @@ -329,6 +329,57 @@ def test_detail_view_for_invalid_uuids_returns_not_found( assert res.status_code == 404 +def test_search_with_only_valid_sources_produces_no_warning(media_type, api_client): + stats = api_client.get(f"/v1/{media_type.path}/stats/") + assert stats.status_code == 200 + + search = api_client.get( + f"/v1/{media_type.path}/", + {"source": ",".join([s["source_name"] for s in stats.json()])}, + ) + assert search.status_code == 200 + assert search.json()["warnings"] == [] + + +def test_search_with_partially_invalid_sources_produces_warning_but_still_succeeds( + media_type: MediaType, api_client +): + stats = api_client.get(f"/v1/{media_type.path}/stats/") + assert stats.status_code == 200 + valid_source = stats.json()[0]["source_name"] + + invalid_sources = [ + "surely_neither_this_one", + "this_is_sure_not_to_ever_be_a_real_source_name", + ] + + search = api_client.get( + f"/v1/{media_type.path}/", + {"source": ",".join([valid_source] + invalid_sources)}, + ) + assert search.status_code == 200 + result = search.json() + + assert {w["code"] for w in result["warnings"]} == { + "partially invalid source parameter" + } + warning = result["warnings"][0] + assert set(warning["invalid_sources"]) == set(invalid_sources) + assert warning["referenced_sources"] == [valid_source] + assert f"v1/{media_type.path}/stats/" in warning["available_sources"] + + +def test_search_with_all_invalid_sources_fails(media_type, api_client): + invalid_sources = [ + "this_is_sure_not_to_ever_be_a_real_source_name", + "surely_neither_this_one", + ] + search = api_client.get( + f"/v1/{media_type.path}/", {"source": ",".join(invalid_sources)} + ) + assert search.status_code == 400 + + def test_detail_view_returns_ok(single_result, api_client): media_type, item = single_result res = api_client.get(f"/v1/{media_type.path}/{item['id']}/") diff --git a/api/test/unit/serializers/test_media_serializers.py b/api/test/unit/serializers/test_media_serializers.py index ea2500a2681..79059210ee4 100644 --- a/api/test/unit/serializers/test_media_serializers.py +++ b/api/test/unit/serializers/test_media_serializers.py @@ -99,25 +99,23 @@ def test_media_serializer_adds_license_url_if_missing( assert repr["license_url"] == "https://creativecommons.org/publicdomain/zero/1.0/" -def test_media_serializer_logs_when_invalid_or_duplicate_source(media_type_config): +def test_media_serializer_recovers_invalid_or_duplicate_source( + media_type_config, request_factory +): sources = { "image": ("flickr,flickr,invalid", "flickr"), "audio": ("freesound,freesound,invalid", "freesound"), } - with patch("api.serializers.media_serializers.logger.warning") as mock_logger: - serializer_class = media_type_config.search_request_serializer( - context={"media_type": media_type_config.media_type}, - data={"source": sources[media_type_config.media_type][0]}, - ) - assert serializer_class.is_valid() - assert ( - serializer_class.validated_data["source"] - == sources[media_type_config.media_type][1] - ) - mock_logger.assert_called_with( - f"Invalid sources in search query: {{'invalid'}}; " - f"sources query: '{sources[media_type_config.media_type][0]}'" - ) + request = request_factory.get("/v1/images/") + serializer_class = media_type_config.search_request_serializer( + context={"media_type": media_type_config.media_type, "request": request}, + data={"source": sources[media_type_config.media_type][0]}, + ) + assert serializer_class.is_valid() + assert ( + serializer_class.validated_data["source"] + == sources[media_type_config.media_type][1] + ) @pytest.mark.parametrize( From caddd9eab6bce3dd81d8155872d62ee28e881163 Mon Sep 17 00:00:00 2001 From: sarayourfriend Date: Thu, 4 Apr 2024 15:39:29 +1100 Subject: [PATCH 2/3] Use fixture source list --- api/test/integration/test_media_integration.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/api/test/integration/test_media_integration.py b/api/test/integration/test_media_integration.py index fe1e148d2fa..4e999e72e47 100644 --- a/api/test/integration/test_media_integration.py +++ b/api/test/integration/test_media_integration.py @@ -330,12 +330,9 @@ def test_detail_view_for_invalid_uuids_returns_not_found( def test_search_with_only_valid_sources_produces_no_warning(media_type, api_client): - stats = api_client.get(f"/v1/{media_type.path}/stats/") - assert stats.status_code == 200 - search = api_client.get( f"/v1/{media_type.path}/", - {"source": ",".join([s["source_name"] for s in stats.json()])}, + {"source": ",".join(media_type.providers)}, ) assert search.status_code == 200 assert search.json()["warnings"] == [] @@ -344,10 +341,6 @@ def test_search_with_only_valid_sources_produces_no_warning(media_type, api_clie def test_search_with_partially_invalid_sources_produces_warning_but_still_succeeds( media_type: MediaType, api_client ): - stats = api_client.get(f"/v1/{media_type.path}/stats/") - assert stats.status_code == 200 - valid_source = stats.json()[0]["source_name"] - invalid_sources = [ "surely_neither_this_one", "this_is_sure_not_to_ever_be_a_real_source_name", @@ -355,7 +348,7 @@ def test_search_with_partially_invalid_sources_produces_warning_but_still_succee search = api_client.get( f"/v1/{media_type.path}/", - {"source": ",".join([valid_source] + invalid_sources)}, + {"source": ",".join([media_type.providers[0]] + invalid_sources)}, ) assert search.status_code == 200 result = search.json() @@ -365,7 +358,7 @@ def test_search_with_partially_invalid_sources_produces_warning_but_still_succee } warning = result["warnings"][0] assert set(warning["invalid_sources"]) == set(invalid_sources) - assert warning["referenced_sources"] == [valid_source] + assert warning["referenced_sources"] == [media_type.providers[0]] assert f"v1/{media_type.path}/stats/" in warning["available_sources"] From 4eea42f280fb2ddc32fc71a3fae91da5828aaf79 Mon Sep 17 00:00:00 2001 From: sarayourfriend Date: Mon, 8 Apr 2024 11:07:53 +1000 Subject: [PATCH 3/3] Move warnings to top and use clearer message --- api/api/examples/audio_responses.py | 1 - api/api/examples/image_responses.py | 1 - api/api/serializers/media_serializers.py | 13 +++++---- api/api/utils/pagination.py | 27 ++++++++++++------- .../integration/test_media_integration.py | 6 ++--- 5 files changed, 29 insertions(+), 19 deletions(-) diff --git a/api/api/examples/audio_responses.py b/api/api/examples/audio_responses.py index c434b863974..4b53e9f62f1 100644 --- a/api/api/examples/audio_responses.py +++ b/api/api/examples/audio_responses.py @@ -64,7 +64,6 @@ "results": [ base_audio | {"fields_matched": ["title"]}, ], - "warnings": [], }, } diff --git a/api/api/examples/image_responses.py b/api/api/examples/image_responses.py index ec387c8a237..7db0ff3a18c 100644 --- a/api/api/examples/image_responses.py +++ b/api/api/examples/image_responses.py @@ -72,7 +72,6 @@ "page_size": 20, "page": 1, "results": [base_image | {"fields_matched": ["title"]}], - "warnings": [], }, } diff --git a/api/api/serializers/media_serializers.py b/api/api/serializers/media_serializers.py index 1d4ff103c58..9bc3988cf0b 100644 --- a/api/api/serializers/media_serializers.py +++ b/api/api/serializers/media_serializers.py @@ -423,15 +423,18 @@ def validate_source(self, value): f"Refer to the source list for valid options: {sources_list}." ) elif invalid_sources := (sources - valid_sources): + available_sources_uri = self.context["request"].build_absolute_uri( + reverse(f"{self.media_type}-stats") + ) self.context["warnings"].append( { "code": "partially invalid source parameter", - "message": "The source parameter was partially invalid.", - "invalid_sources": invalid_sources, - "referenced_sources": valid_sources, - "available_sources": self.context["request"].build_absolute_uri( - reverse(f"{self.media_type}-stats") + "message": ( + "The source parameter included non-existent sources. " + f"For a list of available sources, see {available_sources_uri}" ), + "invalid_sources": invalid_sources, + "valid_sources": valid_sources, } ) diff --git a/api/api/utils/pagination.py b/api/api/utils/pagination.py index f2ede4b278a..8c3534504ec 100644 --- a/api/api/utils/pagination.py +++ b/api/api/utils/pagination.py @@ -20,15 +20,23 @@ def __init__(self, *args, **kwargs): self.warnings = [] # populated later as needed def get_paginated_response(self, data): + response = { + "result_count": self.result_count, + "page_count": min(settings.MAX_PAGINATION_DEPTH, self.page_count), + "page_size": self.page_size, + "page": self.page, + "results": data, + } return Response( - { - "result_count": self.result_count, - "page_count": min(settings.MAX_PAGINATION_DEPTH, self.page_count), - "page_size": self.page_size, - "page": self.page, - "results": data, - "warnings": list(self.warnings), - } + ( + { + # Put ``warnings`` first so it is as visible as possible. + "warnings": list(self.warnings), + } + if self.warnings + else {} + ) + | response ) def get_paginated_response_schema(self, schema): @@ -63,7 +71,7 @@ def get_paginated_response_schema(self, schema): }, "description": ( "Warnings pertinent to the request. " - "If there are no warnings, this list will be empty. " + "If there are no warnings, this property will not be present on the response. " "Warnings are non-critical problems with the request. " "Responses with warnings should be treated as unstable. " "Warning descriptions must not be treated as machine readable " @@ -86,4 +94,5 @@ def get_paginated_response_schema(self, schema): return { "type": "object", "properties": properties, + "required": list(set(properties.keys()) - {"warnings"}), } diff --git a/api/test/integration/test_media_integration.py b/api/test/integration/test_media_integration.py index 4e999e72e47..68bb1e8d797 100644 --- a/api/test/integration/test_media_integration.py +++ b/api/test/integration/test_media_integration.py @@ -335,7 +335,7 @@ def test_search_with_only_valid_sources_produces_no_warning(media_type, api_clie {"source": ",".join(media_type.providers)}, ) assert search.status_code == 200 - assert search.json()["warnings"] == [] + assert "warnings" not in search.json() def test_search_with_partially_invalid_sources_produces_warning_but_still_succeeds( @@ -358,8 +358,8 @@ def test_search_with_partially_invalid_sources_produces_warning_but_still_succee } warning = result["warnings"][0] assert set(warning["invalid_sources"]) == set(invalid_sources) - assert warning["referenced_sources"] == [media_type.providers[0]] - assert f"v1/{media_type.path}/stats/" in warning["available_sources"] + assert warning["valid_sources"] == [media_type.providers[0]] + assert f"v1/{media_type.path}/stats/" in warning["message"] def test_search_with_all_invalid_sources_fails(media_type, api_client):