Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add warning to search response when source parameter has mixed validity #4031

Merged
merged 3 commits into from
Apr 8, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions api/api/examples/audio_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
"results": [
base_audio | {"fields_matched": ["title"]},
],
"warnings": [],
},
}

Expand Down
1 change: 1 addition & 0 deletions api/api/examples/image_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
"page_size": 20,
"page": 1,
"results": [base_image | {"fields_matched": ["title"]}],
"warnings": [],
},
}

Expand Down
44 changes: 36 additions & 8 deletions api/api/serializers/media_serializers.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
import logging
from collections import namedtuple
from typing import TypedDict

from django.conf import settings
from django.core.exceptions import ValidationError as DjangoValidationError
from django.core.validators import MaxValueValidator
from django.urls import reverse
from rest_framework import serializers
from rest_framework.exceptions import NotAuthenticated, ValidationError
from rest_framework.request import Request

from drf_spectacular.utils import extend_schema_serializer
from elasticsearch_dsl.response import Hit

from api.constants import sensitivity
from api.constants.licenses import LICENSE_GROUPS
from api.constants.media_types import MediaType
from api.constants.parameters import COLLECTION, TAG
from api.constants.sorting import DESCENDING, RELEVANCE, SORT_DIRECTIONS, SORT_FIELDS
from api.controllers import search_controller
Expand Down Expand Up @@ -294,8 +298,16 @@ class MediaSearchRequestSerializer(PaginatedRequestSerializer):
required=False,
)

class Context(TypedDict, total=True):
warnings: list[dict]
media_type: MediaType
request: Request

context: Context

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.context["warnings"] = []
self.media_type = self.context.get("media_type")
if not self.media_type:
raise ValueError(
Expand Down Expand Up @@ -398,15 +410,31 @@ def validate_source(self, value):
)
return value
else:
sources = value.lower().split(",")
valid_sources = set(
[source for source in sources if source in allowed_sources]
)
if len(sources) > len(valid_sources):
invalid_sources = set(sources).difference(valid_sources)
logger.warning(
f"Invalid sources in search query: {invalid_sources}; sources query: '{value}'"
sources = set(value.lower().split(","))
valid_sources = {source for source in sources if source in allowed_sources}
if not valid_sources:
# Raise only if there are _no_ valid sources selected
# If the requester passed only `mispelled_museum_name1,mispelled_musesum_name2`
# the request cannot move forward, as all the top responses will likely be from Flickr
# which provides radically different responses than most other providers.
# If even one source is valid, it won't be a problem, in which case we'll issue a warning
raise serializers.ValidationError(
f"Invalid source parameter '{value}'. No valid sources selected. "
f"Refer to the source list for valid options: {sources_list}."
)
elif invalid_sources := (sources - valid_sources):
self.context["warnings"].append(
{
"code": "partially invalid source parameter",
"message": "The source parameter was partially invalid.",
"invalid_sources": invalid_sources,
"referenced_sources": valid_sources,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The name "valid" feels clearer to me than "referenced".

Suggested change
"referenced_sources": valid_sources,
"valid_sources": valid_sources,

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found juggling the list of valid and available sources confusing. Usually when I see an error or warning about invalid values, the valid values listed are all the possible valid values, if that makes sense? I'm torn, so I'll wait and see what the other reviewer says, and change it if they want it changed as well, if that's okay.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"valid", "invalid" and "available"/"all" potentially. But yeah, let's allow one more review to see what they think.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

referenced did not feel clear to me either, but I see what Sara's saying as well.

Catching up on the linked discussions, Dhruv mentioned it seems like that is the most common way of handling cases where the input is partially acceptable and can generate a valid response. Are you referencing something in particular you can link to? I'm not familiar with this type of response.

Along those lines, is the shape of this response following some established convention, or could it be changed? For example, do we need to explicitly list which of the provided sources were valid at all, or can we just have invalid_sources and available_sources? Or could this information all be spelled out in the "message" instead of in separate named fields?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or could this information all be spelled out in the "message" instead of in separate named fields?

I actually originally had the warnings just be a list of strings, but I found it hard to do a meaningful test without just duplicating the string almost word-for-word in the test case 😰 On top of that, because of using sets instead of lists, the order of sources in the strings was non-deterministic, making it even hard to test against a simple string.

We should use whatever format here we want. I've included in the documentation for the response field that it is meant to be human readable rather than read by a machine, and that the contents of each dict are not stable.

Maybe discarded_sources, kept_sources, and available_sources? 🤷 whatever folks want here, happy to change it, I am not attached to any specific language, even if I found something or other personally confusing. I think it will get the idea across that something isn't right about the parameter on the request and that the developer needs to take a closer look at it.

Which also makes me wonder whether the warnings should go first in the JSON, rather than at the end? On a page of 20 results, I don't know whether it's easier to miss at the front or end of the document.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you referencing something in particular you can link to?

I didn't keep a record of my search when looking for a good pattern but I went through my browser history and found these references.

To be clear, this is not an established convention. It's the simplest, backwards compatible way I could think of to stick with a 200 OK status code but also convey problems in their input to the user.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gotcha! And thanks for the links -- I wanted to make sure I wasn't suggesting deviating from some widely accepted pattern :)

I actually originally had the warnings just be a list of strings, but I found it hard to do a meaningful test without just duplicating the string almost word-for-word in the test case 😰 On top of that, because of using sets instead of lists, the order of sources in the strings was non-deterministic, making it even hard to test against a simple string.

Dang, that makes sense. One final suggestion -- what if we moved just the link to the available sources into the message? So the warning could be something like:

        {
            "code": "partially invalid source parameter",
            "message": "The source parameter was partially invalid. For a list of available sources, see http://localhost:50280/v1/images/stats",
            "invalid_sources": [
                "foo"
            ],
            "valid_sources": [
                "flickr"
            ]
        }

I think that would fix the problem with testing but make it a little clearer.

Which also makes me wonder whether the warnings should go first in the JSON, rather than at the end? On a page of 20 results, I don't know whether it's easier to miss at the front or end of the document.

+1 for putting it first in the JSON, now you mention it.

"available_sources": self.context["request"].build_absolute_uri(
reverse(f"{self.media_type}-stats")
),
}
)

return ",".join(valid_sources)

def validate_excluded_source(self, input_sources):
Expand Down
54 changes: 45 additions & 9 deletions api/api/utils/pagination.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,17 @@ class StandardPagination(PageNumberPagination):
page_size_query_param = "page_size"
page_query_param = "page"

result_count: int | None
page_count: int | None
page: int
warnings: list[dict]

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.result_count = None # populated later
self.page_count = None # populated later
self.page = 1 # default, gets updated when necessary
self.warnings = [] # populated later as needed

def get_paginated_response(self, data):
return Response(
Expand All @@ -21,6 +27,7 @@ def get_paginated_response(self, data):
"page_size": self.page_size,
"page": self.page,
"results": data,
"warnings": list(self.warnings),
}
)

Expand All @@ -39,15 +46,44 @@ def get_paginated_response_schema(self, schema):
"page_size": ("The number of items per page.", 20),
"page": ("The current page number returned in the response.", 1),
}

properties = {
field: {
"type": "integer",
"description": description,
"example": example,
}
for field, (description, example) in field_descriptions.items()
} | {
"results": schema,
"warnings": {
"type": "array",
"items": {
"type": "object",
},
"description": (
"Warnings pertinent to the request. "
"If there are no warnings, this list will be empty. "
"Warnings are non-critical problems with the request. "
"Responses with warnings should be treated as unstable. "
"Warning descriptions must not be treated as machine readable "
"and their schema can change at any time."
),
"example": [
{
"code": "partially invalid request parameter",
"message": (
"Some of the request parameters were bad, "
"but we processed the request anywhere. "
"Here's some information that might help you "
"fix the problem for future requests."
),
}
],
},
}

return {
"type": "object",
"properties": {
field: {
"type": "integer",
"description": description,
"example": example,
}
for field, (description, example) in field_descriptions.items()
}
| {"results": schema},
"properties": properties,
}
1 change: 1 addition & 0 deletions api/api/views/media_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ def get_media_results(
):
page_size = self.paginator.page_size = params.data["page_size"]
page = self.paginator.page = params.data["page"]
self.paginator.warnings = params.context["warnings"]

hashed_ip = hash(self._get_user_ip(request))
filter_dead = params.validated_data.get("filter_dead", True)
Expand Down
2 changes: 1 addition & 1 deletion api/test/fixtures/rest_framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


@pytest.fixture
def api_client():
def api_client() -> APIClient:
return APIClient()


Expand Down
44 changes: 44 additions & 0 deletions api/test/integration/test_media_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,50 @@ def test_detail_view_for_invalid_uuids_returns_not_found(
assert res.status_code == 404


def test_search_with_only_valid_sources_produces_no_warning(media_type, api_client):
sarayourfriend marked this conversation as resolved.
Show resolved Hide resolved
search = api_client.get(
f"/v1/{media_type.path}/",
{"source": ",".join(media_type.providers)},
)
assert search.status_code == 200
assert search.json()["warnings"] == []


def test_search_with_partially_invalid_sources_produces_warning_but_still_succeeds(
media_type: MediaType, api_client
):
invalid_sources = [
"surely_neither_this_one",
"this_is_sure_not_to_ever_be_a_real_source_name",
]

search = api_client.get(
f"/v1/{media_type.path}/",
{"source": ",".join([media_type.providers[0]] + invalid_sources)},
)
assert search.status_code == 200
result = search.json()

assert {w["code"] for w in result["warnings"]} == {
"partially invalid source parameter"
}
warning = result["warnings"][0]
assert set(warning["invalid_sources"]) == set(invalid_sources)
assert warning["referenced_sources"] == [media_type.providers[0]]
assert f"v1/{media_type.path}/stats/" in warning["available_sources"]


def test_search_with_all_invalid_sources_fails(media_type, api_client):
invalid_sources = [
"this_is_sure_not_to_ever_be_a_real_source_name",
"surely_neither_this_one",
]
search = api_client.get(
f"/v1/{media_type.path}/", {"source": ",".join(invalid_sources)}
)
assert search.status_code == 400


def test_detail_view_returns_ok(single_result, api_client):
media_type, item = single_result
res = api_client.get(f"/v1/{media_type.path}/{item['id']}/")
Expand Down
28 changes: 13 additions & 15 deletions api/test/unit/serializers/test_media_serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,25 +99,23 @@ def test_media_serializer_adds_license_url_if_missing(
assert repr["license_url"] == "https://creativecommons.org/publicdomain/zero/1.0/"


def test_media_serializer_logs_when_invalid_or_duplicate_source(media_type_config):
def test_media_serializer_recovers_invalid_or_duplicate_source(
media_type_config, request_factory
):
sources = {
"image": ("flickr,flickr,invalid", "flickr"),
"audio": ("freesound,freesound,invalid", "freesound"),
}
with patch("api.serializers.media_serializers.logger.warning") as mock_logger:
serializer_class = media_type_config.search_request_serializer(
context={"media_type": media_type_config.media_type},
data={"source": sources[media_type_config.media_type][0]},
)
assert serializer_class.is_valid()
assert (
serializer_class.validated_data["source"]
== sources[media_type_config.media_type][1]
)
mock_logger.assert_called_with(
f"Invalid sources in search query: {{'invalid'}}; "
f"sources query: '{sources[media_type_config.media_type][0]}'"
)
request = request_factory.get("/v1/images/")
serializer_class = media_type_config.search_request_serializer(
context={"media_type": media_type_config.media_type, "request": request},
data={"source": sources[media_type_config.media_type][0]},
)
assert serializer_class.is_valid()
assert (
serializer_class.validated_data["source"]
== sources[media_type_config.media_type][1]
)


@pytest.mark.parametrize(
Expand Down