Skip to content
This repository has been archived by the owner on Feb 22, 2023. It is now read-only.

Limit page depth of searches #859

Merged
merged 3 commits into from
Aug 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions api/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ verify_ssl = true
ipython = "*"
pycodestyle = "*"
pytest-django = ">=3.5"
pytest-raises = "*"
sarayourfriend marked this conversation as resolved.
Show resolved Hide resolved
remote-pdb = "*"
pre-commit = "*"
locust = "*"
Expand Down
110 changes: 67 additions & 43 deletions api/Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions api/catalog/api/controllers/search_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import catalog.api.models as models
from catalog.api.utils.dead_link_mask import get_query_hash, get_query_mask
from catalog.api.utils.pagination import MAX_TOTAL_PAGE_COUNT
from catalog.api.utils.validate_images import validate_images


Expand Down Expand Up @@ -427,8 +428,7 @@ def _get_result_and_page_count(
natural_page_count = int(result_count / page_size)
if natural_page_count % page_size != 0:
natural_page_count += 1
last_allowed_page = int((5000 + page_size / 2) / page_size)
page_count = min(natural_page_count, last_allowed_page)
page_count = min(natural_page_count, MAX_TOTAL_PAGE_COUNT)
if len(results) < page_size and page_count == 0:
result_count = len(results)

Expand Down
5 changes: 5 additions & 0 deletions api/catalog/api/utils/pagination.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
from catalog.api.utils.exceptions import get_api_exception


MAX_TOTAL_PAGE_COUNT = 20


class StandardPagination(PageNumberPagination):
page_size_query_param = "page_size"
page_query_param = "page"
Expand Down Expand Up @@ -42,6 +45,8 @@ def page(self, value):
value = int(value) # convert str params to int
if value <= 0:
raise get_api_exception("Page must be greater than 0.", 400)
elif value > 20:
raise get_api_exception("Searches are limited to 20 pages.", 400)
Comment on lines 46 to +49
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This validation should be removed from here and the new validation should be added to the serializer:

def validate_page_size(self, value):
request = self.context.get("request")
is_anonymous = bool(request and request.user and request.user.is_anonymous)
if is_anonymous and value > 20:
raise get_api_exception(
"Page size must be between 1 & 20 for unauthenticated requests.", 401
)
return value

Suggested change
if value <= 0:
raise get_api_exception("Page must be greater than 0.", 400)
elif value > 20:
raise get_api_exception("Searches are limited to 20 pages.", 400)

It was erroneously added back to this area during the revert.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, this is inaccurate. The validation needs to be moved to the serializer, but I think it should be done in a different PR dedicated to a small refactor of the pagination parameter validation. We're under-utilising serializer validation at the moment and we can delete a significant amount code. But it should be done in a different PR.

zackkrida marked this conversation as resolved.
Show resolved Hide resolved
self._page = value

def get_paginated_response(self, data):
Expand Down
12 changes: 11 additions & 1 deletion api/test/dead_link_filter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import pytest
import requests

from catalog.api.utils.pagination import MAX_TOTAL_PAGE_COUNT


def _patch_redis():
def redis_mget(keys, *_, **__):
Expand Down Expand Up @@ -115,7 +117,7 @@ def test_page_consistency_removing_dead_links(search_without_dead_links):
Test the results returned in consecutive pages are never repeated when
filtering out dead links.
"""
total_pages = 30
total_pages = MAX_TOTAL_PAGE_COUNT
page_size = 5

page_results = []
Expand All @@ -134,3 +136,11 @@ def no_duplicates(xs):
ids = list(map(lambda x: x["id"], page_results))
# No results should be repeated so we should have no duplicate ids
assert no_duplicates(ids)


@pytest.mark.django_db
def test_max_page_count():
response = requests.get(
f"{API_URL}/v1/images", params={"page": MAX_TOTAL_PAGE_COUNT + 1}, verify=False
)
assert response.status_code == 400
51 changes: 51 additions & 0 deletions api/test/unit/controllers/test_search_controller.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from unittest import mock

import pytest

from catalog.api.controllers import search_controller
from catalog.api.utils.pagination import MAX_TOTAL_PAGE_COUNT


@pytest.mark.parametrize(
"total_hits, real_result_count, page_size, expected",
[
# No results
(0, 0, 10, (0, 0)),
# Setting page size to 0 raises an exception
pytest.param(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it make sense to add a negative page size, too?

Copy link
Contributor

@sarayourfriend sarayourfriend Aug 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Upstream validation catches negative values here and just uses the default of 1 instead.

0, 0, 0, (0, 0), marks=pytest.mark.raises(exception=ZeroDivisionError)
),
# Fewer results than page size leads to max of result total
(5, 5, 10, (5, 0)),
# Even if no real results exist, total result count and page count are returned
# (seems like an impossible case IRL)
(100, 0, 10, (100, 10)),
# If there are real results and ES reports no hits, nothing is expected
# (seems like an impossible case IRL)
(0, 100, 10, (0, 0)),
# Evenly divisible number of pages
(25, 5, 5, (25, 5)),
# Unevenly divisible number of pages
(21, 5, 5, (21, 5)),
# My assumption would be that this yields (20, 4), but the code is such that
# when the "natural" page count can't be cleanly divisible by the page size,
# We increment it plus one. Why would that be the case? 20 results, with 5
# results per-page, would seem to result in 4 pages total not 5 🤷‍♀️
(20, 5, 5, (20, 5)),
# Fewer hits than page size, but result list somehow differs, use that for count
(48, 20, 50, (20, 0)),
# Page count gets truncated always
(5000, 10, 10, (5000, MAX_TOTAL_PAGE_COUNT)),
],
)
def test_get_result_and_page_count(total_hits, real_result_count, page_size, expected):
response_obj = mock.MagicMock()
response_obj.hits.total.value = total_hits
results = [mock.MagicMock() for _ in range(real_result_count)]

actual = search_controller._get_result_and_page_count(
response_obj,
results,
page_size,
)
assert actual == expected