diff --git a/api/catalog/api/controllers/search_controller.py b/api/catalog/api/controllers/search_controller.py index 9e34f6ae4..da7981431 100644 --- a/api/catalog/api/controllers/search_controller.py +++ b/api/catalog/api/controllers/search_controller.py @@ -44,6 +44,9 @@ def _paginate_with_dead_link_mask( Given a query, a page and page_size, return the start and end of the slice of results. + In almost all cases the ``DEAD_LINK_RATIO`` will effectively double + the page size (given the current configuration of 0.5). + :param s: The elasticsearch Search object :param page_size: How big the page should be. :param page: The page number. @@ -103,7 +106,7 @@ def _quote_escape(query_string): def _post_process_results( s, start, end, page_size, search_results, request, filter_dead -) -> List[Hit]: +) -> Optional[List[Hit]]: """ After fetching the search results from the back end, iterate through the results, perform image validation, and route certain thumbnails through our @@ -131,6 +134,10 @@ def _post_process_results( query_hash = get_query_hash(s) validate_images(query_hash, start, results, to_validate) + if len(results) == 0: + # first page is all dead links + return None + if len(results) < page_size: end += int(end / 2) if start + end > ELASTICSEARCH_MAX_RESULT_WINDOW: @@ -331,7 +338,7 @@ def search( result_count, page_count = _get_result_and_page_count( search_response, results, page_size ) - return results, page_count, result_count + return results or [], page_count, result_count def related_media(uuid, index, request, filter_dead): @@ -367,7 +374,7 @@ def related_media(uuid, index, request, filter_dead): result_count, _ = _get_result_and_page_count(response, results, page_size) - return results, result_count + return results or [], result_count def get_sources(index): @@ -414,7 +421,7 @@ def get_sources(index): def _get_result_and_page_count( - response_obj: Response, results: List[Hit], page_size: int + response_obj: Response, results: Optional[List[Hit]], page_size: int ) -> Tuple[int, int]: """ Elasticsearch does not allow deep pagination of ranked queries. @@ -424,6 +431,9 @@ def _get_result_and_page_count( :param results: The list of filtered result Hits. :return: Result and page count. """ + if results is None: + return 0, 1 + result_count = response_obj.hits.total.value natural_page_count = int(result_count / page_size) if natural_page_count % page_size != 0: diff --git a/api/catalog/api/utils/validate_images.py b/api/catalog/api/utils/validate_images.py index 9a94e682c..6ad23f6f7 100644 --- a/api/catalog/api/utils/validate_images.py +++ b/api/catalog/api/utils/validate_images.py @@ -10,6 +10,14 @@ parent_logger = logging.getLogger(__name__) +CACHE_PREFIX = "valid:" + + +def _get_cached_statuses(redis, image_urls): + cached_statuses = redis.mget([CACHE_PREFIX + url for url in image_urls]) + return [int(b.decode("utf-8")) if b is not None else None for b in cached_statuses] + + def validate_images(query_hash, start_slice, results, image_urls): """ Make sure images exist before we display them. Treat redirects as broken @@ -28,11 +36,7 @@ def validate_images(query_hash, start_slice, results, image_urls): start_time = time.time() # Pull matching images from the cache. redis = django_redis.get_redis_connection("default") - cache_prefix = "valid:" - cached_statuses = redis.mget([cache_prefix + url for url in image_urls]) - cached_statuses = [ - int(b.decode("utf-8")) if b is not None else None for b in cached_statuses - ] + cached_statuses = _get_cached_statuses(redis, image_urls) logger.debug(f"len(cached_statuses)={len(cached_statuses)}") # Anything that isn't in the cache needs to be validated via HEAD request. to_verify = {} @@ -48,7 +52,7 @@ def validate_images(query_hash, start_slice, results, image_urls): # Cache newly verified image statuses. to_cache = {} for idx, url in enumerate(to_verify.keys()): - cache_key = cache_prefix + url + cache_key = CACHE_PREFIX + url if verified[idx]: status = verified[idx].status_code # Response didn't arrive in time. Try again later. diff --git a/api/test/dead_link_filter_test.py b/api/test/dead_link_filter_test.py index 89fc87d31..0d295de2b 100644 --- a/api/test/dead_link_filter_test.py +++ b/api/test/dead_link_filter_test.py @@ -1,23 +1,50 @@ from test.constants import API_URL from unittest.mock import MagicMock, patch +from uuid import uuid4 import pytest import requests +from fakeredis import FakeRedis +from catalog.api.controllers.search_controller import DEAD_LINK_RATIO from catalog.api.utils.pagination import MAX_TOTAL_PAGE_COUNT -def _patch_redis(): - def redis_mget(keys, *_, **__): - """ - Patch for ``redis.mget`` used by ``validate_images`` to use validity - information from the cache - """ - return [None] * len(keys) +@pytest.fixture(autouse=True) +def redis(monkeypatch) -> FakeRedis: + fake_redis = FakeRedis() + + def get_redis_connection(*args, **kwargs): + return fake_redis + + monkeypatch.setattr( + "catalog.api.utils.dead_link_mask.get_redis_connection", get_redis_connection + ) + monkeypatch.setattr("django_redis.get_redis_connection", get_redis_connection) - mock_conn = MagicMock() - mock_conn.mget = MagicMock(side_effect=redis_mget) - return patch("django_redis.get_redis_connection", return_value=mock_conn) + yield fake_redis + fake_redis.client().close() + + +@pytest.fixture +def unique_query_hash(redis, monkeypatch): + def get_unique_hash(*args, **kwargs): + return str(uuid4()) + + monkeypatch.setattr( + "catalog.api.controllers.search_controller.get_query_hash", get_unique_hash + ) + + +@pytest.fixture +def empty_validation_cache(monkeypatch): + def get_empty_cached_statuses(_, image_urls): + return [None] * len(image_urls) + + monkeypatch.setattr( + "catalog.api.utils.validate_images._get_cached_statuses", + get_empty_cached_statuses, + ) def _patch_grequests(): @@ -36,10 +63,29 @@ def grequests_map(reqs, *_, **__): return patch("grequests.map", side_effect=grequests_map) +def patch_link_validation_dead_for_count(count): + total_res_count = 0 + + def grequests_map(reqs, *_, **__): + nonlocal total_res_count + """ + Patch for ``grequests.map`` used by ``validate_images`` to filter + and remove dead links + """ + responses = [] + for idx in range(len(list(reqs))): + total_res_count += 1 + mocked_res = MagicMock() + mocked_res.status_code = 404 if total_res_count <= count else 200 + responses.append(mocked_res) + return responses + + return patch("grequests.map", side_effect=grequests_map) + + @pytest.mark.django_db -@_patch_redis() @_patch_grequests() -def test_dead_link_filtering(mocked_map, _, client): +def test_dead_link_filtering(mocked_map, client): path = "/v1/images/" query_params = {"q": "*", "page_size": 20} @@ -67,9 +113,48 @@ def test_dead_link_filtering(mocked_map, _, client): res_1_ids = set(result["id"] for result in data_with_dead_links["results"]) res_2_ids = set(result["id"] for result in data_without_dead_links["results"]) + # In this case, both have 20 results as the dead link filter has "back filled" the + # pages of dead links. See the subsequent test for the case when this does not + # occur (i.e., when the entire first page of links is dead). + assert len(res_1_ids) == 20 + assert len(res_2_ids) == 20 assert bool(res_1_ids - res_2_ids) +@pytest.mark.django_db +@pytest.mark.parametrize( + ("filter_dead", "page_size", "expected_result_count"), + ( + (True, 20, 0), + (False, 20, 20), + ), +) +def test_dead_link_filtering_all_dead_links( + client, + filter_dead, + page_size, + expected_result_count, + unique_query_hash, + empty_validation_cache, +): + path = "/v1/images/" + query_params = {"q": "*", "page_size": page_size} + + with patch_link_validation_dead_for_count(page_size / DEAD_LINK_RATIO): + response = client.get( + path, + query_params | {"filter_dead": filter_dead}, + ) + + assert response.status_code == 200 + + res_json = response.json() + + assert len(res_json["results"]) == expected_result_count + if expected_result_count == 0: + assert res_json["result_count"] == 0 + + @pytest.fixture def search_factory(client): """