Skip to content
This repository has been archived by the owner on Feb 22, 2023. It is now read-only.

Only backfill dead links if at least one on first page was not dead #865

Merged
merged 3 commits into from
Aug 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions api/catalog/api/controllers/search_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ def _paginate_with_dead_link_mask(
Given a query, a page and page_size, return the start and end
of the slice of results.

In almost all cases the ``DEAD_LINK_RATIO`` will effectively double
the page size (given the current configuration of 0.5).

:param s: The elasticsearch Search object
:param page_size: How big the page should be.
:param page: The page number.
Expand Down Expand Up @@ -103,7 +106,7 @@ def _quote_escape(query_string):

def _post_process_results(
s, start, end, page_size, search_results, request, filter_dead
) -> List[Hit]:
) -> Optional[List[Hit]]:
"""
After fetching the search results from the back end, iterate through the
results, perform image validation, and route certain thumbnails through our
Expand Down Expand Up @@ -131,6 +134,10 @@ def _post_process_results(
query_hash = get_query_hash(s)
validate_images(query_hash, start, results, to_validate)

if len(results) == 0:
# first page is all dead links
return None

if len(results) < page_size:
end += int(end / 2)
if start + end > ELASTICSEARCH_MAX_RESULT_WINDOW:
Expand Down Expand Up @@ -331,7 +338,7 @@ def search(
result_count, page_count = _get_result_and_page_count(
search_response, results, page_size
)
return results, page_count, result_count
return results or [], page_count, result_count


def related_media(uuid, index, request, filter_dead):
Expand Down Expand Up @@ -367,7 +374,7 @@ def related_media(uuid, index, request, filter_dead):

result_count, _ = _get_result_and_page_count(response, results, page_size)

return results, result_count
return results or [], result_count


def get_sources(index):
Expand Down Expand Up @@ -414,7 +421,7 @@ def get_sources(index):


def _get_result_and_page_count(
response_obj: Response, results: List[Hit], page_size: int
response_obj: Response, results: Optional[List[Hit]], page_size: int
) -> Tuple[int, int]:
"""
Elasticsearch does not allow deep pagination of ranked queries.
Expand All @@ -424,6 +431,9 @@ def _get_result_and_page_count(
:param results: The list of filtered result Hits.
:return: Result and page count.
"""
if results is None:
return 0, 1

result_count = response_obj.hits.total.value
natural_page_count = int(result_count / page_size)
if natural_page_count % page_size != 0:
Expand Down
16 changes: 10 additions & 6 deletions api/catalog/api/utils/validate_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@
parent_logger = logging.getLogger(__name__)


CACHE_PREFIX = "valid:"


def _get_cached_statuses(redis, image_urls):
cached_statuses = redis.mget([CACHE_PREFIX + url for url in image_urls])
return [int(b.decode("utf-8")) if b is not None else None for b in cached_statuses]
Comment on lines +13 to +18
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change is to facilitate mocking the cached statuses in the test. Mostly this is to avoid the status cache being prepopulated based on other tests request/response cycle.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's also welcome modularization!



def validate_images(query_hash, start_slice, results, image_urls):
"""
Make sure images exist before we display them. Treat redirects as broken
Expand All @@ -28,11 +36,7 @@ def validate_images(query_hash, start_slice, results, image_urls):
start_time = time.time()
# Pull matching images from the cache.
redis = django_redis.get_redis_connection("default")
cache_prefix = "valid:"
cached_statuses = redis.mget([cache_prefix + url for url in image_urls])
cached_statuses = [
int(b.decode("utf-8")) if b is not None else None for b in cached_statuses
]
cached_statuses = _get_cached_statuses(redis, image_urls)
logger.debug(f"len(cached_statuses)={len(cached_statuses)}")
# Anything that isn't in the cache needs to be validated via HEAD request.
to_verify = {}
Expand All @@ -48,7 +52,7 @@ def validate_images(query_hash, start_slice, results, image_urls):
# Cache newly verified image statuses.
to_cache = {}
for idx, url in enumerate(to_verify.keys()):
cache_key = cache_prefix + url
cache_key = CACHE_PREFIX + url
if verified[idx]:
status = verified[idx].status_code
# Response didn't arrive in time. Try again later.
Expand Down
109 changes: 97 additions & 12 deletions api/test/dead_link_filter_test.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,50 @@
from test.constants import API_URL
from unittest.mock import MagicMock, patch
from uuid import uuid4

import pytest
import requests
from fakeredis import FakeRedis

from catalog.api.controllers.search_controller import DEAD_LINK_RATIO
from catalog.api.utils.pagination import MAX_TOTAL_PAGE_COUNT


def _patch_redis():
def redis_mget(keys, *_, **__):
"""
Patch for ``redis.mget`` used by ``validate_images`` to use validity
information from the cache
"""
return [None] * len(keys)
@pytest.fixture(autouse=True)
def redis(monkeypatch) -> FakeRedis:
fake_redis = FakeRedis()

def get_redis_connection(*args, **kwargs):
return fake_redis

monkeypatch.setattr(
"catalog.api.utils.dead_link_mask.get_redis_connection", get_redis_connection
)
monkeypatch.setattr("django_redis.get_redis_connection", get_redis_connection)

mock_conn = MagicMock()
mock_conn.mget = MagicMock(side_effect=redis_mget)
return patch("django_redis.get_redis_connection", return_value=mock_conn)
yield fake_redis
fake_redis.client().close()


@pytest.fixture
def unique_query_hash(redis, monkeypatch):
def get_unique_hash(*args, **kwargs):
return str(uuid4())

monkeypatch.setattr(
"catalog.api.controllers.search_controller.get_query_hash", get_unique_hash
)


@pytest.fixture
def empty_validation_cache(monkeypatch):
def get_empty_cached_statuses(_, image_urls):
return [None] * len(image_urls)

monkeypatch.setattr(
"catalog.api.utils.validate_images._get_cached_statuses",
get_empty_cached_statuses,
)


def _patch_grequests():
Expand All @@ -36,10 +63,29 @@ def grequests_map(reqs, *_, **__):
return patch("grequests.map", side_effect=grequests_map)


def patch_link_validation_dead_for_count(count):
total_res_count = 0

def grequests_map(reqs, *_, **__):
nonlocal total_res_count
"""
Patch for ``grequests.map`` used by ``validate_images`` to filter
and remove dead links
"""
responses = []
for idx in range(len(list(reqs))):
total_res_count += 1
mocked_res = MagicMock()
mocked_res.status_code = 404 if total_res_count <= count else 200
responses.append(mocked_res)
return responses

return patch("grequests.map", side_effect=grequests_map)


@pytest.mark.django_db
@_patch_redis()
@_patch_grequests()
def test_dead_link_filtering(mocked_map, _, client):
def test_dead_link_filtering(mocked_map, client):
path = "/v1/images/"
query_params = {"q": "*", "page_size": 20}

Expand Down Expand Up @@ -67,9 +113,48 @@ def test_dead_link_filtering(mocked_map, _, client):

res_1_ids = set(result["id"] for result in data_with_dead_links["results"])
res_2_ids = set(result["id"] for result in data_without_dead_links["results"])
# In this case, both have 20 results as the dead link filter has "back filled" the
# pages of dead links. See the subsequent test for the case when this does not
# occur (i.e., when the entire first page of links is dead).
assert len(res_1_ids) == 20
assert len(res_2_ids) == 20
assert bool(res_1_ids - res_2_ids)


@pytest.mark.django_db
@pytest.mark.parametrize(
("filter_dead", "page_size", "expected_result_count"),
(
(True, 20, 0),
(False, 20, 20),
),
)
def test_dead_link_filtering_all_dead_links(
client,
filter_dead,
page_size,
expected_result_count,
unique_query_hash,
empty_validation_cache,
):
path = "/v1/images/"
query_params = {"q": "*", "page_size": page_size}

with patch_link_validation_dead_for_count(page_size / DEAD_LINK_RATIO):
response = client.get(
path,
query_params | {"filter_dead": filter_dead},
)

assert response.status_code == 200

res_json = response.json()

assert len(res_json["results"]) == expected_result_count
if expected_result_count == 0:
assert res_json["result_count"] == 0


@pytest.fixture
def search_factory(client):
"""
Expand Down