Skip to content
This repository has been archived by the owner on Feb 22, 2023. It is now read-only.

Commit

Permalink
Only backfill dead links if at least one on first page was not dead
Browse files Browse the repository at this point in the history
  • Loading branch information
sarayourfriend committed Aug 9, 2022
1 parent 30b77c2 commit 7b09e21
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 18 deletions.
7 changes: 7 additions & 0 deletions api/catalog/api/controllers/search_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ def _paginate_with_dead_link_mask(
Given a query, a page and page_size, return the start and end
of the slice of results.
In almost all cases the ``DEAD_LINK_RATIO`` will effectively double
the page size (given the current configuration of 0.5).
:param s: The elasticsearch Search object
:param page_size: How big the page should be.
:param page: The page number.
Expand Down Expand Up @@ -130,6 +133,10 @@ def _post_process_results(
query_hash = get_query_hash(s)
validate_images(query_hash, start, results, to_validate)

if len(results) == 0:
# first page is all dead links
return []

if len(results) < page_size:
end += int(end / 2)
if start + end > ELASTICSEARCH_MAX_RESULT_WINDOW:
Expand Down
16 changes: 10 additions & 6 deletions api/catalog/api/utils/validate_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@
parent_logger = logging.getLogger(__name__)


CACHE_PREFIX = "valid:"


def _get_cached_statuses(redis, image_urls):
cached_statuses = redis.mget([CACHE_PREFIX + url for url in image_urls])
return [int(b.decode("utf-8")) if b is not None else None for b in cached_statuses]


def validate_images(query_hash, start_slice, results, image_urls):
"""
Make sure images exist before we display them. Treat redirects as broken
Expand All @@ -28,11 +36,7 @@ def validate_images(query_hash, start_slice, results, image_urls):
start_time = time.time()
# Pull matching images from the cache.
redis = django_redis.get_redis_connection("default")
cache_prefix = "valid:"
cached_statuses = redis.mget([cache_prefix + url for url in image_urls])
cached_statuses = [
int(b.decode("utf-8")) if b is not None else None for b in cached_statuses
]
cached_statuses = _get_cached_statuses(redis, image_urls)
logger.debug(f"len(cached_statuses)={len(cached_statuses)}")
# Anything that isn't in the cache needs to be validated via HEAD request.
to_verify = {}
Expand All @@ -48,7 +52,7 @@ def validate_images(query_hash, start_slice, results, image_urls):
# Cache newly verified image statuses.
to_cache = {}
for idx, url in enumerate(to_verify.keys()):
cache_key = cache_prefix + url
cache_key = CACHE_PREFIX + url
if verified[idx]:
status = verified[idx].status_code
# Response didn't arrive in time. Try again later.
Expand Down
108 changes: 96 additions & 12 deletions api/test/dead_link_filter_test.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,49 @@
from test.constants import API_URL
from unittest.mock import MagicMock, patch
from uuid import uuid4

import pytest
import requests
from fakeredis import FakeRedis

from catalog.api.controllers.search_controller import DEAD_LINK_RATIO

def _patch_redis():
def redis_mget(keys, *_, **__):
"""
Patch for ``redis.mget`` used by ``validate_images`` to use validity
information from the cache
"""
return [None] * len(keys)

mock_conn = MagicMock()
mock_conn.mget = MagicMock(side_effect=redis_mget)
return patch("django_redis.get_redis_connection", return_value=mock_conn)
@pytest.fixture(autouse=True)
def redis(monkeypatch) -> FakeRedis:
fake_redis = FakeRedis()

def get_redis_connection(*args, **kwargs):
return fake_redis

monkeypatch.setattr(
"catalog.api.utils.dead_link_mask.get_redis_connection", get_redis_connection
)
monkeypatch.setattr("django_redis.get_redis_connection", get_redis_connection)

yield fake_redis
fake_redis.client().close()


@pytest.fixture
def unique_query_hash(redis, monkeypatch):
def get_unique_hash(*args, **kwargs):
return str(uuid4())

monkeypatch.setattr(
"catalog.api.controllers.search_controller.get_query_hash", get_unique_hash
)


@pytest.fixture
def empty_validation_cache(monkeypatch):
def get_empty_cached_statuses(_, image_urls):
return [None] * len(image_urls)

monkeypatch.setattr(
"catalog.api.utils.validate_images._get_cached_statuses",
get_empty_cached_statuses,
)


def _patch_grequests():
Expand All @@ -34,10 +62,29 @@ def grequests_map(reqs, *_, **__):
return patch("grequests.map", side_effect=grequests_map)


def patch_link_validation_dead_for_count(count):
total_res_count = 0

def grequests_map(reqs, *_, **__):
nonlocal total_res_count
"""
Patch for ``grequests.map`` used by ``validate_images`` to filter
and remove dead links
"""
responses = []
for idx in range(len(list(reqs))):
total_res_count += 1
mocked_res = MagicMock()
mocked_res.status_code = 404 if total_res_count <= count else 200
responses.append(mocked_res)
return responses

return patch("grequests.map", side_effect=grequests_map)


@pytest.mark.django_db
@_patch_redis()
@_patch_grequests()
def test_dead_link_filtering(mocked_map, _, client):
def test_dead_link_filtering(mocked_map, client):
path = "/v1/images/"
query_params = {"q": "*", "page_size": 20}

Expand Down Expand Up @@ -65,9 +112,46 @@ def test_dead_link_filtering(mocked_map, _, client):

res_1_ids = set(result["id"] for result in data_with_dead_links["results"])
res_2_ids = set(result["id"] for result in data_without_dead_links["results"])
# In this case, both have 20 results as the dead link filter has "back filled" the
# pages of dead links. See the subsequent test for the case when this does not
# occur (i.e., when the entire first page of links is dead).
assert len(res_1_ids) == 20
assert len(res_2_ids) == 20
assert bool(res_1_ids - res_2_ids)


@pytest.mark.django_db
@pytest.mark.parametrize(
("filter_dead", "page_size", "expected_result_count"),
(
(True, 20, 0),
(False, 20, 20),
),
)
def test_dead_link_filtering_all_dead_links(
client,
filter_dead,
page_size,
expected_result_count,
unique_query_hash,
empty_validation_cache,
):
path = "/v1/images/"
query_params = {"q": "*", "page_size": page_size}

with patch_link_validation_dead_for_count(page_size / DEAD_LINK_RATIO):
response = client.get(
path,
query_params | {"filter_dead": filter_dead},
)

assert response.status_code == 200

res_json = response.json()

assert len(res_json["results"]) == expected_result_count


@pytest.fixture
def search_factory(client):
"""
Expand Down

0 comments on commit 7b09e21

Please sign in to comment.