Skip to content

Commit

Permalink
Treat 403s from Flickr as dead links (#1201)
Browse files Browse the repository at this point in the history
* Remove Elasticsearch mem limit

Can we make this "opt-in" rather than "opt-out"?

Impossible to run ES locally for me without removing this

* Add per-provider status checks in dead link validation

* Add default overrides and unambiguous way of disabling them

* Use more descriptive module docstring

Co-authored-by: Krystle Salazar <[email protected]>

---------

Co-authored-by: Krystle Salazar <[email protected]>
  • Loading branch information
sarayourfriend and krysal authored Apr 18, 2023
1 parent 0f048cd commit 6cdfb59
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
from decouple import config
from elasticsearch_dsl.response import Hit

from catalog.api.utils.check_dead_links.provider_status_mappings import (
provider_status_mappings,
)
from catalog.api.utils.dead_link_mask import get_query_mask, save_query_mask


Expand Down Expand Up @@ -119,24 +122,23 @@ def check_dead_links(
for idx, _ in enumerate(cached_statuses):
del_idx = len(cached_statuses) - idx - 1
status = cached_statuses[del_idx]
# thingiverse treated as failure despite the suspect status code
# due to issues described here:
# https://github.com/WordPress/openverse/issues/900
if (
status == 429
or status == 403
and results[del_idx]["provider"] != "thingiverse"
):

provider = results[del_idx]["provider"]
status_mapping = provider_status_mappings[provider]

if status in status_mapping.unknown:
logger.warning(
"Image validation failed due to rate limiting or blocking. "
f"url={image_urls[idx]} "
f"status={status} "
f"provider={provider} "
)
elif status != 200:
elif status not in status_mapping.live:
logger.info(
"Deleting broken image from results "
f"id={results[del_idx]['identifier']} "
f"status={status} "
f"provider={provider} "
)
# remove the result, mutating in place
del results[del_idx]
Expand Down
23 changes: 23 additions & 0 deletions api/catalog/api/utils/check_dead_links/provider_status_mappings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""Per-provider HTTP status mappings for link availability."""

from collections import defaultdict
from dataclasses import dataclass


@dataclass
class StatusMapping:
unknown: tuple[int] = (429, 403)
live: tuple[int] = (200,)


provider_status_mappings = defaultdict(
StatusMapping,
thingiverse=StatusMapping(
# https://github.com/WordPress/openverse/issues/900
unknown=(429,),
),
flickr=StatusMapping(
# https://github.com/WordPress/openverse/issues/1200
unknown=(429,),
),
)
18 changes: 10 additions & 8 deletions api/test/unit/utils/check_dead_links_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import aiohttp
import pook
import pytest

from catalog.api.utils.check_dead_links import HEADERS, check_dead_links

Expand All @@ -11,7 +12,7 @@
@pook.on
def test_sends_user_agent(wrapped_client_session: mock.AsyncMock):
query_hash = "test_sends_user_agent"
results = [object() for _ in range(40)]
results = [{"provider": "best_provider_ever"} for _ in range(40)]
image_urls = [f"https://example.org/{i}" for i in range(len(results))]
start_slice = 0

Expand Down Expand Up @@ -40,7 +41,7 @@ def test_handles_timeout():
3 seconds.
"""
query_hash = "test_handles_timeout"
results = [{"identifier": i} for i in range(1)]
results = [{"identifier": i, "provider": "best_provider_ever"} for i in range(1)]
image_urls = [f"https://example.org/{i}" for i in range(len(results))]
start_slice = 0

Expand All @@ -58,10 +59,12 @@ def raise_timeout_error(*args, **kwargs):
assert len(results) == 0


def test_thingiverse_403_considered_dead():
query_hash = "test_thingiverse_403_considered_dead"
@pytest.mark.parametrize("provider", ("thingiverse", "flickr"))
def test_403_considered_dead(provider):
query_hash = f"test_{provider}_403_considered_dead"
other_provider = "fake_other_provider"
results = [
{"identifier": i, "provider": "thingiverse" if i % 2 else "flickr"}
{"identifier": i, "provider": provider if i % 2 else other_provider}
for i in range(4)
]
image_urls = [f"https://example.org/{i}" for i in range(len(results))]
Expand All @@ -78,6 +81,5 @@ def test_thingiverse_403_considered_dead():

assert head_mock.calls == len(results)

# All the "thingiverse" results should have been filtered out
# whereas the flickr results should be left
assert all([r["provider"] == "flickr" for r in results])
# All the provider's results should be filtered out, leaving only the "other" provider
assert all([r["provider"] == other_provider for r in results])
4 changes: 4 additions & 0 deletions docker-compose.overrides.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
services:
es:
# Memory limit for ES, as it tends to be a memory hoarder
mem_limit: 4294967296
4 changes: 0 additions & 4 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -220,10 +220,6 @@ services:
nofile:
soft: 65536
hard: 65536
# Memory limit for ES, as it tends to be a memory hoarder
# Set this value to an empty string to remove the limit
# https://docs.docker.com/compose/compose-file/compose-file-v2/#cpu-and-other-resources
mem_limit: ${ES_MEM_LIMIT:-4294967296} # 4 GiB in bytes
volumes:
- es-data:/usr/share/elasticsearch/data

Expand Down
5 changes: 5 additions & 0 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ IS_PROD := env_var_or_default("PROD", env_var_or_default("IS_PROD", ""))
# `PROD_ENV` can be "ingestion_server" or "catalog"
PROD_ENV := env_var_or_default("PROD_ENV", "")
IS_CI := env_var_or_default("CI", "")
DC_USER := env_var_or_default("DC_USER", "opener")
ENABLE_DC_OVERRIDES := env_var_or_default("OPENVERSE_ENABLE_DC_OVERRIDES", "true")

# Show all available recipes, also recurses inside nested justfiles
@_default:
Expand Down Expand Up @@ -115,6 +117,9 @@ DOCKER_FILE := "-f " + (
else { "docker-compose.yml" }
}
else { "docker-compose.yml" }
) + (
if ENABLE_DC_OVERRIDES == "true" { " -f docker-compose.overrides.yml" }
else { "" }
)
EXEC_DEFAULTS := if IS_CI == "" { "" } else { "-T" }

Expand Down

0 comments on commit 6cdfb59

Please sign in to comment.