Skip to content
This repository has been archived by the owner on Feb 22, 2023. It is now read-only.

Add User-Agent header to all outbound requests coming from the API #877

Merged
merged 15 commits into from
Aug 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions api/catalog/api/utils/validate_images.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging
import time

from django.conf import settings

import django_redis
import grequests
from decouple import config
Expand All @@ -12,6 +14,9 @@


CACHE_PREFIX = "valid:"
HEADERS = {
"User-Agent": settings.OUTBOUND_USER_AGENT_TEMPLATE.format(purpose="LinkValidation")
}


def _get_cached_statuses(redis, image_urls):
Expand Down Expand Up @@ -50,8 +55,10 @@ def validate_images(query_hash, start_slice, results, image_urls):
to_verify[url] = idx
logger.debug(f"len(to_verify)={len(to_verify)}")
reqs = (
grequests.head(u, allow_redirects=False, timeout=2, verify=False)
for u in to_verify.keys()
grequests.head(
url, headers=HEADERS, allow_redirects=False, timeout=2, verify=False
)
for url in to_verify.keys()
)
verified = grequests.map(reqs, exception_handler=_validation_failure)
# Cache newly verified image statuses.
Expand Down
20 changes: 16 additions & 4 deletions api/catalog/api/utils/watermark.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
import logging
import os
from enum import Flag, auto
from io import BytesIO
from textwrap import wrap

from django.conf import settings

import piexif
import requests
from PIL import Image, ImageDraw, ImageFont
from sentry_sdk import capture_exception


parent_logger = logging.getLogger(__name__)


BREAKPOINT_DIMENSION = 400 # 400px
Expand All @@ -14,6 +21,9 @@

FRAME_COLOR = "#fff" # White frame
TEXT_COLOR = "#000" # Black text
HEADERS = {
"User-Agent": settings.OUTBOUND_USER_AGENT_TEMPLATE.format(purpose="Watermark")
}


class Dimension(Flag):
Expand Down Expand Up @@ -143,9 +153,9 @@ def _open_image(url):
:param url: the URL from where to read the image
:return: the PIL image object with the EXIF data
"""

logger = parent_logger.getChild("_open_image")
try:
response = requests.get(url)
response = requests.get(url, headers=HEADERS)
img_bytes = BytesIO(response.content)
img = Image.open(img_bytes)
# Preserve EXIF metadata
Expand All @@ -154,8 +164,10 @@ def _open_image(url):
else:
exif = None
return img, exif
except requests.exceptions.RequestException:
print("Error loading image data")
except requests.exceptions.RequestException as e:
capture_exception(e)
logger.error(f"Error loading image data: {e}")
return None, None


def _print_attribution_on_image(img, image_info):
Expand Down
4 changes: 3 additions & 1 deletion api/catalog/api/utils/waveform.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
import subprocess
from typing import List

from django.conf import settings

import requests


parent_logger = logging.getLogger(__name__)

TMP_DIR = pathlib.Path("/tmp").resolve()
UA_STRING = "OpenverseWaveform/0.0 (https://wordpress.org/openverse)"
UA_STRING = settings.OUTBOUND_USER_AGENT_TEMPLATE.format(purpose="Waveform")


def ext_from_url(url):
Expand Down
11 changes: 8 additions & 3 deletions api/catalog/api/views/image_views.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import io

from django.conf import settings
from django.http.response import FileResponse, Http404, HttpResponse
from django.http.response import FileResponse, HttpResponse
from django.utils.decorators import method_decorator
from rest_framework.decorators import action
from rest_framework.exceptions import NotFound
from rest_framework.response import Response

import piexif
Expand Down Expand Up @@ -60,6 +61,10 @@ class ImageViewSet(MediaViewSet):

serializer_class = ImageSerializer

OEMBED_HEADERS = {
"User-Agent": settings.OUTBOUND_USER_AGENT_TEMPLATE.format(purpose="OEmbed"),
}

# Extra actions

@action(
Expand All @@ -81,7 +86,7 @@ def oembed(self, request, *_, **__):
except Image.DoesNotExist:
return get_api_exception("Could not find image.", 404)
if not (image.height and image.width):
image_file = requests.get(image.url)
image_file = requests.get(image.url, headers=self.OEMBED_HEADERS)
width, height = PILImage.open(io.BytesIO(image_file.content)).size
context |= {
"width": width,
Expand Down Expand Up @@ -110,7 +115,7 @@ def thumbnail(self, request, *_, **__):
@action(detail=True, url_path="watermark", url_name="watermark")
def watermark(self, request, *_, **__):
if not settings.WATERMARK_ENABLED:
raise Http404 # watermark feature is disabled
raise NotFound("The watermark feature is currently disabled.")

params = WatermarkRequestSerializer(data=request.query_params)
params.is_valid(raise_exception=True)
Expand Down
41 changes: 25 additions & 16 deletions api/catalog/api/views/media_views.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
import json
import logging as log
from http.client import RemoteDisconnected
from urllib.error import HTTPError
import logging
from urllib.parse import urlencode
from urllib.request import Request, urlopen

from django.conf import settings
from django.http.response import HttpResponse
Expand All @@ -13,6 +9,7 @@
from rest_framework.response import Response
from rest_framework.viewsets import ReadOnlyModelViewSet

import requests
from sentry_sdk import capture_exception

from catalog.api.controllers import search_controller
Expand All @@ -28,6 +25,9 @@ class UpstreamThumbnailException(APIException):
default_detail = "Could not render thumbnail due to upstream provider error."


parent_logger = logging.getLogger(__name__)


class MediaViewSet(ReadOnlyModelViewSet):
swagger_schema = CustomAutoSchema

Expand Down Expand Up @@ -171,32 +171,41 @@ def _get_user_ip(request):
ip = request.META.get("REMOTE_ADDR")
return ip

THUMBNAIL_PROXY_COMM_HEADERS = {
"User-Agent": settings.OUTBOUND_USER_AGENT_TEMPLATE.format(
purpose="ThumbnailGeneration"
)
}

@staticmethod
def _thumbnail_proxy_comm(
path: str,
params: dict,
headers: tuple[tuple[str, str]] = (),
):
) -> tuple[requests.Response, int, str]:
logger = parent_logger.getChild("_thumbnail_proxy_comm")
proxy_url = settings.THUMBNAIL_PROXY_URL
query_string = urlencode(params)
upstream_url = f"{proxy_url}/{path}?{query_string}"
log.debug(f"Image proxy upstream URL: {upstream_url}")
logger.debug(f"Image proxy upstream URL: {upstream_url}")

try:
req = Request(upstream_url)
for key, val in headers:
req.add_header(key, val)
upstream_response = urlopen(req, timeout=10)
compiled_headers = MediaViewSet.THUMBNAIL_PROXY_COMM_HEADERS | {
k: v for k, v in headers
}
upstream_response = requests.get(
upstream_url, timeout=10, headers=compiled_headers
)

res_status = upstream_response.status
res_status = upstream_response.status_code
content_type = upstream_response.headers.get("Content-Type")
log.debug(
logger.debug(
"Image proxy response "
f"status: {res_status}, content-type: {content_type}"
)

return upstream_response, res_status, content_type
except (HTTPError, RemoteDisconnected, TimeoutError) as exc:
except requests.RequestException as exc:
capture_exception(exc)
raise UpstreamThumbnailException(f"Failed to render thumbnail: {exc}")
except Exception as exc:
Expand All @@ -217,7 +226,7 @@ def _get_proxied_image(
info_res, *_ = MediaViewSet._thumbnail_proxy_comm(
"info", {"url": image_url}
)
info = json.loads(info_res.read())
info = info_res.json()
width = info["width"]

params = {
Expand All @@ -243,6 +252,6 @@ def _get_proxied_image(
"resize", params, (("Accept", accept_header),)
)
response = HttpResponse(
img_res.read(), status=res_status, content_type=content_type
img_res.content, status=res_status, content_type=content_type
)
return response
5 changes: 5 additions & 0 deletions api/catalog/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,11 @@
# The version of the API. We follow the semantic version specification.
API_VERSION = config("SEMANTIC_VERSION", default="Version not specified")

OUTBOUND_USER_AGENT_TEMPLATE = config(
"OUTBOUND_USER_AGENT_TEMPLATE",
default=f"Openverse{{purpose}}/{API_VERSION} (https://wordpress.org/openverse)",
)

# The contact email of the Openverse team
CONTACT_EMAIL = config("CONTACT_EMAIL", default="[email protected]")

Expand Down
20 changes: 0 additions & 20 deletions api/test/audio_integration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,6 @@
search_source_and_excluded,
search_special_chars,
stats,
thumb,
thumb_compression,
thumb_full_size,
thumb_webp,
uuid_validation,
)

Expand Down Expand Up @@ -122,10 +118,6 @@ def test_audio_stats():
stats("audio")


def test_audio_thumb(jamendo_audio_fixture):
thumb(jamendo_audio_fixture)


def test_audio_detail_without_thumb():
resp = requests.get(f"{API_URL}/v1/audio/44540200-91eb-483d-9e99-38ce86a52fb6")
assert resp.status_code == 200
Expand All @@ -141,18 +133,6 @@ def test_audio_search_without_thumb():
assert parsed["results"][0]["thumbnail"] is None


def test_audio_thumb_compression(jamendo_audio_fixture):
thumb_compression(jamendo_audio_fixture)


def test_audio_thumb_webp(jamendo_audio_fixture):
thumb_webp(jamendo_audio_fixture)


def test_audio_thumb_full_size(jamendo_audio_fixture):
thumb_full_size(jamendo_audio_fixture)


def test_audio_report(audio_fixture):
report("audio", audio_fixture)

Expand Down
1 change: 1 addition & 0 deletions api/test/factory/models/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class Meta:

foreign_landing_url = Faker("globally_unique_url")
url = Faker("globally_unique_url")
thumbnail = Faker("image_url")


class IdentifierFactory(factory.SubFactory):
Expand Down
20 changes: 20 additions & 0 deletions api/test/factory/sample-audio-info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"creator": "TheGloomWorker",
"headers": {
"Access-Control-Allow-Headers": "DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range",
"Access-Control-Allow-Methods": "GET, POST, OPTIONS",
"Access-Control-Allow-Origin": "*",
"Access-Control-Expose-Headers": "Content-Length,Content-Range",
"Connection": "keep-alive",
"Content-Length": "583491",
"Content-Range": "bytes 0-583490/583491",
"Content-Type": "audio/mpeg",
"Date": "Fri, 12 Aug 2022 15:18:23 GMT",
"ETag": "5e4e7f4d-8e743",
"Last-Modified": "Thu, 20 Feb 2020 12:45:01 GMT",
"Server": "nginx/1.18.0 (Ubuntu)"
},
"license": "CC0",
"license_version": "1.0",
"title": "Birds and City Ambience.mp3"
}
Binary file added api/test/factory/sample-audio.mp3
Binary file not shown.
6 changes: 6 additions & 0 deletions api/test/factory/sample-image-info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"creator": "andymiccone",
"license": "CC0",
"license_version": "1.0",
"title": "'I Just Love Old Music..!'"
}
Binary file added api/test/factory/sample-image.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
20 changes: 0 additions & 20 deletions api/test/image_integration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,6 @@
search_source_and_excluded,
search_special_chars,
stats,
thumb,
thumb_compression,
thumb_full_size,
thumb_webp,
uuid_validation,
)
from urllib.parse import urlencode
Expand Down Expand Up @@ -74,22 +70,6 @@ def test_image_stats():
stats("images")


def test_image_thumb(image_fixture):
thumb(image_fixture)


def test_image_thumb_compression(image_fixture):
thumb_compression(image_fixture)


def test_image_thumb_webp(image_fixture):
thumb_webp(image_fixture)


def test_image_thumb_full_size(image_fixture):
thumb_full_size(image_fixture)


def test_audio_report(image_fixture):
report("images", image_fixture)

Expand Down
Loading