Skip to content
This repository has been archived by the owner on Aug 4, 2023. It is now read-only.

Add HEAD requests support to DelayedRequester #865

Merged
merged 8 commits into from
Nov 18, 2022
51 changes: 39 additions & 12 deletions openverse_catalog/dags/common/requester.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import time
from collections.abc import Callable
AetherUnbound marked this conversation as resolved.
Show resolved Hide resolved

import oauth2
import requests
Expand Down Expand Up @@ -30,12 +31,11 @@ class RetriesExceeded(Exception):

class DelayedRequester:
"""
Provides a method `get` that is a wrapper around `get` from the
`requests` module (i.e., it simply passes along whatever arguments it
receives). The difference is that when this class is initialized
with a non-zero `delay` parameter, it waits for at least that number
of seconds between consecutive requests. This is to avoid hitting
rate limits of APIs.
Provides methods `get` and `head` that are wrappers around the `requests`
module methods with the same name (i.e., it simply passes along whatever
arguments it receives). The difference is that when this class is initialized
with a non-zero `delay` parameter, it waits for at least that number of seconds
between consecutive requests. This is to avoid hitting rate limits of APIs.

Optional Arguments:
delay: an integer giving the minimum number of seconds to wait
Expand All @@ -50,23 +50,26 @@ def __init__(self, delay=0, headers=None):
self._last_request = 0
self.session = requests.Session()

def get(self, url, params=None, **kwargs):
def _make_request(
self, method: Callable[..., requests.models.Response], url: str, **kwargs
):
"""
Make a get request, and return the response object if it exists.
Make a request, and return the response object if it exists.

Required Arguments:

method: `requests` module request method.
url: URL to make the request as a string.
params: Dictionary of query string params
**kwargs: Optional arguments that will be passed to `requests.get`
**kwargs: Optional arguments that will be passed to the `requests`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstrings params need to be updated, too: add method and remove params (or keep it if it's possible to document a **kwargs parameter).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated in 8c5d0a2.

module request.
"""
self._delay_processing()
self._last_request = time.time()
request_kwargs = kwargs or {}
if "headers" not in kwargs:
request_kwargs["headers"] = self.headers
try:
response = self.session.get(url, params=params, **request_kwargs)
response = method(url, **request_kwargs)
if response.status_code == requests.codes.ok:
logger.debug(f"Received response from url {response.url}")
elif response.status_code == requests.codes.unauthorized:
Expand All @@ -90,10 +93,34 @@ def get(self, url, params=None, **kwargs):
except Exception as e:
logger.error(f"Error with the request for URL: {url}")
logger.info(f"{type(e).__name__}: {e}")
logger.info(f"Using query parameters {params}")
if params := request_kwargs.get("params"):
logger.info(f"Using query parameters {params}")
logger.info(f'Using headers {request_kwargs.get("headers")}')
return None

def get(self, url, params=None, **kwargs):
twstokes marked this conversation as resolved.
Show resolved Hide resolved
"""
Make a GET request, and return the response object if it exists.

Required Arguments:

url: URL to make the request as a string.
params: Dictionary of query string params.
**kwargs: Optional arguments that will be passed to `requests.get`.
"""
return self._make_request(self.session.get, url, params=params, **kwargs)

def head(self, url, **kwargs):
twstokes marked this conversation as resolved.
Show resolved Hide resolved
"""
Make a HEAD request, and return the response object if it exists.

Required Arguments:

url: URL to make the request as a string.
**kwargs: Optional arguments that will be passed to `requests.head`.
"""
return self._make_request(self.session.head, url, **kwargs)

def _delay_processing(self):
wait = self._DELAY - (time.time() - self._last_request)
if wait >= 0:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def _get_filesize(self, image_url):
"""
Get the size of the image in bytes.
"""
resp = self.delayed_requester.get(image_url)
resp = self.delayed_requester.head(image_url)
if resp:
filesize = int(resp.headers.get("Content-Length", 0))
return filesize if filesize != 0 else None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def _get_file_info(self, media_details):
return None, None, None, None

def _get_filesize(self, image_url):
resp = self.get_response_json(query_params={}, endpoint=image_url)
resp = self.delayed_requester.head(image_url)
if resp:
filesize = int(resp.headers.get("Content-Length", 0))
return filesize if filesize != 0 else None
Expand Down