Skip to content
This repository has been archived by the owner on Aug 4, 2023. It is now read-only.

Add HEAD requests support to DelayedRequester #865

Merged
merged 8 commits into from
Nov 18, 2022
29 changes: 18 additions & 11 deletions openverse_catalog/dags/common/requester.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,11 @@ class RetriesExceeded(Exception):

class DelayedRequester:
"""
Provides a method `get` that is a wrapper around `get` from the
`requests` module (i.e., it simply passes along whatever arguments it
receives). The difference is that when this class is initialized
with a non-zero `delay` parameter, it waits for at least that number
of seconds between consecutive requests. This is to avoid hitting
rate limits of APIs.
Provides methods `get` and `head` that are wrappers around the `requests`
module methods with the same name (i.e., it simply passes along whatever
arguments it receives). The difference is that when this class is initialized
with a non-zero `delay` parameter, it waits for at least that number of seconds
between consecutive requests. This is to avoid hitting rate limits of APIs.

Optional Arguments:
delay: an integer giving the minimum number of seconds to wait
Expand All @@ -50,23 +49,24 @@ def __init__(self, delay=0, headers=None):
self._last_request = 0
self.session = requests.Session()

def get(self, url, params=None, **kwargs):
def _make_request(self, method, url, **kwargs):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def _make_request(self, method, url, **kwargs):
def _make_request(self, method: Literal['get', 'head'], url: str, **kwargs):

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👋 Hi @obulat - thanks for the suggestion. Could you help me understand this literal type constraint better? I was under the impression that the calls to _make_request were passing methods from the Requests module and not string values, but I'm probably missing something.

To experiment I changed the Literal params to something nonsensical (instead of get and head) and just lint and just test passed when I was expecting them to fail.

Also, if you're suggesting that we make the Requests module calls only within _make_request based on the string passed then that makes sense, but I wanted to get confirmation. Thank you!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gah, that's right! This is actually of type callable, rather than Literal. I think the way it is presently is fine, but it might be worth adding the generic callable type annotation (trying to match the call signatures of both .head and .get will be a head-ache 😄)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for stepping in, @AetherUnbound, and sorry I couldn't reply sooner, @twstokes.

"""
Make a get request, and return the response object if it exists.
Make a request, and return the response object if it exists.

Required Arguments:

url: URL to make the request as a string.
params: Dictionary of query string params
**kwargs: Optional arguments that will be passed to `requests.get`
**kwargs: Optional arguments that will be passed to the `requests`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstrings params need to be updated, too: add method and remove params (or keep it if it's possible to document a **kwargs parameter).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated in 8c5d0a2.

module request
"""
self._delay_processing()
self._last_request = time.time()
request_kwargs = kwargs or {}
if "headers" not in kwargs:
request_kwargs["headers"] = self.headers
try:
response = self.session.get(url, params=params, **request_kwargs)
response = method(url, **request_kwargs)
if response.status_code == requests.codes.ok:
logger.debug(f"Received response from url {response.url}")
elif response.status_code == requests.codes.unauthorized:
Expand All @@ -90,10 +90,17 @@ def get(self, url, params=None, **kwargs):
except Exception as e:
logger.error(f"Error with the request for URL: {url}")
logger.info(f"{type(e).__name__}: {e}")
logger.info(f"Using query parameters {params}")
if params := request_kwargs.get("params"):
logger.info(f"Using query parameters {params}")
logger.info(f'Using headers {request_kwargs.get("headers")}')
return None

def get(self, url, params=None, **kwargs):
twstokes marked this conversation as resolved.
Show resolved Hide resolved
self._make_request(self.session.get, url, params=params, **kwargs)

def head(self, url, **kwargs):
twstokes marked this conversation as resolved.
Show resolved Hide resolved
self._make_request(self.session.head, url, **kwargs)
twstokes marked this conversation as resolved.
Show resolved Hide resolved

def _delay_processing(self):
wait = self._DELAY - (time.time() - self._last_request)
if wait >= 0:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def _get_filesize(self, image_url):
"""
Get the size of the image in bytes.
"""
resp = self.delayed_requester.get(image_url)
resp = self.delayed_requester.head(image_url)
if resp:
filesize = int(resp.headers.get("Content-Length", 0))
return filesize if filesize != 0 else None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def _get_file_info(self, media_details):
return None, None, None, None

def _get_filesize(self, image_url):
resp = self.get_response_json(query_params={}, endpoint=image_url)
resp = self.delayed_requester.head(image_url)
if resp:
filesize = int(resp.headers.get("Content-Length", 0))
return filesize if filesize != 0 else None
Expand Down