diff --git a/openverse_catalog/dags/common/requester.py b/openverse_catalog/dags/common/requester.py index 39cc8a4a5..eda23b3e2 100644 --- a/openverse_catalog/dags/common/requester.py +++ b/openverse_catalog/dags/common/requester.py @@ -1,5 +1,6 @@ import logging import time +from collections.abc import Callable import oauth2 import requests @@ -30,12 +31,11 @@ class RetriesExceeded(Exception): class DelayedRequester: """ - Provides a method `get` that is a wrapper around `get` from the - `requests` module (i.e., it simply passes along whatever arguments it - receives). The difference is that when this class is initialized - with a non-zero `delay` parameter, it waits for at least that number - of seconds between consecutive requests. This is to avoid hitting - rate limits of APIs. + Provides methods `get` and `head` that are wrappers around the `requests` + module methods with the same name (i.e., it simply passes along whatever + arguments it receives). The difference is that when this class is initialized + with a non-zero `delay` parameter, it waits for at least that number of seconds + between consecutive requests. This is to avoid hitting rate limits of APIs. Optional Arguments: delay: an integer giving the minimum number of seconds to wait @@ -50,15 +50,18 @@ def __init__(self, delay=0, headers=None): self._last_request = 0 self.session = requests.Session() - def get(self, url, params=None, **kwargs): + def _make_request( + self, method: Callable[..., requests.models.Response], url: str, **kwargs + ): """ - Make a get request, and return the response object if it exists. + Make a request, and return the response object if it exists. Required Arguments: + method: `requests` module request method. url: URL to make the request as a string. - params: Dictionary of query string params - **kwargs: Optional arguments that will be passed to `requests.get` + **kwargs: Optional arguments that will be passed to the `requests` + module request. """ self._delay_processing() self._last_request = time.time() @@ -66,7 +69,7 @@ def get(self, url, params=None, **kwargs): if "headers" not in kwargs: request_kwargs["headers"] = self.headers try: - response = self.session.get(url, params=params, **request_kwargs) + response = method(url, **request_kwargs) if response.status_code == requests.codes.ok: logger.debug(f"Received response from url {response.url}") elif response.status_code == requests.codes.unauthorized: @@ -90,10 +93,34 @@ def get(self, url, params=None, **kwargs): except Exception as e: logger.error(f"Error with the request for URL: {url}") logger.info(f"{type(e).__name__}: {e}") - logger.info(f"Using query parameters {params}") + if params := request_kwargs.get("params"): + logger.info(f"Using query parameters {params}") logger.info(f'Using headers {request_kwargs.get("headers")}') return None + def get(self, url, params=None, **kwargs): + """ + Make a GET request, and return the response object if it exists. + + Required Arguments: + + url: URL to make the request as a string. + params: Dictionary of query string params. + **kwargs: Optional arguments that will be passed to `requests.get`. + """ + return self._make_request(self.session.get, url, params=params, **kwargs) + + def head(self, url, **kwargs): + """ + Make a HEAD request, and return the response object if it exists. + + Required Arguments: + + url: URL to make the request as a string. + **kwargs: Optional arguments that will be passed to `requests.head`. + """ + return self._make_request(self.session.head, url, **kwargs) + def _delay_processing(self): wait = self._DELAY - (time.time() - self._last_request) if wait >= 0: diff --git a/openverse_catalog/dags/providers/provider_api_scripts/stocksnap.py b/openverse_catalog/dags/providers/provider_api_scripts/stocksnap.py index 963b2d3e6..dfc0c6c53 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/stocksnap.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/stocksnap.py @@ -151,7 +151,7 @@ def _get_filesize(self, image_url): """ Get the size of the image in bytes. """ - resp = self.delayed_requester.get(image_url) + resp = self.delayed_requester.head(image_url) if resp: filesize = int(resp.headers.get("Content-Length", 0)) return filesize if filesize != 0 else None diff --git a/openverse_catalog/dags/providers/provider_api_scripts/wordpress.py b/openverse_catalog/dags/providers/provider_api_scripts/wordpress.py index 63efcdf20..9e56fdde3 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/wordpress.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/wordpress.py @@ -129,7 +129,7 @@ def _get_file_info(self, media_details): return None, None, None, None def _get_filesize(self, image_url): - resp = self.get_response_json(query_params={}, endpoint=image_url) + resp = self.delayed_requester.head(image_url) if resp: filesize = int(resp.headers.get("Content-Length", 0)) return filesize if filesize != 0 else None