From 367118fbfae7d9b9e2d42eac0a18ec35db088225 Mon Sep 17 00:00:00 2001 From: "Tanner W. Stokes" Date: Thu, 10 Nov 2022 10:26:15 -0700 Subject: [PATCH 1/7] Add HEAD requests support to DelayedRequester. --- openverse_catalog/dags/common/requester.py | 29 ++++++++++++++-------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/openverse_catalog/dags/common/requester.py b/openverse_catalog/dags/common/requester.py index 39cc8a4a5..40557a13e 100644 --- a/openverse_catalog/dags/common/requester.py +++ b/openverse_catalog/dags/common/requester.py @@ -30,12 +30,11 @@ class RetriesExceeded(Exception): class DelayedRequester: """ - Provides a method `get` that is a wrapper around `get` from the - `requests` module (i.e., it simply passes along whatever arguments it - receives). The difference is that when this class is initialized - with a non-zero `delay` parameter, it waits for at least that number - of seconds between consecutive requests. This is to avoid hitting - rate limits of APIs. + Provides methods `get` and `head` that are wrappers around the `requests` + module methods with the same name (i.e., it simply passes along whatever + arguments it receives). The difference is that when this class is initialized + with a non-zero `delay` parameter, it waits for at least that number of seconds + between consecutive requests. This is to avoid hitting rate limits of APIs. Optional Arguments: delay: an integer giving the minimum number of seconds to wait @@ -50,15 +49,16 @@ def __init__(self, delay=0, headers=None): self._last_request = 0 self.session = requests.Session() - def get(self, url, params=None, **kwargs): + def _make_request(self, method, url, **kwargs): """ - Make a get request, and return the response object if it exists. + Make a request, and return the response object if it exists. Required Arguments: url: URL to make the request as a string. params: Dictionary of query string params - **kwargs: Optional arguments that will be passed to `requests.get` + **kwargs: Optional arguments that will be passed to the `requests` + module request """ self._delay_processing() self._last_request = time.time() @@ -66,7 +66,7 @@ def get(self, url, params=None, **kwargs): if "headers" not in kwargs: request_kwargs["headers"] = self.headers try: - response = self.session.get(url, params=params, **request_kwargs) + response = method(url, **request_kwargs) if response.status_code == requests.codes.ok: logger.debug(f"Received response from url {response.url}") elif response.status_code == requests.codes.unauthorized: @@ -90,10 +90,17 @@ def get(self, url, params=None, **kwargs): except Exception as e: logger.error(f"Error with the request for URL: {url}") logger.info(f"{type(e).__name__}: {e}") - logger.info(f"Using query parameters {params}") + if params := request_kwargs.get("params"): + logger.info(f"Using query parameters {params}") logger.info(f'Using headers {request_kwargs.get("headers")}') return None + def get(self, url, params=None, **kwargs): + self._make_request(self.session.get, url, params=params, **kwargs) + + def head(self, url, **kwargs): + self._make_request(self.session.head, url, **kwargs) + def _delay_processing(self): wait = self._DELAY - (time.time() - self._last_request) if wait >= 0: From e4250c22a1a1b1808f3728b21f38188dd0be8a03 Mon Sep 17 00:00:00 2001 From: "Tanner W. Stokes" Date: Thu, 10 Nov 2022 11:11:57 -0700 Subject: [PATCH 2/7] Use a HEAD request for StockSnap filesize method. --- .../dags/providers/provider_api_scripts/stocksnap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/stocksnap.py b/openverse_catalog/dags/providers/provider_api_scripts/stocksnap.py index 963b2d3e6..dfc0c6c53 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/stocksnap.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/stocksnap.py @@ -151,7 +151,7 @@ def _get_filesize(self, image_url): """ Get the size of the image in bytes. """ - resp = self.delayed_requester.get(image_url) + resp = self.delayed_requester.head(image_url) if resp: filesize = int(resp.headers.get("Content-Length", 0)) return filesize if filesize != 0 else None From f14062b935796b7a1e3bf23c557703c6ccdd0804 Mon Sep 17 00:00:00 2001 From: "Tanner W. Stokes" Date: Thu, 10 Nov 2022 11:13:45 -0700 Subject: [PATCH 3/7] Use a HEAD request for WordPress filesize method. --- .../dags/providers/provider_api_scripts/wordpress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/wordpress.py b/openverse_catalog/dags/providers/provider_api_scripts/wordpress.py index 63efcdf20..9e56fdde3 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/wordpress.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/wordpress.py @@ -129,7 +129,7 @@ def _get_file_info(self, media_details): return None, None, None, None def _get_filesize(self, image_url): - resp = self.get_response_json(query_params={}, endpoint=image_url) + resp = self.delayed_requester.head(image_url) if resp: filesize = int(resp.headers.get("Content-Length", 0)) return filesize if filesize != 0 else None From be9f43a751f9016b6e4c3b92022f5e74b06428f7 Mon Sep 17 00:00:00 2001 From: "Tanner W. Stokes" Date: Wed, 16 Nov 2022 13:00:15 -0500 Subject: [PATCH 4/7] Add Callable type constraint for Requests method. --- openverse_catalog/dags/common/requester.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/openverse_catalog/dags/common/requester.py b/openverse_catalog/dags/common/requester.py index 40557a13e..eb3524e65 100644 --- a/openverse_catalog/dags/common/requester.py +++ b/openverse_catalog/dags/common/requester.py @@ -1,5 +1,6 @@ import logging import time +from collections.abc import Callable import oauth2 import requests @@ -49,7 +50,9 @@ def __init__(self, delay=0, headers=None): self._last_request = 0 self.session = requests.Session() - def _make_request(self, method, url, **kwargs): + def _make_request( + self, method: Callable[..., requests.models.Response], url: str, **kwargs + ): """ Make a request, and return the response object if it exists. From 8c5d0a26a92313a93bbab4265eea2b11b0e1d26c Mon Sep 17 00:00:00 2001 From: "Tanner W. Stokes" Date: Wed, 16 Nov 2022 13:05:16 -0500 Subject: [PATCH 5/7] Update docstrings params for _make_request. --- openverse_catalog/dags/common/requester.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openverse_catalog/dags/common/requester.py b/openverse_catalog/dags/common/requester.py index eb3524e65..9b4920e34 100644 --- a/openverse_catalog/dags/common/requester.py +++ b/openverse_catalog/dags/common/requester.py @@ -58,10 +58,10 @@ def _make_request( Required Arguments: + method: `requests` module request method. url: URL to make the request as a string. - params: Dictionary of query string params **kwargs: Optional arguments that will be passed to the `requests` - module request + module request. """ self._delay_processing() self._last_request = time.time() From af8324568861a720de423583d56565f05a6b5d27 Mon Sep 17 00:00:00 2001 From: Tanner Stokes Date: Thu, 17 Nov 2022 11:44:31 -0500 Subject: [PATCH 6/7] Update openverse_catalog/dags/common/requester.py Co-authored-by: Madison Swain-Bowden --- openverse_catalog/dags/common/requester.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openverse_catalog/dags/common/requester.py b/openverse_catalog/dags/common/requester.py index 9b4920e34..9f3b59586 100644 --- a/openverse_catalog/dags/common/requester.py +++ b/openverse_catalog/dags/common/requester.py @@ -99,10 +99,10 @@ def _make_request( return None def get(self, url, params=None, **kwargs): - self._make_request(self.session.get, url, params=params, **kwargs) + return self._make_request(self.session.get, url, params=params, **kwargs) def head(self, url, **kwargs): - self._make_request(self.session.head, url, **kwargs) + return self._make_request(self.session.head, url, **kwargs) def _delay_processing(self): wait = self._DELAY - (time.time() - self._last_request) From cd4c71461402dd7ab392f3f2801c7118a882d0d7 Mon Sep 17 00:00:00 2001 From: "Tanner W. Stokes" Date: Thu, 17 Nov 2022 18:32:14 -0500 Subject: [PATCH 7/7] Add docstrings for GET and HEAD methods. --- openverse_catalog/dags/common/requester.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/openverse_catalog/dags/common/requester.py b/openverse_catalog/dags/common/requester.py index 9f3b59586..eda23b3e2 100644 --- a/openverse_catalog/dags/common/requester.py +++ b/openverse_catalog/dags/common/requester.py @@ -99,9 +99,26 @@ def _make_request( return None def get(self, url, params=None, **kwargs): + """ + Make a GET request, and return the response object if it exists. + + Required Arguments: + + url: URL to make the request as a string. + params: Dictionary of query string params. + **kwargs: Optional arguments that will be passed to `requests.get`. + """ return self._make_request(self.session.get, url, params=params, **kwargs) def head(self, url, **kwargs): + """ + Make a HEAD request, and return the response object if it exists. + + Required Arguments: + + url: URL to make the request as a string. + **kwargs: Optional arguments that will be passed to `requests.head`. + """ return self._make_request(self.session.head, url, **kwargs) def _delay_processing(self):