Skip to content

Commit

Permalink
requestsession tests
Browse files Browse the repository at this point in the history
  • Loading branch information
freddyheppell committed Aug 13, 2024
1 parent ccae085 commit 730b5d8
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 12 deletions.
2 changes: 2 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
- Fixed the scrape crawling not correctly recognising when duplicate URLs were encountered. Previously duplicates would be included, but only one would be used. Now, they will be correctly logged. As a result of this change, the `SCRAPE_CRAWL_VERSION` has been incremented, meaning running extraction on a scrape will cause it to be re-crawled.
- Fixed the return type annotation `LangPicker.get_root()`: it is now annotated as (`bs4.Tag` or `None`) instead of `bs4.PageElement`. This shouldn't be a breaking change, as the expected return type of this function was always a `Tag` object or `None`.
- Type of `TranslationLink.lang` changed to reflect that it can accept a string to resolve or an already resolved `Language` instance
- Fixed downloading throwing an error stating the WordPress v2 API was not supported in other error cases
- Fixed the maximum redirects permitted not being set properly, meaning the effective value was always 30

**Documentation**

Expand Down
18 changes: 6 additions & 12 deletions src/wpextract/download/requestsession.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,12 +136,6 @@ class HTTPTooManyRedirects(Exception):


def _handle_status(url: str, status_code: int, n_tries: Optional[int] = None) -> None:
if 300 <= status_code < 400:
logging.error(
f'Too many redirects (status code {status_code}) while fetching "{url}"'
)
raise HTTPTooManyRedirects

if status_code in ERROR_NAMES:
log_msg = (
f'Error {status_code} ({ERROR_NAMES[status_code]}) while fetching "{url}"'
Expand Down Expand Up @@ -237,19 +231,16 @@ def __init__(
self.s.auth = authorization
self.wait = wait
self.timeout = timeout
self._mount_retry(backoff_factor, max_redirects, max_retries)
self.s.max_redirects = max_redirects
self._mount_retry(backoff_factor, max_retries)
self.waiter = RequestWait(wait, random_wait)
self.user_agent = user_agent if user_agent is not None else DEFAULT_UA

def _mount_retry(
self, backoff_factor: float, max_redirects: int, max_retries: int
) -> None:
def _mount_retry(self, backoff_factor: float, max_retries: int) -> None:
retry = Retry(
total=max_retries,
backoff_factor=backoff_factor,
redirect=max_redirects,
status_forcelist=RETRY_AFTER_STATUS,
raise_on_redirect=False,
raise_on_status=False,
)
adapter = HTTPAdapter(max_retries=retry)
Expand Down Expand Up @@ -329,6 +320,9 @@ def do_request(
except requests.Timeout as e:
logging.error(f"Request timed out fetching {url}")
raise ConnectionTimeout from e
except requests.TooManyRedirects as e:
logging.error(f'Too many redirects while fetching "{url}"')
raise HTTPTooManyRedirects from e

# If this is an HTTP 400 due to an invalid page, raise this special error early
if response.status_code == 400 and "application/json" in response.headers.get(
Expand Down
6 changes: 6 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,9 @@ def mocked_responses():
def mocked_responses_optional():
with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
yield rsps


@pytest.fixture()
def mocked_responses_ordered():
with responses.RequestsMock(registry=responses.registries.OrderedRegistry) as rsps:
yield rsps
118 changes: 118 additions & 0 deletions tests/download/test_requestsession.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import pytest
from responses import matchers
from wpextract.download import RequestSession
from wpextract.download.requestsession import (
HTTPError404,
HTTPError500,
HTTPTooManyRedirects,
)


def test_request_session_get(mocked_responses):
sess = RequestSession()
mocked_responses.get("https://example.org", body="Example response")

resp = sess.get("https://example.org")
assert resp.text == "Example response"


def test_request_session_post(mocked_responses):
sess = RequestSession()
mocked_responses.post(
"https://example.org",
match=[matchers.urlencoded_params_matcher({"foo": "bar"})],
body="Example response",
)

resp = sess.post("https://example.org", {"foo": "bar"})
assert resp.text == "Example response"


def test_no_retry_404_error(mocked_responses):
sess = RequestSession()
mocked_responses.get("https://example.org", status=404)

with pytest.raises(HTTPError404):
sess.get("https://example.org")

mocked_responses.assert_call_count("https://example.org", 1)


def test_no_retries(mocked_responses, caplog):
sess_no_retries = RequestSession(max_retries=0)
mocked_responses.get("https://example.org/a", status=500)

with pytest.raises(HTTPError500):
sess_no_retries.get("https://example.org/a")
mocked_responses.assert_call_count("https://example.org/a", 1)


def test_retries(mocked_responses, caplog):
sess_retries = RequestSession(max_retries=5)
mocked_responses.get("https://example.org/b", status=500)

with pytest.raises(HTTPError500):
sess_retries.get("https://example.org/b")
mocked_responses.assert_call_count("https://example.org/b", 6)


def test_timeout(mocked_responses):
sess = RequestSession(timeout=1)
mocked_responses.get(
"https://example.org",
body="Example response",
match=[matchers.request_kwargs_matcher({"timeout": 1})],
)

sess.get("https://example.org")


@pytest.fixture()
def mocked_sleep(mocker):
return mocker.patch("wpextract.download.requestsession.time.sleep")


def test_wait(mocked_responses, mocked_sleep):
sess = RequestSession(wait=1)
mocked_responses.get("https://example.org", body="Example response")

sess.get("https://example.org")

mocked_sleep.assert_called_once_with(1)


def test_rand_wait(mocked_responses, mocked_sleep):
sess = RequestSession(wait=1, random_wait=True)
mocked_responses.get("https://example.org", body="Example response")
sess.get("https://example.org")

mocked_sleep.assert_called_once()
assert mocked_sleep.call_args[0][0] >= 0.5
assert mocked_sleep.call_args[0][0] <= 1.5


def test_max_redirects(mocked_responses):
for i in range(1, 5):
mocked_responses.get(
f"https://example.org/{i}",
status=301,
headers={"Location": f"https://example.org/{i+1}"},
)

sess = RequestSession(max_redirects=3)
with pytest.raises(HTTPTooManyRedirects):
sess.get("https://example.org/1")


def test_max_redirects_within_limit(mocked_responses):
for i in range(1, 5):
mocked_responses.get(
f"https://example.org/{i}",
status=301,
headers={"Location": f"https://example.org/{i+1}"},
)
mocked_responses.get("https://example.org/5", body="Example response")
sess = RequestSession(max_redirects=5)
resp = sess.get("https://example.org/1")
assert resp.status_code == 200
assert resp.text == "Example response"

0 comments on commit 730b5d8

Please sign in to comment.