diff --git a/cdp_backend/tests/utils/test_file_utils.py b/cdp_backend/tests/utils/test_file_utils.py index 2f42a002..63188cbf 100644 --- a/cdp_backend/tests/utils/test_file_utils.py +++ b/cdp_backend/tests/utils/test_file_utils.py @@ -11,6 +11,7 @@ import imageio import pytest +import requests_mock from cdp_backend.utils import file_utils from cdp_backend.utils.file_utils import ( @@ -114,6 +115,19 @@ def test_resource_copy(tmpdir, example_video: Path) -> None: # type: ignore resource_copy(str(example_video), save_path) +def test_resource_copy_https(tmpdir, example_video: Path) -> None: # type: ignore + with requests_mock.Mocker() as mock: + example_video_file = imageio.read(example_video) + mock.get("https://example.com/example_video.mp4", body=example_video_file) + dest = tmpdir / "example_video.mp4" + saved_path = resource_copy( + "https://example.com/example_video.mp4", + dest, + ) + assert saved_path == dest + example_video_file.close() + + # Type ignore because changing tmpdir typing def test_hash_file_contents(tmpdir) -> None: # type: ignore test_file = Path(tmpdir) / "a.txt" diff --git a/cdp_backend/utils/file_utils.py b/cdp_backend/utils/file_utils.py index e1a0e56a..dbb257ff 100644 --- a/cdp_backend/utils/file_utils.py +++ b/cdp_backend/utils/file_utils.py @@ -6,6 +6,7 @@ import math import random import re +import shutil from hashlib import sha256 from pathlib import Path from uuid import uuid4 @@ -225,21 +226,27 @@ def resource_copy( # noqa: C901 ) return str(dst) - # Set custom timeout for http resources + # Common case: http(s) URI if uri.startswith("http"): # The verify=False is passed to any http URIs # It was added because it's very common for SSL certs to be bad # See: https://github.com/CouncilDataProject/cdp-scrapers/pull/85 # And: https://github.com/CouncilDataProject/seattle/runs/5957646032 - with open(dst, "wb") as open_dst: - open_dst.write( - requests.get( - uri, - verify=False, - timeout=1800, - ).content - ) + # Use stream=True to avoid downloading the entire file into memory + # See: https://github.com/CouncilDataProject/cdp-backend/issues/235 + try: + # This response must be closed after the copy is done. But using + # `with requests.get() as response` fails mypy type checking. + # See: https://requests.readthedocs.io/en/latest/user/advanced/#body-content-workflow + response = requests.get(uri, stream=True, verify=False, timeout=1800) + response.raise_for_status() + with open(dst, "wb") as open_dst: + shutil.copyfileobj( + response.raw, open_dst, length=64 * 1024 * 1024 # 64MB chunks + ) + finally: + response.close() else: # TODO: Add explicit use of GCS credentials until public read is fixed diff --git a/pyproject.toml b/pyproject.toml index bf6585a3..0e2bcd29 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -103,6 +103,7 @@ test = [ # Extras "networkx>=2.5", "pydot>=1.4", + "requests-mock>=1.10.0" ] docs = [ # Sphinx + Doc Gen + Styling