WIP: Implement sliced downloads in GSClient (#389)

* feat: Implement sliced downloads in GSClient * fix: remove unintended import changes * Mock transfer_manager. Test both worker types.
drivendataorg · Dec 29, 2023 · f65de44 · f65de44
1 parent 6bce0f9
commit f65de44
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 1 deletion.
diff --git a/HISTORY.md b/HISTORY.md
@@ -4,6 +4,7 @@
 
 - Fix `S3Client` cleanup via `Client.__del__` when `S3Client` encounters an exception during initialization. (Issue [#372](https://github.com/drivendataorg/cloudpathlib/issues/372), PR [#373](https://github.com/drivendataorg/cloudpathlib/pull/373), thanks to [@bryanwweber](https://github.com/bryanwweber))
 - Skip mtime checks during upload when force_overwrite_to_cloud is set to improve upload performance. (Issue [#379](https://github.com/drivendataorg/cloudpathlib/issues/379), PR [#380](https://github.com/drivendataorg/cloudpathlib/pull/380), thanks to [@Gilthans](https://github.com/Gilthans))
+- Implement sliced downloads in GSClient. (Issue [#387](https://github.com/drivendataorg/cloudpathlib/issues/387), PR [#389](https://github.com/drivendataorg/cloudpathlib/pull/389))
 
 ## v0.16.0 (2023-10-09)
  - Add "CloudPath" as return type on `__init__` for mypy issues. ([Issue #179](https://github.com/drivendataorg/cloudpathlib/issues/179), [PR #342](https://github.com/drivendataorg/cloudpathlib/pull/342))

diff --git a/cloudpathlib/gs/gsclient.py b/cloudpathlib/gs/gsclient.py
@@ -16,6 +16,7 @@
     from google.api_core.exceptions import NotFound
     from google.auth.exceptions import DefaultCredentialsError
     from google.cloud.storage import Client as StorageClient
+    from google.cloud.storage import transfer_manager
 
 
 except ModuleNotFoundError:
@@ -39,6 +40,7 @@ def __init__(
         file_cache_mode: Optional[Union[str, FileCacheMode]] = None,
         local_cache_dir: Optional[Union[str, os.PathLike]] = None,
         content_type_method: Optional[Callable] = mimetypes.guess_type,
+        download_chunks_concurrently_kwargs: Optional[Dict[str, Any]] = None,
     ):
         """Class constructor. Sets up a [`Storage
         Client`](https://googleapis.dev/python/storage/latest/client.html).
@@ -76,6 +78,9 @@ def __init__(
                 the `CLOUDPATHLIB_LOCAL_CACHE_DIR` environment variable.
             content_type_method (Optional[Callable]): Function to call to guess media type (mimetype) when
                 writing a file to the cloud. Defaults to `mimetypes.guess_type`. Must return a tuple (content type, content encoding).
+            download_chunks_concurrently_kwargs (Optional[Dict[str, Any]]): Keyword arguments to pass to
+                [`download_chunks_concurrently`](https://cloud.google.com/python/docs/reference/storage/latest/google.cloud.storage.transfer_manager#google_cloud_storage_transfer_manager_download_chunks_concurrently)
+                for sliced parallel downloads.
         """
         if application_credentials is None:
             application_credentials = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
@@ -92,6 +97,8 @@ def __init__(
             except DefaultCredentialsError:
                 self.client = StorageClient.create_anonymous_client()
 
+        self.download_chunks_concurrently_kwargs = download_chunks_concurrently_kwargs
+
         super().__init__(
             local_cache_dir=local_cache_dir,
             content_type_method=content_type_method,
@@ -118,7 +125,13 @@ def _download_file(self, cloud_path: GSPath, local_path: Union[str, os.PathLike]
 
         local_path = Path(local_path)
 
-        blob.download_to_filename(local_path)
+        if self.download_chunks_concurrently_kwargs is not None:
+            transfer_manager.download_chunks_concurrently(
+                blob, local_path, **self.download_chunks_concurrently_kwargs
+            )
+        else:
+            blob.download_to_filename(local_path)
+
         return local_path
 
     def _is_file_or_dir(self, cloud_path: GSPath) -> Optional[str]:

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -30,6 +30,7 @@
 from .mock_clients.mock_gs import (
     mocked_client_class_factory as mocked_gsclient_class_factory,
     DEFAULT_GS_BUCKET_NAME,
+    MockTransferManager,
 )
 from .mock_clients.mock_s3 import mocked_session_class_factory, DEFAULT_S3_BUCKET_NAME
 
@@ -184,6 +185,11 @@ def gs_rig(request, monkeypatch, assets_dir):
             "StorageClient",
             mocked_gsclient_class_factory(test_dir),
         )
+        monkeypatch.setattr(
+            cloudpathlib.gs.gsclient,
+            "transfer_manager",
+            MockTransferManager,
+        )
 
     rig = CloudProviderTestRig(
         path_class=GSPath,

diff --git a/tests/mock_clients/mock_gs.py b/tests/mock_clients/mock_gs.py
@@ -65,6 +65,21 @@ def patch(self):
         if "updated" in self.metadata:
             (self.bucket / self.name).touch()
 
+    def reload(
+        self,
+        client=None,
+        projection="noAcl",
+        if_etag_match=None,
+        if_etag_not_match=None,
+        if_generation_match=None,
+        if_generation_not_match=None,
+        if_metageneration_match=None,
+        if_metageneration_not_match=None,
+        timeout=None,
+        retry=None,
+    ):
+        pass
+
     def upload_from_filename(self, filename, content_type=None):
         data = Path(filename).read_bytes()
         path = self.bucket / self.name
@@ -153,3 +168,19 @@ def __next__(self):
     @property
     def prefixes(self):
         return self.sub_directories
+
+
+class MockTransferManager:
+    @staticmethod
+    def download_chunks_concurrently(
+        blob,
+        filename,
+        chunk_size=32 * 1024 * 1024,
+        download_kwargs=None,
+        deadline=None,
+        worker_type="process",
+        max_workers=8,
+        *,
+        crc32c_checksum=True,
+    ):
+        blob.download_to_filename(filename)
diff --git a/tests/test_gs_specific.py b/tests/test_gs_specific.py
@@ -13,3 +13,13 @@ def test_gspath_properties(path_class):
     p2 = path_class("gs://mybucket/")
     assert p2.blob == ""
     assert p2.bucket == "mybucket"
+
+
+@pytest.mark.parametrize("worker_type", ["process", "thread"])
+def test_concurrent_download(gs_rig, tmp_path, worker_type):
+    client = gs_rig.client_class(download_chunks_concurrently_kwargs={"worker_type": worker_type})
+    p = gs_rig.create_cloud_path("dir_0/file0_0.txt", client=client)
+    dl_dir = tmp_path
+    assert not (dl_dir / p.name).exists()
+    p.download_to(dl_dir)
+    assert (dl_dir / p.name).is_file()