Skip to content

Commit

Permalink
Add as_url method to each CloudPath implementation with ability to …
Browse files Browse the repository at this point in the history
…generate presigned_urls (#236)

* Add ability to set endpoint_url from AWS_ENDPOINT_URL env variable. Very useful for localstack while waiting for upstream PR from boto3: boto/boto3#2746

* add test

* rm print

* lint

* initial implementation of as_url for presigned urls

* add az implementation

* fix non-presigned urls for each cloud

* fix az presigned url

* actually test and fix urls across implementations. conditional keys

* use sdks to get presigned and public urls (gs and s3)

* use sdk to get public url as well

* rm extra commented out

* add basic implementations for new url methods for local

* black format

* rm unused import

* fix sig

* add url stuff to the mock test client

* format

* fix up tests for each specific rig

* don't use a test that requires creds in s3 specific

* lint

* rm path checks that wouldn't work on windows for local mocks

* format
  • Loading branch information
Kabir Khan authored Feb 17, 2024
1 parent ea10587 commit e362366
Show file tree
Hide file tree
Showing 16 changed files with 222 additions and 4 deletions.
30 changes: 28 additions & 2 deletions cloudpathlib/azure/azblobclient.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datetime import datetime
from datetime import datetime, timedelta
import mimetypes
import os
from pathlib import Path, PurePosixPath
Expand All @@ -14,7 +14,13 @@

try:
from azure.core.exceptions import ResourceNotFoundError
from azure.storage.blob import BlobServiceClient, BlobProperties, ContentSettings
from azure.storage.blob import (
BlobSasPermissions,
BlobServiceClient,
BlobProperties,
ContentSettings,
generate_blob_sas,
)
except ModuleNotFoundError:
implementation_registry["azure"].dependencies_loaded = False

Expand Down Expand Up @@ -271,5 +277,25 @@ def _upload_file(

return cloud_path

def _get_public_url(self, cloud_path: AzureBlobPath) -> str:
blob_client = self.service_client.get_blob_client(
container=cloud_path.container, blob=cloud_path.blob
)
return blob_client.url

def _generate_presigned_url(
self, cloud_path: AzureBlobPath, expire_seconds: int = 60 * 60
) -> str:
sas_token = generate_blob_sas(
self.service_client.account_name,
container_name=cloud_path.container,
blob_name=cloud_path.blob,
account_key=self.service_client.credential.account_key,
permission=BlobSasPermissions(read=True),
expiry=datetime.utcnow() + timedelta(seconds=expire_seconds),
)
url = f"{self._get_public_url(cloud_path)}?{sas_token}"
return url


AzureBlobClient.AzureBlobPath = AzureBlobClient.CloudPath # type: ignore
7 changes: 7 additions & 0 deletions cloudpathlib/azure/azblobpath.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,13 @@ def touch(self, exist_ok: bool = True):

tf.cleanup()

def as_url(self, presign: bool = False, expire_seconds: int = 60 * 60):
if presign:
url = self.client._generate_presigned_url(self, expire_seconds=expire_seconds)
else:
url = self.client._get_public_url(self)
return url

def stat(self):
try:
meta = self.client._get_metadata(self)
Expand Down
10 changes: 10 additions & 0 deletions cloudpathlib/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,13 @@ def _upload_file(
self, local_path: Union[str, os.PathLike], cloud_path: BoundedCloudPath
) -> BoundedCloudPath:
pass

@abc.abstractmethod
def _get_public_url(self, cloud_path: BoundedCloudPath) -> str:
pass

Check warning on line 181 in cloudpathlib/client.py

View check run for this annotation

Codecov / codecov/patch

cloudpathlib/client.py#L181

Added line #L181 was not covered by tests

@abc.abstractmethod
def _generate_presigned_url(
self, cloud_path: BoundedCloudPath, expire_seconds: int = 60 * 60
) -> str:
pass

Check warning on line 187 in cloudpathlib/client.py

View check run for this annotation

Codecov / codecov/patch

cloudpathlib/client.py#L187

Added line #L187 was not covered by tests
6 changes: 6 additions & 0 deletions cloudpathlib/cloudpath.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,12 @@ def touch(self, exist_ok: bool = True) -> None:
"""Should be implemented using the client API to create and update modified time"""
pass

@abc.abstractmethod
def as_url(self, presign: bool = False, expire_seconds: int = 60 * 60) -> str:
"""Should be implemented using the client API to get either the public URL for a path or
a presigned URL to the path that will be valid for `expire_seconds`."""
pass

Check warning on line 390 in cloudpathlib/cloudpath.py

View check run for this annotation

Codecov / codecov/patch

cloudpathlib/cloudpath.py#L390

Added line #L390 was not covered by tests

# ====================== IMPLEMENTED FROM SCRATCH ======================
# Methods with their own implementations that work generically
def __rtruediv__(self, other: Any) -> None:
Expand Down
15 changes: 14 additions & 1 deletion cloudpathlib/gs/gsclient.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datetime import datetime
from datetime import datetime, timedelta
import mimetypes
import os
from pathlib import Path, PurePosixPath
Expand Down Expand Up @@ -271,5 +271,18 @@ def _upload_file(self, local_path: Union[str, os.PathLike], cloud_path: GSPath)
blob.upload_from_filename(str(local_path), **extra_args)
return cloud_path

def _get_public_url(self, cloud_path: GSPath) -> str:
bucket = self.client.get_bucket(cloud_path.bucket)
blob = bucket.blob(cloud_path.blob)
return blob.public_url

def _generate_presigned_url(self, cloud_path: GSPath, expire_seconds: int = 60 * 60) -> str:
bucket = self.client.get_bucket(cloud_path.bucket)
blob = bucket.blob(cloud_path.blob)
url = blob.generate_signed_url(
version="v4", expiration=timedelta(seconds=expire_seconds), method="GET"
)
return url


GSClient.GSPath = GSClient.CloudPath # type: ignore
7 changes: 7 additions & 0 deletions cloudpathlib/gs/gspath.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ def touch(self, exist_ok: bool = True):

tf.cleanup()

def as_url(self, presign: bool = False, expire_seconds: int = 60 * 60):
if presign:
url = self.client._generate_presigned_url(self, expire_seconds=expire_seconds)
else:
url = self.client._get_public_url(self)
return url

def stat(self):
meta = self.client._get_metadata(self)
if meta is None:
Expand Down
8 changes: 8 additions & 0 deletions cloudpathlib/local/localclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,14 @@ def _get_metadata(self, cloud_path: "LocalPath") -> Dict:
"content_type": content_type_method(str(self._cloud_path_to_local(cloud_path)))[0],
}

def _get_public_url(self, cloud_path: "LocalPath") -> str:
return cloud_path.as_uri()

Check warning on line 160 in cloudpathlib/local/localclient.py

View check run for this annotation

Codecov / codecov/patch

cloudpathlib/local/localclient.py#L160

Added line #L160 was not covered by tests

def _generate_presigned_url(
self, cloud_path: "LocalPath", expire_seconds: int = 60 * 60
) -> str:
raise NotImplementedError("Cannot generate a presigned URL for a local path.")

Check warning on line 165 in cloudpathlib/local/localclient.py

View check run for this annotation

Codecov / codecov/patch

cloudpathlib/local/localclient.py#L165

Added line #L165 was not covered by tests


_temp_dirs_to_clean: List[TemporaryDirectory] = []

Expand Down
3 changes: 3 additions & 0 deletions cloudpathlib/local/localpath.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,6 @@ def stat(self):

def touch(self, exist_ok: bool = True):
self.client._touch(self, exist_ok)

def as_url(self, presign: bool = False, expire_seconds: int = 60 * 60):
return self.as_uri()

Check warning on line 35 in cloudpathlib/local/localpath.py

View check run for this annotation

Codecov / codecov/patch

cloudpathlib/local/localpath.py#L35

Added line #L35 was not covered by tests
26 changes: 26 additions & 0 deletions cloudpathlib/s3/s3client.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ def __init__(
for k in ["RequestPayer", "ExpectedBucketOwner"]
if k in self._extra_args
}
self._endpoint_url = endpoint_url

super().__init__(
local_cache_dir=local_cache_dir,
Expand Down Expand Up @@ -349,5 +350,30 @@ def _upload_file(self, local_path: Union[str, os.PathLike], cloud_path: S3Path)
obj.upload_file(str(local_path), Config=self.boto3_transfer_config, ExtraArgs=extra_args)
return cloud_path

def _get_public_url(self, cloud_path: S3Path) -> str:
"""Apparently the best way to get the public URL is to generate a presigned URL
with the unsigned config set. This creates a temporary unsigned client to generate
the correct URL
See: https://stackoverflow.com/a/48197877
"""
unsigned_config = Config(signature_version=botocore.UNSIGNED)
unsigned_client = self.sess.client(
"s3", endpoint_url=self._endpoint_url, config=unsigned_config
)
url: str = unsigned_client.generate_presigned_url(
"get_object",
Params={"Bucket": cloud_path.bucket, "Key": cloud_path.key},
ExpiresIn=0,
)
return url

def _generate_presigned_url(self, cloud_path: S3Path, expire_seconds: int = 60 * 60) -> str:
url: str = self.client.generate_presigned_url(
"get_object",
Params={"Bucket": cloud_path.bucket, "Key": cloud_path.key},
ExpiresIn=expire_seconds,
)
return url


S3Client.S3Path = S3Client.CloudPath # type: ignore
7 changes: 7 additions & 0 deletions cloudpathlib/s3/s3path.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ def touch(self, exist_ok: bool = True):

tf.cleanup()

def as_url(self, presign: bool = False, expire_seconds: int = 60 * 60):
if presign:
url = self.client._generate_presigned_url(self, expire_seconds=expire_seconds)
else:
url = self.client._get_public_url(self)
return url

def stat(self):
try:
meta = self.client._get_metadata(self)
Expand Down
18 changes: 18 additions & 0 deletions tests/mock_clients/mock_azureblob.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@


from azure.storage.blob import BlobProperties
from azure.storage.blob._shared.authentication import SharedKeyCredentialPolicy
from azure.core.exceptions import ResourceNotFoundError

from .utils import delete_empty_parents_up_to_root
Expand All @@ -30,6 +31,23 @@ def __init__(self, *args, **kwargs):
def from_connection_string(cls, *args, **kwargs):
return cls()

@property
def account_name(self) -> str:
"""Returns well-known account name used by Azurite
See: https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=visual-studio%2Cblob-storage#well-known-storage-account-and-key
"""
return "devstoreaccount1"

@property
def credential(self):
"""Returns well-known account key used by Azurite
See: https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=visual-studio%2Cblob-storage#well-known-storage-account-and-key
"""
return SharedKeyCredentialPolicy(
self.account_name,
"Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==",
)

def __del__(self):
self.tmp.cleanup()

Expand Down
12 changes: 11 additions & 1 deletion tests/mock_clients/mock_gs.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datetime import datetime
from datetime import datetime, timedelta
from pathlib import Path, PurePosixPath
import shutil
from tempfile import TemporaryDirectory
Expand Down Expand Up @@ -38,6 +38,9 @@ def bucket(self, bucket):
def list_buckets(self):
return [DEFAULT_GS_BUCKET_NAME]

def get_bucket(self, bucket):
return MockBucket(self.tmp_path, bucket, client=self)

return MockClient


Expand Down Expand Up @@ -106,6 +109,13 @@ def updated(self):
def content_type(self):
return self.client.metadata_cache.get(self.bucket / self.name, None)

@property
def public_url(self) -> str:
return f"https://storage.googleapis.com{self.bucket}/{self.name}"

def generate_signed_url(self, version: str, expiration: timedelta, method: str):
return f"https://storage.googleapis.com{self.bucket}/{self.name}?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=TEST&X-Goog-Date=20240131T185515Z&X-Goog-Expires=3600&X-Goog-SignedHeaders=host&X-Goog-Signature=TEST"


class MockBucket:
def __init__(self, name, bucket_name, client=None):
Expand Down
4 changes: 4 additions & 0 deletions tests/mock_clients/mock_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,10 @@ def head_object(self, Bucket, Key, **kwargs):
else:
return {"key": Key}

def generate_presigned_url(self, op: str, Params: dict, ExpiresIn: int):
mock_presigned_url = f"https://{Params['Bucket']}.s3.amazonaws.com/{Params['Key']}?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=TEST%2FTEST%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240131T194721Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=TEST"
return mock_presigned_url

@property
def exceptions(self):
Ex = collections.namedtuple("Ex", "NoSuchKey")
Expand Down
19 changes: 19 additions & 0 deletions tests/test_azure_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pytest

from urllib.parse import urlparse, parse_qs
from cloudpathlib import AzureBlobClient, AzureBlobPath
from cloudpathlib.exceptions import MissingCredentialsError
from cloudpathlib.local import LocalAzureBlobClient, LocalAzureBlobPath
Expand All @@ -26,3 +27,21 @@ def test_azureblobpath_nocreds(client_class, monkeypatch):
monkeypatch.delenv("AZURE_STORAGE_CONNECTION_STRING", raising=False)
with pytest.raises(MissingCredentialsError):
client_class()


def test_as_url(azure_rig):
p: AzureBlobPath = azure_rig.create_cloud_path("dir_0/file0_0.txt")

public_url = str(p.as_url())
public_parts = urlparse(public_url)

assert public_parts.path.endswith("file0_0.txt")

presigned_url = p.as_url(presign=True)
parts = urlparse(presigned_url)
query_params = parse_qs(parts.query)
assert parts.path.endswith("file0_0.txt")
assert "se" in query_params
assert "sp" in query_params
assert "sr" in query_params
assert "sig" in query_params
23 changes: 23 additions & 0 deletions tests/test_gs_specific.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest

from urllib.parse import urlparse, parse_qs
from cloudpathlib import GSPath
from cloudpathlib.local import LocalGSPath

Expand All @@ -23,3 +24,25 @@ def test_concurrent_download(gs_rig, tmp_path, worker_type):
assert not (dl_dir / p.name).exists()
p.download_to(dl_dir)
assert (dl_dir / p.name).is_file()


def test_as_url(gs_rig):
p: GSPath = gs_rig.create_cloud_path("dir_0/file0_0.txt")
public_url = p.as_url()
public_url_parts = urlparse(public_url)
assert public_url_parts.hostname and public_url_parts.hostname.startswith(
"storage.googleapis.com"
)
assert public_url_parts.path.endswith("file0_0.txt")

expire_seconds = 3600
presigned_url = p.as_url(presign=True, expire_seconds=expire_seconds)
parts = urlparse(presigned_url)
query_params = parse_qs(parts.query)
assert parts.path.endswith("file0_0.txt")
assert query_params["X-Goog-Expires"] == [str(expire_seconds)]
assert "X-Goog-Algorithm" in query_params
assert "X-Goog-Credential" in query_params
assert "X-Goog-Date" in query_params
assert "X-Goog-SignedHeaders" in query_params
assert "X-Goog-Signature" in query_params
31 changes: 31 additions & 0 deletions tests/test_s3_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from itertools import islice
from time import sleep

from urllib.parse import urlparse, parse_qs
import pytest

from boto3.s3.transfer import TransferConfig
Expand Down Expand Up @@ -247,3 +248,33 @@ def test_aws_endpoint_url_env(monkeypatch):
monkeypatch.setenv("AWS_ENDPOINT_URL", localstack_url)
s3_client_custom_endpoint = S3Client()
assert s3_client_custom_endpoint.client.meta.endpoint_url == localstack_url


def test_as_url_local(monkeypatch):
path = S3Path("s3://arxiv/pdf")
public_url = path.as_url()
assert public_url == "https://arxiv.s3.amazonaws.com/pdf"

localstack_url = "http://localhost:4566"
monkeypatch.setenv("AWS_ENDPOINT_URL", localstack_url)
s3_client_custom_endpoint = S3Client()

path = S3Path("s3://arxiv/pdf", client=s3_client_custom_endpoint)
public_url = path.as_url()
assert public_url == f"{localstack_url}/arxiv/pdf"


def test_as_url_presign(s3_rig):
p: S3Path = s3_rig.create_cloud_path("dir_0/file0_0.txt")
expire_seconds = 3600
presigned_url = p.as_url(presign=True, expire_seconds=expire_seconds)
parts = urlparse(presigned_url)
query_params = parse_qs(parts.query)

assert parts.path.endswith("file0_0.txt")
assert query_params["X-Amz-Expires"] == [str(expire_seconds)]
assert "X-Amz-Algorithm" in query_params
assert "X-Amz-Credential" in query_params
assert "X-Amz-Date" in query_params
assert "X-Amz-SignedHeaders" in query_params
assert "X-Amz-Signature" in query_params

0 comments on commit e362366

Please sign in to comment.