diff --git a/pueblo/io/universal.py b/pueblo/io/universal.py index c9d3f6d..922003a 100644 --- a/pueblo/io/universal.py +++ b/pueblo/io/universal.py @@ -41,9 +41,16 @@ def open_url(url: str) -> PathPlus: ----------- fs = Path("github://path/to/document.md", username="foobar", token="ghp_lalala", org="acme", repo="sweet-camino") """ - uri = URL(url) + uri = None + try: + uri = URL(url) + except ValueError as ex: + if "host is required for absolute urls" in str(ex): + pass + else: + raise - if uri.scheme.startswith("github+https"): + if uri and uri.scheme.startswith("github+https"): path_fragments = uri.path.split("/")[1:] path_kwargs = { "username": uri.user, @@ -60,7 +67,10 @@ def open_url(url: str) -> PathPlus: path = PathPlus(downstream_url, **path_kwargs) else: - path = PathPlus(url) + kwargs = {} + if url.startswith("s3://"): + kwargs["anon"] = True + path = PathPlus(url, **kwargs) return path diff --git a/pyproject.toml b/pyproject.toml index f17cd4a..c75d6c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,8 +92,9 @@ develop = [ "validate-pyproject<0.16", ] fileio = [ - "fsspec[adlfs,dask,gcs,git,github,http,s3,smb]<2023.11", + "fsspec[abfs,dask,gcs,git,github,http,s3,smb]<2023.11", "pathlibfs<0.6", + "python-magic<0.5", "yarl<1.10", ] nlp = [ diff --git a/tests/test_io.py b/tests/test_io.py index 258d92f..7d5747e 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -1,4 +1,8 @@ # ruff: noqa: E402 +import dataclasses +import typing as t + +import magic import pytest pytest.importorskip("pathlibfs") @@ -60,3 +64,42 @@ def test_path_without_scheme_absolute(): def test_path_without_scheme_relative(): assert path_without_scheme("/bar/baz") == PathPlus("file:///bar/baz") + + +@dataclasses.dataclass +class RemoteFile: + url: str + mimetypes: t.List[str] + + +def remote_files() -> t.List[RemoteFile]: + return [ + RemoteFile( + url="https://github.com/daq-tools/skeem/raw/main/tests/testdata/basic.ods", + mimetypes=["application/vnd.oasis.opendocument.spreadsheet"], + ), + RemoteFile( + url="github://daq-tools:skeem@/tests/testdata/basic.ods", + mimetypes=["application/vnd.oasis.opendocument.spreadsheet"], + ), + RemoteFile( + url="github+https://github.com/daq-tools/skeem/raw/main/tests/testdata/basic.ods", + mimetypes=["application/vnd.oasis.opendocument.spreadsheet"], + ), + RemoteFile( + url="gs://gcp-public-data-landsat/LC08/01/001/003/LC08_L1GT_001003_20140812_20170420_01_T2/LC08_L1GT_001003_20140812_20170420_01_T2_B3.TIF", + mimetypes=["image/tiff"], + ), + RemoteFile( + url="s3://fmi-gridded-obs-daily-1km/Netcdf/Tday/tday_2023.nc", + mimetypes=["application/x-netcdf", "application/octet-stream"], + ), + ] + + +@pytest.mark.parametrize("remote_file", remote_files(), ids=[rf.url for rf in remote_files()]) +def test_to_io_remote_files(remote_file): + with to_io(remote_file.url, mode="rb") as fp: + content = fp.read(100) + mimetype = magic.from_buffer(content, mime=True) + assert mimetype in remote_file.mimetypes