From 4c6cb63b827fd72571f679d5bb15106d483f08c1 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Fri, 29 Mar 2024 14:23:10 -0600 Subject: [PATCH 01/30] adding reader_options kwargs to open_virtual_dataset --- virtualizarr/kerchunk.py | 17 +++++++++++------ virtualizarr/tests/test_xarray.py | 10 ++++++++++ virtualizarr/xarray.py | 5 +++++ 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 935cefe9..fd183261 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -20,7 +20,10 @@ def read_kerchunk_references_from_file( - filepath: str, filetype: Optional[str] + filepath: str, filetype: Optional[str], + reader_options: Optional[dict] = {'storage_options': {'anon': True}} + + ) -> KerchunkStoreRefs: """ Read a single legacy file and return kerchunk references to its contents. @@ -32,6 +35,9 @@ def read_kerchunk_references_from_file( filetype : str, default: None Type of file to be opened. Used to determine which kerchunk file format backend to use. If not provided will attempt to automatically infer the correct filetype from the the filepath's extension. + reader_options: dict, default {'storage_options': {'anon': True}} + Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments, + so ensure reader_options match selected Kerchunk reader arguments. """ if filetype is None: @@ -39,12 +45,11 @@ def read_kerchunk_references_from_file( if filetype.lower() == "netcdf3": from kerchunk.netCDF3 import NetCDF3ToZarr - refs = NetCDF3ToZarr(filepath).translate() + refs = NetCDF3ToZarr(filepath, **reader_options).translate() elif filetype.lower() == "netcdf4": from kerchunk.hdf import SingleHdf5ToZarr - - refs = SingleHdf5ToZarr(filepath).translate() + refs = SingleHdf5ToZarr(filepath, **reader_options).translate() elif filetype == "grib": # TODO Grib files should be handled as a DataTree object # see https://github.com/TomNicholas/VirtualiZarr/issues/11 @@ -52,11 +57,11 @@ def read_kerchunk_references_from_file( elif filetype.lower() == "tiff": from kerchunk.tiff import tiff_to_zarr - refs = tiff_to_zarr(filepath) + refs = tiff_to_zarr(filepath, **reader_options) elif filetype.lower() == "fits": from kerchunk.fits import process_file - refs = process_file(filepath) + refs = process_file(filepath, **reader_options) else: raise NotImplementedError(f"Unsupported file type: {filetype}") diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index db406ada..a99f1934 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -3,6 +3,7 @@ import numpy as np import xarray as xr from xarray.core.indexes import Index +import pytest from virtualizarr import open_virtual_dataset from virtualizarr.manifests import ChunkManifest, ManifestArray @@ -268,3 +269,12 @@ def test_combine_by_coords(self, netcdf4_files): ) assert combined_vds.xindexes["time"].to_pandas_index().is_monotonic_increasing + + +pytest.importorskip("s3fs") +@pytest.mark.xfail(reason="currently should xfail for None filetype and None indexes.",run=False) +@pytest.mark.parametrize("filetype", ['netcdf4', None], ids=["netcdf4 filetype", "None filetype"]) +@pytest.mark.parametrize("indexes", [None, {}], ids=["None index", "empty dict index"]) +def test_anon_read_s3(filetype, indexes): + fpath = 's3://nex-gddp-cmip6/NEX-GDDP-CMIP6/CESM2/historical/r4i1p1f1/pr/pr_day_CESM2_historical_r4i1p1f1_gn_2010.nc' + assert open_virtual_dataset(fpath,filetype=filetype,indexes=indexes,reader_options={'storage_options': {'anon': True}}) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index fe48ad32..479f2246 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -23,6 +23,7 @@ def open_virtual_dataset( drop_variables: Optional[List[str]] = None, indexes: Optional[Mapping[str, Index]] = None, virtual_array_class=ManifestArray, + reader_options: Optional[dict] = {'storage_options': {'anon': True}} ) -> xr.Dataset: """ Open a file or store as an xarray Dataset wrapping virtualized zarr arrays. @@ -48,12 +49,16 @@ def open_virtual_dataset( virtual_array_class Virtual array class to use to represent the references to the chunks in each on-disk array. Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. + reader_options: dict, default {'storage_options': {'anon': True}} + Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments, + so ensure reader_options match selected Kerchunk reader arguments. """ # this is the only place we actually always need to use kerchunk directly vds_refs = kerchunk.read_kerchunk_references_from_file( filepath=filepath, filetype=filetype, + reader_options=reader_options, ) if indexes is None: From ba5ac6d7a88d45e7176aabaddccbfd2b1d64a1d0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 30 Apr 2024 19:04:50 +0000 Subject: [PATCH 02/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/kerchunk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 2ad3fb1e..842deb7e 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -61,7 +61,7 @@ def read_kerchunk_references_from_file( if filetype.name.lower() == "netcdf3": from kerchunk.netCDF3 import NetCDF3ToZarr - + refs = NetCDF3ToZarr(filepath, inline_threshold=0, **reader_options).translate() elif filetype.name.lower() == "netcdf4": from kerchunk.hdf import SingleHdf5ToZarr From ea3091423c91aed53c038b87ada1cc584a9de9f7 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 30 Apr 2024 13:06:58 -0600 Subject: [PATCH 03/30] fix typing --- virtualizarr/kerchunk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 842deb7e..ead66a5b 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -75,7 +75,7 @@ def read_kerchunk_references_from_file( from kerchunk.tiff import tiff_to_zarr refs = tiff_to_zarr(filepath, inline_threshold=0, **reader_options) - elif filetype.lower() == "fits": + elif filetype.name.lower() == "fits": from kerchunk.fits import process_file refs = process_file(filepath, inline_threshold=0, **reader_options) From 448800bcb9ad2ff44fae9d37c01d7084d7fe940c Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 1 May 2024 16:47:27 -0600 Subject: [PATCH 04/30] modifies _automatically_determine_filetype to open file with fsspec to allows for reading of cloud storage --- virtualizarr/kerchunk.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index ead66a5b..5223b9b5 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -54,7 +54,7 @@ def read_kerchunk_references_from_file( """ if filetype is None: - filetype = _automatically_determine_filetype(filepath) + filetype = _automatically_determine_filetype(filepath, reader_options) # if filetype is user defined, convert to FileType filetype = FileType(filetype) @@ -86,20 +86,23 @@ def read_kerchunk_references_from_file( return refs -def _automatically_determine_filetype(filepath: str) -> FileType: +def _automatically_determine_filetype(filepath: str, reader_options: Optional[dict] = {'storage_options': {'anon': True}}) -> FileType: file_extension = Path(filepath).suffix + # opens file with fsspec - can be on cloud storage or local + import fsspec + fsspec_of = fsspec.open(filepath, mode="rb",**reader_options).open() + if file_extension == ".nc": # based off of: https://github.com/TomNicholas/VirtualiZarr/pull/43#discussion_r1543415167 - with open(filepath, 'rb') as f: - magic = f.read() + magic = fsspec_of.read() + if magic[0:3] == b"CDF": filetype = FileType.netcdf3 elif magic[1:4] == b"HDF": filetype = FileType.netcdf4 else: raise ValueError(".nc file does not appear to be NETCDF3 OR NETCDF4") - elif file_extension == ".zarr": # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one... raise NotImplementedError() @@ -112,6 +115,7 @@ def _automatically_determine_filetype(filepath: str) -> FileType: else: raise NotImplementedError(f"Unrecognised file extension: {file_extension}") + fsspec_of.close() return filetype From 8c5dff7e1a77ca0cec833c2660a0b16d7e57100c Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 1 May 2024 17:42:07 -0600 Subject: [PATCH 05/30] using UPath to get file protocol and open with fsspec --- pyproject.toml | 1 + virtualizarr/kerchunk.py | 27 +++++++++++++++++++-------- virtualizarr/xarray.py | 10 +++++++--- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7bde54f5..fdf81371 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ dynamic = ["version"] dependencies = [ "xarray@git+https://github.com/TomNicholas/xarray.git@concat-avoid-index-auto-creation#egg=xarray", "kerchunk==0.2.2", + "universal-pathlib", "h5netcdf", "pydantic", "numpy", diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 5223b9b5..09f49b5e 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -36,7 +36,7 @@ class FileType(AutoName): def read_kerchunk_references_from_file( filepath: str, filetype: Optional[FileType], - reader_options: Optional[dict] = {'storage_options': {'anon': True}} + reader_options: Optional[dict] = {} ) -> KerchunkStoreRefs: """ Read a single legacy file and return kerchunk references to its contents. @@ -48,8 +48,8 @@ def read_kerchunk_references_from_file( filetype : FileType, default: None Type of file to be opened. Used to determine which kerchunk file format backend to use. If not provided will attempt to automatically infer the correct filetype from the the filepath's extension. - reader_options: dict, default {'storage_options': {'anon': True}} - Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments, + reader_options: dict, default {} + Dict passed into Kerchunk file readers. ex: {'storage_options': {'anon': True}}. Note: Each Kerchunk file reader has distinct arguments, so ensure reader_options match selected Kerchunk reader arguments. """ @@ -86,16 +86,27 @@ def read_kerchunk_references_from_file( return refs -def _automatically_determine_filetype(filepath: str, reader_options: Optional[dict] = {'storage_options': {'anon': True}}) -> FileType: +def _automatically_determine_filetype(filepath: str, reader_options: Optional[dict]={}) -> FileType: file_extension = Path(filepath).suffix # opens file with fsspec - can be on cloud storage or local - import fsspec - fsspec_of = fsspec.open(filepath, mode="rb",**reader_options).open() + from upath import UPath + universal_filepath = UPath(filepath) + protocol = universal_filepath.protocol + + # why does UPath give an empty string for a local file protocol :( + if protocol == '': + fpath = open(filepath, 'rb') + + elif protocol in ["s3"]: + import fsspec + fpath = fsspec.filesystem(protocol).open(filepath, **reader_options) + else: + raise NotImplementedError("Only local and s3 file protocols are currently supported") if file_extension == ".nc": # based off of: https://github.com/TomNicholas/VirtualiZarr/pull/43#discussion_r1543415167 - magic = fsspec_of.read() + magic = fpath.read() if magic[0:3] == b"CDF": filetype = FileType.netcdf3 @@ -115,7 +126,7 @@ def _automatically_determine_filetype(filepath: str, reader_options: Optional[di else: raise NotImplementedError(f"Unrecognised file extension: {file_extension}") - fsspec_of.close() + fpath.close() return filetype diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index af1f9c34..b449b6e5 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -25,7 +25,7 @@ def open_virtual_dataset( loadable_variables: Optional[Iterable[str]] = None, indexes: Optional[Mapping[str, Index]] = None, virtual_array_class=ManifestArray, - reader_options: Optional[dict] = {'storage_options': {'anon': True}} + reader_options: Optional[dict] = {}, ) -> xr.Dataset: """ Open a file or store as an xarray Dataset wrapping virtualized zarr arrays. @@ -54,7 +54,7 @@ def open_virtual_dataset( virtual_array_class Virtual array class to use to represent the references to the chunks in each on-disk array. Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. - reader_options: dict, default {'storage_options': {'anon': True}} + reader_options: dict, default {} Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments, so ensure reader_options match selected Kerchunk reader arguments. @@ -64,6 +64,7 @@ def open_virtual_dataset( An xarray Dataset containing instances of virtual_array_cls for each variable, or normal lazily indexed arrays for each variable in loadable_variables. """ + if drop_variables is None: drop_variables = [] elif isinstance(drop_variables, str): @@ -98,7 +99,10 @@ def open_virtual_dataset( # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references # TODO really we probably want a dedicated xarray backend that iterates over all variables only once - ds = xr.open_dataset(filepath, drop_variables=drop_variables) + import fsspec + + fsspec_of = fsspec.open(filepath, mode="rb",**reader_options).open() + ds = xr.open_dataset(fsspec_of, drop_variables=drop_variables) if indexes is None: # add default indexes by reading data from file From 6cd77ce869df8b5c49bb4f169c6f4d932065958b Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 2 May 2024 15:40:23 -0600 Subject: [PATCH 06/30] tests passing locally. Reading over s3/local w+w/o indexes & guessing filetypes --- virtualizarr/kerchunk.py | 21 ++++------------- virtualizarr/tests/test_kerchunk.py | 4 ++-- virtualizarr/tests/test_xarray.py | 5 +++-- virtualizarr/utils.py | 35 +++++++++++++++++++++++++++++ virtualizarr/xarray.py | 10 +++++---- 5 files changed, 50 insertions(+), 25 deletions(-) create mode 100644 virtualizarr/utils.py diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 09f49b5e..b32af248 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -5,6 +5,7 @@ import xarray as xr from virtualizarr.zarr import ZArray, ZAttrs +from virtualizarr.utils import _fsspec_openfile_from_filepath # Distinguishing these via type hints makes it a lot easier to mentally keep track of what the opaque kerchunk "reference dicts" actually mean # (idea from https://kobzol.github.io/rust/python/2023/05/20/writing-python-like-its-rust.html) @@ -54,7 +55,7 @@ def read_kerchunk_references_from_file( """ if filetype is None: - filetype = _automatically_determine_filetype(filepath, reader_options) + filetype = _automatically_determine_filetype(filepath=filepath, reader_options=reader_options) # if filetype is user defined, convert to FileType filetype = FileType(filetype) @@ -86,23 +87,9 @@ def read_kerchunk_references_from_file( return refs -def _automatically_determine_filetype(filepath: str, reader_options: Optional[dict]={}) -> FileType: +def _automatically_determine_filetype(*,filepath: str, reader_options: Optional[dict]={}) -> FileType: file_extension = Path(filepath).suffix - - # opens file with fsspec - can be on cloud storage or local - from upath import UPath - universal_filepath = UPath(filepath) - protocol = universal_filepath.protocol - - # why does UPath give an empty string for a local file protocol :( - if protocol == '': - fpath = open(filepath, 'rb') - - elif protocol in ["s3"]: - import fsspec - fpath = fsspec.filesystem(protocol).open(filepath, **reader_options) - else: - raise NotImplementedError("Only local and s3 file protocols are currently supported") + fpath = _fsspec_openfile_from_filepath(filepath=filepath,reader_options=reader_options) if file_extension == ".nc": # based off of: https://github.com/TomNicholas/VirtualiZarr/pull/43#discussion_r1543415167 diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index 7a2ffb88..e53c4a41 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -159,8 +159,8 @@ def test_automatically_determine_filetype_netcdf3_netcdf4(): ds.to_netcdf(netcdf3_file_path, engine="scipy", format="NETCDF3_CLASSIC") ds.to_netcdf(netcdf4_file_path, engine="h5netcdf") - assert FileType("netcdf3") == _automatically_determine_filetype(netcdf3_file_path) - assert FileType("netcdf4") == _automatically_determine_filetype(netcdf4_file_path) + assert FileType("netcdf3") == _automatically_determine_filetype(filepath = netcdf3_file_path) + assert FileType("netcdf4") == _automatically_determine_filetype(filepath = netcdf4_file_path) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index aba3b1ae..15a7b70b 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -273,11 +273,10 @@ def test_combine_by_coords(self, netcdf4_files): pytest.importorskip("s3fs") -@pytest.mark.xfail(reason="currently should xfail for None filetype and None indexes.",run=False) @pytest.mark.parametrize("filetype", ['netcdf4', None], ids=["netcdf4 filetype", "None filetype"]) @pytest.mark.parametrize("indexes", [None, {}], ids=["None index", "empty dict index"]) def test_anon_read_s3(filetype, indexes): - fpath = 's3://nex-gddp-cmip6/NEX-GDDP-CMIP6/CESM2/historical/r4i1p1f1/pr/pr_day_CESM2_historical_r4i1p1f1_gn_2010.nc' + fpath = 's3://carbonplan-share/virtualizarr/local.nc' assert open_virtual_dataset(fpath, filetype=filetype, indexes=indexes, reader_options={'storage_options': {'anon': True}}) @@ -293,6 +292,8 @@ def test_loadable_variables(self, netcdf4_file): assert isinstance(vds[name].data, ManifestArray), name full_ds = xr.open_dataset(netcdf4_file) + for name in full_ds.variables: + if name in vars_to_load: xrt.assert_identical(vds.variables[name], full_ds.variables[name]) diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py new file mode 100644 index 00000000..380dd3c2 --- /dev/null +++ b/virtualizarr/utils.py @@ -0,0 +1,35 @@ +from typing import Optional, Union + +# TODO: importing fsspec and s3fs to get typing. Is there a better way incase these are optional deps? +from s3fs.core import S3File +from fsspec.implementations.local import LocalFileOpener + + +def _fsspec_openfile_from_filepath(*, filepath: str, reader_options: Optional[dict]) -> Union[S3File, LocalFileOpener]: + """Utility function to facilitate reading remote file paths using fsspec. + + :param filepath: Input filepath + :type filepath: str + :param reader_options: Dict containing options to pass to fsspec file reader + :type reader_options: Optional[dict] + :rtype: Union[S3File, LocalFileOpener] + """ + import fsspec + from upath import UPath + + universal_filepath = UPath(filepath) + protocol = universal_filepath.protocol + + # why does UPath give an empty string for a local file protocol :( + if protocol == '': + + fpath = fsspec.open(filepath, 'rb').open() + + elif protocol in ["s3"]: + + fpath = fsspec.filesystem(protocol).open(filepath, **reader_options) + + else: + raise NotImplementedError("Only local and s3 file protocols are currently supported") + + return fpath diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index b449b6e5..a1235ab6 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -8,6 +8,7 @@ from xarray.core.variable import IndexVariable import virtualizarr.kerchunk as kerchunk +from virtualizarr.utils import _fsspec_openfile_from_filepath from virtualizarr.kerchunk import KerchunkStoreRefs, FileType from virtualizarr.manifests import ChunkManifest, ManifestArray @@ -99,10 +100,11 @@ def open_virtual_dataset( # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references # TODO really we probably want a dedicated xarray backend that iterates over all variables only once - import fsspec - fsspec_of = fsspec.open(filepath, mode="rb",**reader_options).open() - ds = xr.open_dataset(fsspec_of, drop_variables=drop_variables) + + fpath = _fsspec_openfile_from_filepath(filepath=filepath,reader_options=reader_options) + + ds = xr.open_dataset(fpath, drop_variables=drop_variables) if indexes is None: # add default indexes by reading data from file @@ -118,6 +120,7 @@ def open_virtual_dataset( # if we only read the indexes we can just close the file right away as nothing is lazy if loadable_vars == {}: ds.close() + else: loadable_vars = {} indexes = {} @@ -134,7 +137,6 @@ def open_virtual_dataset( ) # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened - return vds From ed3d0f4a2bd4140ff5b37e6fd4e8dcf11a9c4253 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 2 May 2024 15:52:03 -0600 Subject: [PATCH 07/30] add s3fs to test --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index fdf81371..7115ef83 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ test = [ "scipy", "pooch", "ruff", + "s3sf" ] From beec724b75dd68ae8a17e783aab1188f81b7a1e9 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 2 May 2024 15:53:07 -0600 Subject: [PATCH 08/30] typing school 101 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7115ef83..0c029e03 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ test = [ "scipy", "pooch", "ruff", - "s3sf" + "s3fs" ] From e6698414367aa666bf91b2b2944f663c4e363aca Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 2 May 2024 15:56:19 -0600 Subject: [PATCH 09/30] anon --- virtualizarr/tests/test_xarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 15a7b70b..c27e2d88 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -277,7 +277,7 @@ def test_combine_by_coords(self, netcdf4_files): @pytest.mark.parametrize("indexes", [None, {}], ids=["None index", "empty dict index"]) def test_anon_read_s3(filetype, indexes): fpath = 's3://carbonplan-share/virtualizarr/local.nc' - assert open_virtual_dataset(fpath, filetype=filetype, indexes=indexes, reader_options={'storage_options': {'anon': True}}) + assert open_virtual_dataset(fpath, filetype=filetype, indexes=indexes, reader_options={'storage_options': {'anon': True},'remote_options': {'anon': True}}) class TestLoadVirtualDataset: From 09f89a6549ef33a145094aa94ca1aa0b122d2e8c Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 2 May 2024 17:13:45 -0600 Subject: [PATCH 10/30] tying --- virtualizarr/utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index 380dd3c2..179e5bce 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -25,9 +25,14 @@ def _fsspec_openfile_from_filepath(*, filepath: str, reader_options: Optional[di fpath = fsspec.open(filepath, 'rb').open() + elif protocol in ["s3"]: + if not bool(reader_options): + storage_options = {} # type: dict + else: + storage_options = reader_options.get('storage_options') #type: ignore - fpath = fsspec.filesystem(protocol).open(filepath, **reader_options) + fpath = fsspec.filesystem(protocol, **storage_options).open(filepath) else: raise NotImplementedError("Only local and s3 file protocols are currently supported") From e4db860d0c8bcf9a94084f010d72ca95a8b34ea3 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 2 May 2024 17:16:05 -0600 Subject: [PATCH 11/30] test_anon update --- virtualizarr/tests/test_xarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index c27e2d88..15a7b70b 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -277,7 +277,7 @@ def test_combine_by_coords(self, netcdf4_files): @pytest.mark.parametrize("indexes", [None, {}], ids=["None index", "empty dict index"]) def test_anon_read_s3(filetype, indexes): fpath = 's3://carbonplan-share/virtualizarr/local.nc' - assert open_virtual_dataset(fpath, filetype=filetype, indexes=indexes, reader_options={'storage_options': {'anon': True},'remote_options': {'anon': True}}) + assert open_virtual_dataset(fpath, filetype=filetype, indexes=indexes, reader_options={'storage_options': {'anon': True}}) class TestLoadVirtualDataset: From ba8b1e3e045bd0b2892c9d5abab7a5164efe63bf Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 2 May 2024 17:21:18 -0600 Subject: [PATCH 12/30] anon failing --- virtualizarr/tests/test_xarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 15a7b70b..41079787 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -277,7 +277,7 @@ def test_combine_by_coords(self, netcdf4_files): @pytest.mark.parametrize("indexes", [None, {}], ids=["None index", "empty dict index"]) def test_anon_read_s3(filetype, indexes): fpath = 's3://carbonplan-share/virtualizarr/local.nc' - assert open_virtual_dataset(fpath, filetype=filetype, indexes=indexes, reader_options={'storage_options': {'anon': True}}) + assert open_virtual_dataset(fpath, filetype=filetype, indexes=indexes, reader_options={'storage_options': {'anon': 'true'}}) class TestLoadVirtualDataset: From b12d32c588752f91ad452d5a0810558811103aa5 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 2 May 2024 17:38:05 -0600 Subject: [PATCH 13/30] double down on storage_options --- virtualizarr/tests/test_xarray.py | 3 +-- virtualizarr/utils.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 41079787..433f2c5d 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -277,8 +277,7 @@ def test_combine_by_coords(self, netcdf4_files): @pytest.mark.parametrize("indexes", [None, {}], ids=["None index", "empty dict index"]) def test_anon_read_s3(filetype, indexes): fpath = 's3://carbonplan-share/virtualizarr/local.nc' - assert open_virtual_dataset(fpath, filetype=filetype, indexes=indexes, reader_options={'storage_options': {'anon': 'true'}}) - + assert open_virtual_dataset(fpath, filetype=filetype, indexes=indexes, reader_options={'storage_options': {'anon': True}}) class TestLoadVirtualDataset: def test_loadable_variables(self, netcdf4_file): diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index 179e5bce..fd17b51d 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -31,8 +31,7 @@ def _fsspec_openfile_from_filepath(*, filepath: str, reader_options: Optional[di storage_options = {} # type: dict else: storage_options = reader_options.get('storage_options') #type: ignore - - fpath = fsspec.filesystem(protocol, **storage_options).open(filepath) + fpath = fsspec.filesystem(protocol, **storage_options).open(filepath,**storage_options) else: raise NotImplementedError("Only local and s3 file protocols are currently supported") From f9478b958fb45b6b669e03b6c1fea46d283836f2 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Fri, 3 May 2024 08:32:21 -0600 Subject: [PATCH 14/30] fsspec nit --- virtualizarr/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index fd17b51d..f39a33e7 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -25,13 +25,14 @@ def _fsspec_openfile_from_filepath(*, filepath: str, reader_options: Optional[di fpath = fsspec.open(filepath, 'rb').open() - + elif protocol in ["s3"]: if not bool(reader_options): storage_options = {} # type: dict else: storage_options = reader_options.get('storage_options') #type: ignore - fpath = fsspec.filesystem(protocol, **storage_options).open(filepath,**storage_options) + target_options = {"anon":True} + fpath = fsspec.filesystem(protocol).open(filepath,target_options = {"anon":True}) else: raise NotImplementedError("Only local and s3 file protocols are currently supported") From 6958b591d70d16bf814868f53388eaae13f65221 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 14:32:38 +0000 Subject: [PATCH 15/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index f39a33e7..1ad935c2 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -25,7 +25,7 @@ def _fsspec_openfile_from_filepath(*, filepath: str, reader_options: Optional[di fpath = fsspec.open(filepath, 'rb').open() - + elif protocol in ["s3"]: if not bool(reader_options): storage_options = {} # type: dict From aefa22dab1fcbd4bf3d4ce0b7a796ac92de7137b Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Fri, 3 May 2024 12:35:38 -0600 Subject: [PATCH 16/30] seting s3 defaults as empty to try to appease the cruel boto3 gods --- virtualizarr/tests/test_xarray.py | 5 +++++ virtualizarr/utils.py | 13 +++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 433f2c5d..8d7908d4 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -272,6 +272,11 @@ def test_combine_by_coords(self, netcdf4_files): assert combined_vds.xindexes["time"].to_pandas_index().is_monotonic_increasing + + + + + pytest.importorskip("s3fs") @pytest.mark.parametrize("filetype", ['netcdf4', None], ids=["netcdf4 filetype", "None filetype"]) @pytest.mark.parametrize("indexes", [None, {}], ids=["None index", "empty dict index"]) diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index f39a33e7..b40a726a 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -21,18 +21,23 @@ def _fsspec_openfile_from_filepath(*, filepath: str, reader_options: Optional[di protocol = universal_filepath.protocol # why does UPath give an empty string for a local file protocol :( + # import pdb; pdb.set_trace() + if protocol == '': fpath = fsspec.open(filepath, 'rb').open() - elif protocol in ["s3"]: + s3_anon_defaults = {'key':'', 'secret':'', 'anon':True} if not bool(reader_options): - storage_options = {} # type: dict + storage_options = s3_anon_defaults + else: storage_options = reader_options.get('storage_options') #type: ignore - target_options = {"anon":True} - fpath = fsspec.filesystem(protocol).open(filepath,target_options = {"anon":True}) + # using dict merge operator to add in defaults if keys are not specified + storage_options = s3_anon_defaults | storage_options + + fpath = fsspec.filesystem(protocol, **storage_options).open(filepath) else: raise NotImplementedError("Only local and s3 file protocols are currently supported") From d10897806da2e29f0e634239efc466cda859b296 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Fri, 3 May 2024 12:41:18 -0600 Subject: [PATCH 17/30] added fpath to SingleHDF5ToZarr --- virtualizarr/kerchunk.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index b32af248..9e1a02d5 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -60,6 +60,8 @@ def read_kerchunk_references_from_file( # if filetype is user defined, convert to FileType filetype = FileType(filetype) + fpath = _fsspec_openfile_from_filepath(filepath=filepath,reader_options=reader_options) + if filetype.name.lower() == "netcdf3": from kerchunk.netCDF3 import NetCDF3ToZarr @@ -67,7 +69,7 @@ def read_kerchunk_references_from_file( elif filetype.name.lower() == "netcdf4": from kerchunk.hdf import SingleHdf5ToZarr - refs = SingleHdf5ToZarr(filepath, inline_threshold=0, **reader_options).translate() + refs = SingleHdf5ToZarr(fpath, inline_threshold=0).translate() elif filetype.name.lower() == "grib": # TODO Grib files should be handled as a DataTree object # see https://github.com/TomNicholas/VirtualiZarr/issues/11 From 5cc5ecdcacd3f632b698025156dc4bfbc0505c7d Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Fri, 3 May 2024 12:52:53 -0600 Subject: [PATCH 18/30] hardcode in empty storage opts for s3 --- virtualizarr/kerchunk.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 9e1a02d5..ab665786 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -60,8 +60,6 @@ def read_kerchunk_references_from_file( # if filetype is user defined, convert to FileType filetype = FileType(filetype) - fpath = _fsspec_openfile_from_filepath(filepath=filepath,reader_options=reader_options) - if filetype.name.lower() == "netcdf3": from kerchunk.netCDF3 import NetCDF3ToZarr @@ -69,7 +67,7 @@ def read_kerchunk_references_from_file( elif filetype.name.lower() == "netcdf4": from kerchunk.hdf import SingleHdf5ToZarr - refs = SingleHdf5ToZarr(fpath, inline_threshold=0).translate() + refs = SingleHdf5ToZarr(filepath, inline_threshold=0, storage_options={'key':'', 'secret':'', 'anon':True}).translate() elif filetype.name.lower() == "grib": # TODO Grib files should be handled as a DataTree object # see https://github.com/TomNicholas/VirtualiZarr/issues/11 From 3509a1f30172734adfd2dd87d1bcdeb35df69bc2 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Fri, 3 May 2024 12:58:17 -0600 Subject: [PATCH 19/30] hardcode default + unpack test --- virtualizarr/kerchunk.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index ab665786..a9789fbd 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -66,8 +66,8 @@ def read_kerchunk_references_from_file( refs = NetCDF3ToZarr(filepath, inline_threshold=0, **reader_options).translate() elif filetype.name.lower() == "netcdf4": from kerchunk.hdf import SingleHdf5ToZarr - - refs = SingleHdf5ToZarr(filepath, inline_threshold=0, storage_options={'key':'', 'secret':'', 'anon':True}).translate() + reader_options = {'storage_options':{'key':'', 'secret':'', 'anon':True}} + refs = SingleHdf5ToZarr(filepath, inline_threshold=0, **reader_options).translate() elif filetype.name.lower() == "grib": # TODO Grib files should be handled as a DataTree object # see https://github.com/TomNicholas/VirtualiZarr/issues/11 From 80cf22b140a13195208a4aa78034bb13cc3b57d9 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Fri, 3 May 2024 13:04:08 -0600 Subject: [PATCH 20/30] changed reader_options defaults --- virtualizarr/kerchunk.py | 8 ++++---- virtualizarr/tests/test_xarray.py | 3 ++- virtualizarr/utils.py | 5 +++-- virtualizarr/xarray.py | 4 ++-- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index a9789fbd..7fa28bda 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -37,7 +37,8 @@ class FileType(AutoName): def read_kerchunk_references_from_file( filepath: str, filetype: Optional[FileType], - reader_options: Optional[dict] = {} + reader_options: Optional[dict] = {'storage_options':{'key':'', 'secret':'', 'anon':True}} + ) -> KerchunkStoreRefs: """ Read a single legacy file and return kerchunk references to its contents. @@ -49,8 +50,8 @@ def read_kerchunk_references_from_file( filetype : FileType, default: None Type of file to be opened. Used to determine which kerchunk file format backend to use. If not provided will attempt to automatically infer the correct filetype from the the filepath's extension. - reader_options: dict, default {} - Dict passed into Kerchunk file readers. ex: {'storage_options': {'anon': True}}. Note: Each Kerchunk file reader has distinct arguments, + reader_options: dict, default {'storage_options':{'key':'', 'secret':'', 'anon':True}} + Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments, so ensure reader_options match selected Kerchunk reader arguments. """ @@ -66,7 +67,6 @@ def read_kerchunk_references_from_file( refs = NetCDF3ToZarr(filepath, inline_threshold=0, **reader_options).translate() elif filetype.name.lower() == "netcdf4": from kerchunk.hdf import SingleHdf5ToZarr - reader_options = {'storage_options':{'key':'', 'secret':'', 'anon':True}} refs = SingleHdf5ToZarr(filepath, inline_threshold=0, **reader_options).translate() elif filetype.name.lower() == "grib": # TODO Grib files should be handled as a DataTree object diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 8d7908d4..6d4ee2d9 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -281,8 +281,9 @@ def test_combine_by_coords(self, netcdf4_files): @pytest.mark.parametrize("filetype", ['netcdf4', None], ids=["netcdf4 filetype", "None filetype"]) @pytest.mark.parametrize("indexes", [None, {}], ids=["None index", "empty dict index"]) def test_anon_read_s3(filetype, indexes): + #TODO: Switch away from this s3 url after minIO is implemented. fpath = 's3://carbonplan-share/virtualizarr/local.nc' - assert open_virtual_dataset(fpath, filetype=filetype, indexes=indexes, reader_options={'storage_options': {'anon': True}}) + assert open_virtual_dataset(fpath, filetype=filetype, indexes=indexes) class TestLoadVirtualDataset: def test_loadable_variables(self, netcdf4_file): diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index b40a726a..c0bf21b0 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -5,12 +5,12 @@ from fsspec.implementations.local import LocalFileOpener -def _fsspec_openfile_from_filepath(*, filepath: str, reader_options: Optional[dict]) -> Union[S3File, LocalFileOpener]: +def _fsspec_openfile_from_filepath(*, filepath: str, reader_options: Optional[dict] = {'storage_options':{'key':'', 'secret':'', 'anon':True}}) -> Union[S3File, LocalFileOpener]: """Utility function to facilitate reading remote file paths using fsspec. :param filepath: Input filepath :type filepath: str - :param reader_options: Dict containing options to pass to fsspec file reader + :param reader_options: Dict containing options to pass to fsspec file reader. Default: {'storage_options':{'key':'', 'secret':'', 'anon':True}} :type reader_options: Optional[dict] :rtype: Union[S3File, LocalFileOpener] """ @@ -34,6 +34,7 @@ def _fsspec_openfile_from_filepath(*, filepath: str, reader_options: Optional[di else: storage_options = reader_options.get('storage_options') #type: ignore + # using dict merge operator to add in defaults if keys are not specified storage_options = s3_anon_defaults | storage_options diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 1ca151af..237edcba 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -28,7 +28,7 @@ def open_virtual_dataset( loadable_variables: Optional[Iterable[str]] = None, indexes: Optional[Mapping[str, Index]] = None, virtual_array_class=ManifestArray, - reader_options: Optional[dict] = {}, + reader_options: Optional[dict] = {'storage_options':{'key':'', 'secret':'', 'anon':True}}, ) -> xr.Dataset: """ Open a file or store as an xarray Dataset wrapping virtualized zarr arrays. @@ -57,7 +57,7 @@ def open_virtual_dataset( virtual_array_class Virtual array class to use to represent the references to the chunks in each on-disk array. Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. - reader_options: dict, default {} + reader_options: dict, default {'storage_options':{'key':'', 'secret':'', 'anon':True}} Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments, so ensure reader_options match selected Kerchunk reader arguments. From 0235f51df61ea00e8b0f7d9421d2f1063ab01e16 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Fri, 3 May 2024 15:08:32 -0600 Subject: [PATCH 21/30] updated docs install --- ci/doc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/doc.yml b/ci/doc.yml index c2f72e7c..25267e06 100644 --- a/ci/doc.yml +++ b/ci/doc.yml @@ -13,4 +13,4 @@ dependencies: - "sphinx_design" - "sphinx_togglebutton" - "sphinx-autodoc-typehints" - - -e .. + - -e "..[test]" \ No newline at end of file From 1e9e2fec46f2df2a89eeb00b1898d37d359de845 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 21:09:16 +0000 Subject: [PATCH 22/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ci/doc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/doc.yml b/ci/doc.yml index 25267e06..1fffa7ee 100644 --- a/ci/doc.yml +++ b/ci/doc.yml @@ -13,4 +13,4 @@ dependencies: - "sphinx_design" - "sphinx_togglebutton" - "sphinx-autodoc-typehints" - - -e "..[test]" \ No newline at end of file + - -e "..[test]" From 55031f92d59efcaf72dcc597b447bedbd6126c17 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 6 May 2024 12:46:37 -0600 Subject: [PATCH 23/30] changed docstring type in utils to numpy style --- virtualizarr/tests/test_xarray.py | 1 + virtualizarr/utils.py | 30 ++++++++++++++++++++---------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 6d4ee2d9..219b46eb 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -43,6 +43,7 @@ def test_wrapping(): class TestEquals: # regression test for GH29 https://github.com/TomNicholas/VirtualiZarr/issues/29 def test_equals(self): + chunks = (5, 10) shape = (5, 20) zarray = ZArray( diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index c0bf21b0..676fa350 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -6,23 +6,33 @@ def _fsspec_openfile_from_filepath(*, filepath: str, reader_options: Optional[dict] = {'storage_options':{'key':'', 'secret':'', 'anon':True}}) -> Union[S3File, LocalFileOpener]: - """Utility function to facilitate reading remote file paths using fsspec. - - :param filepath: Input filepath - :type filepath: str - :param reader_options: Dict containing options to pass to fsspec file reader. Default: {'storage_options':{'key':'', 'secret':'', 'anon':True}} - :type reader_options: Optional[dict] - :rtype: Union[S3File, LocalFileOpener] + """Converts input filepath to fsspec openfile object. + + Parameters + ---------- + filepath : str + Input filepath + reader_options : _type_, optional + Dict containing kwargs to pass to file opener, by default {'storage_options':{'key':'', 'secret':'', 'anon':True}} + + Returns + ------- + Union[S3File, LocalFileOpener] + Either S3File or LocalFileOpener, depending on which protocol was supplied. + + Raises + ------ + NotImplementedError + Raises a Not Implemented Error if filepath protocol is not supported. """ + + import fsspec from upath import UPath universal_filepath = UPath(filepath) protocol = universal_filepath.protocol - # why does UPath give an empty string for a local file protocol :( - # import pdb; pdb.set_trace() - if protocol == '': fpath = fsspec.open(filepath, 'rb').open() From 6a3d7bebcc7df5fc57e2c730a42ad678514fb7ba Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 8 May 2024 16:09:46 -0600 Subject: [PATCH 24/30] added TYPE_CHECKING for fsspec and s3fs mypy type hints --- virtualizarr/utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index 676fa350..d9c8dc90 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -1,8 +1,9 @@ -from typing import Optional, Union +from typing import Optional, Union, TYPE_CHECKING + +if TYPE_CHECKING: + from fsspec.implementations.local import LocalFileOpener + from s3fs.core import S3File -# TODO: importing fsspec and s3fs to get typing. Is there a better way incase these are optional deps? -from s3fs.core import S3File -from fsspec.implementations.local import LocalFileOpener def _fsspec_openfile_from_filepath(*, filepath: str, reader_options: Optional[dict] = {'storage_options':{'key':'', 'secret':'', 'anon':True}}) -> Union[S3File, LocalFileOpener]: From 83b3c4b3adab00d55072158127043391771037a3 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 8 May 2024 16:31:15 -0600 Subject: [PATCH 25/30] fixed TYPE_CHECKING import --- virtualizarr/tests/test_xarray.py | 7 ++++++- virtualizarr/utils.py | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 05b62488..5e815590 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -277,9 +277,14 @@ def test_combine_by_coords(self, netcdf4_files): ) @pytest.mark.parametrize("indexes", [None, {}], ids=["None index", "empty dict index"]) def test_anon_read_s3(filetype, indexes): + """Parameterized tests for empty vs supplied indexes and filetypes.""" # TODO: Switch away from this s3 url after minIO is implemented. fpath = "s3://carbonplan-share/virtualizarr/local.nc" - assert open_virtual_dataset(fpath, filetype=filetype, indexes=indexes) + vds = open_virtual_dataset(fpath, filetype=filetype, indexes=indexes) + + assert vds.dims == {"time": 2920, "lat": 25, "lon": 53} + for var in vds.variables: + assert isinstance(vds[var].data, ManifestArray), var class TestLoadVirtualDataset: diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index 2fbb44f9..b6257676 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import TYPE_CHECKING, Optional, Union if TYPE_CHECKING: From a143cf4d0b90ab744914607f94ea66054f4fed67 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 9 May 2024 09:39:12 -0600 Subject: [PATCH 26/30] pinned xarray to latest commit on github --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2f49c286..1ab8500b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ classifiers = [ requires-python = ">=3.9" dynamic = ["version"] dependencies = [ - "xarray@git+https://github.com/TomNicholas/xarray.git@concat-avoid-index-auto-creation#egg=xarray", + "xarray@git+https://github.com/pydata/xarray", "kerchunk==0.2.2", "universal-pathlib", "h5netcdf", From 3a29b410f9af7526e6c491e85b29751070767441 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 13 May 2024 13:07:09 -0600 Subject: [PATCH 27/30] re-add upath --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 260826aa..074bbd7b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,8 @@ dependencies = [ "numpy", "ujson", "packaging", + "universal-pathlib" + ] [project.optional-dependencies] From 13fc295e83b4942e89436df7a67dd2ce175edafd Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 13 May 2024 19:50:31 -0600 Subject: [PATCH 28/30] merged w/ main --- virtualizarr/kerchunk.py | 5 ++--- virtualizarr/utils.py | 6 +++--- virtualizarr/xarray.py | 1 + 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index e9f1b2a7..f24d5f3e 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import NewType, cast +from typing import NewType, Optional, cast import ujson # type: ignore import xarray as xr @@ -40,11 +40,10 @@ class FileType(AutoName): def read_kerchunk_references_from_file( filepath: str, - filepath: str, filetype: FileType | None + filetype: FileType | None, reader_options: Optional[dict] = { "storage_options": {"key": "", "secret": "", "anon": True} }, - ) -> KerchunkStoreRefs: """ Read a single legacy file and return kerchunk references to its contents. diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index b6257676..6ba71057 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Optional if TYPE_CHECKING: from fsspec.implementations.local import LocalFileOpener @@ -13,7 +13,7 @@ def _fsspec_openfile_from_filepath( reader_options: Optional[dict] = { "storage_options": {"key": "", "secret": "", "anon": True} }, -) -> Union[S3File, LocalFileOpener]: +) -> S3File | LocalFileOpener: """Converts input filepath to fsspec openfile object. Parameters @@ -25,7 +25,7 @@ def _fsspec_openfile_from_filepath( Returns ------- - Union[S3File, LocalFileOpener] + S3File | LocalFileOpener Either S3File or LocalFileOpener, depending on which protocol was supplied. Raises diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 7e498552..41d99854 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -2,6 +2,7 @@ from pathlib import Path from typing import ( Literal, + Optional, overload, ) From 4f766d9d2f21bf171767506bddaa41ddc6e818de Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 14 May 2024 10:05:47 -0600 Subject: [PATCH 29/30] =?UTF-8?q?=C3=A5dds=20section=20to=20usage?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/usage.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 4fc5411a..f08b3465 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -27,6 +27,7 @@ vds = open_virtual_dataset('air.nc') (Notice we did not have to explicitly indicate the file format, as {py:func}`open_virtual_dataset ` will attempt to automatically infer it.) + ```{note} In future we would like for it to be possible to just use `xr.open_dataset`, e.g. @@ -61,6 +62,17 @@ Attributes: These {py:class}`ManifestArray ` objects are each a virtual reference to some data in the `air.nc` netCDF file, with the references stored in the form of "Chunk Manifests". +### Opening remote files + +To open remote files as virtual datasets pass the `reader_options` options, e.g. + +```python + +aws_credentials = {"key": "", "secret": ""} +vds = open_virtual_dataset("s3://fake-bucket/file.nc", reader_options={'storage_options':aws_credentials}) + +``` + ## Chunk Manifests In the Zarr model N-dimensional arrays are stored as a series of compressed chunks, each labelled by a chunk key which indicates its position in the array. Whilst conventionally each of these Zarr chunks are a separate compressed binary file stored within a Zarr Store, there is no reason why these chunks could not actually already exist as part of another file (e.g. a netCDF file), and be loaded by reading a specific byte range from this pre-existing file. From e6f047ff57822aa88bfee168c9058a4d341fa8d4 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 14 May 2024 10:12:04 -0600 Subject: [PATCH 30/30] Minor formatting nit of code example in docs --- docs/usage.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index f08b3465..baaf5b12 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -67,10 +67,8 @@ These {py:class}`ManifestArray ` objects a To open remote files as virtual datasets pass the `reader_options` options, e.g. ```python - -aws_credentials = {"key": "", "secret": ""} -vds = open_virtual_dataset("s3://fake-bucket/file.nc", reader_options={'storage_options':aws_credentials}) - +aws_credentials = {"key": ..., "secret": ...} +vds = open_virtual_dataset("s3://some-bucket/file.nc", reader_options={'storage_options': aws_credentials}) ``` ## Chunk Manifests