Skip to content

Commit

Permalink
Add utility for opening remote files with fsspec (#9797)
Browse files Browse the repository at this point in the history
* Add utility for opening remote files with fsspec

* Apply Joe's suggestions from code review

Co-authored-by: Joe Hamman <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Lint

* Add what's new entry

* Type hint

* Make mypy happy

---------

Co-authored-by: Joe Hamman <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Nov 20, 2024
1 parent 339ed93 commit 077276a
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 1 deletion.
3 changes: 3 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ New Features
- Optimize :py:meth:`DataArray.polyfit` and :py:meth:`Dataset.polyfit` with dask, when used with
arrays with more than two dimensions.
(:issue:`5629`). By `Deepak Cherian <https://github.com/dcherian>`_.
- Support for directly opening remote files as string paths (for example, ``s3://bucket/data.nc``)
with ``fsspec`` when using the ``h5netcdf`` engine (:issue:`9723`, :pull:`9797`).
By `James Bourbeau <https://github.com/jrbourbeau>`_.
- Re-implement the :py:mod:`ufuncs` module, which now dynamically dispatches to the
underlying array's backend. Provides better support for certain wrapped array types
like ``jax.numpy.ndarray``. (:issue:`7848`, :pull:`9776`).
Expand Down
9 changes: 9 additions & 0 deletions xarray/backends/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,15 @@ def _normalize_path_list(
return _normalize_path_list(paths)


def _open_remote_file(file, mode, storage_options=None):
import fsspec

fs, _, paths = fsspec.get_fs_token_paths(
file, mode=mode, storage_options=storage_options
)
return fs.open(paths[0], mode=mode)


def _encode_variable_name(name):
if name is None:
name = NONE_VAR_NAME
Expand Down
12 changes: 11 additions & 1 deletion xarray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
BackendEntrypoint,
WritableCFDataStore,
_normalize_path,
_open_remote_file,
datatree_from_dict_with_io_cleanup,
find_root_and_group,
)
Expand Down Expand Up @@ -149,9 +150,16 @@ def open(
decode_vlen_strings=True,
driver=None,
driver_kwds=None,
storage_options: dict[str, Any] | None = None,
):
import h5netcdf

if isinstance(filename, str) and is_remote_uri(filename) and driver is None:
mode_ = "rb" if mode == "r" else mode
filename = _open_remote_file(
filename, mode=mode_, storage_options=storage_options
)

if isinstance(filename, bytes):
raise ValueError(
"can't open netCDF4/HDF5 as bytes "
Expand All @@ -161,7 +169,7 @@ def open(
magic_number = read_magic_number_from_file(filename)
if not magic_number.startswith(b"\211HDF\r\n\032\n"):
raise ValueError(
f"{magic_number} is not the signature of a valid netCDF4 file"
f"{magic_number!r} is not the signature of a valid netCDF4 file"
)

if format not in [None, "NETCDF4"]:
Expand Down Expand Up @@ -425,6 +433,7 @@ def open_dataset(
decode_vlen_strings=True,
driver=None,
driver_kwds=None,
storage_options: dict[str, Any] | None = None,
) -> Dataset:
filename_or_obj = _normalize_path(filename_or_obj)
store = H5NetCDFStore.open(
Expand All @@ -437,6 +446,7 @@ def open_dataset(
decode_vlen_strings=decode_vlen_strings,
driver=driver,
driver_kwds=driver_kwds,
storage_options=storage_options,
)

store_entrypoint = StoreBackendEntrypoint()
Expand Down
21 changes: 21 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -6489,3 +6489,24 @@ def test_zarr_safe_chunk_region(tmp_path):
chunk = ds.isel(region)
chunk = chunk.chunk()
chunk.chunk().to_zarr(store, region=region)


@requires_h5netcdf
@requires_fsspec
def test_h5netcdf_storage_options() -> None:
with create_tmp_files(2, allow_cleanup_failure=ON_WINDOWS) as (f1, f2):
ds1 = create_test_data()
ds1.to_netcdf(f1, engine="h5netcdf")

ds2 = create_test_data()
ds2.to_netcdf(f2, engine="h5netcdf")

files = [f"file://{f}" for f in [f1, f2]]
ds = xr.open_mfdataset(
files,
engine="h5netcdf",
concat_dim="time",
combine="nested",
storage_options={"skip_instance_cache": False},
)
assert_identical(xr.concat([ds1, ds2], dim="time"), ds)

0 comments on commit 077276a

Please sign in to comment.