Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

zarr-python v3 compatibility #516

Draft
wants to merge 39 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
39722e7
Save progress for next week
mpiannucci Oct 4, 2024
d3c7e37
Bump zarr python version
mpiannucci Oct 5, 2024
25d7d14
Get some tests working others failing
mpiannucci Oct 5, 2024
ffe5f9d
get through single hdf to zarr
mpiannucci Oct 8, 2024
5aef233
Save progress
mpiannucci Oct 8, 2024
b9323d2
Cleanup, almost working with hdf
mpiannucci Oct 9, 2024
0f17119
Closer...
mpiannucci Oct 9, 2024
5c8806b
Updating tests
mpiannucci Oct 9, 2024
80fedcd
reorganize
mpiannucci Oct 10, 2024
1f69a0b
Save progress
mpiannucci Oct 10, 2024
d556e52
Refactor to clean things up
mpiannucci Oct 10, 2024
b27e64c
Fix circular import
mpiannucci Oct 10, 2024
41d6e8e
Iterate
mpiannucci Oct 10, 2024
7ade1a6
Change zarr dep
mpiannucci Oct 10, 2024
492ddee
More conversion
mpiannucci Oct 10, 2024
6e5741c
Specify zarr version
mpiannucci Oct 15, 2024
c0316ac
Working remote hdf tests
mpiannucci Oct 23, 2024
59bd36c
Working grib impl
mpiannucci Oct 23, 2024
187ced2
Add back commented out code
mpiannucci Oct 23, 2024
690ed21
Make grib codec a compressor since its bytes to array
mpiannucci Oct 23, 2024
5019b15
Switch back
mpiannucci Oct 23, 2024
d96cf46
Add first pass at grib zarr 3 codec
mpiannucci Oct 26, 2024
cbcb720
Fix typing
mpiannucci Oct 29, 2024
b88655f
Fix some broken tests; use async filesystem wrapper
moradology Nov 6, 2024
73eaf33
Implement zarr3 compatibility for grib
moradology Nov 20, 2024
3757199
Use zarr3 stores directly; avoid use of internal fs
moradology Nov 21, 2024
9444ff8
Merge pull request #4 from moradology/fix/zarr3-grib-tests
mpiannucci Nov 26, 2024
d8848ce
Forward
mpiannucci Nov 26, 2024
1fa294e
More
mpiannucci Nov 26, 2024
543178d
Figure out async wrapper
mpiannucci Nov 26, 2024
96b56cd
Closer on hdf5
mpiannucci Nov 26, 2024
0808b05
netcdf but failing
mpiannucci Nov 26, 2024
aef006e
grib passing
mpiannucci Nov 26, 2024
d9bf0dd
Fix inline test
mpiannucci Nov 26, 2024
884fc68
More
mpiannucci Nov 26, 2024
1145f45
standardize compressor name
mpiannucci Nov 27, 2024
94ec479
Fix one more hdf test
mpiannucci Nov 27, 2024
a9693d1
Small tweaks
mpiannucci Nov 27, 2024
7e9112a
Hide fsspec import where necessary
mpiannucci Nov 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 2 additions & 49 deletions kerchunk/hdf.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import base64
import io
import logging
from typing import Union, BinaryIO, Any, cast
from typing import Union, BinaryIO
from packaging.version import Version

import fsspec.core
Expand All @@ -11,7 +11,7 @@
import numcodecs

from .codecs import FillStringsCodec
from .utils import _encode_for_JSON
from .utils import _encode_for_JSON, encode_fill_value

try:
import h5py
Expand All @@ -22,12 +22,6 @@
"for more details."
)

# try:
# from zarr.meta import encode_fill_value
# except ModuleNotFoundError:
# # https://github.com/zarr-developers/zarr-python/issues/2021
# from zarr.v2.meta import encode_fill_value

lggr = logging.getLogger("h5-to-zarr")
_HIDDEN_ATTRS = { # from h5netcdf.attrs
"REFERENCE_LIST",
Expand Down Expand Up @@ -504,7 +498,6 @@ def _translator(
lggr.debug(f"Created Zarr array: {za}")
self._transfer_attrs(h5obj, za)

# za.attrs["_ARRAY_DIMENSIONS"] = adims
lggr.debug(f"_ARRAY_DIMENSIONS = {adims}")

if "data" in kwargs:
Expand Down Expand Up @@ -705,43 +698,3 @@ def _is_netcdf_variable(dataset: h5py.Dataset):
def has_visititems_links():
return hasattr(h5py.Group, "visititems_links")


def encode_fill_value(v: Any, dtype: np.dtype, object_codec: Any = None) -> Any:
# early out
if v is None:
return v
if dtype.kind == "V" and dtype.hasobject:
if object_codec is None:
raise ValueError("missing object_codec for object array")
v = object_codec.encode(v)
v = str(base64.standard_b64encode(v), "ascii")
return v
if dtype.kind == "f":
if np.isnan(v):
return "NaN"
elif np.isposinf(v):
return "Infinity"
elif np.isneginf(v):
return "-Infinity"
else:
return float(v)
elif dtype.kind in "ui":
return int(v)
elif dtype.kind == "b":
return bool(v)
elif dtype.kind in "c":
c = cast(np.complex128, np.dtype(complex).type())
v = (
encode_fill_value(v.real, c.real.dtype, object_codec),
encode_fill_value(v.imag, c.imag.dtype, object_codec),
)
return v
elif dtype.kind in "SV":
v = str(base64.standard_b64encode(v), "ascii")
return v
elif dtype.kind == "U":
return v
elif dtype.kind in "mM":
return int(v.view("i8"))
else:
return v
13 changes: 10 additions & 3 deletions kerchunk/netCDF3.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from functools import reduce
from packaging.version import Version
from operator import mul

import numpy as np
Expand Down Expand Up @@ -167,7 +168,13 @@ def translate(self):
import zarr

out = self.out
z = zarr.open(out, mode="w", zarr_format=2)
if Version(zarr.__version__) < Version("3.0.0.a0"):
store = zarr.storage.KVStore(out)
z = zarr.group(store=store, overwrite=True)
else:
store = zarr.storage.MemoryStore(mode="a", store_dict=out)
z = zarr.open(store, mode="w", zarr_format=2)

for dim, var in self.variables.items():
if dim in self.chunks:
shape = self.chunks[dim][-1]
Expand Down Expand Up @@ -197,7 +204,7 @@ def translate(self):
dtype=var.data.dtype,
fill_value=fill,
chunks=shape,
compression=None,
compressor=None,
)
part = ".".join(["0"] * len(shape)) or "0"
k = f"{dim}/{part}"
Expand Down Expand Up @@ -251,7 +258,7 @@ def translate(self):
dtype=base,
fill_value=fill,
chunks=(1,) + dtype.shape,
compression=None,
compressor=None,
)
arr.attrs.update(
{
Expand Down
16 changes: 11 additions & 5 deletions kerchunk/tests/test_hdf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
from typing import Any
import fsspec
import json
import os.path as osp

import fsspec.implementations
Expand All @@ -23,25 +24,29 @@ async def list_dir(store, path):
[x async for x in store.list_dir(path)]


def create_store(test_dict: dict):
def create_store(test_dict: dict, remote_options: Any = None):
if Version(zarr.__version__) < Version("3.0.0.a0"):
return fsspec.get_mapper(
mpiannucci marked this conversation as resolved.
Show resolved Hide resolved
"reference://", fo=test_dict, remote_protocol="s3", remote_options=so
"reference://", fo=test_dict, remote_protocol="s3", remote_options=remote_options
)
else:
fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict)
fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict, remote_options=remote_options)
return zarr.storage.RemoteStore(fs, mode="r")
mpiannucci marked this conversation as resolved.
Show resolved Hide resolved


def test_single():
"""Test creating references for a single HDF file"""
url = "s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/201704010000.CHRTOUT_DOMAIN1.comp"
#url = "s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/201704010000.CHRTOUT_DOMAIN1.comp"
url = "s3://noaa-nos-ofs-pds/ngofs2/netcdf/202410/ngofs2.t03z.20241001.2ds.f020.nc"
so = dict(anon=True, default_fill_cache=False, default_cache_type="none")

with fsspec.open(url, **so) as f:
h5chunks = SingleHdf5ToZarr(f, url, storage_options=so)
test_dict = h5chunks.translate()

with open("test_dict.json", "w") as f:
json.dump(test_dict, f)

store = create_store(test_dict)

ds = xr.open_dataset(store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False))
Expand Down Expand Up @@ -97,6 +102,7 @@ def test_multizarr(generate_mzz):
"""Test creating a combined reference file with MultiZarrToZarr"""
mzz = generate_mzz
test_dict = mzz.translate()

store = create_store(test_dict)
ds = xr.open_dataset(store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False))

Expand Down
20 changes: 18 additions & 2 deletions kerchunk/tests/test_netcdf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from typing import Any


import fsspec
Expand All @@ -7,6 +8,8 @@
import pytest
from kerchunk import netCDF3

import zarr

xr = pytest.importorskip("xarray")


Expand All @@ -24,16 +27,29 @@
)


def create_store(test_dict: dict, remote_options: Any = None):
if Version(zarr.__version__) < Version("3.0.0.a0"):
return fsspec.get_mapper(
"reference://", fo=test_dict, remote_protocol="s3", remote_options=remote_options
)
else:
fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict, remote_options=remote_options)
return zarr.storage.RemoteStore(fs, mode="r")


def test_one(m):
m.pipe("data.nc3", bdata)
h = netCDF3.netcdf_recording_file("memory://data.nc3")
out = h.translate()

store = create_store(out, remote_options={"remote_protocol": "memory"})

ds = xr.open_dataset(
"reference://",
store,
engine="zarr",
backend_kwargs={
"consolidated": False,
"storage_options": {"fo": out, "remote_protocol": "memory"},
"zarr_format": "2",
},
)
assert (ds.data == data).all()
Expand Down
44 changes: 44 additions & 0 deletions kerchunk/utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import base64
import copy
import itertools
from typing import Any, cast
import warnings

import ujson

import fsspec
import numpy as np
import zarr


Expand Down Expand Up @@ -134,6 +136,48 @@ def _encode_for_JSON(store):
return store



def encode_fill_value(v: Any, dtype: np.dtype, object_codec: Any = None) -> Any:
# early out
if v is None:
return v
if dtype.kind == "V" and dtype.hasobject:
if object_codec is None:
raise ValueError("missing object_codec for object array")
v = object_codec.encode(v)
v = str(base64.standard_b64encode(v), "ascii")
return v
if dtype.kind == "f":
if np.isnan(v):
return "NaN"
elif np.isposinf(v):
return "Infinity"
elif np.isneginf(v):
return "-Infinity"
else:
return float(v)
elif dtype.kind in "ui":
return int(v)
elif dtype.kind == "b":
return bool(v)
elif dtype.kind in "c":
c = cast(np.complex128, np.dtype(complex).type())
v = (
encode_fill_value(v.real, c.real.dtype, object_codec),
encode_fill_value(v.imag, c.imag.dtype, object_codec),
)
return v
elif dtype.kind in "SV":
v = str(base64.standard_b64encode(v), "ascii")
return v
elif dtype.kind == "U":
return v
elif dtype.kind in "mM":
return int(v.view("i8"))
else:
return v


def do_inline(store, threshold, remote_options=None, remote_protocol=None):
"""Replace short chunks with the value of that chunk and inline metadata

Expand Down
Loading