Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write manifests to zarr store #45

Merged
merged 18 commits into from
May 1, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion virtualizarr/manifests/manifest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import itertools
import json
import re
from typing import Any, Iterable, Iterator, List, Mapping, Tuple, Union, cast

Expand Down Expand Up @@ -115,7 +116,8 @@

def to_zarr_json(self, filepath: str) -> None:
"""Write a ChunkManifest to a Zarr manifest.json file."""
raise NotImplementedError()
with open(filepath, "w") as json_file:
json.dump(self.dict(), json_file, indent=4, separators=(", ", ": "))

Check warning on line 120 in virtualizarr/manifests/manifest.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/manifests/manifest.py#L119-L120

Added lines #L119 - L120 were not covered by tests

@classmethod
def from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest":
Expand Down
5 changes: 2 additions & 3 deletions virtualizarr/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import virtualizarr.kerchunk as kerchunk
from virtualizarr.kerchunk import KerchunkStoreRefs
from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.zarr import dataset_to_zarr


class ManifestBackendArray(ManifestArray, BackendArray):
Expand Down Expand Up @@ -157,9 +158,7 @@
----------
filepath : str, default: None
"""
raise NotImplementedError(
"No point in writing out these virtual arrays to Zarr until at least one Zarr reader can actually read them."
)
dataset_to_zarr(self.ds, storepath)

Check warning on line 161 in virtualizarr/xarray.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/xarray.py#L161

Added line #L161 was not covered by tests

@overload
def to_kerchunk(self, filepath: None, format: Literal["dict"]) -> KerchunkStoreRefs:
Expand Down
82 changes: 78 additions & 4 deletions virtualizarr/zarr.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import json
from pathlib import Path
from typing import Any, Literal, NewType, Optional, Tuple, Union

import numpy as np
import ujson # type: ignore
import xarray as xr
from pydantic import BaseModel, ConfigDict, field_validator

# TODO replace these with classes imported directly from Zarr? (i.e. Zarr Object Models)
Expand Down Expand Up @@ -82,11 +85,18 @@
zarr_format=int(decoded_arr_refs_zarray["zarr_format"]),
)

def to_kerchunk_json(self) -> str:
def dict(self) -> dict[str, Any]:
zarray_dict = dict(self)
# TODO not sure if there is a better way to get the '<i4' style representation of the dtype out
zarray_dict["dtype"] = zarray_dict["dtype"].descr[0][1]
return ujson.dumps(zarray_dict)
zarray_dict["dtype"] = encode_dtype(zarray_dict["dtype"])
return zarray_dict

def to_kerchunk_json(self) -> str:
return ujson.dumps(self.dict())


def encode_dtype(dtype: np.dtype) -> str:
# TODO not sure if there is a better way to get the '<i4' style representation of the dtype out
return dtype.descr[0][1]


def ceildiv(a: int, b: int) -> int:
Expand All @@ -96,3 +106,67 @@
See https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python
"""
return -(a // -b)


def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None:
"""
Write an xarray dataset whose variables wrap ManifestArrays to a Zarr store, writing chunk references into manifest.json files.

Not very useful until some implementation of a Zarr reader can actually read these manifest.json files.
See https://github.com/zarr-developers/zarr-specs/issues/287
"""

from virtualizarr.manifests import ManifestArray

Check warning on line 119 in virtualizarr/zarr.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/zarr.py#L119

Added line #L119 was not covered by tests

_storepath = Path(storepath)
Path.mkdir(_storepath, exist_ok=False)

Check warning on line 122 in virtualizarr/zarr.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/zarr.py#L121-L122

Added lines #L121 - L122 were not covered by tests

# TODO should techically loop over groups in a tree but a dataset corresponds to only one group
# TODO does this mean we need a group kwarg?
TomNicholas marked this conversation as resolved.
Show resolved Hide resolved

consolidated_metadata: dict = {"metadata": {}}

Check warning on line 127 in virtualizarr/zarr.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/zarr.py#L127

Added line #L127 was not covered by tests
TomNicholas marked this conversation as resolved.
Show resolved Hide resolved

# write top-level .zattrs
with open(_storepath / ".zattrs", "w") as json_file:
json.dump(ds.attrs, json_file, indent=4, separators=(", ", ": "))
TomNicholas marked this conversation as resolved.
Show resolved Hide resolved
consolidated_metadata[".zattrs"] = ds.attrs

Check warning on line 132 in virtualizarr/zarr.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/zarr.py#L130-L132

Added lines #L130 - L132 were not covered by tests

# write .zgroup
with open(_storepath / ".zgroup", "w") as json_file:
json.dump({"zarr_format": 2}, json_file, indent=4, separators=(", ", ": "))
consolidated_metadata[".zgroup"] = {"zarr_format": 2}

Check warning on line 137 in virtualizarr/zarr.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/zarr.py#L135-L137

Added lines #L135 - L137 were not covered by tests

for name, var in ds.variables.items():
array_dir = _storepath / name
marr = var.data

Check warning on line 141 in virtualizarr/zarr.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/zarr.py#L139-L141

Added lines #L139 - L141 were not covered by tests

# TODO move this check outside the writing loop so we don't write an incomplete store on failure?
if not isinstance(marr, ManifestArray):
raise TypeError(

Check warning on line 145 in virtualizarr/zarr.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/zarr.py#L144-L145

Added lines #L144 - L145 were not covered by tests
"Only xarray objects wrapping ManifestArrays can be written to zarr using this method, "
f"but variable {name} wraps an array of type {type(marr)}"
)

Path.mkdir(array_dir, exist_ok=False)

Check warning on line 150 in virtualizarr/zarr.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/zarr.py#L150

Added line #L150 was not covered by tests

# write the chunk references into a manifest.json file
marr.manifest.to_zarr_json(array_dir / "manifest.json")

Check warning on line 153 in virtualizarr/zarr.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/zarr.py#L153

Added line #L153 was not covered by tests

# write each .zarray
with open(array_dir / ".zarray", "w") as json_file:
json.dump(marr.zarray.dict(), json_file, indent=4, separators=(", ", ": "))

Check warning on line 157 in virtualizarr/zarr.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/zarr.py#L156-L157

Added lines #L156 - L157 were not covered by tests

# write each .zattrs
zattrs = var.attrs.copy()
zattrs["_ARRAY_DIMENSIONS"] = list(var.dims)
with open(array_dir / ".zattrs", "w") as json_file:
json.dump(zattrs, json_file, indent=4, separators=(", ", ": "))

Check warning on line 163 in virtualizarr/zarr.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/zarr.py#L160-L163

Added lines #L160 - L163 were not covered by tests

# record this info to include in the overall .zmetadata
consolidated_metadata["metadata"][name + "/.zarray"] = marr.zarray.dict()
consolidated_metadata["metadata"][name + "/.zattrs"] = zattrs

Check warning on line 167 in virtualizarr/zarr.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/zarr.py#L166-L167

Added lines #L166 - L167 were not covered by tests

# write store-level .zmetadata
consolidated_metadata["zarr_consolidated_format"] = 1
with open(_storepath / ".zmetadata", "w") as json_file:
json.dump(consolidated_metadata, json_file, indent=4, separators=(", ", ": "))

Check warning on line 172 in virtualizarr/zarr.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/zarr.py#L170-L172

Added lines #L170 - L172 were not covered by tests
Loading