Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…into refactor/store-mode
  • Loading branch information
jhamman committed Nov 11, 2024
2 parents 62939b6 + 2fa0082 commit 2296c3f
Show file tree
Hide file tree
Showing 10 changed files with 776 additions and 22 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/releases.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ jobs:
with:
name: releases
path: dist
- uses: pypa/gh-action-pypi-publish@v1.11.0
- uses: pypa/gh-action-pypi-publish@v1.12.2
with:
user: __token__
password: ${{ secrets.pypi_password }}
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ default_language_version:
python: python3
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.7.2
rev: v0.7.3
hooks:
- id: ruff
args: ["--fix", "--show-fixes"]
Expand Down
135 changes: 135 additions & 0 deletions src/zarr/core/_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import dataclasses
import textwrap
from typing import Any, Literal

import numcodecs.abc
import numpy as np

from zarr.abc.codec import Codec
from zarr.core.metadata.v3 import DataType


@dataclasses.dataclass(kw_only=True)
class GroupInfo:
"""
Visual summary for a Group.
Note that this method and its properties is not part of
Zarr's public API.
"""

_name: str
_type: Literal["Group"] = "Group"
_zarr_format: Literal[2, 3]
_read_only: bool
_store_type: str
_count_members: int | None = None
_count_arrays: int | None = None
_count_groups: int | None = None

def __repr__(self) -> str:
template = textwrap.dedent("""\
Name : {_name}
Type : {_type}
Zarr format : {_zarr_format}
Read-only : {_read_only}
Store type : {_store_type}""")

if self._count_members is not None:
template += "\nNo. members : {_count_members}"
if self._count_arrays is not None:
template += "\nNo. arrays : {_count_arrays}"
if self._count_groups is not None:
template += "\nNo. groups : {_count_groups}"
return template.format(**dataclasses.asdict(self))


def human_readable_size(size: int) -> str:
if size < 2**10:
return f"{size}"
elif size < 2**20:
return f"{size / float(2**10):.1f}K"
elif size < 2**30:
return f"{size / float(2**20):.1f}M"
elif size < 2**40:
return f"{size / float(2**30):.1f}G"
elif size < 2**50:
return f"{size / float(2**40):.1f}T"
else:
return f"{size / float(2**50):.1f}P"


def byte_info(size: int) -> str:
if size < 2**10:
return str(size)
else:
return f"{size} ({human_readable_size(size)})"


@dataclasses.dataclass(kw_only=True)
class ArrayInfo:
"""
Visual summary for an Array.
Note that this method and its properties is not part of
Zarr's public API.
"""

_type: Literal["Array"] = "Array"
_zarr_format: Literal[2, 3]
_data_type: np.dtype[Any] | DataType
_shape: tuple[int, ...]
_chunk_shape: tuple[int, ...] | None = None
_order: Literal["C", "F"]
_read_only: bool
_store_type: str
_compressor: numcodecs.abc.Codec | None = None
_filters: tuple[numcodecs.abc.Codec, ...] | None = None
_codecs: list[Codec] | None = None
_count_bytes: int | None = None
_count_bytes_stored: int | None = None
_count_chunks_initialized: int | None = None

def __repr__(self) -> str:
template = textwrap.dedent("""\
Type : {_type}
Zarr format : {_zarr_format}
Data type : {_data_type}
Shape : {_shape}
Chunk shape : {_chunk_shape}
Order : {_order}
Read-only : {_read_only}
Store type : {_store_type}""")

kwargs = dataclasses.asdict(self)
if self._chunk_shape is None:
# for non-regular chunk grids
kwargs["chunk_shape"] = "<variable>"
if self._compressor is not None:
template += "\nCompressor : {_compressor}"

if self._filters is not None:
template += "\nFilters : {_filters}"

if self._codecs is not None:
template += "\nCodecs : {_codecs}"

if self._count_bytes is not None:
template += "\nNo. bytes : {_count_bytes}"
kwargs["_count_bytes"] = byte_info(self._count_bytes)

if self._count_bytes_stored is not None:
template += "\nNo. bytes stored : {_count_bytes_stored}"
kwargs["_count_stored"] = byte_info(self._count_bytes_stored)

if (
self._count_bytes is not None
and self._count_bytes_stored is not None
and self._count_bytes_stored > 0
):
template += "\nStorage ratio : {_storage_ratio}"
kwargs["_storage_ratio"] = f"{self._count_bytes / self._count_bytes_stored:.1f}"

if self._count_chunks_initialized is not None:
template += "\nChunks Initialized : {_count_chunks_initialized}"
return template.format(**kwargs)
115 changes: 110 additions & 5 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from zarr.abc.store import Store, set_or_delete
from zarr.codecs import _get_default_array_bytes_codec
from zarr.codecs._v2 import V2Codec
from zarr.core._info import ArrayInfo
from zarr.core.attributes import Attributes
from zarr.core.buffer import (
BufferPrototype,
Expand Down Expand Up @@ -1332,9 +1333,65 @@ async def update_attributes(self, new_attributes: dict[str, JSON]) -> Self:
def __repr__(self) -> str:
return f"<AsyncArray {self.store_path} shape={self.shape} dtype={self.dtype}>"

async def info(self) -> None:
@property
def info(self) -> Any:
"""
Return the statically known information for an array.
Returns
-------
ArrayInfo
See Also
--------
AsyncArray.info_complete
All information about a group, including dynamic information
like the number of bytes and chunks written.
"""
return self._info()

async def info_complete(self) -> Any:
# TODO: get the size of the object from the store.
extra = {
"count_chunks_initialized": await self.nchunks_initialized(),
# count_bytes_stored isn't yet implemented.
}
return self._info(extra=extra)

raise NotImplementedError

def _info(self, extra: dict[str, int] | None = None) -> Any:
kwargs: dict[str, Any] = {}
if self.metadata.zarr_format == 2:
assert isinstance(self.metadata, ArrayV2Metadata)
if self.metadata.compressor is not None:
kwargs["_compressor"] = self.metadata.compressor
if self.metadata.filters is not None:
kwargs["_filters"] = self.metadata.filters
kwargs["_data_type"] = self.metadata.dtype
kwargs["_chunk_shape"] = self.metadata.chunks
else:
kwargs["_codecs"] = self.metadata.codecs
kwargs["_data_type"] = self.metadata.data_type
# just regular?
chunk_grid = self.metadata.chunk_grid
if isinstance(chunk_grid, RegularChunkGrid):
kwargs["_chunk_shape"] = chunk_grid.chunk_shape
else:
raise NotImplementedError(
"'info' is not yet implemented for chunk grids of type {type(self.metadata.chunk_grid)}"
)

return ArrayInfo(
_zarr_format=self.metadata.zarr_format,
_shape=self.shape,
_order=self.order,
_read_only=self.read_only,
_store_type=type(self.store_path.store).__name__,
_count_bytes=self.dtype.itemsize * self.size,
**kwargs,
)


# TODO: Array can be a frozen data class again once property setters (e.g. shape) are removed
@dataclass(frozen=False)
Expand Down Expand Up @@ -3099,10 +3156,58 @@ def update_attributes(self, new_attributes: dict[str, JSON]) -> Array:
def __repr__(self) -> str:
return f"<Array {self.store_path} shape={self.shape} dtype={self.dtype}>"

def info(self) -> None:
return sync(
self._async_array.info(),
)
@property
def info(self) -> Any:
"""
Return the statically known information for an array.
Returns
-------
ArrayInfo
See Also
--------
Array.info_complete
All information about a group, including dynamic information
like the number of bytes and chunks written.
Examples
--------
>>> arr = zarr.create(shape=(10,), chunks=(2,), dtype="float32")
>>> arr.info
Type : Array
Zarr format : 3
Data type : DataType.float32
Shape : (10,)
Chunk shape : (2,)
Order : C
Read-only : False
Store type : MemoryStore
Codecs : [BytesCodec(endian=<Endian.little: 'little'>)]
No. bytes : 40
"""
return self._async_array.info

def info_complete(self) -> Any:
"""
Returns all the information about an array, including information from the Store.
In addition to the statically known information like ``name`` and ``zarr_format``,
this includes additional information like the size of the array in bytes and
the number of chunks written.
Note that this method will need to read metadata from the store.
Returns
-------
ArrayInfo
See Also
--------
Array.info
The statically known subset of metadata about an array.
"""
return sync(self._async_array.info_complete())


async def chunks_initialized(
Expand Down
Loading

0 comments on commit 2296c3f

Please sign in to comment.