diff --git a/.github/workflows/python-ci-single.yml b/.github/workflows/python-ci-single.yml index 408ed20e8e..2def6608a2 100644 --- a/.github/workflows/python-ci-single.yml +++ b/.github/workflows/python-ci-single.yml @@ -104,7 +104,7 @@ jobs: # key: libtiledbsoma-build-dist-${{ inputs.os }}-${{ inputs.python_version }}-${{ hashFiles('libtiledbsoma', 'scripts/bld') }} - name: Install tiledbsoma - run: pip -v install -e apis/python[dev] -C "--build-option=--no-tiledb-deprecated" + run: pip -v install -e apis/python[all] -C "--build-option=--no-tiledb-deprecated" env: CC: ${{ inputs.cc }} CXX: ${{ inputs.cxx }} diff --git a/.github/workflows/r-python-interop-testing.yml b/.github/workflows/r-python-interop-testing.yml index 2b48ee1c1f..8f1f9ae703 100644 --- a/.github/workflows/r-python-interop-testing.yml +++ b/.github/workflows/r-python-interop-testing.yml @@ -100,7 +100,7 @@ jobs: cache-dependency-path: ./apis/python/setup.py - name: Install tiledbsoma - run: pip -v install -e apis/python[dev] -C "--build-option=--no-tiledb-deprecated " + run: pip -v install -e apis/python[all] -C "--build-option=--no-tiledb-deprecated " - name: Show Python package versions run: | diff --git a/apis/python/requirements_spatial.txt b/apis/python/requirements_spatial.txt index 1f9fd21769..45c3593a70 100644 --- a/apis/python/requirements_spatial.txt +++ b/apis/python/requirements_spatial.txt @@ -1,3 +1,4 @@ tifffile pillow spatialdata +xarray>=2024.05.0 diff --git a/apis/python/src/tiledbsoma/experimental/_xarray_backend.py b/apis/python/src/tiledbsoma/experimental/_xarray_backend.py new file mode 100644 index 0000000000..02fb184269 --- /dev/null +++ b/apis/python/src/tiledbsoma/experimental/_xarray_backend.py @@ -0,0 +1,208 @@ +# Copyright (c) 2024 The Chan Zuckerberg Initiative Foundation +# Copyright (c) 2024 TileDB, Inc +# +# Licensed under the MIT License. +from typing import Any, Mapping, Optional, Tuple, Union + +import numpy as np +from somacore import options +from xarray import Variable +from xarray.backends import AbstractDataStore, BackendArray +from xarray.core.indexing import ( + BasicIndexer, + ExplicitIndexer, + IndexingSupport, + LazilyIndexedArray, + OuterIndexer, + VectorizedIndexer, + explicit_indexing_adapter, +) +from xarray.core.utils import Frozen + +from .. import DenseNDArray +from ..options._soma_tiledb_context import SOMATileDBContext + + +class DenseNDArrayWrapper(BackendArray): # type: ignore + """Wraps an SOMA DenseNDArray for xarray variable/DataArray support. + + Note: This class does not open or close the SOMA array. The array must be already + opened for reading to be used with this wrapper. + """ + + def __init__( + self, + array: DenseNDArray, + ): + self._array = array + self._dtype: np.typing.DTypeLike = self._array.schema.field( + "soma_data" + ).type.to_pandas_dtype() + + def _raw_indexing_method( + self, key: Tuple[Union[slice, int], ...] + ) -> np.typing.NDArray[Any]: + """Returns a numpy array containing data from a requested tuple.""" + assert len(key) == len(self.shape) # Should be verified by xarray. + + # Compute the expected Xarray output shape. + output_shape = tuple( + len(range(dim_size)[index]) # type: ignore + for dim_size, index in zip(self.shape, key) + if not np.isscalar(index) + ) + + def update_int_index(index: int, dim_size: int) -> int: + """Convert xarray integer index to a SOMA-compatible integer index. + + - Convert negative indices to appropriate positive position. + """ + if not -dim_size <= index < dim_size: + raise IndexError( + f"Index {index} out of bounds for dimension with size {dim_size}." + ) + return index if index >= 0 else index + dim_size - 1 + + def update_slice_index(index: slice, dim_size: int) -> slice: + """Convert xarray slice index to a SOMA-compatible slice index. + + - Throw error for slice step != 1. + - Convert negative indices to appropriate positive position. + - Convert upper index to be inclusive (SOMA/TileDB) instead of + exclusive (numpy/xarray). + """ + if index.step not in (1, None): + raise ValueError("Slice steps are not supported.") + _index = range(dim_size)[index] # Convert negative values to positive. + return slice(_index.start, _index.stop - 1) + + key = tuple( + ( + update_slice_index(index, dim_size) + if isinstance(index, slice) + else update_int_index(index, dim_size) + ) + for index, dim_size in zip(key, self.shape) + ) + + # Read the data from SOMA, convert to numpy, and reshape. + result = self._array.read(key).to_numpy() + return result.reshape(output_shape) # type: ignore + + def __getitem__(self, key: ExplicitIndexer) -> np.typing.NDArray[Any]: + """Returns data from SOMA DenseNDArray using xarray-style indexing.""" + + # If any of the slices have steps, convert the key to a vectorized + # indexer and the slice to a numpy array. + def has_step(x: Any) -> bool: + return isinstance(x, slice) and x.step not in (1, None) + + if any(has_step(index) for index in key.tuple): + + key_tuple = tuple( + ( + np.arange(index.start, index.stop, index.step) + if has_step(index) + else index + ) + for index, dim_size in zip(key.tuple, self.shape) + ) + + key = ( + OuterIndexer(key_tuple) + if isinstance(key, BasicIndexer) or isinstance(key, OuterIndexer) + else VectorizedIndexer(key_tuple) + ) + + # Use xarray indexing adapter to further partition indicies. + return explicit_indexing_adapter( # type: ignore + key, + self.shape, + IndexingSupport.BASIC, + self._raw_indexing_method, + ) + + @property + def dtype(self) -> np.typing.DTypeLike: + """Numpy dtype of the array data.""" + return self._dtype + + @property + def shape(self) -> Tuple[int, ...]: + """Shape of the wrapped SOMA DenseNDArray.""" + return self._array.shape + + +class DenseNDArrayDatastore(AbstractDataStore): # type: ignore + """Xarray datastore for reading a SOMA DenseNDArray. + + This class can be used to read an Xarray ``Dataset`` using the + ``xarray.Dataset.load_store`` class method. The SOMA DenseNDArray will be opened + when this class is initialized and closed by the ``close`` method. It will always + create a ``Dataset`` with exactly one ``DataArray``. + + Args: + uri: The URI of the :class:`DenseNDarray` to open. + variable_name: Name to use for the ``DataArray`` holding this + :class:`DenseNDArray`. Defaults to ``soma_data``. + dim_names: Name to use for the dimensions. Defaults to the + :class:`DenseNDArray` dimension names (e.g. ``("soma_dim0", "soma_dim1")``). + context: The Context value to use when opening the object. Defaults to ``None``. + platform_config: Platform configuration options specific to this open operation. + Defaults to ``None``. + """ + + __slots__ = ( + "_soma_array", + "_attrs", + "_dim_names", + "_variable_name", + ) + + def __init__( + self, + uri: str, + *, + variable_name: str = "soma_data", + dim_names: Optional[str] = None, + attrs: Optional[Mapping[str, Any]] = None, + context: Optional[SOMATileDBContext] = None, + platform_config: Optional[options.PlatformConfig] = None, + ): + """Initialize and open the data store.""" + self._soma_array = DenseNDArray.open( + uri, + mode="r", + context=context, + platform_config=platform_config, + ) + self._variable_name = variable_name + self._dim_names = ( + tuple(f"soma_dim_{enum}" for enum in range(self._soma_array.ndim)) + if dim_names is None + else dim_names + ) + self._attrs = Frozen(dict()) if attrs is None else Frozen(attrs) + + def close(self) -> None: + """Close the data store.""" + self._soma_array.close() + + def load(self) -> Tuple[Frozen[str, Any], Frozen[str, Any]]: + """Returns a dictionary of ``xarray.Variable`` objects and a dictionary of + ``xarray.Attribute`` objects. + """ + variables = Frozen( + { + self._variable_name: Variable( + self._dim_names, + LazilyIndexedArray(DenseNDArrayWrapper(self._soma_array)), + attrs={ + key: val + for key, val in self._soma_array.metadata.items() + if not key.startswith("soma_") + }, + ) + } + ) + return variables, self._attrs diff --git a/apis/python/tests/test_basic_xarray_io.py b/apis/python/tests/test_basic_xarray_io.py new file mode 100644 index 0000000000..4d79aa1f62 --- /dev/null +++ b/apis/python/tests/test_basic_xarray_io.py @@ -0,0 +1,197 @@ +from urllib.parse import urljoin + +import numpy as np +import pyarrow as pa +import pytest + +import tiledbsoma as soma + +soma_xarray = pytest.importorskip("tiledbsoma.experimental._xarray_backend") +xr = pytest.importorskip("xarray") + + +@pytest.fixture(scope="module") +def sample_1d_dense_array(tmp_path_factory): + """Create a sample DenseNDArray with data and return open for reading.""" + baseuri = tmp_path_factory.mktemp("basic_xarray_io").as_uri() + array_uri = urljoin(baseuri, "sample_1d_dense_array") + array = soma.DenseNDArray.create(array_uri, type=pa.uint8(), shape=(8,)) + data = pa.Tensor.from_numpy(np.arange(8, dtype=np.uint8)) + array.write((None,), data) + array.close() + array = soma.DenseNDArray.open(array_uri) + return array + + +@pytest.fixture(scope="module") +def sample_soma_3d_dense_array(tmp_path_factory): + """Create a sample DenseNDArray with data and return open for reading.""" + baseuri = tmp_path_factory.mktemp("basic_xarray_io").as_uri() + array_uri = urljoin(baseuri, "sample_3d_dense_array") + array = soma.DenseNDArray.create(array_uri, type=pa.uint8(), shape=(8, 2, 4)) + data = pa.Tensor.from_numpy(np.reshape(np.arange(64, dtype=np.uint8), (8, 2, 4))) + array.write((None,), data) + array.close() + array = soma.DenseNDArray.open(array_uri) + return array + + +class TestDenseNDVariable1D: + + @pytest.fixture(scope="class") + def xr_soma_variable(self, sample_1d_dense_array): + array_wrapper = soma_xarray.DenseNDArrayWrapper(sample_1d_dense_array) + assert isinstance(array_wrapper, xr.backends.BackendArray) + return xr.Variable( + ("xdim",), xr.core.indexing.LazilyIndexedArray(array_wrapper) + ) + + @pytest.fixture(scope="class") + def xr_numpy_variable(self): + data = np.arange(8, dtype=np.uint8) + return xr.Variable(("xdim",), data) + + def test_variable_basics(self, xr_soma_variable): + assert xr_soma_variable.shape == (8,) + assert xr_soma_variable.dims == ("xdim",) + assert isinstance(xr_soma_variable._data, xr.core.indexing.LazilyIndexedArray) + + def test_getitem_all(self, xr_soma_variable, xr_numpy_variable): + expected = xr_numpy_variable[...] + + actual = xr_soma_variable[...] + assert actual.shape == (8,) + np.testing.assert_equal(actual.data, expected.data) + + actual = xr_soma_variable[:] + assert actual.shape == (8,) + np.testing.assert_equal(actual.data, expected.data) + + @pytest.mark.parametrize( + "key", + [ + (2,), + (-1,), + (slice(1, 2),), + (slice(None, None),), + (slice(1, -1),), + (slice(0, 4, 2),), + ([0, 3],), + ([0, -3],), + ], + ) + def test_getitem(self, xr_soma_variable, xr_numpy_variable, key): + actual = xr_soma_variable[key] + expected = xr_numpy_variable[key] + + assert actual.dims == expected.dims + assert actual.shape == expected.shape + + np.testing.assert_equal(actual.data, expected.data) + + +class TestDenseNDVariable3D: + + @pytest.fixture(scope="class") + def xr_soma_variable(self, sample_soma_3d_dense_array): + array_wrapper = soma_xarray.DenseNDArrayWrapper(sample_soma_3d_dense_array) + assert isinstance(array_wrapper, xr.backends.BackendArray) + return xr.Variable( + ("xdim", "ydim", "zdim"), xr.core.indexing.LazilyIndexedArray(array_wrapper) + ) + + @pytest.fixture(scope="class") + def xr_numpy_variable(self): + data = np.reshape(np.arange(64, dtype=np.uint8), (8, 2, 4)) + return xr.Variable(("xdim", "ydim", "zdim"), data) + + def test_variable_basics(self, xr_soma_variable): + assert xr_soma_variable.shape == (8, 2, 4) + assert xr_soma_variable.dims == ("xdim", "ydim", "zdim") + assert isinstance(xr_soma_variable._data, xr.core.indexing.LazilyIndexedArray) + + def test_getitem_all(self, xr_soma_variable, xr_numpy_variable): + expected = xr_numpy_variable[...] + + actual = xr_soma_variable[...] + assert actual.shape == (8, 2, 4) + np.testing.assert_equal(actual.data, expected.data) + + actual = xr_soma_variable[:] + assert actual.shape == (8, 2, 4) + np.testing.assert_equal(actual.data, expected.data) + + actual = xr_soma_variable[:, :, :] + assert actual.shape == (8, 2, 4) + np.testing.assert_equal(actual.data, expected.data) + + @pytest.mark.parametrize( + "key", + [ + (2, 1, 3), + (0, 0, slice(1, 2)), + ( + 2, + 1, + slice( + None, + ), + ), + (0, slice(None, None), 1), + ([1, 3], 1, 1), + ([1, 3], 1, [0, 2]), + (-1, -1, -1), + (0, 1, slice(0, 4, 2)), + ([0, 3, 4], [0, 1, 0], [1, 3, 2]), + (slice(0, 5, 2), [0, 1, 0], [1, 3, 2]), + (..., slice(0, 3)), + (slice(1, 2), ...), + ], + ) + def test_getitem(self, xr_soma_variable, xr_numpy_variable, key): + actual = xr_soma_variable[key] + expected = xr_numpy_variable[key] + + assert actual.dims == expected.dims + assert actual.shape == expected.shape + + np.testing.assert_equal(actual.data, expected.data) + + +def test_1d_xarray_dataset(sample_1d_dense_array): + datastore = soma_xarray.DenseNDArrayDatastore( + sample_1d_dense_array.uri, attrs={"test1": 1, "test2": 2} + ) + ds = xr.Dataset.load_store(datastore) + assert isinstance(ds, xr.Dataset) + assert len(ds.attrs) == 2 + assert ds.attrs == {"test1": 1, "test2": 2} + + assert len(ds.data_vars) == 1 + assert "soma_data" in ds + + data = ds["soma_data"] + assert isinstance(data, xr.DataArray) + assert data.dims == ("soma_dim_0",) + assert len(data.attrs) == 0 + + expected = np.arange(8, dtype=np.uint8) + np.testing.assert_equal(data.data, expected) + + +def test_3d_xarray_dataset(sample_soma_3d_dense_array): + datastore = soma_xarray.DenseNDArrayDatastore(sample_soma_3d_dense_array.uri) + ds = xr.Dataset.load_store(datastore) + assert isinstance(ds, xr.Dataset) + assert len(ds.attrs) == 0 + + assert len(ds.data_vars) == 1 + assert "soma_data" in ds + + data = ds["soma_data"] + assert isinstance(data, xr.DataArray) + assert data.dims == ("soma_dim_0", "soma_dim_1", "soma_dim_2") + assert len(data.attrs) == 0 + + expected = np.reshape(np.arange(64, dtype=np.uint8), (8, 2, 4)) + np.testing.assert_equal(data.data, expected)