Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport release-1.7] [python] tiledbsoma.io.update_matrix #2130

Merged
merged 1 commit into from
Feb 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions apis/python/src/tiledbsoma/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
register_h5ads,
to_anndata,
to_h5ad,
update_matrix,
update_obs,
update_var,
)
Expand All @@ -28,6 +29,7 @@
"register_anndatas",
"to_anndata",
"to_h5ad",
"update_matrix",
"update_obs",
"update_var",
)
90 changes: 90 additions & 0 deletions apis/python/src/tiledbsoma/io/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1637,6 +1637,96 @@ def _update_dataframe(
)


def update_matrix(
soma_ndarray: Union[SparseNDArray, DenseNDArray],
new_data: Union[Matrix, h5py.Dataset],
*,
context: Optional[SOMATileDBContext] = None,
platform_config: Optional[PlatformConfig] = None,
) -> None:
"""
Given a ``SparseNDArray`` or ``DenseNDArray`` already opened for write,
writes the new data. It is the caller's responsibility to ensure that the
intended shape of written contents of the array match those of the existing
data. The intended use-case is to replace updated numerical values.

Example:

with tiledbsoma.Experiment.open(uri, "w") as exp:
tiledbsoma.io.update_matrix(
exp.ms["RNA"].X["data"],
adata.X,
)

Args:
soma_ndarray: a ``SparseNDArray`` or ``DenseNDArray`` already opened for write.

new_data: If the ``soma_ndarray`` is sparse, a Scipy CSR/CSC matrix or
AnnData ``SparseDataset``. If the ``soma_ndarray`` is dense,
a NumPy NDArray.

context: Optional :class:`SOMATileDBContext` containing storage parameters, etc.

platform_config: Platform-specific options used to update this array, provided
in the form ``{"tiledb": {"create": {"dataframe_dim_zstd_level": 7}}}``

Returns:
None

Lifecycle:
Experimental.
"""

# More developer-level information on why we do not -- and cannot -- check
# shape/bounding box:
#
# * The TileDB-SOMA "shape" can be huge x huge, with "room for growth" --
# this does not track the user-level "shape" and is not intended to.
# * The TileDB-SOMA bounding box is, by contrast, intended to track the
# user-level "shape" but it is not thread-safe and may be incorrect.
# Please see
# https://github.com/single-cell-data/TileDB-SOMA/issues/1969
# https://github.com/single-cell-data/TileDB-SOMA/issues/1971

s = _util.get_start_stamp()
logging.log_io(
f"Writing {soma_ndarray.uri}",
f"START UPDATING {soma_ndarray.uri}",
)

ingestion_params = IngestionParams("write", None)

if isinstance(soma_ndarray, DenseNDArray):
_write_matrix_to_denseNDArray(
soma_ndarray,
new_data,
tiledb_create_options=TileDBCreateOptions.from_platform_config(
platform_config
),
context=context,
ingestion_params=ingestion_params,
)
elif isinstance(soma_ndarray, SparseNDArray): # SOMASparseNDArray
_write_matrix_to_sparseNDArray(
soma_ndarray,
new_data,
tiledb_create_options=TileDBCreateOptions.from_platform_config(
platform_config
),
context=context,
ingestion_params=ingestion_params,
axis_0_mapping=AxisIDMapping.identity(new_data.shape[0]),
axis_1_mapping=AxisIDMapping.identity(new_data.shape[1]),
)
else:
raise TypeError(f"unknown array type {type(soma_ndarray)}")

logging.log_io(
f"Wrote {soma_ndarray.uri}",
_util.format_elapsed(s, f"FINISH UPDATING {soma_ndarray.uri}"),
)


def add_X_layer(
exp: Experiment,
measurement_name: str,
Expand Down
83 changes: 83 additions & 0 deletions apis/python/tests/test_update_matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import tempfile
from pathlib import Path

import anndata
import pytest

import tiledbsoma
import tiledbsoma.io

HERE = Path(__file__).parent


@pytest.fixture
def h5ad_file(request):
input_path = HERE.parent / "testdata/pbmc3k_processed.h5ad"
return input_path


@pytest.fixture
def adata(h5ad_file):
return anndata.read_h5ad(h5ad_file)


def test_update_matrix_X(adata):
tempdir = tempfile.TemporaryDirectory()
output_path = tempdir.name

tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA")

with tiledbsoma.Experiment.open(output_path) as exp:
old = exp.ms["RNA"].X["data"].read().tables().concat()

assert len(old["soma_dim_0"]) == 4848644
assert len(old["soma_dim_1"]) == 4848644
assert len(old["soma_data"]) == 4848644

with tiledbsoma.Experiment.open(output_path, "w") as exp:
tiledbsoma.io.update_matrix(
exp.ms["RNA"].X["data"],
adata.X + 1,
)

with tiledbsoma.Experiment.open(output_path) as exp:
new = exp.ms["RNA"].X["data"].read().tables().concat()

assert len(new["soma_dim_0"]) == 4848644
assert len(new["soma_dim_1"]) == 4848644
assert len(new["soma_data"]) == 4848644

assert old["soma_dim_0"] == new["soma_dim_0"]
assert old["soma_dim_1"] == new["soma_dim_1"]
assert old["soma_data"] != new["soma_data"]


def test_update_matrix_obsm(adata):
tempdir = tempfile.TemporaryDirectory()
output_path = tempdir.name

tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA")

with tiledbsoma.Experiment.open(output_path) as exp:
old = exp.ms["RNA"].obsm["X_pca"].read().tables().concat()

assert len(old["soma_dim_0"]) == 131900
assert len(old["soma_dim_1"]) == 131900
assert len(old["soma_data"]) == 131900

with tiledbsoma.Experiment.open(output_path, "w") as exp:
tiledbsoma.io.update_matrix(
exp.ms["RNA"].obsm["X_pca"],
adata.obsm["X_pca"] + 1,
)

with tiledbsoma.Experiment.open(output_path) as exp:
new = exp.ms["RNA"].obsm["X_pca"].read().tables().concat()

assert len(new["soma_dim_0"]) == 131900
assert len(new["soma_dim_1"]) == 131900
assert len(new["soma_data"]) == 131900

assert old["soma_dim_0"] == new["soma_dim_0"]
assert old["soma_dim_1"] == new["soma_dim_1"]
assert old["soma_data"] != new["soma_data"]
Loading