Skip to content

Commit

Permalink
[python] Leverage bounding-box feature for obsm/varm outgest robustne…
Browse files Browse the repository at this point in the history
…ss (#1650)

* temp

* robustness

* extract method for obsm/varm outgest

* complete rebase to main

* more unit-test cases

* remove R debugs

* robustness

* complete rebase to main

* [python] Leverage bounding-box feature for obsm/varm outgest robustness

* test data for holey obsm

* unit-test cases

* on-line help improvements
  • Loading branch information
johnkerl authored and nguyenv committed Sep 11, 2023
1 parent 6efcd85 commit 0687b16
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 0 deletions.
25 changes: 25 additions & 0 deletions apis/python/testdata/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,28 @@ ann.obsp = None
ann.X = scipy.sparse.csc_matrix(ann.X)
ann.write_h5ad('pbmc-small-x-csc.h5ad')
```

# Sparsely occupied `obsm`

For testing https://github.com/single-cell-data/TileDB-SOMA/pull/1650, we force a zero into an
`obsm` matrix so that when stored as sparse, it will not have full `nnz`:

```
>>> import anndata as ad
>>> adata = ad.read_h5ad('pbmc3k.h5ad')
>>> adata.obsm
AxisArrays with keys: X_pca, X_tsne, X_umap, X_draw_graph_fr
>>> o = adata.obsm["X_pca"]
>>> o.shape
(2638, 50)
>>> o[0][0] = 0
>>> adata.obsm["X_pca"] = o
>>> adata.write_h5ad('pbmc3k-with-obsm-zero.h5ad')
```
Binary file added apis/python/testdata/pbmc3k-with-obsm-zero.h5ad
Binary file not shown.
64 changes: 64 additions & 0 deletions apis/python/tests/test_basic_anndata_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@ def h5ad_file_extended(request):
return input_path


@pytest.fixture
def h5ad_file_with_obsm_holes(request):
# This has zeroes in an obsm matrix so nnz is not num_rows * num_cols
input_path = HERE.parent / "testdata/pbmc3k-with-obsm-zero.h5ad"
return input_path


@pytest.fixture
def h5ad_file_uns_string_array(request):
# This has uns["louvain_colors"] with dtype.char == "U"
Expand Down Expand Up @@ -641,3 +648,60 @@ def test_obs_with_categorical_int_nan_enumeration(
tiledbsoma.io.from_h5ad(
output_path, h5ad_file_categorical_int_nan, measurement_name="RNA"
)


def test_export_obsm_with_holes(h5ad_file_with_obsm_holes, tmp_path):
adata = anndata.read_h5ad(h5ad_file_with_obsm_holes.as_posix())
assert 1 == 1

# This data file is prepared such that obsm["X_pca"] has shape (2638, 50)
# but its [0][0] element is a 0, so when it's stored as sparse, its nnz
# is not 2638*50=131900.
ado = adata.obsm["X_pca"]
assert ado.shape == (2638, 50)

output_path = tmp_path.as_posix()
tiledbsoma.io.from_anndata(output_path, adata, "RNA")

exp = tiledbsoma.Experiment.open(output_path)

# Verify the bounding box on the SOMA SparseNDArray
with tiledb.open(exp.ms["RNA"].obsm["X_pca"].uri) as so:
assert so.meta["soma_dim_0_domain_lower"] == 0
assert so.meta["soma_dim_0_domain_upper"] == 2637
assert so.meta["soma_dim_1_domain_lower"] == 0
assert so.meta["soma_dim_1_domain_upper"] == 49

# With the bounding box present, all is well for outgest to AnnData format.
try1 = tiledbsoma.io.to_anndata(exp, "RNA")
assert try1.obsm["X_pca"].shape == (2638, 50)

# Now remove the bounding box to simulate reading older data that lacks a bounding box.
with tiledb.open(exp.ms["RNA"].obsm["X_pca"].uri, "w") as so:
del so.meta["soma_dim_0_domain_lower"]
del so.meta["soma_dim_0_domain_upper"]
del so.meta["soma_dim_1_domain_lower"]
del so.meta["soma_dim_1_domain_upper"]

# Re-open to simulate opening afresh a bounding-box-free array.
exp = tiledbsoma.Experiment.open(output_path)

with tiledb.open(exp.ms["RNA"].obsm["X_pca"].uri) as so:
with pytest.raises(KeyError):
so.meta["soma_dim_0_domain_lower"]
with pytest.raises(KeyError):
so.meta["soma_dim_0_domain_upper"]
with pytest.raises(KeyError):
so.meta["soma_dim_1_domain_lower"]
with pytest.raises(KeyError):
so.meta["soma_dim_1_domain_upper"]
assert so.meta["soma_object_type"] == "SOMASparseNDArray"

# Now try the remaining options for outgest.
with pytest.raises(tiledbsoma.SOMAError):
tiledbsoma.io.to_anndata(exp, "RNA")

try3 = tiledbsoma.io.to_anndata(
exp, "RNA", obsm_varm_width_hints={"obsm": {"X_pca": 50}}
)
assert try3.obsm["X_pca"].shape == (2638, 50)

0 comments on commit 0687b16

Please sign in to comment.