Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement SOMA-level dimension-slicing #102

Merged
merged 2 commits into from
May 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions apis/python/src/tiledbsc/annotation_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,19 @@ def keys(self) -> List[str]:
"""
return self.get_attr_names()

# ----------------------------------------------------------------
def dim_select(self, ids):
"""
Selects a slice out of the dataframe with specified `obs_ids` (for `obs`) or `var_ids` (for `var`).
If `ids` is `None`, the entire dataframe is returned.
"""
if ids is None:
with tiledb.open(self.uri) as A: # TODO: with self.open
return A.df[:]
else:
with tiledb.open(self.uri) as A: # TODO: with self.open
return A.df[ids]

# ----------------------------------------------------------------
def from_dataframe(self, dataframe: pd.DataFrame, extent: int) -> None:
"""
Expand Down
45 changes: 29 additions & 16 deletions apis/python/src/tiledbsc/annotation_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,25 +31,17 @@ def __init__(
self.dim_name = dim_name

# ----------------------------------------------------------------
def from_anndata(self, matrix, dim_values):
def dim_select(self, ids):
"""
Populates an array in the obsm/ or varm/ subgroup for a SOMA object.

:param matrix: anndata.obsm['foo'], anndata.varm['foo'], or anndata.raw.varm['foo'].
:param dim_values: anndata.obs_names, anndata.var_names, or anndata.raw.var_names.
Selects a slice out of the array with specified `obs_ids` (for `obsm` elements) or
`var_ids` (for `varm` elements). If `ids` is `None`, the entire array is returned.
"""

if self.verbose:
s = util.get_start_stamp()
print(f"{self.indent}START WRITING {self.uri}")

if isinstance(matrix, pd.DataFrame):
self._from_pandas_dataframe(matrix, dim_values)
if ids is None:
with tiledb.open(self.uri) as A: # TODO: with self.open
return A.df[:]
else:
self._numpy_ndarray_or_scipy_sparse_csr_matrix(matrix, dim_values)

if self.verbose:
print(util.format_elapsed(s, f"{self.indent}FINISH WRITING {self.uri}"))
with tiledb.open(self.uri) as A: # TODO: with self.open
return A.df[ids]

# ----------------------------------------------------------------
def shape(self):
Expand All @@ -67,6 +59,27 @@ def shape(self):
num_cols = A.schema.nattr
return (num_rows, num_cols)

# ----------------------------------------------------------------
def from_anndata(self, matrix, dim_values):
"""
Populates an array in the obsm/ or varm/ subgroup for a SOMA object.

:param matrix: anndata.obsm['foo'], anndata.varm['foo'], or anndata.raw.varm['foo'].
:param dim_values: anndata.obs_names, anndata.var_names, or anndata.raw.var_names.
"""

if self.verbose:
s = util.get_start_stamp()
print(f"{self.indent}START WRITING {self.uri}")

if isinstance(matrix, pd.DataFrame):
self._from_pandas_dataframe(matrix, dim_values)
else:
self._numpy_ndarray_or_scipy_sparse_csr_matrix(matrix, dim_values)

if self.verbose:
print(util.format_elapsed(s, f"{self.indent}FINISH WRITING {self.uri}"))

# ----------------------------------------------------------------
def _numpy_ndarray_or_scipy_sparse_csr_matrix(self, matrix, dim_values):
# We do not have column names for anndata-provenance annotation matrices.
Expand Down
13 changes: 13 additions & 0 deletions apis/python/src/tiledbsc/annotation_pairwise_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,19 @@ def shape(self):
# * https://github.com/TileDB-Inc/TileDB-Py/pull/1055
return A.df[:].shape # nnz x 3 -- id_i, id_j, and value

# ----------------------------------------------------------------
def dim_select(self, ids):
"""
Selects a slice out of the array with specified `obs_ids` (for `obsp` elements) or
`var_ids` (for `varp` elements). If `ids` is `None`, the entire array is returned.
"""
if ids is None:
with tiledb.open(self.uri) as A: # TODO: with self.open
return A.df[:, :]
else:
with tiledb.open(self.uri) as A: # TODO: with self.open
return A.df[ids, ids]

# ----------------------------------------------------------------
def from_anndata(self, matrix, dim_values):
"""
Expand Down
19 changes: 19 additions & 0 deletions apis/python/src/tiledbsc/assay_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,25 @@ def __init__(
# We don't have a .shape() method since X is sparse. One should
# instead use the row-counts for the soma's obs and var.

# ----------------------------------------------------------------
def dim_select(self, obs_ids, var_ids):
"""
Selects a slice out of the matrix with specified `obs_ids` and/or `var_ids`.
Either or both of the ID lists may be `None`, meaning, do not subselect along
that dimension. If both ID lists are `None`, the entire matrix is returned.
"""
with tiledb.open(self.uri) as A:
if obs_ids is None:
if var_ids is None:
return A.df[:, :]
else:
return A.df[:, var_ids]
else:
if var_ids is None:
return A.df[obs_ids, :]
else:
return A.df[obs_ids, var_ids]

# ----------------------------------------------------------------
def from_matrix(self, matrix, row_names, col_names) -> None:
"""
Expand Down
170 changes: 170 additions & 0 deletions apis/python/tests/test_dim_select.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
import anndata
import tiledb
import tiledbsc

import pytest
import tempfile
import os
from pathlib import Path

HERE = Path(__file__).parent


@pytest.fixture
def h5ad_file(request):
# Tests in this file rely on specific values form this particular input data file.
input_path = HERE.parent / "anndata/pbmc-small.h5ad"
return input_path


@pytest.fixture
def adata(h5ad_file):
return anndata.read_h5ad(h5ad_file)


def test_dim_select(adata):

# Set up anndata input path and tiledb-group output path
tempdir = tempfile.TemporaryDirectory()
output_path = tempdir.name

# Ingest
soma = tiledbsc.SOMA(output_path, verbose=True)
soma.from_anndata(adata)

assert soma.obs.ids() == [
b"AAATTCGAATCACG",
b"AAGCAAGAGCTTAG",
b"AAGCGACTTTGACG",
b"AATGCGTGGACGGA",
b"AATGTTGACAGTCA",
b"ACAGGTACTGGTGT",
b"ACCAGTGAATACCG",
b"ACGTGATGCCATGA",
b"ACTCGCACGAAAGT",
b"AGAGATGATCTCGC",
b"AGATATACCCGTAA",
b"AGGTCATGAGTGTC",
b"AGTCAGACTGCACA",
b"AGTCTTACTTCGGA",
b"ATAAGTTGGTACGT",
b"ATACCACTCTAAGC",
b"ATAGGAGAAACAGA",
b"ATCATCTGACACCA",
b"ATGCCAGAACGACT",
b"ATTACCTGCCTTAT",
b"ATTCAGCTCATTGG",
b"ATTGCACTTGCTTT",
b"ATTGTAGATTCCCG",
b"CATATAGACTAAGC",
b"CATCAGGATGCACA",
b"CATCATACGGAGCA",
b"CATGAGACACGGGA",
b"CATGCGCTAGTCAC",
b"CATGGCCTGTGCAT",
b"CATTACACCAACTG",
b"CCATCCGATTCGCC",
b"CCCAACTGCAATCG",
b"CCTATAACGAGACG",
b"CGGCACGAACTCAG",
b"CGTAGCCTGTATGC",
b"CTAAACCTCTGACA",
b"CTAAACCTGTGCAT",
b"CTAACGGAACCGAT",
b"CTAGGTGATGGTTG",
b"CTGCCAACAGGAGC",
b"CTTCATGACCGAAT",
b"CTTGATTGATCTTC",
b"GAACCTGATGAACC",
b"GACATTCTCCACCT",
b"GACGCTCTCTCTCG",
b"GAGTTGTGGTAGCT",
b"GATAGAGAAGGGTG",
b"GATAGAGATCACGA",
b"GATATAACACGCAT",
b"GCACTAGACCTTTA",
b"GCAGCTCTGTTTCT",
b"GCGCACGACTTTAC",
b"GCGCATCTTGCTCC",
b"GCGTAAACACGGTT",
b"GCTCCATGAGAAGT",
b"GGAACACTTCAGAC",
b"GGCATATGCTTATC",
b"GGCATATGGGGAGT",
b"GGCCGATGTACTCT",
b"GGGTAACTCTAGTG",
b"GGTGGAGATTACTC",
b"GTAAGCACTCATTC",
b"GTCATACTTCGCCT",
b"GTTGACGATATCGG",
b"TACAATGATGCTAG",
b"TACATCACGCTAAC",
b"TACGCCACTCCGAA",
b"TACTCTGAATCGAC",
b"TAGGGACTGAACTC",
b"TCCACTCTGAGCTT",
b"TCTGATACACGTGT",
b"TGACTGGATTCTCA",
b"TGAGCTGAATGCTG",
b"TGGTATCTAAACAG",
b"TTACCATGAATCGC",
b"TTACGTACGTTCAG",
b"TTGAGGACTACGCA",
b"TTGCATTGAGCTAC",
b"TTGGTACTGAATCC",
b"TTTAGCTGTACTCT",
]

assert soma.var.ids() == [
b"AKR1C3",
b"CA2",
b"CD1C",
b"GNLY",
b"HLA-DPB1",
b"HLA-DQA1",
b"IGLL5",
b"MYL9",
b"PARVB",
b"PF4",
b"PGRMC1",
b"PPBP",
b"RP11-290F20.3",
b"RUFY1",
b"S100A8",
b"S100A9",
b"SDPR",
b"TREML1",
b"TUBB1",
b"VDAC3",
]

df = soma.obs.dim_select([b"AAGCGACTTTGACG", b"AATGCGTGGACGGA"])
assert df.shape == (2, 7)
assert df.at["AAGCGACTTTGACG", "groups"] == "g1"
assert df.at["AATGCGTGGACGGA", "nFeature_RNA"] == 73
# orig.ident nCount_RNA nFeature_RNA RNA_snn_res.0.8 letter.idents groups RNA_snn_res.1
# obs_id
# AAGCGACTTTGACG 0 443.0 77 1 1 g1 1
# AATGCGTGGACGGA 0 389.0 73 1 1 g1 1
assert soma.obs.dim_select(None).shape == (80, 7)

df = soma.var.dim_select([b"AKR1C3", b"MYL9"])
assert df.shape == (2, 5)
assert df.at["AKR1C3", "vst.variable"] == 1
assert df.at["MYL9", "vst.variable"] == 1
assert soma.var.dim_select(None).shape == (20, 5)

assert sorted(soma.obsm.keys()) == sorted(["X_tsne", "X_pca"])

df = soma.obsm["X_tsne"].dim_select([b"AAGCGACTTTGACG", b"AATGCGTGGACGGA"])
assert df.shape == (2, 3)

df = soma.obsm["X_pca"].dim_select([b"AAGCGACTTTGACG", b"AATGCGTGGACGGA"])
assert df.shape == (2, 20)

assert soma.X.data.dim_select([b"AAGCGACTTTGACG"], [b"AKR1C3"]).shape == (1, 3)
assert soma.X.data.dim_select(None, [b"AKR1C3"]).shape == (80, 3)
assert soma.X.data.dim_select([b"AAGCGACTTTGACG"], None).shape == (20, 3)
assert soma.X.data.dim_select(None, None).shape == (1600, 3)

tempdir.cleanup()