Skip to content

Commit

Permalink
SOMA-component dataframe/schema accessors (#104)
Browse files Browse the repository at this point in the history
* matrix.df() accessors

* unit-test cases

* soma.foo.schema() -> soma.foo.tiledb_array_schema()
  • Loading branch information
johnkerl authored May 20, 2022
1 parent f88be47 commit 89923c1
Show file tree
Hide file tree
Showing 6 changed files with 130 additions and 13 deletions.
12 changes: 10 additions & 2 deletions apis/python/src/tiledbsc/annotation_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,9 @@ def ids(self) -> List[str]:
def keys(self) -> List[str]:
"""
Returns the column names for the `obs` or `var` dataframe. For obs and varp, `.keys()` is a
keystroke-saver for the more general array-schema accessor `get_attr_names`.
keystroke-saver for the more general array-schema accessor `attr_names`.
"""
return self.get_attr_names()
return self.attr_names()

# ----------------------------------------------------------------
def dim_select(self, ids):
Expand All @@ -78,6 +78,14 @@ def dim_select(self, ids):
with tiledb.open(self.uri) as A: # TODO: with self.open
return A.df[ids]

# ----------------------------------------------------------------
def df(self, ids=None) -> pd.DataFrame:
"""
Keystroke-saving alias for `.dim_select()`. If `ids` are provided, they're used
to subselect; if not, the entire dataframe is returned.
"""
return self.dim_select(ids)

# ----------------------------------------------------------------
# TODO: this is a v1 for prototype/demo timeframe -- needs expanding.
def attribute_filter(self, query_string, col_names_to_keep):
Expand Down
8 changes: 8 additions & 0 deletions apis/python/src/tiledbsc/annotation_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ def dim_select(self, ids):
with tiledb.open(self.uri) as A: # TODO: with self.open
return A.df[ids]

# ----------------------------------------------------------------
def df(self, ids=None) -> pd.DataFrame:
"""
Keystroke-saving alias for `.dim_select()`. If `ids` are provided, they're used
to subselect; if not, the entire dataframe is returned.
"""
return self.dim_select(ids)

# ----------------------------------------------------------------
def shape(self):
"""
Expand Down
8 changes: 8 additions & 0 deletions apis/python/src/tiledbsc/annotation_pairwise_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,14 @@ def dim_select(self, ids):
with tiledb.open(self.uri) as A: # TODO: with self.open
return A.df[ids, ids]

# ----------------------------------------------------------------
def df(self, ids=None) -> pd.DataFrame:
"""
Keystroke-saving alias for `.dim_select()`. If `ids` are provided, they're used
to subselect; if not, the entire dataframe is returned.
"""
return self.dim_select(ids)

# ----------------------------------------------------------------
def from_anndata(self, matrix, dim_values):
"""
Expand Down
9 changes: 9 additions & 0 deletions apis/python/src/tiledbsc/assay_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import scipy
import numpy as np
import pandas as pd

from typing import Optional
import time
Expand Down Expand Up @@ -63,6 +64,14 @@ def dim_select(self, obs_ids, var_ids):
else:
return A.df[obs_ids, var_ids]

# ----------------------------------------------------------------
def df(self, obs_ids=None, var_ids=None) -> pd.DataFrame:
"""
Keystroke-saving alias for `.dim_select()`. If either of `obs_ids` or `var_ids`
are provided, they're used to subselect; if not, the entire dataframe is returned.
"""
return self.dim_select(obs_ids, var_ids)

# ----------------------------------------------------------------
def from_matrix(self, matrix, row_names, col_names) -> None:
"""
Expand Down
30 changes: 25 additions & 5 deletions apis/python/src/tiledbsc/tiledb_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
from .tiledb_object import TileDBObject
from .tiledb_group import TileDBGroup

from typing import Optional, List
from typing import Optional, List, Dict


class TileDBArray(TileDBObject):
"""
Wraps arrays from TileDB-Py by retaining a URI, verbose flag, etc.
Also serves as an abstraction layer to hide TileDB-specific details from the API, unless
requested.
"""

def __init__(
Expand Down Expand Up @@ -37,29 +39,47 @@ def open(self):
A = tiledb.open(self.uri)
return A

def schema(self):
def tiledb_array_schema(self):
"""
Returns the TileDB array schema.
"""
with tiledb.open(self.uri) as A:
return A.schema

def get_dim_names(self) -> List[str]:
def dim_names(self) -> List[str]:
"""
Reads the dimension names from the schema: for example, ['obs_id', 'var_id'].
"""
with tiledb.open(self.uri) as A:
return [A.schema.domain.dim(i).name for i in range(A.schema.domain.ndim)]

def get_attr_names(self) -> List[str]:
def dim_names_to_types(self) -> Dict[str, str]:
"""
Returns a dict mapping from dimension name to dimension type.
"""
with tiledb.open(self.uri) as A:
dom = A.schema.domain
return {dom.dim(i).name: dom.dim(i).dtype for i in range(dom.ndim)}

def attr_names(self) -> List[str]:
"""
Reads the attribute names from the schema: for example, the list of column names in a dataframe.
"""
with tiledb.open(self.uri) as A:
return [A.schema.attr(i).name for i in range(A.schema.nattr)]

def attr_names_to_types(self) -> Dict[str, str]:
"""
Returns a dict mapping from attribute name to attribute type.
"""
with tiledb.open(self.uri) as A:
schema = A.schema
return {
schema.attr(i).name: schema.attr(i).dtype for i in range(schema.nattr)
}

def has_attr_name(self, attr_name: str) -> bool:
"""
Returns true if the array has the specified attribute name, false otherwise.
"""
return attr_name in self.get_attr_names()
return attr_name in self.attr_names()
76 changes: 70 additions & 6 deletions apis/python/tests/test_soma_group_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import tiledb
import tiledbsc

import numpy as np

import pytest
import tempfile
import os
Expand Down Expand Up @@ -48,9 +50,19 @@ def test_soma_group_indexing(h5ad_file):
["uns", "varm", "X", "raw", "obsp", "varp", "var", "obsm", "obs"]
)
assert set(soma.X.get_member_names()) == set(["data"])
assert soma.X.data.get_dim_names() == ["obs_id", "var_id"]
assert soma.X.data.dim_names() == ["obs_id", "var_id"]

assert soma.obs.get_dim_names() == ["obs_id"]
assert soma.obs.dim_names() == ["obs_id"]
assert soma.obs.dim_name == "obs_id"
assert soma.obs.keys() == [
"orig.ident",
"nCount_RNA",
"nFeature_RNA",
"RNA_snn_res.0.8",
"letter.idents",
"groups",
"RNA_snn_res.1",
]
assert set(soma.obs.ids()) == set(
[
b"AAATTCGAATCACG",
Expand Down Expand Up @@ -135,8 +147,27 @@ def test_soma_group_indexing(h5ad_file):
b"TTTAGCTGTACTCT",
]
)
assert soma.obs.df().shape == (80, 7)
assert soma.obs.df(["AAGCAAGAGCTTAG", "TTGGTACTGAATCC"]).shape == (2, 7)
assert list(soma.obs.df().dtypes) == [
np.dtype("int32"),
np.dtype("float64"),
np.dtype("int32"),
np.dtype("int32"),
np.dtype("int32"),
np.dtype("O"),
np.dtype("int32"),
]

assert soma.var.get_dim_names() == ["var_id"]
assert soma.var.dim_names() == ["var_id"]
assert soma.obs.dim_name == "obs_id"
assert soma.var.keys() == [
"vst.mean",
"vst.variance",
"vst.variance.expected",
"vst.variance.standardized",
"vst.variable",
]
assert set(soma.var.ids()) == set(
[
b"AKR1C3",
Expand All @@ -161,22 +192,55 @@ def test_soma_group_indexing(h5ad_file):
b"VDAC3",
]
)
assert soma.var.shape() == (20, 5)
assert soma.var.df(["RUFY1", "AKR1C3"]).shape == (2, 5)
assert list(soma.var.df().dtypes) == [
np.dtype("float64"),
np.dtype("float64"),
np.dtype("float64"),
np.dtype("float64"),
np.dtype("int32"),
]

assert set(soma.obsm.get_member_names()) == set(["X_pca", "X_tsne"])
assert set(soma.obsm.keys()) == set(["X_pca", "X_tsne"])
assert isinstance(soma.obsm["X_pca"], tiledbsc.AnnotationMatrix)
assert soma.obsm["nonesuch"] is None
assert soma.obsm["X_pca"].get_dim_names() == ["obs_id"]
assert soma.obsm["X_pca"].dim_names() == ["obs_id"]
assert soma.obsm["X_pca"].df().shape == (80, 20)
assert list(soma.obsm["X_pca"].df().dtypes) == [
np.dtype("O"),
np.dtype("float64"),
np.dtype("float64"),
np.dtype("float64"),
np.dtype("float64"),
np.dtype("float64"),
np.dtype("float64"),
np.dtype("float64"),
np.dtype("float64"),
np.dtype("float64"),
np.dtype("float64"),
np.dtype("float64"),
np.dtype("float64"),
np.dtype("float64"),
np.dtype("float64"),
np.dtype("float64"),
np.dtype("float64"),
np.dtype("float64"),
np.dtype("float64"),
np.dtype("float64"),
]

assert set(soma.varm.get_member_names()) == set(["PCs"])
assert isinstance(soma.varm["PCs"], tiledbsc.AnnotationMatrix)
assert soma.varm["nonesuch"] is None
assert soma.varm.get_member_names() == ["PCs"]
assert soma.varm["PCs"].get_dim_names() == ["var_id"]
assert soma.varm["PCs"].dim_names() == ["var_id"]

assert set(soma.obsp.get_member_names()) == set(["distances"])
assert isinstance(soma.obsp["distances"], tiledbsc.AnnotationPairwiseMatrix)
assert soma.varp["nonesuch"] is None
assert soma.obsp["distances"].get_dim_names() == ["obs_id_i", "obs_id_j"]
assert soma.obsp["distances"].dim_names() == ["obs_id_i", "obs_id_j"]

assert set(soma.uns.get_member_names()) == set(["neighbors"])
assert isinstance(soma.uns["neighbors"], tiledbsc.UnsGroup)
Expand Down

0 comments on commit 89923c1

Please sign in to comment.