From 9c3c83929dc239d12fd82f87c5859942054db126 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Fri, 20 May 2022 11:41:02 -0400 Subject: [PATCH 1/3] matrix.df() accessors --- apis/python/src/tiledbsc/annotation_dataframe.py | 8 ++++++++ apis/python/src/tiledbsc/annotation_matrix.py | 8 ++++++++ apis/python/src/tiledbsc/annotation_pairwise_matrix.py | 8 ++++++++ apis/python/src/tiledbsc/assay_matrix.py | 9 +++++++++ 4 files changed, 33 insertions(+) diff --git a/apis/python/src/tiledbsc/annotation_dataframe.py b/apis/python/src/tiledbsc/annotation_dataframe.py index bdd4cec683..7dd46a5846 100644 --- a/apis/python/src/tiledbsc/annotation_dataframe.py +++ b/apis/python/src/tiledbsc/annotation_dataframe.py @@ -78,6 +78,14 @@ def dim_select(self, ids): with tiledb.open(self.uri) as A: # TODO: with self.open return A.df[ids] + # ---------------------------------------------------------------- + def df(self, ids=None) -> pd.DataFrame: + """ + Keystroke-saving alias for `.dim_select()`. If `ids` are provided, they're used + to subselect; if not, the entire dataframe is returned. + """ + return self.dim_select(ids) + # ---------------------------------------------------------------- # TODO: this is a v1 for prototype/demo timeframe -- needs expanding. def attribute_filter(self, query_string, col_names_to_keep): diff --git a/apis/python/src/tiledbsc/annotation_matrix.py b/apis/python/src/tiledbsc/annotation_matrix.py index 36c17bdbf4..0a21497051 100644 --- a/apis/python/src/tiledbsc/annotation_matrix.py +++ b/apis/python/src/tiledbsc/annotation_matrix.py @@ -43,6 +43,14 @@ def dim_select(self, ids): with tiledb.open(self.uri) as A: # TODO: with self.open return A.df[ids] + # ---------------------------------------------------------------- + def df(self, ids=None) -> pd.DataFrame: + """ + Keystroke-saving alias for `.dim_select()`. If `ids` are provided, they're used + to subselect; if not, the entire dataframe is returned. + """ + return self.dim_select(ids) + # ---------------------------------------------------------------- def shape(self): """ diff --git a/apis/python/src/tiledbsc/annotation_pairwise_matrix.py b/apis/python/src/tiledbsc/annotation_pairwise_matrix.py index 2637398083..94b4dbd94b 100644 --- a/apis/python/src/tiledbsc/annotation_pairwise_matrix.py +++ b/apis/python/src/tiledbsc/annotation_pairwise_matrix.py @@ -60,6 +60,14 @@ def dim_select(self, ids): with tiledb.open(self.uri) as A: # TODO: with self.open return A.df[ids, ids] + # ---------------------------------------------------------------- + def df(self, ids=None) -> pd.DataFrame: + """ + Keystroke-saving alias for `.dim_select()`. If `ids` are provided, they're used + to subselect; if not, the entire dataframe is returned. + """ + return self.dim_select(ids) + # ---------------------------------------------------------------- def from_anndata(self, matrix, dim_values): """ diff --git a/apis/python/src/tiledbsc/assay_matrix.py b/apis/python/src/tiledbsc/assay_matrix.py index 02d0c44a85..5ab2af228b 100644 --- a/apis/python/src/tiledbsc/assay_matrix.py +++ b/apis/python/src/tiledbsc/assay_matrix.py @@ -7,6 +7,7 @@ import scipy import numpy as np +import pandas as pd from typing import Optional import time @@ -63,6 +64,14 @@ def dim_select(self, obs_ids, var_ids): else: return A.df[obs_ids, var_ids] + # ---------------------------------------------------------------- + def df(self, obs_ids=None, var_ids=None) -> pd.DataFrame: + """ + Keystroke-saving alias for `.dim_select()`. If either of `obs_ids` or `var_ids` + are provided, they're used to subselect; if not, the entire dataframe is returned. + """ + return self.dim_select(obs_ids, var_ids) + # ---------------------------------------------------------------- def from_matrix(self, matrix, row_names, col_names) -> None: """ From aac677c0259c7c63b39cf7350caf9a3c1852dea9 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Fri, 20 May 2022 12:02:46 -0400 Subject: [PATCH 2/3] unit-test cases --- apis/python/src/tiledbsc/assay_matrix.py | 7 ++ apis/python/tests/test_soma_group_indexing.py | 64 +++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/apis/python/src/tiledbsc/assay_matrix.py b/apis/python/src/tiledbsc/assay_matrix.py index 5ab2af228b..b50a24214b 100644 --- a/apis/python/src/tiledbsc/assay_matrix.py +++ b/apis/python/src/tiledbsc/assay_matrix.py @@ -45,6 +45,13 @@ def __init__( # We don't have a .shape() method since X is sparse. One should # instead use the row-counts for the soma's obs and var. + # ---------------------------------------------------------------- + def dim_names(self): + """ + Keystroke-saving accessor for `row_dim_name` and `col_dim_name`. + """ + return (self.row_dim_name, self.col_dim_name) + # ---------------------------------------------------------------- def dim_select(self, obs_ids, var_ids): """ diff --git a/apis/python/tests/test_soma_group_indexing.py b/apis/python/tests/test_soma_group_indexing.py index 8fd73b5876..793bd35b6d 100644 --- a/apis/python/tests/test_soma_group_indexing.py +++ b/apis/python/tests/test_soma_group_indexing.py @@ -2,6 +2,8 @@ import tiledb import tiledbsc +import numpy as np + import pytest import tempfile import os @@ -51,6 +53,16 @@ def test_soma_group_indexing(h5ad_file): assert soma.X.data.get_dim_names() == ["obs_id", "var_id"] assert soma.obs.get_dim_names() == ["obs_id"] + assert soma.obs.dim_name == "obs_id" + assert soma.obs.keys() == [ + "orig.ident", + "nCount_RNA", + "nFeature_RNA", + "RNA_snn_res.0.8", + "letter.idents", + "groups", + "RNA_snn_res.1", + ] assert set(soma.obs.ids()) == set( [ b"AAATTCGAATCACG", @@ -135,8 +147,27 @@ def test_soma_group_indexing(h5ad_file): b"TTTAGCTGTACTCT", ] ) + assert soma.obs.df().shape == (80, 7) + assert soma.obs.df(["AAGCAAGAGCTTAG", "TTGGTACTGAATCC"]).shape == (2, 7) + assert list(soma.obs.df().dtypes) == [ + np.dtype("int32"), + np.dtype("float64"), + np.dtype("int32"), + np.dtype("int32"), + np.dtype("int32"), + np.dtype("O"), + np.dtype("int32"), + ] assert soma.var.get_dim_names() == ["var_id"] + assert soma.obs.dim_name == "obs_id" + assert soma.var.keys() == [ + "vst.mean", + "vst.variance", + "vst.variance.expected", + "vst.variance.standardized", + "vst.variable", + ] assert set(soma.var.ids()) == set( [ b"AKR1C3", @@ -161,11 +192,44 @@ def test_soma_group_indexing(h5ad_file): b"VDAC3", ] ) + assert soma.var.shape() == (20, 5) + assert soma.var.df(["RUFY1", "AKR1C3"]).shape == (2, 5) + assert list(soma.var.df().dtypes) == [ + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("int32"), + ] assert set(soma.obsm.get_member_names()) == set(["X_pca", "X_tsne"]) + assert set(soma.obsm.keys()) == set(["X_pca", "X_tsne"]) assert isinstance(soma.obsm["X_pca"], tiledbsc.AnnotationMatrix) assert soma.obsm["nonesuch"] is None assert soma.obsm["X_pca"].get_dim_names() == ["obs_id"] + assert soma.obsm["X_pca"].df().shape == (80, 20) + assert list(soma.obsm["X_pca"].df().dtypes) == [ + np.dtype("O"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + ] assert set(soma.varm.get_member_names()) == set(["PCs"]) assert isinstance(soma.varm["PCs"], tiledbsc.AnnotationMatrix) From 30f8d254ce67abf7d15328c1e17ebc00a88ed1e6 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Fri, 20 May 2022 14:31:37 -0400 Subject: [PATCH 3/3] soma.foo.schema() -> soma.foo.tiledb_array_schema() --- .../src/tiledbsc/annotation_dataframe.py | 4 +-- apis/python/src/tiledbsc/assay_matrix.py | 7 ----- apis/python/src/tiledbsc/tiledb_array.py | 30 +++++++++++++++---- apis/python/tests/test_soma_group_indexing.py | 12 ++++---- 4 files changed, 33 insertions(+), 20 deletions(-) diff --git a/apis/python/src/tiledbsc/annotation_dataframe.py b/apis/python/src/tiledbsc/annotation_dataframe.py index 7dd46a5846..e742d0eecc 100644 --- a/apis/python/src/tiledbsc/annotation_dataframe.py +++ b/apis/python/src/tiledbsc/annotation_dataframe.py @@ -61,9 +61,9 @@ def ids(self) -> List[str]: def keys(self) -> List[str]: """ Returns the column names for the `obs` or `var` dataframe. For obs and varp, `.keys()` is a - keystroke-saver for the more general array-schema accessor `get_attr_names`. + keystroke-saver for the more general array-schema accessor `attr_names`. """ - return self.get_attr_names() + return self.attr_names() # ---------------------------------------------------------------- def dim_select(self, ids): diff --git a/apis/python/src/tiledbsc/assay_matrix.py b/apis/python/src/tiledbsc/assay_matrix.py index b50a24214b..5ab2af228b 100644 --- a/apis/python/src/tiledbsc/assay_matrix.py +++ b/apis/python/src/tiledbsc/assay_matrix.py @@ -45,13 +45,6 @@ def __init__( # We don't have a .shape() method since X is sparse. One should # instead use the row-counts for the soma's obs and var. - # ---------------------------------------------------------------- - def dim_names(self): - """ - Keystroke-saving accessor for `row_dim_name` and `col_dim_name`. - """ - return (self.row_dim_name, self.col_dim_name) - # ---------------------------------------------------------------- def dim_select(self, obs_ids, var_ids): """ diff --git a/apis/python/src/tiledbsc/tiledb_array.py b/apis/python/src/tiledbsc/tiledb_array.py index f03a41e19a..608fad7df9 100644 --- a/apis/python/src/tiledbsc/tiledb_array.py +++ b/apis/python/src/tiledbsc/tiledb_array.py @@ -3,12 +3,14 @@ from .tiledb_object import TileDBObject from .tiledb_group import TileDBGroup -from typing import Optional, List +from typing import Optional, List, Dict class TileDBArray(TileDBObject): """ Wraps arrays from TileDB-Py by retaining a URI, verbose flag, etc. + Also serves as an abstraction layer to hide TileDB-specific details from the API, unless + requested. """ def __init__( @@ -37,29 +39,47 @@ def open(self): A = tiledb.open(self.uri) return A - def schema(self): + def tiledb_array_schema(self): """ Returns the TileDB array schema. """ with tiledb.open(self.uri) as A: return A.schema - def get_dim_names(self) -> List[str]: + def dim_names(self) -> List[str]: """ Reads the dimension names from the schema: for example, ['obs_id', 'var_id']. """ with tiledb.open(self.uri) as A: return [A.schema.domain.dim(i).name for i in range(A.schema.domain.ndim)] - def get_attr_names(self) -> List[str]: + def dim_names_to_types(self) -> Dict[str, str]: + """ + Returns a dict mapping from dimension name to dimension type. + """ + with tiledb.open(self.uri) as A: + dom = A.schema.domain + return {dom.dim(i).name: dom.dim(i).dtype for i in range(dom.ndim)} + + def attr_names(self) -> List[str]: """ Reads the attribute names from the schema: for example, the list of column names in a dataframe. """ with tiledb.open(self.uri) as A: return [A.schema.attr(i).name for i in range(A.schema.nattr)] + def attr_names_to_types(self) -> Dict[str, str]: + """ + Returns a dict mapping from attribute name to attribute type. + """ + with tiledb.open(self.uri) as A: + schema = A.schema + return { + schema.attr(i).name: schema.attr(i).dtype for i in range(schema.nattr) + } + def has_attr_name(self, attr_name: str) -> bool: """ Returns true if the array has the specified attribute name, false otherwise. """ - return attr_name in self.get_attr_names() + return attr_name in self.attr_names() diff --git a/apis/python/tests/test_soma_group_indexing.py b/apis/python/tests/test_soma_group_indexing.py index 793bd35b6d..9bcaffdd37 100644 --- a/apis/python/tests/test_soma_group_indexing.py +++ b/apis/python/tests/test_soma_group_indexing.py @@ -50,9 +50,9 @@ def test_soma_group_indexing(h5ad_file): ["uns", "varm", "X", "raw", "obsp", "varp", "var", "obsm", "obs"] ) assert set(soma.X.get_member_names()) == set(["data"]) - assert soma.X.data.get_dim_names() == ["obs_id", "var_id"] + assert soma.X.data.dim_names() == ["obs_id", "var_id"] - assert soma.obs.get_dim_names() == ["obs_id"] + assert soma.obs.dim_names() == ["obs_id"] assert soma.obs.dim_name == "obs_id" assert soma.obs.keys() == [ "orig.ident", @@ -159,7 +159,7 @@ def test_soma_group_indexing(h5ad_file): np.dtype("int32"), ] - assert soma.var.get_dim_names() == ["var_id"] + assert soma.var.dim_names() == ["var_id"] assert soma.obs.dim_name == "obs_id" assert soma.var.keys() == [ "vst.mean", @@ -206,7 +206,7 @@ def test_soma_group_indexing(h5ad_file): assert set(soma.obsm.keys()) == set(["X_pca", "X_tsne"]) assert isinstance(soma.obsm["X_pca"], tiledbsc.AnnotationMatrix) assert soma.obsm["nonesuch"] is None - assert soma.obsm["X_pca"].get_dim_names() == ["obs_id"] + assert soma.obsm["X_pca"].dim_names() == ["obs_id"] assert soma.obsm["X_pca"].df().shape == (80, 20) assert list(soma.obsm["X_pca"].df().dtypes) == [ np.dtype("O"), @@ -235,12 +235,12 @@ def test_soma_group_indexing(h5ad_file): assert isinstance(soma.varm["PCs"], tiledbsc.AnnotationMatrix) assert soma.varm["nonesuch"] is None assert soma.varm.get_member_names() == ["PCs"] - assert soma.varm["PCs"].get_dim_names() == ["var_id"] + assert soma.varm["PCs"].dim_names() == ["var_id"] assert set(soma.obsp.get_member_names()) == set(["distances"]) assert isinstance(soma.obsp["distances"], tiledbsc.AnnotationPairwiseMatrix) assert soma.varp["nonesuch"] is None - assert soma.obsp["distances"].get_dim_names() == ["obs_id_i", "obs_id_j"] + assert soma.obsp["distances"].dim_names() == ["obs_id_i", "obs_id_j"] assert set(soma.uns.get_member_names()) == set(["neighbors"]) assert isinstance(soma.uns["neighbors"], tiledbsc.UnsGroup)