temp

single-cell-data · Jun 9, 2022 · 43cbdad · 43cbdad
1 parent e603353
commit 43cbdad
Show file tree

Hide file tree

Showing 15 changed files with 837 additions and 21 deletions.
diff --git a/_quarto.yml b/_quarto.yml
@@ -67,6 +67,8 @@ website:
                   text: "Inspecting SOMA schemas"
                 - href: "apis/python/examples/soma-collection-reconnaissance.md"
                   text: "SOMA-collection reconnaissance"
+                - href: "apis/python/examples/soma-slice-query.md"
+                  text: "SOMA slice query"
 
             - section: "Python API"
               contents:

diff --git a/apis/python/anndata/subset-soma-01.h5ad b/apis/python/anndata/subset-soma-01.h5ad
diff --git a/apis/python/anndata/subset-soma-02.h5ad b/apis/python/anndata/subset-soma-02.h5ad
diff --git a/apis/python/anndata/subset-soma-03.h5ad b/apis/python/anndata/subset-soma-03.h5ad
diff --git a/apis/python/anndata/subset-soma-04.h5ad b/apis/python/anndata/subset-soma-04.h5ad
diff --git a/apis/python/examples/soma-slice-query.md b/apis/python/examples/soma-slice-query.md
@@ -0,0 +1,133 @@
+Here we show an example of doing a _slice query_ across a `SOMACollection`.
+
+## Populate the collection
+
+Here we use a few small sample files included in this repository.
+
+```
+import tiledbsc
+import tiledbsc.io
+import os
+import shutil
+
+soco_path = './soco-attribute-filter'
+if os.path.exists(soco_path):
+    shutil.rmtree(soco_path)
+
+soco = tiledbsc.SOMACollection(soco_path)
+if not soco.exists():
+    soco._create()
+
+for name, h5ad in [
+    ('subset-soma-01', './anndata/subset-soma-01.h5ad'),
+    ('subset-soma-02', './anndata/subset-soma-02.h5ad'),
+    ('subset-soma-03', './anndata/subset-soma-03.h5ad'),
+    ('subset-soma-04', './anndata/subset-soma-04.h5ad'),
+]:
+    soma_path = os.path.join(soco_path, name)
+    soma = tiledbsc.SOMA(soma_path)
+    tiledbsc.io.from_h5ad(soma, h5ad)
+    soco.add(soma)
+```
+
+## Do the slice query
+
+```
+import tiledbsc
+import os
+import shutil
+from typing import List, Dict
+
+# ----------------------------------------------------------------
+def soco_attribute_filter_prototype(
+    soco: tiledbsc.SOMACollection,
+    obs_attr_names: List[str],
+    obs_query_string: str,
+    var_attr_names: List[str],
+    var_query_string: str,
+) -> None:
+
+    soma_slices = []
+    for soma in soco:
+        # E.g. querying for 'cell_type == "blood"' but this SOMA doesn'tiledbsc have a cell_type column in
+        # its obs at all.
+        if not soma.obs.has_attr_names(obs_attr_names):
+            continue
+        # E.g. querying for 'feature_name == "MT-CO3"' but this SOMA doesn'tiledbsc have a feature_name
+        # column in its var at all.
+        if not soma.var.has_attr_names(var_attr_names):
+            continue
+
+        soma_slice = soma.attribute_filter(obs_query_string, var_query_string)
+        if soma_slice != None:
+            print("...", soma.name, soma_slice.ann.X.shape)
+            soma_slices.append(soma_slice)
+
+    result_soma_slice = tiledbsc.SOMASlice.concat(soma_slices)
+    if result_soma_slice is None:
+        print("Empty slice")
+    else:
+        output_file_name = "slice-query-results.h5ad"
+        a = result_soma_slice.to_anndata()
+        a.write_h5ad(output_file_name)
+        print("Wrote", output_file_name)
+
+        output_soma_path = "slice-query-results"
+        if os.path.exists(output_soma_path):
+            shutil.rmtree(output_soma_path)
+        soma = tiledbsc.SOMA.from_soma_slice(result_soma_slice, output_soma_path)
+        print("Wrote", output_soma_path)
+
+# ----------------------------------------------------------------
+soco_path = './soco-attribute-filter'
+soco = tiledbsc.SOMACollection(soco_path)
+soco_attribute_filter_prototype(
+    soco,
+    obs_attr_names=["tissue"],
+    obs_query_string='tissue == "blood"',
+    var_attr_names=["feature_name"],
+    var_query_string='feature_name == "MT-CO3"',
+)
+```
+
+## Examine the results
+
+```
+$ peek-soma slice-query-results
+>>> soma.obs.df()
+                 assay_ontology_term_id cell_type_ontology_term_id development_stage_ontology_term_id disease_ontology_term_id ethnicity_ontology_term_id  ... ethnicity      organism   sex tissue is_primary_data
+obs_id                                                                                                                                                     ...
+AAACCCAAGACGGTTG            EFO:0009922                 CL:0000814                     HsapDv:0000143            MONDO:0100096                    unknown  ...   unknown  Homo sapiens  male  blood               1
+AAACCCACACAATGTC            EFO:0009922                 CL:0000814                     HsapDv:0000143            MONDO:0100096                    unknown  ...   unknown  Homo sapiens  male  blood               1
+AAACCCACACCCAATA            EFO:0009922                 CL:0000236                     HsapDv:0000143            MONDO:0100096                    unknown  ...   unknown  Homo sapiens  male  blood               1
+AAACCCACACGTACTA            EFO:0009922                 CL:0000763                     HsapDv:0000143            MONDO:0100096                    unknown  ...   unknown  Homo sapiens  male  blood               1
+AAACCCACACTTCTCG            EFO:0009922                 CL:0000763                     HsapDv:0000143            MONDO:0100096                    unknown  ...   unknown  Homo sapiens  male  blood               1
+...                                 ...                        ...                                ...                      ...                        ...  ...       ...           ...   ...    ...             ...
+ACTATCTGTACTCGTA            EFO:0009922                 CL:0000233                     HsapDv:0000143            MONDO:0100096                    unknown  ...   unknown  Homo sapiens  male  blood               1
+ACTATCTGTATTAAGG            EFO:0009922                 CL:0000814                     HsapDv:0000143            MONDO:0100096                    unknown  ...   unknown  Homo sapiens  male  blood               1
+ACTATCTTCTAGACCA            EFO:0009922                 CL:0000763                     HsapDv:0000143            MONDO:0100096                    unknown  ...   unknown  Homo sapiens  male  blood               1
+ACTATGGAGGATATGT            EFO:0009922                 CL:0000763                     HsapDv:0000143            MONDO:0100096                    unknown  ...   unknown  Homo sapiens  male  blood               1
+ACTATGGGTGGAGAAA            EFO:0009922                 CL:0000763                     HsapDv:0000143            MONDO:0100096                    unknown  ...   unknown  Homo sapiens  male  blood               1
+
+[400 rows x 17 columns]
+>>> soma.var.df()
+Empty DataFrame
+Columns: []
+Index: [ENSG00000198938]
+>>> soma.X.data.df()
+                                  value
+obs_id           var_id
+AAACCCAAGACGGTTG ENSG00000198938  190.0
+AAACCCACACAATGTC ENSG00000198938  118.0
+AAACCCACACCCAATA ENSG00000198938  151.0
+AAACCCACACGTACTA ENSG00000198938   29.0
+AAACCCACACTTCTCG ENSG00000198938  139.0
+...                                 ...
+ACTATCTGTACTCGTA ENSG00000198938   23.0
+ACTATCTGTATTAAGG ENSG00000198938   48.0
+ACTATCTTCTAGACCA ENSG00000198938  124.0
+ACTATGGAGGATATGT ENSG00000198938   78.0
+ACTATGGGTGGAGAAA ENSG00000198938   37.0
+
+[399 rows x 1 columns]
+```
diff --git a/apis/python/src/tiledbsc/__init__.py b/apis/python/src/tiledbsc/__init__.py
@@ -1,6 +1,8 @@
 from .soma_collection import SOMACollection
 from .soma import SOMA
 from .soma_options import SOMAOptions
+from .soma_collection import SOMACollection
+from .soma_slice import SOMASlice
 
 from .tiledb_object import TileDBObject
 from .tiledb_array import TileDBArray

diff --git a/apis/python/src/tiledbsc/annotation_dataframe.py b/apis/python/src/tiledbsc/annotation_dataframe.py
@@ -7,7 +7,7 @@
 import pandas as pd
 import numpy as np
 
-from typing import Optional, Tuple, List
+from typing import Optional, Tuple, List, Set
 
 
 class AnnotationDataFrame(TileDBArray):
@@ -84,6 +84,13 @@ def keys(self) -> List[str]:
         """
         return self.attr_names()
 
+    # ----------------------------------------------------------------
+    def keyset(self) -> Set[str]:
+        """
+        Same as `.keys` but returns as set.
+        """
+        return set(self.keys())
+
     # ----------------------------------------------------------------
     def dim_select(self, ids):
         """
@@ -118,17 +125,21 @@ def df(self, ids=None) -> pd.DataFrame:
         return self.dim_select(ids)
 
     # ----------------------------------------------------------------
-    # TODO: this is a v1 for prototype/demo timeframe -- needs expanding.
-    def attribute_filter(self, query_string, col_names_to_keep):
+    def attribute_filter(self, query_string, col_names_to_keep=None):
         """
         Selects from obs/var using a TileDB-Py `QueryCondition` string such as
-        `cell_type == "blood"`. Returns None if the slice is empty.
+        `cell_type == "blood"`.
+        If `col_names_to_keep` is `None`, returns all column names in the dataframe.
+        Returns None if the slice is empty.
         This is a v1 implementation for the prototype/demo timeframe.
         """
         with self._open() as A:
             qc = tiledb.QueryCondition(query_string)
             slice_query = A.query(attr_cond=qc)
-            slice_df = slice_query.df[:][col_names_to_keep]
+            if col_names_to_keep is None:
+                slice_df = slice_query.df[:][:]
+            else:
+                slice_df = slice_query.df[:][col_names_to_keep]
             nobs = len(slice_df)
             if nobs == 0:
                 return None
@@ -149,7 +160,7 @@ def _ascii_to_unicode_dataframe_readback(self, df):
         return df
 
     # ----------------------------------------------------------------
-    def from_dataframe(self, dataframe: pd.DataFrame, extent: int) -> None:
+    def from_dataframe(self, dataframe: pd.DataFrame, extent: int = 2048) -> None:
         """
         Populates the `obs` or `var` subgroup for a SOMA object.
 

diff --git a/apis/python/src/tiledbsc/assay_matrix.py b/apis/python/src/tiledbsc/assay_matrix.py
@@ -597,19 +597,9 @@ def to_csr_matrix(self, row_labels, col_labels):
             s = util.get_start_stamp()
             print(f"{self._indent}START  read {self.uri}")
 
-        df = self.df()
-
-        retval = util.X_and_ids_to_sparse_matrix(
-            df,
-            self.row_dim_name,
-            self.col_dim_name,
-            self.attr_name,
-            row_labels,
-            col_labels,
-            return_as="csr",
-        )
+        csr = self.csr()
 
         if self._verbose:
             print(util.format_elapsed(s, f"{self._indent}FINISH read {self.uri}"))
 
-        return retval
+        return csr
diff --git a/apis/python/src/tiledbsc/soma.py b/apis/python/src/tiledbsc/soma.py
@@ -1,5 +1,5 @@
 import os
-from typing import Optional, Union, Dict
+from typing import Optional, Union, List, Dict
 
 import anndata as ad
 import numpy as np
@@ -13,6 +13,7 @@
 from tiledbsc import util_ann
 
 from .soma_options import SOMAOptions
+from .soma_slice import SOMASlice
 from .tiledb_group import TileDBGroup
 from .assay_matrix_group import AssayMatrixGroup
 from .annotation_dataframe import AnnotationDataFrame
@@ -84,9 +85,9 @@ def __init__(
             ctx=ctx,
         )
 
-        X_uri = os.path.join(self.uri, "X")
         obs_uri = os.path.join(self.uri, "obs")
         var_uri = os.path.join(self.uri, "var")
+        X_uri = os.path.join(self.uri, "X")
         obsm_uri = os.path.join(self.uri, "obsm")
         varm_uri = os.path.join(self.uri, "varm")
         obsp_uri = os.path.join(self.uri, "obsp")
@@ -174,3 +175,136 @@ def var_keys(self):
         An alias for `soma.var.ids()`.
         """
         return self.var.ids()
+
+    # ----------------------------------------------------------------
+    def dim_slice(self, slice_obs_ids, slice_var_ids) -> Dict:
+        """
+        Subselects the SOMA's obs, var, and X/data using the specified obs_ids and var_ids.
+        Using a value of `None` for obs_ids means use all obs_ids, and likewise for var_ids.
+        Returns `None` for empty slice.
+        """
+
+        assert slice_obs_ids != None or slice_var_ids != None
+
+        if slice_obs_ids is None:
+            # Try the var slice first to see if that produces zero results -- if so we don't need to
+            # load the obs.
+            slice_var_df = self.var.dim_select(slice_var_ids)
+            if slice_var_df.shape[0] == 0:
+                return None
+            slice_obs_df = self.obs.dim_select(slice_obs_ids)
+            if slice_obs_df.shape[0] == 0:
+                return None
+
+        elif slice_var_ids is None:
+            # Try the obs slice first to see if that produces zero results -- if so we don't need to
+            # load the var.
+            slice_obs_df = self.obs.dim_select(slice_obs_ids)
+            if slice_obs_df.shape[0] == 0:
+                return None
+            slice_var_df = self.var.dim_select(slice_var_ids)
+            if slice_var_df.shape[0] == 0:
+                return None
+
+        else:
+            slice_obs_df = self.obs.dim_select(slice_obs_ids)
+            if slice_obs_df.shape[0] == 0:
+                return None
+            slice_var_df = self.var.dim_select(slice_var_ids)
+            if slice_var_df.shape[0] == 0:
+                return None
+
+        return self._assemble_soma_slice(
+            slice_obs_ids, slice_var_ids, slice_obs_df, slice_var_df
+        )
+
+    # ----------------------------------------------------------------
+    def attribute_filter(
+        self,
+        obs_query_string: str,
+        var_query_string: str,
+    ) -> Dict:
+        """
+        Subselects the SOMA's obs, var, and X/data using the specified queries on obs and var.
+        Queries use the TileDB-Py `QueryCondition` API.
+        """
+
+        # E.g. querying for 'cell_type == "blood"' and this SOMA does have a cell_type column in its
+        # obs, but no rows with cell_type == "blood".
+        slice_obs_df = self.obs.attribute_filter(obs_query_string)
+        if slice_obs_df is None:
+            return None
+
+        # E.g. querying for 'feature_name == "MT-CO3"' and this SOMA does have a feature_name column
+        # in its var, but no rows with feature_name == "MT-CO3".
+        if var_query_string is None:
+            slice_var_df = self.var.df()
+        else:
+            slice_var_df = self.var.attribute_filter(var_query_string)
+            if slice_var_df is None:
+                return None
+
+        slice_obs_ids = list(slice_obs_df.index)
+        slice_var_ids = list(slice_var_df.index)
+
+        return self._assemble_soma_slice(
+            slice_obs_ids, slice_var_ids, slice_obs_df, slice_var_df
+        )
+
+    # ----------------------------------------------------------------
+    def _assemble_soma_slice(
+        self,
+        slice_obs_ids,
+        slice_var_ids,
+        slice_obs_df,
+        slice_var_df,
+    ) -> SOMASlice:
+        """
+        An internal method for constructing a `SOMASlice` object given query results.
+        """
+
+        slice_X_data = self.X.data.dim_select(slice_obs_ids, slice_var_ids)
+
+        return SOMASlice(
+            X=slice_X_data,
+            obs=slice_obs_df,
+            var=slice_var_df,
+        )
+
+    # ----------------------------------------------------------------
+    @classmethod
+    def from_soma_slice(
+        cls,
+        soma_slice: SOMASlice,
+        uri: str,
+        name=None,
+        soma_options: Optional[SOMAOptions] = None,
+        verbose: Optional[bool] = True,
+        config: Optional[tiledb.Config] = None,
+        ctx: Optional[tiledb.Ctx] = None,
+        parent: Optional[TileDBGroup] = None,  # E.g. a SOMA collection
+    ):
+        """
+        Constructs `SOMA` storage from a given in-memory `SOMASlice` object.
+        """
+
+        soma = cls(
+            uri=uri,
+            name=name,
+            soma_options=soma_options,
+            verbose=verbose,
+            config=config,
+            ctx=ctx,
+            parent=parent,
+        )
+
+        soma._create()
+        soma.obs.from_dataframe(soma_slice.obs)
+        soma.var.from_dataframe(soma_slice.var)
+        soma.X.add_layer_from_matrix_and_dim_values(
+            soma_slice.X,
+            soma.obs.ids(),
+            soma.var.ids(),
+        )
+
+        return soma