Skip to content

Commit

Permalink
temp
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Jun 9, 2022
1 parent e603353 commit 43cbdad
Show file tree
Hide file tree
Showing 15 changed files with 837 additions and 21 deletions.
2 changes: 2 additions & 0 deletions _quarto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ website:
text: "Inspecting SOMA schemas"
- href: "apis/python/examples/soma-collection-reconnaissance.md"
text: "SOMA-collection reconnaissance"
- href: "apis/python/examples/soma-slice-query.md"
text: "SOMA slice query"

- section: "Python API"
contents:
Expand Down
Binary file added apis/python/anndata/subset-soma-01.h5ad
Binary file not shown.
Binary file added apis/python/anndata/subset-soma-02.h5ad
Binary file not shown.
Binary file added apis/python/anndata/subset-soma-03.h5ad
Binary file not shown.
Binary file added apis/python/anndata/subset-soma-04.h5ad
Binary file not shown.
133 changes: 133 additions & 0 deletions apis/python/examples/soma-slice-query.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
Here we show an example of doing a _slice query_ across a `SOMACollection`.

## Populate the collection

Here we use a few small sample files included in this repository.

```
import tiledbsc
import tiledbsc.io
import os
import shutil
soco_path = './soco-attribute-filter'
if os.path.exists(soco_path):
shutil.rmtree(soco_path)
soco = tiledbsc.SOMACollection(soco_path)
if not soco.exists():
soco._create()
for name, h5ad in [
('subset-soma-01', './anndata/subset-soma-01.h5ad'),
('subset-soma-02', './anndata/subset-soma-02.h5ad'),
('subset-soma-03', './anndata/subset-soma-03.h5ad'),
('subset-soma-04', './anndata/subset-soma-04.h5ad'),
]:
soma_path = os.path.join(soco_path, name)
soma = tiledbsc.SOMA(soma_path)
tiledbsc.io.from_h5ad(soma, h5ad)
soco.add(soma)
```

## Do the slice query

```
import tiledbsc
import os
import shutil
from typing import List, Dict
# ----------------------------------------------------------------
def soco_attribute_filter_prototype(
soco: tiledbsc.SOMACollection,
obs_attr_names: List[str],
obs_query_string: str,
var_attr_names: List[str],
var_query_string: str,
) -> None:
soma_slices = []
for soma in soco:
# E.g. querying for 'cell_type == "blood"' but this SOMA doesn'tiledbsc have a cell_type column in
# its obs at all.
if not soma.obs.has_attr_names(obs_attr_names):
continue
# E.g. querying for 'feature_name == "MT-CO3"' but this SOMA doesn'tiledbsc have a feature_name
# column in its var at all.
if not soma.var.has_attr_names(var_attr_names):
continue
soma_slice = soma.attribute_filter(obs_query_string, var_query_string)
if soma_slice != None:
print("...", soma.name, soma_slice.ann.X.shape)
soma_slices.append(soma_slice)
result_soma_slice = tiledbsc.SOMASlice.concat(soma_slices)
if result_soma_slice is None:
print("Empty slice")
else:
output_file_name = "slice-query-results.h5ad"
a = result_soma_slice.to_anndata()
a.write_h5ad(output_file_name)
print("Wrote", output_file_name)
output_soma_path = "slice-query-results"
if os.path.exists(output_soma_path):
shutil.rmtree(output_soma_path)
soma = tiledbsc.SOMA.from_soma_slice(result_soma_slice, output_soma_path)
print("Wrote", output_soma_path)
# ----------------------------------------------------------------
soco_path = './soco-attribute-filter'
soco = tiledbsc.SOMACollection(soco_path)
soco_attribute_filter_prototype(
soco,
obs_attr_names=["tissue"],
obs_query_string='tissue == "blood"',
var_attr_names=["feature_name"],
var_query_string='feature_name == "MT-CO3"',
)
```

## Examine the results

```
$ peek-soma slice-query-results
>>> soma.obs.df()
assay_ontology_term_id cell_type_ontology_term_id development_stage_ontology_term_id disease_ontology_term_id ethnicity_ontology_term_id ... ethnicity organism sex tissue is_primary_data
obs_id ...
AAACCCAAGACGGTTG EFO:0009922 CL:0000814 HsapDv:0000143 MONDO:0100096 unknown ... unknown Homo sapiens male blood 1
AAACCCACACAATGTC EFO:0009922 CL:0000814 HsapDv:0000143 MONDO:0100096 unknown ... unknown Homo sapiens male blood 1
AAACCCACACCCAATA EFO:0009922 CL:0000236 HsapDv:0000143 MONDO:0100096 unknown ... unknown Homo sapiens male blood 1
AAACCCACACGTACTA EFO:0009922 CL:0000763 HsapDv:0000143 MONDO:0100096 unknown ... unknown Homo sapiens male blood 1
AAACCCACACTTCTCG EFO:0009922 CL:0000763 HsapDv:0000143 MONDO:0100096 unknown ... unknown Homo sapiens male blood 1
... ... ... ... ... ... ... ... ... ... ... ...
ACTATCTGTACTCGTA EFO:0009922 CL:0000233 HsapDv:0000143 MONDO:0100096 unknown ... unknown Homo sapiens male blood 1
ACTATCTGTATTAAGG EFO:0009922 CL:0000814 HsapDv:0000143 MONDO:0100096 unknown ... unknown Homo sapiens male blood 1
ACTATCTTCTAGACCA EFO:0009922 CL:0000763 HsapDv:0000143 MONDO:0100096 unknown ... unknown Homo sapiens male blood 1
ACTATGGAGGATATGT EFO:0009922 CL:0000763 HsapDv:0000143 MONDO:0100096 unknown ... unknown Homo sapiens male blood 1
ACTATGGGTGGAGAAA EFO:0009922 CL:0000763 HsapDv:0000143 MONDO:0100096 unknown ... unknown Homo sapiens male blood 1
[400 rows x 17 columns]
>>> soma.var.df()
Empty DataFrame
Columns: []
Index: [ENSG00000198938]
>>> soma.X.data.df()
value
obs_id var_id
AAACCCAAGACGGTTG ENSG00000198938 190.0
AAACCCACACAATGTC ENSG00000198938 118.0
AAACCCACACCCAATA ENSG00000198938 151.0
AAACCCACACGTACTA ENSG00000198938 29.0
AAACCCACACTTCTCG ENSG00000198938 139.0
... ...
ACTATCTGTACTCGTA ENSG00000198938 23.0
ACTATCTGTATTAAGG ENSG00000198938 48.0
ACTATCTTCTAGACCA ENSG00000198938 124.0
ACTATGGAGGATATGT ENSG00000198938 78.0
ACTATGGGTGGAGAAA ENSG00000198938 37.0
[399 rows x 1 columns]
```
2 changes: 2 additions & 0 deletions apis/python/src/tiledbsc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from .soma_collection import SOMACollection
from .soma import SOMA
from .soma_options import SOMAOptions
from .soma_collection import SOMACollection
from .soma_slice import SOMASlice

from .tiledb_object import TileDBObject
from .tiledb_array import TileDBArray
Expand Down
23 changes: 17 additions & 6 deletions apis/python/src/tiledbsc/annotation_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pandas as pd
import numpy as np

from typing import Optional, Tuple, List
from typing import Optional, Tuple, List, Set


class AnnotationDataFrame(TileDBArray):
Expand Down Expand Up @@ -84,6 +84,13 @@ def keys(self) -> List[str]:
"""
return self.attr_names()

# ----------------------------------------------------------------
def keyset(self) -> Set[str]:
"""
Same as `.keys` but returns as set.
"""
return set(self.keys())

# ----------------------------------------------------------------
def dim_select(self, ids):
"""
Expand Down Expand Up @@ -118,17 +125,21 @@ def df(self, ids=None) -> pd.DataFrame:
return self.dim_select(ids)

# ----------------------------------------------------------------
# TODO: this is a v1 for prototype/demo timeframe -- needs expanding.
def attribute_filter(self, query_string, col_names_to_keep):
def attribute_filter(self, query_string, col_names_to_keep=None):
"""
Selects from obs/var using a TileDB-Py `QueryCondition` string such as
`cell_type == "blood"`. Returns None if the slice is empty.
`cell_type == "blood"`.
If `col_names_to_keep` is `None`, returns all column names in the dataframe.
Returns None if the slice is empty.
This is a v1 implementation for the prototype/demo timeframe.
"""
with self._open() as A:
qc = tiledb.QueryCondition(query_string)
slice_query = A.query(attr_cond=qc)
slice_df = slice_query.df[:][col_names_to_keep]
if col_names_to_keep is None:
slice_df = slice_query.df[:][:]
else:
slice_df = slice_query.df[:][col_names_to_keep]
nobs = len(slice_df)
if nobs == 0:
return None
Expand All @@ -149,7 +160,7 @@ def _ascii_to_unicode_dataframe_readback(self, df):
return df

# ----------------------------------------------------------------
def from_dataframe(self, dataframe: pd.DataFrame, extent: int) -> None:
def from_dataframe(self, dataframe: pd.DataFrame, extent: int = 2048) -> None:
"""
Populates the `obs` or `var` subgroup for a SOMA object.
Expand Down
14 changes: 2 additions & 12 deletions apis/python/src/tiledbsc/assay_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,19 +597,9 @@ def to_csr_matrix(self, row_labels, col_labels):
s = util.get_start_stamp()
print(f"{self._indent}START read {self.uri}")

df = self.df()

retval = util.X_and_ids_to_sparse_matrix(
df,
self.row_dim_name,
self.col_dim_name,
self.attr_name,
row_labels,
col_labels,
return_as="csr",
)
csr = self.csr()

if self._verbose:
print(util.format_elapsed(s, f"{self._indent}FINISH read {self.uri}"))

return retval
return csr
138 changes: 136 additions & 2 deletions apis/python/src/tiledbsc/soma.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from typing import Optional, Union, Dict
from typing import Optional, Union, List, Dict

import anndata as ad
import numpy as np
Expand All @@ -13,6 +13,7 @@
from tiledbsc import util_ann

from .soma_options import SOMAOptions
from .soma_slice import SOMASlice
from .tiledb_group import TileDBGroup
from .assay_matrix_group import AssayMatrixGroup
from .annotation_dataframe import AnnotationDataFrame
Expand Down Expand Up @@ -84,9 +85,9 @@ def __init__(
ctx=ctx,
)

X_uri = os.path.join(self.uri, "X")
obs_uri = os.path.join(self.uri, "obs")
var_uri = os.path.join(self.uri, "var")
X_uri = os.path.join(self.uri, "X")
obsm_uri = os.path.join(self.uri, "obsm")
varm_uri = os.path.join(self.uri, "varm")
obsp_uri = os.path.join(self.uri, "obsp")
Expand Down Expand Up @@ -174,3 +175,136 @@ def var_keys(self):
An alias for `soma.var.ids()`.
"""
return self.var.ids()

# ----------------------------------------------------------------
def dim_slice(self, slice_obs_ids, slice_var_ids) -> Dict:
"""
Subselects the SOMA's obs, var, and X/data using the specified obs_ids and var_ids.
Using a value of `None` for obs_ids means use all obs_ids, and likewise for var_ids.
Returns `None` for empty slice.
"""

assert slice_obs_ids != None or slice_var_ids != None

if slice_obs_ids is None:
# Try the var slice first to see if that produces zero results -- if so we don't need to
# load the obs.
slice_var_df = self.var.dim_select(slice_var_ids)
if slice_var_df.shape[0] == 0:
return None
slice_obs_df = self.obs.dim_select(slice_obs_ids)
if slice_obs_df.shape[0] == 0:
return None

elif slice_var_ids is None:
# Try the obs slice first to see if that produces zero results -- if so we don't need to
# load the var.
slice_obs_df = self.obs.dim_select(slice_obs_ids)
if slice_obs_df.shape[0] == 0:
return None
slice_var_df = self.var.dim_select(slice_var_ids)
if slice_var_df.shape[0] == 0:
return None

else:
slice_obs_df = self.obs.dim_select(slice_obs_ids)
if slice_obs_df.shape[0] == 0:
return None
slice_var_df = self.var.dim_select(slice_var_ids)
if slice_var_df.shape[0] == 0:
return None

return self._assemble_soma_slice(
slice_obs_ids, slice_var_ids, slice_obs_df, slice_var_df
)

# ----------------------------------------------------------------
def attribute_filter(
self,
obs_query_string: str,
var_query_string: str,
) -> Dict:
"""
Subselects the SOMA's obs, var, and X/data using the specified queries on obs and var.
Queries use the TileDB-Py `QueryCondition` API.
"""

# E.g. querying for 'cell_type == "blood"' and this SOMA does have a cell_type column in its
# obs, but no rows with cell_type == "blood".
slice_obs_df = self.obs.attribute_filter(obs_query_string)
if slice_obs_df is None:
return None

# E.g. querying for 'feature_name == "MT-CO3"' and this SOMA does have a feature_name column
# in its var, but no rows with feature_name == "MT-CO3".
if var_query_string is None:
slice_var_df = self.var.df()
else:
slice_var_df = self.var.attribute_filter(var_query_string)
if slice_var_df is None:
return None

slice_obs_ids = list(slice_obs_df.index)
slice_var_ids = list(slice_var_df.index)

return self._assemble_soma_slice(
slice_obs_ids, slice_var_ids, slice_obs_df, slice_var_df
)

# ----------------------------------------------------------------
def _assemble_soma_slice(
self,
slice_obs_ids,
slice_var_ids,
slice_obs_df,
slice_var_df,
) -> SOMASlice:
"""
An internal method for constructing a `SOMASlice` object given query results.
"""

slice_X_data = self.X.data.dim_select(slice_obs_ids, slice_var_ids)

return SOMASlice(
X=slice_X_data,
obs=slice_obs_df,
var=slice_var_df,
)

# ----------------------------------------------------------------
@classmethod
def from_soma_slice(
cls,
soma_slice: SOMASlice,
uri: str,
name=None,
soma_options: Optional[SOMAOptions] = None,
verbose: Optional[bool] = True,
config: Optional[tiledb.Config] = None,
ctx: Optional[tiledb.Ctx] = None,
parent: Optional[TileDBGroup] = None, # E.g. a SOMA collection
):
"""
Constructs `SOMA` storage from a given in-memory `SOMASlice` object.
"""

soma = cls(
uri=uri,
name=name,
soma_options=soma_options,
verbose=verbose,
config=config,
ctx=ctx,
parent=parent,
)

soma._create()
soma.obs.from_dataframe(soma_slice.obs)
soma.var.from_dataframe(soma_slice.var)
soma.X.add_layer_from_matrix_and_dim_values(
soma_slice.X,
soma.obs.ids(),
soma.var.ids(),
)

return soma
Loading

0 comments on commit 43cbdad

Please sign in to comment.