From 879e34138ecf4f5fd5969048fb9ebbfd72dba583 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Fri, 26 Aug 2022 09:16:20 -0400 Subject: [PATCH] Support return_arrow for various queries (#256) * Support return_arrow for various queries * Unit-test cases --- .../src/tiledbsc/annotation_dataframe.py | 107 ++++++++++++++---- apis/python/src/tiledbsc/annotation_matrix.py | 26 ++++- apis/python/src/tiledbsc/assay_matrix.py | 31 +++-- apis/python/src/tiledbsc/soma.py | 80 +++++++++---- apis/python/src/tiledbsc/soma_collection.py | 2 + apis/python/src/tiledbsc/soma_slice.py | 66 +++++++++-- apis/python/tests/test_soco_slice_query.py | 56 +++++---- 7 files changed, 275 insertions(+), 93 deletions(-) diff --git a/apis/python/src/tiledbsc/annotation_dataframe.py b/apis/python/src/tiledbsc/annotation_dataframe.py index 802cb76cb3..81f6b89e19 100644 --- a/apis/python/src/tiledbsc/annotation_dataframe.py +++ b/apis/python/src/tiledbsc/annotation_dataframe.py @@ -1,8 +1,9 @@ from concurrent.futures import ThreadPoolExecutor -from typing import Optional, Sequence, Set, Tuple +from typing import Optional, Sequence, Set, Tuple, Union import numpy as np import pandas as pd +import pyarrow as pa import tiledb import tiledbsc.util as util @@ -109,15 +110,19 @@ def keyset(self) -> Set[str]: # ---------------------------------------------------------------- def dim_select( - self, ids: Optional[Ids], attrs: Optional[Sequence[str]] = None - ) -> pd.DataFrame: + self, + ids: Optional[Ids], + attrs: Optional[Sequence[str]] = None, + *, + return_arrow: bool = False, + ) -> Union[pd.DataFrame, pa.Table]: """ Selects a slice out of the dataframe with specified `obs_ids` (for `obs`) or `var_ids` (for `var`). If `ids` is `None`, the entire dataframe is returned. Similarly, if `attrs` are provided, they're used for the query; else, all attributes are returned. """ with self._open("r") as A: - query = A.query(attrs=attrs) + query = A.query(return_arrow=return_arrow, attrs=attrs) if ids is None: df = query.df[:] else: @@ -132,24 +137,32 @@ def dim_select( # so the set_index is already done for us. # # However if the data was written somehow else (e.g. by tiledbscr-r) then we do. - if isinstance(df.index, pd.RangeIndex) and self.dim_name in df.columns: - df.set_index(self.dim_name, inplace=True) + if not return_arrow: + if isinstance(df.index, pd.RangeIndex) and self.dim_name in df.columns: + df.set_index(self.dim_name, inplace=True) # TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this. # This is the 'decode on read' part of our logic; in from_dataframe we have the 'encode on write' part. # Context: https://github.com/single-cell-data/TileDB-SingleCell/issues/99. - return self._ascii_to_unicode_dataframe_readback(df) + if return_arrow: + return self._ascii_to_unicode_arrow_readback(df) + else: + return self._ascii_to_unicode_pandas_readback(df) # ---------------------------------------------------------------- def df( - self, ids: Optional[Ids] = None, attrs: Optional[Sequence[str]] = None - ) -> pd.DataFrame: + self, + ids: Optional[Ids] = None, + attrs: Optional[Sequence[str]] = None, + *, + return_arrow: bool = False, + ) -> Union[pd.DataFrame, pa.Table]: """ Keystroke-saving alias for `.dim_select()`. If `ids` are provided, they're used to subselect; if not, the entire dataframe is returned. If `attrs` are provided, they're used for the query; else, all attributes are returned. """ - return self.dim_select(ids, attrs) + return self.dim_select(ids, attrs, return_arrow=return_arrow) # ---------------------------------------------------------------- def query( @@ -157,7 +170,9 @@ def query( query_string: Optional[str], ids: Optional[Ids] = None, attrs: Optional[Sequence[str]] = None, - ) -> pd.DataFrame: + *, + return_arrow: bool = False, + ) -> Union[pd.DataFrame, pa.Table]: """ Selects from obs/var using a TileDB-Py `QueryCondition` string such as `cell_type == "blood"`. If `attrs` is `None`, returns all column names in the dataframe; use `[]` for @@ -165,38 +180,46 @@ def query( included in `attrs` if `attrs` is not `None`. Returns `None` if the slice is empty. """ if query_string is None: - return self.dim_select(ids) + return self.dim_select(ids, return_arrow=return_arrow) with self._open() as A: qc = tiledb.QueryCondition(query_string) if attrs is None: - slice_query = A.query(attr_cond=qc) + slice_query = A.query(attr_cond=qc, return_arrow=return_arrow) if ids is None: slice_df = slice_query.df[:] else: slice_df = slice_query.df[ids] else: - slice_query = A.query(attr_cond=qc, attrs=attrs) + slice_query = A.query( + attr_cond=qc, attrs=attrs, return_arrow=return_arrow + ) if ids is None: slice_df = slice_query.df[:] else: slice_df = slice_query.df[ids] # This is the 'decode on read' part of our logic; in dim_select we have the 'encode on write' part. # Context: https://github.com/single-cell-data/TileDB-SingleCell/issues/99. - return self._ascii_to_unicode_dataframe_readback(slice_df) + if return_arrow: + return self._ascii_to_unicode_arrow_readback(slice_df) + else: + return self._ascii_to_unicode_pandas_readback(slice_df) # ---------------------------------------------------------------- - def _ascii_to_unicode_series_readback( + def _ascii_to_unicode_pandas_series_readback( self, field_name: str, series: pd.Series ) -> Tuple[str, bool, Optional[pd.Series]]: + """ + Helper method for `_ascii_to_unicode_pandas_readback` + """ if len(series) > 0 and type(series[0]) == bytes: return (field_name, True, series.map(lambda e: e.decode())) else: return (field_name, False, None) - def _ascii_to_unicode_dataframe_readback(self, df: pd.DataFrame) -> pd.DataFrame: + def _ascii_to_unicode_pandas_readback(self, df: pd.DataFrame) -> pd.DataFrame: """ - Implements the 'decode on read' partof our logic as noted in `dim_select()`. + Implements the 'decode on read' part of our logic as noted in `dim_select()`. """ futures = [] # Empirically we find this has a bit of a speed-up. Presumably that's because of some NumPy @@ -204,7 +227,7 @@ def _ascii_to_unicode_dataframe_readback(self, df: pd.DataFrame) -> pd.DataFrame with ThreadPoolExecutor() as executor: for k in df: future = executor.submit( - self._ascii_to_unicode_series_readback, k, df[k] + self._ascii_to_unicode_pandas_series_readback, k, df[k] ) futures.append(future) @@ -215,6 +238,52 @@ def _ascii_to_unicode_dataframe_readback(self, df: pd.DataFrame) -> pd.DataFrame return df + # ---------------------------------------------------------------- + def _ascii_to_unicode_arrow_series_readback( + self, array_number: int, series: Union[pa.Array, pa.ChunkedArray] + ) -> Tuple[int, bool, Optional[Union[pa.Array, pa.ChunkedArray]]]: + """ + Helper method for `_ascii_to_unicode_arrow_readback` + """ + # pyarrow's way of handling 'bytes' + if len(series) > 0 and ( + type(series[0]) == pa.LargeBinaryArray + or type(series[0]) == pa.LargeStringScalar + ): + return (array_number, True, series.cast(pa.string())) + else: + return (array_number, False, None) + + def _ascii_to_unicode_arrow_readback(self, df: pa.Table) -> pa.Table: + """ + Implements the 'decode on read' part of our logic as noted in `dim_select()`. + """ + + array_names = df.column_names + futures = [] + # Empirically we find this doesn't have much of a speed-up. Presumably that's because of + # PyArrow Python code holding the GIL. Nonetheless, experiments show it isn't slower so + # we'll keep the ThreadPoolExecutor logic, which will only get faster pending (hypothetical) + # future PyArrow C++ work. + + with ThreadPoolExecutor() as executor: + for array_number in range(df.num_columns): + future = executor.submit( + self._ascii_to_unicode_arrow_series_readback, + array_number, + df[array_number], + ) + futures.append(future) + + new_arrays = [None] * df.num_columns + for future in futures: + array_number, modified, new_array = future.result() + if modified: + new_arrays[array_number] = new_array + else: + new_arrays[array_number] = df[array_number] + return pa.Table.from_arrays(new_arrays, names=array_names) + # ---------------------------------------------------------------- def from_dataframe(self, dataframe: pd.DataFrame, extent: int = 2048) -> None: """ diff --git a/apis/python/src/tiledbsc/annotation_matrix.py b/apis/python/src/tiledbsc/annotation_matrix.py index 06abaa00b9..325c26754a 100644 --- a/apis/python/src/tiledbsc/annotation_matrix.py +++ b/apis/python/src/tiledbsc/annotation_matrix.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd +import pyarrow as pa import tiledb import tiledbsc.util as util @@ -66,27 +67,40 @@ def shape(self) -> Tuple[int, int]: return (num_rows, num_cols) # ---------------------------------------------------------------- - def dim_select(self, ids: Optional[Ids]) -> pd.DataFrame: + def dim_select( + self, + ids: Optional[Ids] = None, + *, + return_arrow: bool = False, + ) -> Union[pd.DataFrame, pa.Table]: """ Selects a slice out of the array with specified `obs_ids` (for `obsm` elements) or `var_ids` (for `varm` elements). If `ids` is `None`, the entire array is returned. """ if ids is None: with self._open() as A: - df = A.df[:] + query = A.query(return_arrow=return_arrow) + df = query.df[:] else: with self._open() as A: - df = A.df[ids] - df.set_index(self.dim_name, inplace=True) + query = A.query(return_arrow=return_arrow) + df = query.df[ids] + if not return_arrow: + df.set_index(self.dim_name, inplace=True) return df # ---------------------------------------------------------------- - def df(self, ids: Optional[Ids] = None) -> pd.DataFrame: + def df( + self, + ids: Optional[Ids] = None, + *, + return_arrow: bool = False, + ) -> Union[pd.DataFrame, pa.Table]: """ Keystroke-saving alias for `.dim_select()`. If `ids` are provided, they're used to subselect; if not, the entire dataframe is returned. """ - return self.dim_select(ids) + return self.dim_select(ids, return_arrow=return_arrow) # ---------------------------------------------------------------- def from_matrix_and_dim_values( diff --git a/apis/python/src/tiledbsc/assay_matrix.py b/apis/python/src/tiledbsc/assay_matrix.py index 1dcab49e63..ab41fbf520 100644 --- a/apis/python/src/tiledbsc/assay_matrix.py +++ b/apis/python/src/tiledbsc/assay_matrix.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd +import pyarrow as pa import scipy.sparse as sp import tiledb @@ -77,36 +78,46 @@ def shape(self) -> Tuple[int, int]: # ---------------------------------------------------------------- def dim_select( - self, obs_ids: Optional[Ids], var_ids: Optional[Ids] - ) -> pd.DataFrame: + self, + obs_ids: Optional[Ids], + var_ids: Optional[Ids], + *, + return_arrow: bool = False, + ) -> Union[pd.DataFrame, pa.Table]: """ Selects a slice out of the matrix with specified `obs_ids` and/or `var_ids`. Either or both of the ID lists may be `None`, meaning, do not subselect along that dimension. If both ID lists are `None`, the entire matrix is returned. """ with tiledb.open(self.uri, ctx=self._ctx) as A: + query = A.query(return_arrow=return_arrow) if obs_ids is None: if var_ids is None: - df = A.df[:, :] + df = query.df[:, :] else: - df = A.df[:, var_ids] + df = query.df[:, var_ids] else: if var_ids is None: - df = A.df[obs_ids, :] + df = query.df[obs_ids, :] else: - df = A.df[obs_ids, var_ids] - df.set_index([self.row_dim_name, self.col_dim_name], inplace=True) + df = query.df[obs_ids, var_ids] + if not return_arrow: + df.set_index([self.row_dim_name, self.col_dim_name], inplace=True) return df # ---------------------------------------------------------------- def df( - self, obs_ids: Optional[Ids] = None, var_ids: Optional[Ids] = None - ) -> pd.DataFrame: + self, + obs_ids: Optional[Ids] = None, + var_ids: Optional[Ids] = None, + *, + return_arrow: bool = False, + ) -> Union[pd.DataFrame, pa.Table]: """ Keystroke-saving alias for `.dim_select()`. If either of `obs_ids` or `var_ids` are provided, they're used to subselect; if not, the entire dataframe is returned. """ - return self.dim_select(obs_ids, var_ids) + return self.dim_select(obs_ids, var_ids, return_arrow=return_arrow) # ---------------------------------------------------------------- def csr( diff --git a/apis/python/src/tiledbsc/soma.py b/apis/python/src/tiledbsc/soma.py index 920db5892b..4155e8339f 100644 --- a/apis/python/src/tiledbsc/soma.py +++ b/apis/python/src/tiledbsc/soma.py @@ -3,9 +3,10 @@ import os from collections import Counter from concurrent.futures import ThreadPoolExecutor -from typing import List, Optional, Sequence, Tuple +from typing import List, Optional, Sequence, Tuple, Union import pandas as pd +import pyarrow as pa import tiledb from .annotation_dataframe import AnnotationDataFrame @@ -244,7 +245,11 @@ def _get_obs_or_var_value_counts( # ---------------------------------------------------------------- def dim_slice( - self, obs_ids: Optional[Ids], var_ids: Optional[Ids] + self, + obs_ids: Optional[Ids], + var_ids: Optional[Ids], + *, + return_arrow: bool = False, ) -> Optional[SOMASlice]: """ Subselects the SOMA's obs, var, and X/data using the specified obs_ids and var_ids. @@ -255,28 +260,28 @@ def dim_slice( if obs_ids is None: # Try the var slice first to see if that produces zero results -- if so we don't need to # load the obs. - slice_var_df = self.var.dim_select(var_ids) + slice_var_df = self.var.dim_select(var_ids, return_arrow=return_arrow) if slice_var_df.shape[0] == 0: return None - slice_obs_df = self.obs.dim_select(obs_ids) + slice_obs_df = self.obs.dim_select(obs_ids, return_arrow=return_arrow) if slice_obs_df.shape[0] == 0: return None elif var_ids is None: # Try the obs slice first to see if that produces zero results -- if so we don't need to # load the var. - slice_obs_df = self.obs.dim_select(obs_ids) + slice_obs_df = self.obs.dim_select(obs_ids, return_arrow=return_arrow) if slice_obs_df.shape[0] == 0: return None - slice_var_df = self.var.dim_select(var_ids) + slice_var_df = self.var.dim_select(var_ids, return_arrow=return_arrow) if slice_var_df.shape[0] == 0: return None else: - slice_obs_df = self.obs.dim_select(obs_ids) + slice_obs_df = self.obs.dim_select(obs_ids, return_arrow=return_arrow) if slice_obs_df.shape[0] == 0: return None - slice_var_df = self.var.dim_select(var_ids) + slice_var_df = self.var.dim_select(var_ids, return_arrow=return_arrow) if slice_var_df.shape[0] == 0: return None @@ -290,7 +295,9 @@ def dim_slice( # * obsp # * varp - return self._assemble_soma_slice(obs_ids, var_ids, slice_obs_df, slice_var_df) + return self._assemble_soma_slice( + obs_ids, var_ids, slice_obs_df, slice_var_df, return_arrow=return_arrow + ) # ---------------------------------------------------------------- def query( @@ -302,6 +309,7 @@ def query( var_attrs: Optional[Sequence[str]] = None, var_query_string: Optional[str] = None, var_ids: Optional[Ids] = None, + return_arrow: bool = False, ) -> Optional[SOMASlice]: """ Subselects the SOMA's obs, var, and X/data using the specified queries on obs and var. @@ -315,14 +323,21 @@ def query( """ slice_obs_df = self.obs.query( - query_string=obs_query_string, ids=obs_ids, attrs=obs_attrs + query_string=obs_query_string, + ids=obs_ids, + attrs=obs_attrs, + return_arrow=return_arrow, ) # E.g. querying for 'cell_type == "blood"' and this SOMA does have a cell_type column in its # obs, but no rows with cell_type == "blood". if slice_obs_df is None: return None - if len(slice_obs_df.index) == 0: - return None + if return_arrow: + if len(slice_obs_df["obs_id"]) == 0: + return None + else: + if len(slice_obs_df.index) == 0: + return None # At the tiledb multi-index level, if we're say slicing on obs_ids but not var_ids, # we'll do `A.df[obs_ids, :]`. We can't pass a `:` down the callstack to get there, # but we pass `None` instead. @@ -332,18 +347,30 @@ def query( # `A.df[{158 obs ids}, {all 2000 var ids}]` is non-performant while # `A.df[{158 obs ids}, :]` is performant. if obs_ids is not None or obs_query_string is not None: - obs_ids = list(slice_obs_df.index) + if return_arrow: + obs_ids = [obs_id.as_py() for obs_id in slice_obs_df["obs_id"]] + else: + obs_ids = list(slice_obs_df.index) - slice_var_df = self.var.query(var_query_string, ids=var_ids, attrs=var_attrs) + slice_var_df = self.var.query( + var_query_string, ids=var_ids, attrs=var_attrs, return_arrow=return_arrow + ) # E.g. querying for 'feature_name == "MT-CO3"' and this SOMA does have a feature_name column # in its var, but no rows with feature_name == "MT-CO3". if slice_var_df is None: return None - if len(slice_var_df.index) == 0: - return None + if return_arrow: + if len(slice_var_df["var_id"]) == 0: + return None + else: + if len(slice_var_df.index) == 0: + return None # See above comment re keeping obs_ids == None if that's what it came in as. if var_ids is not None or var_query_string is not None: - var_ids = list(slice_var_df.index) + if return_arrow: + var_ids = [var_id.as_py() for var_id in slice_var_df["var_id"]] + else: + var_ids = list(slice_var_df.index) # TODO: # do this here: @@ -355,7 +382,9 @@ def query( # * obsp # * varp - return self._assemble_soma_slice(obs_ids, var_ids, slice_obs_df, slice_var_df) + return self._assemble_soma_slice( + obs_ids, var_ids, slice_obs_df, slice_var_df, return_arrow=return_arrow + ) # ---------------------------------------------------------------- @classmethod @@ -369,6 +398,7 @@ def queries( var_attrs: Optional[Sequence[str]] = None, var_query_string: Optional[str] = None, var_ids: Optional[Ids] = None, + return_arrow: bool = False, max_thread_pool_workers: Optional[int] = None, ) -> List[SOMASlice]: """ @@ -416,6 +446,7 @@ def queries( var_query_string=var_query_string, obs_ids=obs_ids, var_ids=var_ids, + return_arrow=return_arrow, ) soma_slice_futures.append(soma_slice_future) @@ -433,15 +464,19 @@ def _assemble_soma_slice_aux( X: AssayMatrix, obs_ids: Optional[Ids], var_ids: Optional[Ids], - ) -> Tuple[str, pd.DataFrame]: - return (layer_name, X.dim_select(obs_ids, var_ids)) + *, + return_arrow: bool = False, + ) -> Tuple[str, Union[pd.DataFrame, pa.Table]]: + return (layer_name, X.dim_select(obs_ids, var_ids, return_arrow=return_arrow)) def _assemble_soma_slice( self, obs_ids: Optional[Ids], var_ids: Optional[Ids], - slice_obs_df: pd.DataFrame, - slice_var_df: pd.DataFrame, + slice_obs_df: Union[pd.DataFrame, pa.Table], + slice_var_df: Union[pd.DataFrame, pa.Table], + *, + return_arrow: bool = False, ) -> SOMASlice: """ An internal method for constructing a `SOMASlice` object given query results. @@ -462,6 +497,7 @@ def _assemble_soma_slice( X_layer, obs_ids, var_ids, + return_arrow=return_arrow, ) futures.append(future) diff --git a/apis/python/src/tiledbsc/soma_collection.py b/apis/python/src/tiledbsc/soma_collection.py index 9b61223024..2cf1566c36 100644 --- a/apis/python/src/tiledbsc/soma_collection.py +++ b/apis/python/src/tiledbsc/soma_collection.py @@ -234,6 +234,7 @@ def query( var_attrs: Optional[Sequence[str]] = None, var_query_string: Optional[str] = None, var_ids: Optional[Ids] = None, + return_arrow: bool = False, ) -> List[SOMASlice]: """ Subselects the obs, var, and X/data using the specified queries on obs and var, @@ -260,6 +261,7 @@ def query( var_attrs=var_attrs, var_query_string=var_query_string, var_ids=var_ids, + return_arrow=return_arrow, ) # ---------------------------------------------------------------- diff --git a/apis/python/src/tiledbsc/soma_slice.py b/apis/python/src/tiledbsc/soma_slice.py index 04a8bbc4b2..e33d17eaa2 100644 --- a/apis/python/src/tiledbsc/soma_slice.py +++ b/apis/python/src/tiledbsc/soma_slice.py @@ -1,9 +1,10 @@ from __future__ import annotations -from typing import Dict, Optional, Sequence, Union +from typing import Dict, List, Optional, Sequence, Union import anndata as ad import pandas as pd +import pyarrow as pa from tiledbsc import util @@ -21,9 +22,9 @@ class SOMASlice(TileDBGroup): # ---------------------------------------------------------------- def __init__( self, - X: Dict[str, Union[pd.DataFrame, Matrix]], - obs: pd.DataFrame, - var: pd.DataFrame, + X: Dict[str, Union[pd.DataFrame, pa.Table, Matrix]], + obs: Union[pd.DataFrame, pa.Table], + var: Union[pd.DataFrame, pa.Table], # TODO # obsm: Dict[str, pd.DataFrame], # varm: Dict[str, pd.DataFrame], @@ -35,8 +36,8 @@ def __init__( """ Constructs an in-memory `SOMASlice` object. This is a simple collection of obs, var, and X dataframes. """ - assert isinstance(obs, pd.DataFrame) - assert isinstance(var, pd.DataFrame) + assert isinstance(obs, pd.DataFrame) or isinstance(obs, pa.Table) + assert isinstance(var, pd.DataFrame) or isinstance(obs, pa.Table) assert "data" in X self.obs = obs @@ -75,10 +76,20 @@ def to_anndata(self) -> ad.AnnData: X_data = self.X["data"] if isinstance(X_data, pd.DataFrame): X_dtype = X_data.dtypes["value"] + elif isinstance(X_data, pa.Table): + X_dtype = X_data["value"].type.to_pandas_dtype() else: X_dtype = X_data.dtype - ann = ad.AnnData(obs=self.obs, var=self.var, dtype=X_dtype) + obs = self.obs + var = self.var + if isinstance(obs, pa.Table): + obs = obs.to_pandas() + obs.set_index("obs_id", inplace=True) + if isinstance(var, pa.Table): + var = var.to_pandas() + var.set_index("var_id", inplace=True) + ann = ad.AnnData(obs=obs, var=var, dtype=X_dtype) # TODO: # self.obsm = obsm @@ -94,14 +105,29 @@ def to_anndata(self) -> ad.AnnData: if isinstance(data, pd.DataFrame): # Make obs_id and var_id accessible as columns. data = data.reset_index() + data = util.X_and_ids_to_sparse_matrix( data, "obs_id", # row_dim_name "var_id", # col_dim_name "value", # attr_name - self.obs.index, - self.var.index, + obs.index, + var.index, ) + + if isinstance(data, pa.Table): + data = data.to_pandas() + data.set_index(["obs_id", "var_id"], inplace=True) + + data = util.X_and_ids_to_sparse_matrix( + data, + "obs_id", # row_dim_name + "var_id", # col_dim_name + "value", # attr_name + obs.index, + var.index, + ) + # We use AnnData as our in-memory storage. For SOMAs, all X layers are arrays within the # soma.X group; for AnnData, the 'data' layer is ann.X and all the others are in # ann.layers. @@ -113,6 +139,13 @@ def to_anndata(self) -> ad.AnnData: return ann # ---------------------------------------------------------------- + @classmethod + def _keys_for_concat(cls, df: Union[pd.DataFrame, pa.Table, Matrix]) -> List[str]: + if isinstance(df, pa.Table): + return sorted(list(df.column_names)) + else: + return sorted(list(df.keys())) + @classmethod def concat(cls, soma_slices: Sequence[SOMASlice]) -> Optional[SOMASlice]: """ @@ -126,6 +159,9 @@ def concat(cls, soma_slices: Sequence[SOMASlice]) -> Optional[SOMASlice]: # Check column names for each dataframe-type are the same slice0 = soma_slices[0] + okeys0 = cls._keys_for_concat(slice0.obs) + vkeys0 = cls._keys_for_concat(slice0.var) + for i, slicei in enumerate(soma_slices): if i == 0: continue @@ -134,11 +170,19 @@ def concat(cls, soma_slices: Sequence[SOMASlice]) -> Optional[SOMASlice]: raise Exception( "SOMA slices to be concatenated must have all the same X attributes" ) - if sorted(list(slicei.obs.keys())) != sorted(list(slice0.obs.keys())): + for key in slice0.X.keys(): + if cls._keys_for_concat(slicei.X[key]) != cls._keys_for_concat( + slice0.X[key] + ): + raise Exception( + "SOMA slices to be concatenated must have all the same obs attributes" + ) + + if cls._keys_for_concat(slicei.obs) != okeys0: raise Exception( "SOMA slices to be concatenated must have all the same obs attributes" ) - if sorted(list(slicei.var.keys())) != sorted(list(slice0.var.keys())): + if cls._keys_for_concat(slicei.var) != vkeys0: raise Exception( "SOMA slices to be concatenated must have all the same var attributes" ) diff --git a/apis/python/tests/test_soco_slice_query.py b/apis/python/tests/test_soco_slice_query.py index 3341d6fe14..c494a3a2c4 100644 --- a/apis/python/tests/test_soco_slice_query.py +++ b/apis/python/tests/test_soco_slice_query.py @@ -37,31 +37,37 @@ def test_soco_slice_query(tmp_path): var_attrs = ["feature_name"] var_query_string = 'feature_name == "MT-CO3"' - soma_slices = [] - for soma in soco: - # E.g. querying for 'cell_type == "blood"' but this SOMA doesn't have a cell_type column in - # its obs at all. - if not soma.obs.has_attr_names(obs_attrs): - continue - # E.g. querying for 'feature_name == "MT-CO3"' but this SOMA doesn't have a feature_name - # column in its var at all. - if not soma.var.has_attr_names(var_attrs): - continue - - soma_slice = soma.query( - obs_query_string=obs_query_string, var_query_string=var_query_string - ) - if soma_slice is not None: - soma_slices.append(soma_slice) - - result_soma_slice = tiledbsc.SOMASlice.concat(soma_slices) - assert result_soma_slice is not None - - ann = result_soma_slice.to_anndata() - - assert ann.obs.shape == (400, 17) - assert ann.var.shape == (1, 3) - assert ann.X.shape == (400, 1) + # The return_arrow=True case drives Arrow format all the way through SOMA, SOMASlice, + # obs, var, X, etc. + for return_arrow in [False, True]: + + soma_slices = [] + for soma in soco: + # E.g. querying for 'cell_type == "blood"' but this SOMA doesn't have a cell_type column in + # its obs at all. + if not soma.obs.has_attr_names(obs_attrs): + continue + # E.g. querying for 'feature_name == "MT-CO3"' but this SOMA doesn't have a feature_name + # column in its var at all. + if not soma.var.has_attr_names(var_attrs): + continue + + soma_slice = soma.query( + obs_query_string=obs_query_string, + var_query_string=var_query_string, + return_arrow=return_arrow, + ) + if soma_slice is not None: + soma_slices.append(soma_slice) + + result_soma_slice = tiledbsc.SOMASlice.concat(soma_slices) + assert result_soma_slice is not None + + ann = result_soma_slice.to_anndata() + + assert ann.obs.shape == (400, 17) + assert ann.var.shape == (1, 3) + assert ann.X.shape == (400, 1) def test_soco_slice_query_nans(tmp_path):