From 879e34138ecf4f5fd5969048fb9ebbfd72dba583 Mon Sep 17 00:00:00 2001
From: John Kerl <kerl.john.r@gmail.com>
Date: Fri, 26 Aug 2022 09:16:20 -0400
Subject: [PATCH] Support return_arrow for various queries (#256)

* Support return_arrow for various queries

* Unit-test cases
---
 .../src/tiledbsc/annotation_dataframe.py      | 107 ++++++++++++++----
 apis/python/src/tiledbsc/annotation_matrix.py |  26 ++++-
 apis/python/src/tiledbsc/assay_matrix.py      |  31 +++--
 apis/python/src/tiledbsc/soma.py              |  80 +++++++++----
 apis/python/src/tiledbsc/soma_collection.py   |   2 +
 apis/python/src/tiledbsc/soma_slice.py        |  66 +++++++++--
 apis/python/tests/test_soco_slice_query.py    |  56 +++++----
 7 files changed, 275 insertions(+), 93 deletions(-)

diff --git a/apis/python/src/tiledbsc/annotation_dataframe.py b/apis/python/src/tiledbsc/annotation_dataframe.py
index 802cb76cb3..81f6b89e19 100644
--- a/apis/python/src/tiledbsc/annotation_dataframe.py
+++ b/apis/python/src/tiledbsc/annotation_dataframe.py
@@ -1,8 +1,9 @@
 from concurrent.futures import ThreadPoolExecutor
-from typing import Optional, Sequence, Set, Tuple
+from typing import Optional, Sequence, Set, Tuple, Union
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 import tiledb
 
 import tiledbsc.util as util
@@ -109,15 +110,19 @@ def keyset(self) -> Set[str]:
 
     # ----------------------------------------------------------------
     def dim_select(
-        self, ids: Optional[Ids], attrs: Optional[Sequence[str]] = None
-    ) -> pd.DataFrame:
+        self,
+        ids: Optional[Ids],
+        attrs: Optional[Sequence[str]] = None,
+        *,
+        return_arrow: bool = False,
+    ) -> Union[pd.DataFrame, pa.Table]:
         """
         Selects a slice out of the dataframe with specified `obs_ids` (for `obs`) or `var_ids` (for
         `var`).  If `ids` is `None`, the entire dataframe is returned.  Similarly, if `attrs` are
         provided, they're used for the query; else, all attributes are returned.
         """
         with self._open("r") as A:
-            query = A.query(attrs=attrs)
+            query = A.query(return_arrow=return_arrow, attrs=attrs)
             if ids is None:
                 df = query.df[:]
             else:
@@ -132,24 +137,32 @@ def dim_select(
         # so the set_index is already done for us.
         #
         # However if the data was written somehow else (e.g. by tiledbscr-r) then we do.
-        if isinstance(df.index, pd.RangeIndex) and self.dim_name in df.columns:
-            df.set_index(self.dim_name, inplace=True)
+        if not return_arrow:
+            if isinstance(df.index, pd.RangeIndex) and self.dim_name in df.columns:
+                df.set_index(self.dim_name, inplace=True)
 
         # TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this.
         # This is the 'decode on read' part of our logic; in from_dataframe we have the 'encode on write' part.
         # Context: https://github.com/single-cell-data/TileDB-SingleCell/issues/99.
-        return self._ascii_to_unicode_dataframe_readback(df)
+        if return_arrow:
+            return self._ascii_to_unicode_arrow_readback(df)
+        else:
+            return self._ascii_to_unicode_pandas_readback(df)
 
     # ----------------------------------------------------------------
     def df(
-        self, ids: Optional[Ids] = None, attrs: Optional[Sequence[str]] = None
-    ) -> pd.DataFrame:
+        self,
+        ids: Optional[Ids] = None,
+        attrs: Optional[Sequence[str]] = None,
+        *,
+        return_arrow: bool = False,
+    ) -> Union[pd.DataFrame, pa.Table]:
         """
         Keystroke-saving alias for `.dim_select()`. If `ids` are provided, they're used
         to subselect; if not, the entire dataframe is returned. If `attrs` are provided,
         they're used for the query; else, all attributes are returned.
         """
-        return self.dim_select(ids, attrs)
+        return self.dim_select(ids, attrs, return_arrow=return_arrow)
 
     # ----------------------------------------------------------------
     def query(
@@ -157,7 +170,9 @@ def query(
         query_string: Optional[str],
         ids: Optional[Ids] = None,
         attrs: Optional[Sequence[str]] = None,
-    ) -> pd.DataFrame:
+        *,
+        return_arrow: bool = False,
+    ) -> Union[pd.DataFrame, pa.Table]:
         """
         Selects from obs/var using a TileDB-Py `QueryCondition` string such as `cell_type ==
         "blood"`.  If `attrs` is `None`, returns all column names in the dataframe; use `[]` for
@@ -165,38 +180,46 @@ def query(
         included in `attrs` if `attrs` is not `None`.  Returns `None` if the slice is empty.
         """
         if query_string is None:
-            return self.dim_select(ids)
+            return self.dim_select(ids, return_arrow=return_arrow)
 
         with self._open() as A:
             qc = tiledb.QueryCondition(query_string)
             if attrs is None:
-                slice_query = A.query(attr_cond=qc)
+                slice_query = A.query(attr_cond=qc, return_arrow=return_arrow)
                 if ids is None:
                     slice_df = slice_query.df[:]
                 else:
                     slice_df = slice_query.df[ids]
             else:
-                slice_query = A.query(attr_cond=qc, attrs=attrs)
+                slice_query = A.query(
+                    attr_cond=qc, attrs=attrs, return_arrow=return_arrow
+                )
                 if ids is None:
                     slice_df = slice_query.df[:]
                 else:
                     slice_df = slice_query.df[ids]
             # This is the 'decode on read' part of our logic; in dim_select we have the 'encode on write' part.
             # Context: https://github.com/single-cell-data/TileDB-SingleCell/issues/99.
-            return self._ascii_to_unicode_dataframe_readback(slice_df)
+            if return_arrow:
+                return self._ascii_to_unicode_arrow_readback(slice_df)
+            else:
+                return self._ascii_to_unicode_pandas_readback(slice_df)
 
     # ----------------------------------------------------------------
-    def _ascii_to_unicode_series_readback(
+    def _ascii_to_unicode_pandas_series_readback(
         self, field_name: str, series: pd.Series
     ) -> Tuple[str, bool, Optional[pd.Series]]:
+        """
+        Helper method for `_ascii_to_unicode_pandas_readback`
+        """
         if len(series) > 0 and type(series[0]) == bytes:
             return (field_name, True, series.map(lambda e: e.decode()))
         else:
             return (field_name, False, None)
 
-    def _ascii_to_unicode_dataframe_readback(self, df: pd.DataFrame) -> pd.DataFrame:
+    def _ascii_to_unicode_pandas_readback(self, df: pd.DataFrame) -> pd.DataFrame:
         """
-        Implements the 'decode on read' partof our logic as noted in `dim_select()`.
+        Implements the 'decode on read' part of our logic as noted in `dim_select()`.
         """
         futures = []
         # Empirically we find this has a bit of a speed-up. Presumably that's because of some NumPy
@@ -204,7 +227,7 @@ def _ascii_to_unicode_dataframe_readback(self, df: pd.DataFrame) -> pd.DataFrame
         with ThreadPoolExecutor() as executor:
             for k in df:
                 future = executor.submit(
-                    self._ascii_to_unicode_series_readback, k, df[k]
+                    self._ascii_to_unicode_pandas_series_readback, k, df[k]
                 )
                 futures.append(future)
 
@@ -215,6 +238,52 @@ def _ascii_to_unicode_dataframe_readback(self, df: pd.DataFrame) -> pd.DataFrame
 
         return df
 
+    # ----------------------------------------------------------------
+    def _ascii_to_unicode_arrow_series_readback(
+        self, array_number: int, series: Union[pa.Array, pa.ChunkedArray]
+    ) -> Tuple[int, bool, Optional[Union[pa.Array, pa.ChunkedArray]]]:
+        """
+        Helper method for `_ascii_to_unicode_arrow_readback`
+        """
+        # pyarrow's way of handling 'bytes'
+        if len(series) > 0 and (
+            type(series[0]) == pa.LargeBinaryArray
+            or type(series[0]) == pa.LargeStringScalar
+        ):
+            return (array_number, True, series.cast(pa.string()))
+        else:
+            return (array_number, False, None)
+
+    def _ascii_to_unicode_arrow_readback(self, df: pa.Table) -> pa.Table:
+        """
+        Implements the 'decode on read' part of our logic as noted in `dim_select()`.
+        """
+
+        array_names = df.column_names
+        futures = []
+        # Empirically we find this doesn't have much of a speed-up. Presumably that's because of
+        # PyArrow Python code holding the GIL. Nonetheless, experiments show it isn't slower so
+        # we'll keep the ThreadPoolExecutor logic, which will only get faster pending (hypothetical)
+        # future PyArrow C++ work.
+
+        with ThreadPoolExecutor() as executor:
+            for array_number in range(df.num_columns):
+                future = executor.submit(
+                    self._ascii_to_unicode_arrow_series_readback,
+                    array_number,
+                    df[array_number],
+                )
+                futures.append(future)
+
+        new_arrays = [None] * df.num_columns
+        for future in futures:
+            array_number, modified, new_array = future.result()
+            if modified:
+                new_arrays[array_number] = new_array
+            else:
+                new_arrays[array_number] = df[array_number]
+        return pa.Table.from_arrays(new_arrays, names=array_names)
+
     # ----------------------------------------------------------------
     def from_dataframe(self, dataframe: pd.DataFrame, extent: int = 2048) -> None:
         """
diff --git a/apis/python/src/tiledbsc/annotation_matrix.py b/apis/python/src/tiledbsc/annotation_matrix.py
index 06abaa00b9..325c26754a 100644
--- a/apis/python/src/tiledbsc/annotation_matrix.py
+++ b/apis/python/src/tiledbsc/annotation_matrix.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 import tiledb
 
 import tiledbsc.util as util
@@ -66,27 +67,40 @@ def shape(self) -> Tuple[int, int]:
             return (num_rows, num_cols)
 
     # ----------------------------------------------------------------
-    def dim_select(self, ids: Optional[Ids]) -> pd.DataFrame:
+    def dim_select(
+        self,
+        ids: Optional[Ids] = None,
+        *,
+        return_arrow: bool = False,
+    ) -> Union[pd.DataFrame, pa.Table]:
         """
         Selects a slice out of the array with specified `obs_ids` (for `obsm` elements) or
         `var_ids` (for `varm` elements).  If `ids` is `None`, the entire array is returned.
         """
         if ids is None:
             with self._open() as A:
-                df = A.df[:]
+                query = A.query(return_arrow=return_arrow)
+                df = query.df[:]
         else:
             with self._open() as A:
-                df = A.df[ids]
-        df.set_index(self.dim_name, inplace=True)
+                query = A.query(return_arrow=return_arrow)
+                df = query.df[ids]
+        if not return_arrow:
+            df.set_index(self.dim_name, inplace=True)
         return df
 
     # ----------------------------------------------------------------
-    def df(self, ids: Optional[Ids] = None) -> pd.DataFrame:
+    def df(
+        self,
+        ids: Optional[Ids] = None,
+        *,
+        return_arrow: bool = False,
+    ) -> Union[pd.DataFrame, pa.Table]:
         """
         Keystroke-saving alias for `.dim_select()`. If `ids` are provided, they're used
         to subselect; if not, the entire dataframe is returned.
         """
-        return self.dim_select(ids)
+        return self.dim_select(ids, return_arrow=return_arrow)
 
     # ----------------------------------------------------------------
     def from_matrix_and_dim_values(
diff --git a/apis/python/src/tiledbsc/assay_matrix.py b/apis/python/src/tiledbsc/assay_matrix.py
index 1dcab49e63..ab41fbf520 100644
--- a/apis/python/src/tiledbsc/assay_matrix.py
+++ b/apis/python/src/tiledbsc/assay_matrix.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 import scipy.sparse as sp
 import tiledb
 
@@ -77,36 +78,46 @@ def shape(self) -> Tuple[int, int]:
 
     # ----------------------------------------------------------------
     def dim_select(
-        self, obs_ids: Optional[Ids], var_ids: Optional[Ids]
-    ) -> pd.DataFrame:
+        self,
+        obs_ids: Optional[Ids],
+        var_ids: Optional[Ids],
+        *,
+        return_arrow: bool = False,
+    ) -> Union[pd.DataFrame, pa.Table]:
         """
         Selects a slice out of the matrix with specified `obs_ids` and/or `var_ids`.
         Either or both of the ID lists may be `None`, meaning, do not subselect along
         that dimension. If both ID lists are `None`, the entire matrix is returned.
         """
         with tiledb.open(self.uri, ctx=self._ctx) as A:
+            query = A.query(return_arrow=return_arrow)
             if obs_ids is None:
                 if var_ids is None:
-                    df = A.df[:, :]
+                    df = query.df[:, :]
                 else:
-                    df = A.df[:, var_ids]
+                    df = query.df[:, var_ids]
             else:
                 if var_ids is None:
-                    df = A.df[obs_ids, :]
+                    df = query.df[obs_ids, :]
                 else:
-                    df = A.df[obs_ids, var_ids]
-        df.set_index([self.row_dim_name, self.col_dim_name], inplace=True)
+                    df = query.df[obs_ids, var_ids]
+        if not return_arrow:
+            df.set_index([self.row_dim_name, self.col_dim_name], inplace=True)
         return df
 
     # ----------------------------------------------------------------
     def df(
-        self, obs_ids: Optional[Ids] = None, var_ids: Optional[Ids] = None
-    ) -> pd.DataFrame:
+        self,
+        obs_ids: Optional[Ids] = None,
+        var_ids: Optional[Ids] = None,
+        *,
+        return_arrow: bool = False,
+    ) -> Union[pd.DataFrame, pa.Table]:
         """
         Keystroke-saving alias for `.dim_select()`. If either of `obs_ids` or `var_ids`
         are provided, they're used to subselect; if not, the entire dataframe is returned.
         """
-        return self.dim_select(obs_ids, var_ids)
+        return self.dim_select(obs_ids, var_ids, return_arrow=return_arrow)
 
     # ----------------------------------------------------------------
     def csr(
diff --git a/apis/python/src/tiledbsc/soma.py b/apis/python/src/tiledbsc/soma.py
index 920db5892b..4155e8339f 100644
--- a/apis/python/src/tiledbsc/soma.py
+++ b/apis/python/src/tiledbsc/soma.py
@@ -3,9 +3,10 @@
 import os
 from collections import Counter
 from concurrent.futures import ThreadPoolExecutor
-from typing import List, Optional, Sequence, Tuple
+from typing import List, Optional, Sequence, Tuple, Union
 
 import pandas as pd
+import pyarrow as pa
 import tiledb
 
 from .annotation_dataframe import AnnotationDataFrame
@@ -244,7 +245,11 @@ def _get_obs_or_var_value_counts(
 
     # ----------------------------------------------------------------
     def dim_slice(
-        self, obs_ids: Optional[Ids], var_ids: Optional[Ids]
+        self,
+        obs_ids: Optional[Ids],
+        var_ids: Optional[Ids],
+        *,
+        return_arrow: bool = False,
     ) -> Optional[SOMASlice]:
         """
         Subselects the SOMA's obs, var, and X/data using the specified obs_ids and var_ids.
@@ -255,28 +260,28 @@ def dim_slice(
         if obs_ids is None:
             # Try the var slice first to see if that produces zero results -- if so we don't need to
             # load the obs.
-            slice_var_df = self.var.dim_select(var_ids)
+            slice_var_df = self.var.dim_select(var_ids, return_arrow=return_arrow)
             if slice_var_df.shape[0] == 0:
                 return None
-            slice_obs_df = self.obs.dim_select(obs_ids)
+            slice_obs_df = self.obs.dim_select(obs_ids, return_arrow=return_arrow)
             if slice_obs_df.shape[0] == 0:
                 return None
 
         elif var_ids is None:
             # Try the obs slice first to see if that produces zero results -- if so we don't need to
             # load the var.
-            slice_obs_df = self.obs.dim_select(obs_ids)
+            slice_obs_df = self.obs.dim_select(obs_ids, return_arrow=return_arrow)
             if slice_obs_df.shape[0] == 0:
                 return None
-            slice_var_df = self.var.dim_select(var_ids)
+            slice_var_df = self.var.dim_select(var_ids, return_arrow=return_arrow)
             if slice_var_df.shape[0] == 0:
                 return None
 
         else:
-            slice_obs_df = self.obs.dim_select(obs_ids)
+            slice_obs_df = self.obs.dim_select(obs_ids, return_arrow=return_arrow)
             if slice_obs_df.shape[0] == 0:
                 return None
-            slice_var_df = self.var.dim_select(var_ids)
+            slice_var_df = self.var.dim_select(var_ids, return_arrow=return_arrow)
             if slice_var_df.shape[0] == 0:
                 return None
 
@@ -290,7 +295,9 @@ def dim_slice(
         # * obsp
         # * varp
 
-        return self._assemble_soma_slice(obs_ids, var_ids, slice_obs_df, slice_var_df)
+        return self._assemble_soma_slice(
+            obs_ids, var_ids, slice_obs_df, slice_var_df, return_arrow=return_arrow
+        )
 
     # ----------------------------------------------------------------
     def query(
@@ -302,6 +309,7 @@ def query(
         var_attrs: Optional[Sequence[str]] = None,
         var_query_string: Optional[str] = None,
         var_ids: Optional[Ids] = None,
+        return_arrow: bool = False,
     ) -> Optional[SOMASlice]:
         """
         Subselects the SOMA's obs, var, and X/data using the specified queries on obs and var.
@@ -315,14 +323,21 @@ def query(
         """
 
         slice_obs_df = self.obs.query(
-            query_string=obs_query_string, ids=obs_ids, attrs=obs_attrs
+            query_string=obs_query_string,
+            ids=obs_ids,
+            attrs=obs_attrs,
+            return_arrow=return_arrow,
         )
         # E.g. querying for 'cell_type == "blood"' and this SOMA does have a cell_type column in its
         # obs, but no rows with cell_type == "blood".
         if slice_obs_df is None:
             return None
-        if len(slice_obs_df.index) == 0:
-            return None
+        if return_arrow:
+            if len(slice_obs_df["obs_id"]) == 0:
+                return None
+        else:
+            if len(slice_obs_df.index) == 0:
+                return None
         # At the tiledb multi-index level, if we're say slicing on obs_ids but not var_ids,
         # we'll do `A.df[obs_ids, :]`. We can't pass a `:` down the callstack to get there,
         # but we pass `None` instead.
@@ -332,18 +347,30 @@ def query(
         # `A.df[{158 obs ids}, {all 2000 var ids}]` is non-performant while
         # `A.df[{158 obs ids}, :]` is performant.
         if obs_ids is not None or obs_query_string is not None:
-            obs_ids = list(slice_obs_df.index)
+            if return_arrow:
+                obs_ids = [obs_id.as_py() for obs_id in slice_obs_df["obs_id"]]
+            else:
+                obs_ids = list(slice_obs_df.index)
 
-        slice_var_df = self.var.query(var_query_string, ids=var_ids, attrs=var_attrs)
+        slice_var_df = self.var.query(
+            var_query_string, ids=var_ids, attrs=var_attrs, return_arrow=return_arrow
+        )
         # E.g. querying for 'feature_name == "MT-CO3"' and this SOMA does have a feature_name column
         # in its var, but no rows with feature_name == "MT-CO3".
         if slice_var_df is None:
             return None
-        if len(slice_var_df.index) == 0:
-            return None
+        if return_arrow:
+            if len(slice_var_df["var_id"]) == 0:
+                return None
+        else:
+            if len(slice_var_df.index) == 0:
+                return None
         # See above comment re keeping obs_ids == None if that's what it came in as.
         if var_ids is not None or var_query_string is not None:
-            var_ids = list(slice_var_df.index)
+            if return_arrow:
+                var_ids = [var_id.as_py() for var_id in slice_var_df["var_id"]]
+            else:
+                var_ids = list(slice_var_df.index)
 
         # TODO:
         # do this here:
@@ -355,7 +382,9 @@ def query(
         # * obsp
         # * varp
 
-        return self._assemble_soma_slice(obs_ids, var_ids, slice_obs_df, slice_var_df)
+        return self._assemble_soma_slice(
+            obs_ids, var_ids, slice_obs_df, slice_var_df, return_arrow=return_arrow
+        )
 
     # ----------------------------------------------------------------
     @classmethod
@@ -369,6 +398,7 @@ def queries(
         var_attrs: Optional[Sequence[str]] = None,
         var_query_string: Optional[str] = None,
         var_ids: Optional[Ids] = None,
+        return_arrow: bool = False,
         max_thread_pool_workers: Optional[int] = None,
     ) -> List[SOMASlice]:
         """
@@ -416,6 +446,7 @@ def queries(
                     var_query_string=var_query_string,
                     obs_ids=obs_ids,
                     var_ids=var_ids,
+                    return_arrow=return_arrow,
                 )
                 soma_slice_futures.append(soma_slice_future)
 
@@ -433,15 +464,19 @@ def _assemble_soma_slice_aux(
         X: AssayMatrix,
         obs_ids: Optional[Ids],
         var_ids: Optional[Ids],
-    ) -> Tuple[str, pd.DataFrame]:
-        return (layer_name, X.dim_select(obs_ids, var_ids))
+        *,
+        return_arrow: bool = False,
+    ) -> Tuple[str, Union[pd.DataFrame, pa.Table]]:
+        return (layer_name, X.dim_select(obs_ids, var_ids, return_arrow=return_arrow))
 
     def _assemble_soma_slice(
         self,
         obs_ids: Optional[Ids],
         var_ids: Optional[Ids],
-        slice_obs_df: pd.DataFrame,
-        slice_var_df: pd.DataFrame,
+        slice_obs_df: Union[pd.DataFrame, pa.Table],
+        slice_var_df: Union[pd.DataFrame, pa.Table],
+        *,
+        return_arrow: bool = False,
     ) -> SOMASlice:
         """
         An internal method for constructing a `SOMASlice` object given query results.
@@ -462,6 +497,7 @@ def _assemble_soma_slice(
                         X_layer,
                         obs_ids,
                         var_ids,
+                        return_arrow=return_arrow,
                     )
                     futures.append(future)
 
diff --git a/apis/python/src/tiledbsc/soma_collection.py b/apis/python/src/tiledbsc/soma_collection.py
index 9b61223024..2cf1566c36 100644
--- a/apis/python/src/tiledbsc/soma_collection.py
+++ b/apis/python/src/tiledbsc/soma_collection.py
@@ -234,6 +234,7 @@ def query(
         var_attrs: Optional[Sequence[str]] = None,
         var_query_string: Optional[str] = None,
         var_ids: Optional[Ids] = None,
+        return_arrow: bool = False,
     ) -> List[SOMASlice]:
         """
         Subselects the obs, var, and X/data using the specified queries on obs and var,
@@ -260,6 +261,7 @@ def query(
             var_attrs=var_attrs,
             var_query_string=var_query_string,
             var_ids=var_ids,
+            return_arrow=return_arrow,
         )
 
     # ----------------------------------------------------------------
diff --git a/apis/python/src/tiledbsc/soma_slice.py b/apis/python/src/tiledbsc/soma_slice.py
index 04a8bbc4b2..e33d17eaa2 100644
--- a/apis/python/src/tiledbsc/soma_slice.py
+++ b/apis/python/src/tiledbsc/soma_slice.py
@@ -1,9 +1,10 @@
 from __future__ import annotations
 
-from typing import Dict, Optional, Sequence, Union
+from typing import Dict, List, Optional, Sequence, Union
 
 import anndata as ad
 import pandas as pd
+import pyarrow as pa
 
 from tiledbsc import util
 
@@ -21,9 +22,9 @@ class SOMASlice(TileDBGroup):
     # ----------------------------------------------------------------
     def __init__(
         self,
-        X: Dict[str, Union[pd.DataFrame, Matrix]],
-        obs: pd.DataFrame,
-        var: pd.DataFrame,
+        X: Dict[str, Union[pd.DataFrame, pa.Table, Matrix]],
+        obs: Union[pd.DataFrame, pa.Table],
+        var: Union[pd.DataFrame, pa.Table],
         # TODO
         # obsm: Dict[str, pd.DataFrame],
         # varm: Dict[str, pd.DataFrame],
@@ -35,8 +36,8 @@ def __init__(
         """
         Constructs an in-memory `SOMASlice` object. This is a simple collection of obs, var, and X dataframes.
         """
-        assert isinstance(obs, pd.DataFrame)
-        assert isinstance(var, pd.DataFrame)
+        assert isinstance(obs, pd.DataFrame) or isinstance(obs, pa.Table)
+        assert isinstance(var, pd.DataFrame) or isinstance(obs, pa.Table)
         assert "data" in X
 
         self.obs = obs
@@ -75,10 +76,20 @@ def to_anndata(self) -> ad.AnnData:
         X_data = self.X["data"]
         if isinstance(X_data, pd.DataFrame):
             X_dtype = X_data.dtypes["value"]
+        elif isinstance(X_data, pa.Table):
+            X_dtype = X_data["value"].type.to_pandas_dtype()
         else:
             X_dtype = X_data.dtype
 
-        ann = ad.AnnData(obs=self.obs, var=self.var, dtype=X_dtype)
+        obs = self.obs
+        var = self.var
+        if isinstance(obs, pa.Table):
+            obs = obs.to_pandas()
+            obs.set_index("obs_id", inplace=True)
+        if isinstance(var, pa.Table):
+            var = var.to_pandas()
+            var.set_index("var_id", inplace=True)
+        ann = ad.AnnData(obs=obs, var=var, dtype=X_dtype)
 
         # TODO:
         # self.obsm = obsm
@@ -94,14 +105,29 @@ def to_anndata(self) -> ad.AnnData:
             if isinstance(data, pd.DataFrame):
                 # Make obs_id and var_id accessible as columns.
                 data = data.reset_index()
+
                 data = util.X_and_ids_to_sparse_matrix(
                     data,
                     "obs_id",  # row_dim_name
                     "var_id",  # col_dim_name
                     "value",  # attr_name
-                    self.obs.index,
-                    self.var.index,
+                    obs.index,
+                    var.index,
                 )
+
+            if isinstance(data, pa.Table):
+                data = data.to_pandas()
+                data.set_index(["obs_id", "var_id"], inplace=True)
+
+                data = util.X_and_ids_to_sparse_matrix(
+                    data,
+                    "obs_id",  # row_dim_name
+                    "var_id",  # col_dim_name
+                    "value",  # attr_name
+                    obs.index,
+                    var.index,
+                )
+
             # We use AnnData as our in-memory storage. For SOMAs, all X layers are arrays within the
             # soma.X group; for AnnData, the 'data' layer is ann.X and all the others are in
             # ann.layers.
@@ -113,6 +139,13 @@ def to_anndata(self) -> ad.AnnData:
         return ann
 
     # ----------------------------------------------------------------
+    @classmethod
+    def _keys_for_concat(cls, df: Union[pd.DataFrame, pa.Table, Matrix]) -> List[str]:
+        if isinstance(df, pa.Table):
+            return sorted(list(df.column_names))
+        else:
+            return sorted(list(df.keys()))
+
     @classmethod
     def concat(cls, soma_slices: Sequence[SOMASlice]) -> Optional[SOMASlice]:
         """
@@ -126,6 +159,9 @@ def concat(cls, soma_slices: Sequence[SOMASlice]) -> Optional[SOMASlice]:
 
         # Check column names for each dataframe-type are the same
         slice0 = soma_slices[0]
+        okeys0 = cls._keys_for_concat(slice0.obs)
+        vkeys0 = cls._keys_for_concat(slice0.var)
+
         for i, slicei in enumerate(soma_slices):
             if i == 0:
                 continue
@@ -134,11 +170,19 @@ def concat(cls, soma_slices: Sequence[SOMASlice]) -> Optional[SOMASlice]:
                 raise Exception(
                     "SOMA slices to be concatenated must have all the same X attributes"
                 )
-            if sorted(list(slicei.obs.keys())) != sorted(list(slice0.obs.keys())):
+            for key in slice0.X.keys():
+                if cls._keys_for_concat(slicei.X[key]) != cls._keys_for_concat(
+                    slice0.X[key]
+                ):
+                    raise Exception(
+                        "SOMA slices to be concatenated must have all the same obs attributes"
+                    )
+
+            if cls._keys_for_concat(slicei.obs) != okeys0:
                 raise Exception(
                     "SOMA slices to be concatenated must have all the same obs attributes"
                 )
-            if sorted(list(slicei.var.keys())) != sorted(list(slice0.var.keys())):
+            if cls._keys_for_concat(slicei.var) != vkeys0:
                 raise Exception(
                     "SOMA slices to be concatenated must have all the same var attributes"
                 )
diff --git a/apis/python/tests/test_soco_slice_query.py b/apis/python/tests/test_soco_slice_query.py
index 3341d6fe14..c494a3a2c4 100644
--- a/apis/python/tests/test_soco_slice_query.py
+++ b/apis/python/tests/test_soco_slice_query.py
@@ -37,31 +37,37 @@ def test_soco_slice_query(tmp_path):
     var_attrs = ["feature_name"]
     var_query_string = 'feature_name == "MT-CO3"'
 
-    soma_slices = []
-    for soma in soco:
-        # E.g. querying for 'cell_type == "blood"' but this SOMA doesn't have a cell_type column in
-        # its obs at all.
-        if not soma.obs.has_attr_names(obs_attrs):
-            continue
-        # E.g. querying for 'feature_name == "MT-CO3"' but this SOMA doesn't have a feature_name
-        # column in its var at all.
-        if not soma.var.has_attr_names(var_attrs):
-            continue
-
-        soma_slice = soma.query(
-            obs_query_string=obs_query_string, var_query_string=var_query_string
-        )
-        if soma_slice is not None:
-            soma_slices.append(soma_slice)
-
-    result_soma_slice = tiledbsc.SOMASlice.concat(soma_slices)
-    assert result_soma_slice is not None
-
-    ann = result_soma_slice.to_anndata()
-
-    assert ann.obs.shape == (400, 17)
-    assert ann.var.shape == (1, 3)
-    assert ann.X.shape == (400, 1)
+    # The return_arrow=True case drives Arrow format all the way through SOMA, SOMASlice,
+    # obs, var, X, etc.
+    for return_arrow in [False, True]:
+
+        soma_slices = []
+        for soma in soco:
+            # E.g. querying for 'cell_type == "blood"' but this SOMA doesn't have a cell_type column in
+            # its obs at all.
+            if not soma.obs.has_attr_names(obs_attrs):
+                continue
+            # E.g. querying for 'feature_name == "MT-CO3"' but this SOMA doesn't have a feature_name
+            # column in its var at all.
+            if not soma.var.has_attr_names(var_attrs):
+                continue
+
+            soma_slice = soma.query(
+                obs_query_string=obs_query_string,
+                var_query_string=var_query_string,
+                return_arrow=return_arrow,
+            )
+            if soma_slice is not None:
+                soma_slices.append(soma_slice)
+
+        result_soma_slice = tiledbsc.SOMASlice.concat(soma_slices)
+        assert result_soma_slice is not None
+
+        ann = result_soma_slice.to_anndata()
+
+        assert ann.obs.shape == (400, 17)
+        assert ann.var.shape == (1, 3)
+        assert ann.X.shape == (400, 1)
 
 
 def test_soco_slice_query_nans(tmp_path):