From 4e898373fae088fc7498010b78eca838f5a22693 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 12 Oct 2022 09:03:30 -0400 Subject: [PATCH] simplify read_as_pandas --- apis/python/src/tiledbsoma/soma_dataframe.py | 48 +++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py index 285829a211..ff9c99b020 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.py +++ b/apis/python/src/tiledbsoma/soma_dataframe.py @@ -12,7 +12,7 @@ # from .query_condition import QueryCondition from . import query_condition as qcmodule -from . import util, util_arrow, util_tiledb +from . import util, util_arrow from .logging import log_io from .soma_collection import SOMACollectionBase from .tiledb_array import TileDBArray @@ -216,6 +216,52 @@ def read_all( ) ) + def read_as_pandas( + self, + *, + ids: Optional[Ids] = None, + value_filter: Optional[str] = None, + column_names: Optional[Sequence[str]] = None, + result_order: Optional[SOMAResultOrder] = None, + # to rename index to 'obs_id' or 'var_id', if desired, for anndata + id_column_name: Optional[str] = None, + ) -> Iterator[pd.DataFrame]: + """ + Reads from SOMA storage into memory. For ``to_anndata``, as well as for any interactive use where the user wants a Pandas dataframe. Returns a generator over dataframes for batched read. See also ``read_as_pandas_all`` for a convenience wrapper. + + TODO: params-list + """ + for tbl in self.read( + ids=ids, + value_filter=value_filter, + column_names=column_names, + result_order=result_order, + ): + yield tbl.to_pandas() + + def read_as_pandas_all( + self, + *, + ids: Optional[Ids] = None, + value_filter: Optional[str] = None, + column_names: Optional[Sequence[str]] = None, + result_order: Optional[SOMAResultOrder] = None, + # to rename index to 'obs_id' or 'var_id', if desired, for anndata + id_column_name: Optional[str] = None, + ) -> pd.DataFrame: + """ + This is a convenience method around ``read``. It concatenates all partial read results into a single DataFrame. Its nominal use is to simplify unit-test cases. + """ + return pd.concat( + self.read_as_pandas( + ids=ids, + value_filter=value_filter, + column_names=column_names, + result_order=result_order, + id_column_name=id_column_name, + ) + ) + def _get_is_sparse(self) -> bool: if self._cached_is_sparse is None: