From 1f9bb8708acec3cb903ae504549b14c5a6533974 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Mon, 3 Oct 2022 00:48:43 -0400 Subject: [PATCH] Iterating from SOMADataFrame --- .gitignore | 1 + apis/python/src/tiledbsoma/soma_dataframe.py | 52 ++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/.gitignore b/.gitignore index 332070c63b..8629e6b682 100644 --- a/.gitignore +++ b/.gitignore @@ -52,5 +52,6 @@ apis/python/src/tiledbsoma/libtiledb.* apis/python/src/tiledbsoma/libtiledbsoma.* /.quarto/ +/tags /NOTES/ diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py index eab8621cc7..0be6a7660d 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.py +++ b/apis/python/src/tiledbsoma/soma_dataframe.py @@ -5,8 +5,11 @@ import pyarrow as pa import tiledb +import tiledbsoma.libtiledbsoma as clib + from . import util, util_arrow, util_tiledb from .logging import log_io +from .query_condition import QueryCondition from .soma_collection import SOMACollectionBase from .tiledb_array import TileDBArray from .types import Ids, NTuple, SOMAResultOrder @@ -157,6 +160,55 @@ def is_indexed(self) -> Literal[False]: def get_index_column_names(self) -> Sequence[str]: return [] + def read_using_lib_temp( + self, + *, + # TODO: find the right syntax to get the typechecker to accept args like ``ids=slice(0,10)`` + # ids: Optional[Union[Sequence[int], Slice]] = None, + ids: Optional[Any] = None, + value_filter: Optional[str] = None, + column_names: Optional[Sequence[str]] = None, + result_order: Optional[str] = None, + # TODO: batch_size + # TODO: partition, + # TODO: platform_config, + ) -> Iterator[pa.Table]: + """ + TODO: copy the text + """ + + with self._tiledb_open("r") as A: + dim_names, attr_names = util_tiledb.split_column_names( + A.schema, column_names + ) + + query_condition = None + if value_filter is not None: + # query_condition = tiledb.QueryCondition(value_filter) + query_condition = QueryCondition(value_filter) + + # As an arg to this method, `column_names` is optional-None. For the pybind11 + # code it's optional-[]. + lib_column_names = [] if column_names is None else column_names + + sr = clib.SOMAReader( + self._uri, + name=self.name, + schema=A.schema, # query_condition needs this + column_names=lib_column_names, + query_condition=query_condition, + ) + + # TODO: platform_config + # TODO: batch_size + # TODO: result_order + + sr.submit() + + while arrow_table := sr.read_next(): + # yield util_arrow.ascii_to_unicode_pyarrow_readback(batch) + yield arrow_table # XXX what other post-processing + def read( self, *,