From fc29a6d0b3df8779ec03792e56ea29e3e0dbca39 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 4 Oct 2022 11:16:51 -0400 Subject: [PATCH] rebase prep --- apis/python/src/tiledbsoma/soma_dataframe.py | 66 +------------------ .../src/tiledbsoma/soma_indexed_dataframe.py | 10 +-- 2 files changed, 7 insertions(+), 69 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py index 814710de48..0dd079eecb 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.py +++ b/apis/python/src/tiledbsoma/soma_dataframe.py @@ -5,11 +5,8 @@ import pyarrow as pa import tiledb -import tiledbsoma.libtiledbsoma as clib - from . import util, util_arrow, util_tiledb from .logging import log_io -from .query_condition import QueryCondition from .soma_collection import SOMACollectionBase from .tiledb_array import TileDBArray from .types import Ids, NTuple, SOMAResultOrder @@ -160,55 +157,6 @@ def is_indexed(self) -> Literal[False]: def get_index_column_names(self) -> Sequence[str]: return [] - def read_using_lib_temp( - self, - *, - # TODO: find the right syntax to get the typechecker to accept args like ``ids=slice(0,10)`` - # ids: Optional[Union[Sequence[int], Slice]] = None, - ids: Optional[Any] = None, - value_filter: Optional[str] = None, - column_names: Optional[Sequence[str]] = None, - result_order: Optional[str] = None, - # TODO: batch_size - # TODO: partition, - # TODO: platform_config, - ) -> Iterator[pa.Table]: - """ - TODO: copy the text - """ - - with self._tiledb_open("r") as A: - dim_names, attr_names = util_tiledb.split_column_names( - A.schema, column_names - ) - - query_condition = None - if value_filter is not None: - # query_condition = tiledb.QueryCondition(value_filter) - query_condition = QueryCondition(value_filter) - - # As an arg to this method, `column_names` is optional-None. For the pybind11 - # code it's optional-[]. - lib_column_names = [] if column_names is None else column_names - - sr = clib.SOMAReader( - self._uri, - name=self.name, - schema=A.schema, # query_condition needs this - column_names=lib_column_names, - query_condition=query_condition, - ) - - # TODO: platform_config - # TODO: batch_size - # TODO: result_order - - sr.submit() - - while arrow_table := sr.read_next(): - # yield util_arrow.ascii_to_unicode_pyarrow_readback(batch) - yield arrow_table # XXX what other post-processing - def read( self, *, @@ -270,12 +218,7 @@ def read( iterator = query.df[ids] for table in iterator: - # XXX COMMENT MORE - # This is the 'decode on read' part of our logic; in dim_select we have the - # 'encode on write' part. - # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99. - # - # Also: don't materialize these on read + # Don't materialize these on read # TODO: get the arrow syntax for drop # df.drop(ROWID, axis=1) yield table @@ -295,7 +238,7 @@ def read_all( # TODO: platform_config, ) -> pa.Table: """ - This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases. + This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simplify unit-test cases. """ return pa.concat_tables( self.read( @@ -412,11 +355,6 @@ def read_as_pandas( for df in iterator: - # This is the 'decode on read' part of our logic; in dim_select we have the 'encode on - # write' part. - # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99. - df = df - if id_column_name is not None: df.reset_index(inplace=True) df.set_index(id_column_name, inplace=True) diff --git a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py index 539d4e5140..e395158532 100644 --- a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py +++ b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py @@ -259,7 +259,7 @@ def read( iterator = query.df[ids] for table in iterator: - yield table + yield df def read_all( self, @@ -275,17 +275,17 @@ def read_all( # TODO: platform_config, ) -> pa.Table: """ - This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases. + This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the record batches found. Its nominal use is to simplify unit-test cases. """ return pa.concat_tables( self.read(ids=ids, value_filter=value_filter, column_names=column_names) ) - def write(self, values: pa.Table) -> None: + def write(self, values: pa.RecordBatch) -> None: """ - Write an Arrow.Table to the persistent object. As duplicate index values are not allowed, index values already present in the object are overwritten and new index values are added. + Write an Arrow.RecordBatch to the persistent object. As duplicate index values are not allowed, index values already present in the object are overwritten and new index values are added. - :param values: An Arrow.Table containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMAIndexedDataFrame``. + :param values: An Arrow.RecordBatch containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMAIndexedDataFrame``. """ self._shape = None # cache-invalidate