From 70844be36d1bff98230b8edc78d1d39e07967b59 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Mon, 3 Oct 2022 15:27:51 -0400 Subject: [PATCH] rebase prep --- apis/python/src/tiledbsoma/soma_dataframe.py | 62 ++----------------- .../src/tiledbsoma/soma_indexed_dataframe.py | 14 +++-- 2 files changed, 14 insertions(+), 62 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py index 814710de48..2c24a2a881 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.py +++ b/apis/python/src/tiledbsoma/soma_dataframe.py @@ -5,11 +5,8 @@ import pyarrow as pa import tiledb -import tiledbsoma.libtiledbsoma as clib - -from . import util, util_arrow, util_tiledb +from . import util, util_arrow, util_pandas, util_tiledb from .logging import log_io -from .query_condition import QueryCondition from .soma_collection import SOMACollectionBase from .tiledb_array import TileDBArray from .types import Ids, NTuple, SOMAResultOrder @@ -160,55 +157,6 @@ def is_indexed(self) -> Literal[False]: def get_index_column_names(self) -> Sequence[str]: return [] - def read_using_lib_temp( - self, - *, - # TODO: find the right syntax to get the typechecker to accept args like ``ids=slice(0,10)`` - # ids: Optional[Union[Sequence[int], Slice]] = None, - ids: Optional[Any] = None, - value_filter: Optional[str] = None, - column_names: Optional[Sequence[str]] = None, - result_order: Optional[str] = None, - # TODO: batch_size - # TODO: partition, - # TODO: platform_config, - ) -> Iterator[pa.Table]: - """ - TODO: copy the text - """ - - with self._tiledb_open("r") as A: - dim_names, attr_names = util_tiledb.split_column_names( - A.schema, column_names - ) - - query_condition = None - if value_filter is not None: - # query_condition = tiledb.QueryCondition(value_filter) - query_condition = QueryCondition(value_filter) - - # As an arg to this method, `column_names` is optional-None. For the pybind11 - # code it's optional-[]. - lib_column_names = [] if column_names is None else column_names - - sr = clib.SOMAReader( - self._uri, - name=self.name, - schema=A.schema, # query_condition needs this - column_names=lib_column_names, - query_condition=query_condition, - ) - - # TODO: platform_config - # TODO: batch_size - # TODO: result_order - - sr.submit() - - while arrow_table := sr.read_next(): - # yield util_arrow.ascii_to_unicode_pyarrow_readback(batch) - yield arrow_table # XXX what other post-processing - def read( self, *, @@ -278,7 +226,7 @@ def read( # Also: don't materialize these on read # TODO: get the arrow syntax for drop # df.drop(ROWID, axis=1) - yield table + yield util_arrow.ascii_to_unicode_pyarrow_readback(table) def read_all( self, @@ -295,7 +243,7 @@ def read_all( # TODO: platform_config, ) -> pa.Table: """ - This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases. + This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simplify unit-test cases. """ return pa.concat_tables( self.read( @@ -415,7 +363,7 @@ def read_as_pandas( # This is the 'decode on read' part of our logic; in dim_select we have the 'encode on # write' part. # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99. - df = df + df = util_pandas.ascii_to_unicode_pandas_readback(df) if id_column_name is not None: df.reset_index(inplace=True) @@ -529,7 +477,7 @@ def write_from_pandas( dfc = dataframe[column_name] if len(dfc) > 0 and type(dfc[0]) == str: # Force ASCII storage if string, in order to make obs/var columns queryable. - column_types[column_name] = "ascii" + column_types[column_name] = np.dtype("S") tiledb.from_pandas( uri=self.uri, diff --git a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py index 539d4e5140..dd6abf8220 100644 --- a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py +++ b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py @@ -259,7 +259,11 @@ def read( iterator = query.df[ids] for table in iterator: - yield table + # XXX COMMENT MORE + # This is the 'decode on read' part of our logic; in dim_select we have the + # 'encode on write' part. + # Context: # https://github.com/single-cell-data/TileDB-SOMA/issues/99. + yield util_arrow.ascii_to_unicode_pyarrow_readback(table) def read_all( self, @@ -275,17 +279,17 @@ def read_all( # TODO: platform_config, ) -> pa.Table: """ - This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases. + This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the record batches found. Its nominal use is to simplify unit-test cases. """ return pa.concat_tables( self.read(ids=ids, value_filter=value_filter, column_names=column_names) ) - def write(self, values: pa.Table) -> None: + def write(self, values: pa.RecordBatch) -> None: """ - Write an Arrow.Table to the persistent object. As duplicate index values are not allowed, index values already present in the object are overwritten and new index values are added. + Write an Arrow.RecordBatch to the persistent object. As duplicate index values are not allowed, index values already present in the object are overwritten and new index values are added. - :param values: An Arrow.Table containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMAIndexedDataFrame``. + :param values: An Arrow.RecordBatch containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMAIndexedDataFrame``. """ self._shape = None # cache-invalidate