From 70844be36d1bff98230b8edc78d1d39e07967b59 Mon Sep 17 00:00:00 2001
From: John Kerl <kerl.john.r@gmail.com>
Date: Mon, 3 Oct 2022 15:27:51 -0400
Subject: [PATCH] rebase prep

---
 apis/python/src/tiledbsoma/soma_dataframe.py  | 62 ++-----------------
 .../src/tiledbsoma/soma_indexed_dataframe.py  | 14 +++--
 2 files changed, 14 insertions(+), 62 deletions(-)

diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py
index 814710de48..2c24a2a881 100644
--- a/apis/python/src/tiledbsoma/soma_dataframe.py
+++ b/apis/python/src/tiledbsoma/soma_dataframe.py
@@ -5,11 +5,8 @@
 import pyarrow as pa
 import tiledb
 
-import tiledbsoma.libtiledbsoma as clib
-
-from . import util, util_arrow, util_tiledb
+from . import util, util_arrow, util_pandas, util_tiledb
 from .logging import log_io
-from .query_condition import QueryCondition
 from .soma_collection import SOMACollectionBase
 from .tiledb_array import TileDBArray
 from .types import Ids, NTuple, SOMAResultOrder
@@ -160,55 +157,6 @@ def is_indexed(self) -> Literal[False]:
     def get_index_column_names(self) -> Sequence[str]:
         return []
 
-    def read_using_lib_temp(
-        self,
-        *,
-        # TODO: find the right syntax to get the typechecker to accept args like ``ids=slice(0,10)``
-        # ids: Optional[Union[Sequence[int], Slice]] = None,
-        ids: Optional[Any] = None,
-        value_filter: Optional[str] = None,
-        column_names: Optional[Sequence[str]] = None,
-        result_order: Optional[str] = None,
-        # TODO: batch_size
-        # TODO: partition,
-        # TODO: platform_config,
-    ) -> Iterator[pa.Table]:
-        """
-        TODO: copy the text
-        """
-
-        with self._tiledb_open("r") as A:
-            dim_names, attr_names = util_tiledb.split_column_names(
-                A.schema, column_names
-            )
-
-            query_condition = None
-            if value_filter is not None:
-                # query_condition = tiledb.QueryCondition(value_filter)
-                query_condition = QueryCondition(value_filter)
-
-            # As an arg to this method, `column_names` is optional-None. For the pybind11
-            # code it's optional-[].
-            lib_column_names = [] if column_names is None else column_names
-
-            sr = clib.SOMAReader(
-                self._uri,
-                name=self.name,
-                schema=A.schema,  # query_condition needs this
-                column_names=lib_column_names,
-                query_condition=query_condition,
-            )
-
-            # TODO: platform_config
-            # TODO: batch_size
-            # TODO: result_order
-
-            sr.submit()
-
-            while arrow_table := sr.read_next():
-                # yield util_arrow.ascii_to_unicode_pyarrow_readback(batch)
-                yield arrow_table  # XXX what other post-processing
-
     def read(
         self,
         *,
@@ -278,7 +226,7 @@ def read(
                 # Also: don't materialize these on read
                 # TODO: get the arrow syntax for drop
                 # df.drop(ROWID, axis=1)
-                yield table
+                yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
 
     def read_all(
         self,
@@ -295,7 +243,7 @@ def read_all(
         # TODO: platform_config,
     ) -> pa.Table:
         """
-        This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
+        This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simplify unit-test cases.
         """
         return pa.concat_tables(
             self.read(
@@ -415,7 +363,7 @@ def read_as_pandas(
                 # This is the 'decode on read' part of our logic; in dim_select we have the 'encode on
                 # write' part.
                 # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
-                df = df
+                df = util_pandas.ascii_to_unicode_pandas_readback(df)
 
                 if id_column_name is not None:
                     df.reset_index(inplace=True)
@@ -529,7 +477,7 @@ def write_from_pandas(
             dfc = dataframe[column_name]
             if len(dfc) > 0 and type(dfc[0]) == str:
                 # Force ASCII storage if string, in order to make obs/var columns queryable.
-                column_types[column_name] = "ascii"
+                column_types[column_name] = np.dtype("S")
 
         tiledb.from_pandas(
             uri=self.uri,
diff --git a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py
index 539d4e5140..dd6abf8220 100644
--- a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py
+++ b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py
@@ -259,7 +259,11 @@ def read(
                 iterator = query.df[ids]
 
             for table in iterator:
-                yield table
+                # XXX COMMENT MORE
+                # This is the 'decode on read' part of our logic; in dim_select we have the
+                # 'encode on write' part.
+                # Context: # https://github.com/single-cell-data/TileDB-SOMA/issues/99.
+                yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
 
     def read_all(
         self,
@@ -275,17 +279,17 @@ def read_all(
         # TODO: platform_config,
     ) -> pa.Table:
         """
-        This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
+        This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the record batches found. Its nominal use is to simplify unit-test cases.
         """
         return pa.concat_tables(
             self.read(ids=ids, value_filter=value_filter, column_names=column_names)
         )
 
-    def write(self, values: pa.Table) -> None:
+    def write(self, values: pa.RecordBatch) -> None:
         """
-        Write an Arrow.Table to the persistent object. As duplicate index values are not allowed, index values already present in the object are overwritten and new index values are added.
+        Write an Arrow.RecordBatch to the persistent object. As duplicate index values are not allowed, index values already present in the object are overwritten and new index values are added.
 
-        :param values: An Arrow.Table containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMAIndexedDataFrame``.
+        :param values: An Arrow.RecordBatch containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMAIndexedDataFrame``.
         """
         self._shape = None  # cache-invalidate