Skip to content

Commit

Permalink
rebase prep
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Oct 4, 2022
1 parent e70523f commit 10d0d92
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 69 deletions.
66 changes: 2 additions & 64 deletions apis/python/src/tiledbsoma/soma_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,8 @@
import pyarrow as pa
import tiledb

import tiledbsoma.libtiledbsoma as clib

from . import util, util_arrow, util_tiledb
from .logging import log_io
from .query_condition import QueryCondition
from .soma_collection import SOMACollectionBase
from .tiledb_array import TileDBArray
from .types import Ids, NTuple, SOMAResultOrder
Expand Down Expand Up @@ -160,55 +157,6 @@ def is_indexed(self) -> Literal[False]:
def get_index_column_names(self) -> Sequence[str]:
return []

def read_using_lib_temp(
self,
*,
# TODO: find the right syntax to get the typechecker to accept args like ``ids=slice(0,10)``
# ids: Optional[Union[Sequence[int], Slice]] = None,
ids: Optional[Any] = None,
value_filter: Optional[str] = None,
column_names: Optional[Sequence[str]] = None,
result_order: Optional[str] = None,
# TODO: batch_size
# TODO: partition,
# TODO: platform_config,
) -> Iterator[pa.Table]:
"""
TODO: copy the text
"""

with self._tiledb_open("r") as A:
dim_names, attr_names = util_tiledb.split_column_names(
A.schema, column_names
)

query_condition = None
if value_filter is not None:
# query_condition = tiledb.QueryCondition(value_filter)
query_condition = QueryCondition(value_filter)

# As an arg to this method, `column_names` is optional-None. For the pybind11
# code it's optional-[].
lib_column_names = [] if column_names is None else column_names

sr = clib.SOMAReader(
self._uri,
name=self.name,
schema=A.schema, # query_condition needs this
column_names=lib_column_names,
query_condition=query_condition,
)

# TODO: platform_config
# TODO: batch_size
# TODO: result_order

sr.submit()

while arrow_table := sr.read_next():
# yield util_arrow.ascii_to_unicode_pyarrow_readback(batch)
yield arrow_table # XXX what other post-processing

def read(
self,
*,
Expand Down Expand Up @@ -270,12 +218,7 @@ def read(
iterator = query.df[ids]

for table in iterator:
# XXX COMMENT MORE
# This is the 'decode on read' part of our logic; in dim_select we have the
# 'encode on write' part.
# Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
#
# Also: don't materialize these on read
# Don't materialize these on read
# TODO: get the arrow syntax for drop
# df.drop(ROWID, axis=1)
yield table
Expand All @@ -295,7 +238,7 @@ def read_all(
# TODO: platform_config,
) -> pa.Table:
"""
This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simplify unit-test cases.
"""
return pa.concat_tables(
self.read(
Expand Down Expand Up @@ -412,11 +355,6 @@ def read_as_pandas(

for df in iterator:

# This is the 'decode on read' part of our logic; in dim_select we have the 'encode on
# write' part.
# Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
df = df

if id_column_name is not None:
df.reset_index(inplace=True)
df.set_index(id_column_name, inplace=True)
Expand Down
10 changes: 5 additions & 5 deletions apis/python/src/tiledbsoma/soma_indexed_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def read(
iterator = query.df[ids]

for table in iterator:
yield table
yield df

def read_all(
self,
Expand All @@ -275,17 +275,17 @@ def read_all(
# TODO: platform_config,
) -> pa.Table:
"""
This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the record batches found. Its nominal use is to simplify unit-test cases.
"""
return pa.concat_tables(
self.read(ids=ids, value_filter=value_filter, column_names=column_names)
)

def write(self, values: pa.Table) -> None:
def write(self, values: pa.RecordBatch) -> None:
"""
Write an Arrow.Table to the persistent object. As duplicate index values are not allowed, index values already present in the object are overwritten and new index values are added.
Write an Arrow.RecordBatch to the persistent object. As duplicate index values are not allowed, index values already present in the object are overwritten and new index values are added.
:param values: An Arrow.Table containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMAIndexedDataFrame``.
:param values: An Arrow.RecordBatch containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMAIndexedDataFrame``.
"""
self._shape = None # cache-invalidate

Expand Down

0 comments on commit 10d0d92

Please sign in to comment.