From 9aa952eb997e9099ad806b6a155bbe4541c0633d Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 12 Oct 2022 18:54:26 -0400 Subject: [PATCH] Port the essenceof #360 --- apis/python/src/tiledbsoma/soma_dataframe.py | 63 +++++++++--------- .../src/tiledbsoma/soma_indexed_dataframe.py | 66 +++++++++---------- apis/python/src/tiledbsoma/util.py | 18 ++++- 3 files changed, 81 insertions(+), 66 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py index 9f194461a2..ca86041bde 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.py +++ b/apis/python/src/tiledbsoma/soma_dataframe.py @@ -5,8 +5,12 @@ import pyarrow as pa import tiledb -from . import util_arrow, util_tiledb +# This package's pybind11 code +import tiledbsoma.libtiledbsoma as clib + +from . import util, util_arrow from .constants import SOMA_JOINID, SOMA_ROWID +from .query_condition import QueryCondition # type: ignore from .soma_collection import SOMACollectionBase from .tiledb_array import TileDBArray from .types import Ids, SOMAResultOrder @@ -150,40 +154,35 @@ def read( **Indexing**: the ``ids`` parameter will support, per dimension: a row offset (uint), a row-offset range (slice), or a list of both. """ - tiledb_result_order = util_tiledb.tiledb_result_order_from_soma_result_order( - result_order, accept=["rowid-ordered", "unordered"] - ) - with self._tiledb_open("r") as A: - dim_names, attr_names = util_tiledb.split_column_names( - A.schema, column_names + query_condition = None + if value_filter is not None: + query_condition = QueryCondition(value_filter) + + # As an arg to this method, `column_names` is optional-None. For the pybind11 + # code it's optional-[]. + lib_column_names = [] if column_names is None else column_names + + sr = clib.SOMAReader( + self._uri, + name=self.__class__.__name__, + schema=A.schema, # query_condition needs this + column_names=lib_column_names, + query_condition=query_condition, ) - if value_filter is None: - query = A.query( - return_arrow=True, - return_incomplete=True, - order=tiledb_result_order, - dims=dim_names, - attrs=attr_names, - ) - else: - qc = tiledb.QueryCondition(value_filter) - query = A.query( - return_arrow=True, - return_incomplete=True, - attr_cond=qc, - order=tiledb_result_order, - dims=dim_names, - attrs=attr_names, - ) - - if ids is None: - iterator = query.df[:] - else: - iterator = query.df[ids] - for table in iterator: - yield table + if ids is not None: + # XXX TODO NEEDS TO ALWAYS BE A LIST NO MATTER WHAT + if isinstance(ids, slice): + ids = util.slice_to_list(ids) + sr.set_dim_points(SOMA_ROWID, ids) + # TODO: platform_config + # TODO: batch_size + # TODO: result_order + sr.submit() + + while arrow_table := sr.read_next(): + yield arrow_table # XXX what other post-processing def read_all( self, diff --git a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py index 401d199ecf..d2b25d6178 100644 --- a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py +++ b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py @@ -5,8 +5,12 @@ import pyarrow as pa import tiledb -from . import util_arrow, util_tiledb +# This package's pybind11 code +import tiledbsoma.libtiledbsoma as clib + +from . import util, util_arrow from .constants import SOMA_JOINID +from .query_condition import QueryCondition # type: ignore from .soma_collection import SOMACollectionBase from .tiledb_array import TileDBArray from .types import Ids, SOMAResultOrder @@ -182,41 +186,37 @@ def read( **Indexing**: the ``ids`` parameter will support, per dimension: a list of values of the type of the indexed column. """ - tiledb_result_order = util_tiledb.tiledb_result_order_from_soma_result_order( - result_order, accept=["row-major", "column-major", "unordered"] - ) - - # TODO: more about index_column_names with self._tiledb_open("r") as A: - dim_names, attr_names = util_tiledb.split_column_names( - A.schema, column_names + query_condition = None + if value_filter is not None: + query_condition = QueryCondition(value_filter) + + # As an arg to this method, `column_names` is optional-None. For the pybind11 + # code it's optional-[]. + lib_column_names = [] if column_names is None else column_names + + sr = clib.SOMAReader( + self._uri, + name=self.__class__.__name__, + schema=A.schema, # query_condition needs this + column_names=lib_column_names, + query_condition=query_condition, ) - if value_filter is None: - query = A.query( - return_arrow=True, - return_incomplete=True, - order=tiledb_result_order, - dims=dim_names, - attrs=attr_names, - ) - else: - qc = tiledb.QueryCondition(value_filter) - query = A.query( - return_arrow=True, - return_incomplete=True, - attr_cond=qc, - order=tiledb_result_order, - dims=dim_names, - attrs=attr_names, - ) - - if ids is None: - iterator = query.df[:] - else: - iterator = query.df[ids] - for table in iterator: - yield table + if ids is not None: + # XXX TODO NEEDS TO ALWAYS BE A LIST NO MATTER WHAT + if isinstance(ids, slice): + ids = util.slice_to_list(ids) + sr.set_dim_points(A.schema.domain.dim(0).name, ids) + + # TODO: platform_config + # TODO: batch_size + # TODO: result_order + + sr.submit() + + while arrow_table := sr.read_next(): + yield arrow_table # XXX what other post-processing def read_all( self, diff --git a/apis/python/src/tiledbsoma/util.py b/apis/python/src/tiledbsoma/util.py index 3856ab7052..2d044448a8 100644 --- a/apis/python/src/tiledbsoma/util.py +++ b/apis/python/src/tiledbsoma/util.py @@ -1,7 +1,7 @@ import pathlib import time import urllib.parse -from typing import TypeVar +from typing import List, TypeVar import numpy as np import pandas as pd @@ -91,3 +91,19 @@ def uri_joinpath(base: str, path: str) -> str: parts[2] = parts[2] + "/" + path return urllib.parse.urlunparse(parts) + + +def slice_to_list(sl: slice) -> List[int]: + """ + TODO: COMMENT GOES HERE + """ + # xxx note tiledb doesn't support stepped slices -- only 1 (and maybe -1)? + assert isinstance(sl, slice) + step = sl.step + if step is None: + if sl.start <= sl.stop: + step = 1 + else: + step = -1 + stop = sl.stop + step + return list(range(sl.start, stop, step))