Skip to content

Commit

Permalink
Port the essenceof #360
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Oct 19, 2022
1 parent d05de72 commit f5dd971
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 66 deletions.
63 changes: 31 additions & 32 deletions apis/python/src/tiledbsoma/soma_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@
import pyarrow as pa
import tiledb

from . import util_arrow, util_tiledb
# This package's pybind11 code
import tiledbsoma.libtiledbsoma as clib

from . import util, util_arrow
from .constants import SOMA_JOINID, SOMA_ROWID
from .query_condition import QueryCondition # type: ignore
from .soma_collection import SOMACollectionBase
from .tiledb_array import TileDBArray
from .types import Ids, SOMAResultOrder
Expand Down Expand Up @@ -150,40 +154,35 @@ def read(
**Indexing**: the ``ids`` parameter will support, per dimension: a row offset (uint), a row-offset range (slice), or a list of both.
"""
tiledb_result_order = util_tiledb.tiledb_result_order_from_soma_result_order(
result_order, accept=["rowid-ordered", "unordered"]
)

with self._tiledb_open("r") as A:
dim_names, attr_names = util_tiledb.split_column_names(
A.schema, column_names
query_condition = None
if value_filter is not None:
query_condition = QueryCondition(value_filter)

# As an arg to this method, `column_names` is optional-None. For the pybind11
# code it's optional-[].
lib_column_names = [] if column_names is None else column_names

sr = clib.SOMAReader(
self._uri,
name=self.__class__.__name__,
schema=A.schema, # query_condition needs this
column_names=lib_column_names,
query_condition=query_condition,
)
if value_filter is None:
query = A.query(
return_arrow=True,
return_incomplete=True,
order=tiledb_result_order,
dims=dim_names,
attrs=attr_names,
)
else:
qc = tiledb.QueryCondition(value_filter)
query = A.query(
return_arrow=True,
return_incomplete=True,
attr_cond=qc,
order=tiledb_result_order,
dims=dim_names,
attrs=attr_names,
)

if ids is None:
iterator = query.df[:]
else:
iterator = query.df[ids]

for table in iterator:
yield table
if ids is not None:
# XXX TODO NEEDS TO ALWAYS BE A LIST NO MATTER WHAT
if isinstance(ids, slice):
ids = util.slice_to_list(ids)
sr.set_dim_points(SOMA_ROWID, ids)
# TODO: platform_config
# TODO: batch_size
# TODO: result_order
sr.submit()

while arrow_table := sr.read_next():
yield arrow_table # XXX what other post-processing

def read_all(
self,
Expand Down
66 changes: 33 additions & 33 deletions apis/python/src/tiledbsoma/soma_indexed_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@
import pyarrow as pa
import tiledb

from . import util_arrow, util_tiledb
# This package's pybind11 code
import tiledbsoma.libtiledbsoma as clib

from . import util, util_arrow
from .constants import SOMA_JOINID
from .query_condition import QueryCondition # type: ignore
from .soma_collection import SOMACollectionBase
from .tiledb_array import TileDBArray
from .types import Ids, SOMAResultOrder
Expand Down Expand Up @@ -182,41 +186,37 @@ def read(
**Indexing**: the ``ids`` parameter will support, per dimension: a list of values of the type of the indexed column.
"""
tiledb_result_order = util_tiledb.tiledb_result_order_from_soma_result_order(
result_order, accept=["row-major", "column-major", "unordered"]
)

# TODO: more about index_column_names
with self._tiledb_open("r") as A:
dim_names, attr_names = util_tiledb.split_column_names(
A.schema, column_names
query_condition = None
if value_filter is not None:
query_condition = QueryCondition(value_filter)

# As an arg to this method, `column_names` is optional-None. For the pybind11
# code it's optional-[].
lib_column_names = [] if column_names is None else column_names

sr = clib.SOMAReader(
self._uri,
name=self.__class__.__name__,
schema=A.schema, # query_condition needs this
column_names=lib_column_names,
query_condition=query_condition,
)
if value_filter is None:
query = A.query(
return_arrow=True,
return_incomplete=True,
order=tiledb_result_order,
dims=dim_names,
attrs=attr_names,
)
else:
qc = tiledb.QueryCondition(value_filter)
query = A.query(
return_arrow=True,
return_incomplete=True,
attr_cond=qc,
order=tiledb_result_order,
dims=dim_names,
attrs=attr_names,
)

if ids is None:
iterator = query.df[:]
else:
iterator = query.df[ids]

for table in iterator:
yield table
if ids is not None:
# XXX TODO NEEDS TO ALWAYS BE A LIST NO MATTER WHAT
if isinstance(ids, slice):
ids = util.slice_to_list(ids)
sr.set_dim_points(A.schema.domain.dim(0).name, ids)

# TODO: platform_config
# TODO: batch_size
# TODO: result_order

sr.submit()

while arrow_table := sr.read_next():
yield arrow_table # XXX what other post-processing

def read_all(
self,
Expand Down
18 changes: 17 additions & 1 deletion apis/python/src/tiledbsoma/util.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pathlib
import time
import urllib.parse
from typing import TypeVar
from typing import List, TypeVar

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -91,3 +91,19 @@ def uri_joinpath(base: str, path: str) -> str:
parts[2] = parts[2] + "/" + path

return urllib.parse.urlunparse(parts)


def slice_to_list(sl: slice) -> List[int]:
"""
TODO: COMMENT GOES HERE
"""
# xxx note tiledb doesn't support stepped slices -- only 1 (and maybe -1)?
assert isinstance(sl, slice)
step = sl.step
if step is None:
if sl.start <= sl.stop:
step = 1
else:
step = -1
stop = sl.stop + step
return list(range(sl.start, stop, step))

0 comments on commit f5dd971

Please sign in to comment.