Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python] Dataframe read path #1793

Merged
merged 24 commits into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
2570820
Addition Of `DataFrameWrapper`
nguyenv Oct 16, 2023
defb5bc
Add Dictionary Support For ArraySchema -> ArrowSchema
nguyenv Oct 17, 2023
72589f8
Refactor QueryCondition
nguyenv Oct 18, 2023
f25193b
SOMADataFrame --> DataframeWrapper
nguyenv Oct 18, 2023
8a87d35
Add Pybind11 Common Code
nguyenv Oct 19, 2023
b920697
Reduce Usage of TileDB ArraySchema in Favor in Pyarrow Schema
nguyenv Oct 19, 2023
fc449e1
Support All Enumeration Value Types in `to_arrow`
nguyenv Oct 19, 2023
0b94a02
Do TileDB ArraySchema to ArrowSchema Converstion in C++
nguyenv Oct 19, 2023
b1156b9
Correct `exists` Method For macos
nguyenv Oct 20, 2023
4fdffd8
Add `domain` for `SOMADataFrame`
nguyenv Oct 20, 2023
c52e99a
TileDB ArraySchema to ArrowSchema Should Not Hardcode String Dict
nguyenv Oct 20, 2023
9e4103f
Re-add SOMAArray::schema -> TileDB ArraySchema
nguyenv Oct 20, 2023
81480fa
Remove #include <tiledb/tiledb> from Array derived SOMAObjects
nguyenv Oct 20, 2023
ca37d67
Clean Up NED
nguyenv Oct 24, 2023
ba7aaad
Use DTypeLike instead of np.dtype[Any]
nguyenv Nov 4, 2023
ff3e4b8
Create Factory in Pybind11, Bind `SOMAError`
nguyenv Nov 6, 2023
de1ffee
Add `soma_object.cc` files
nguyenv Nov 6, 2023
960b883
Use Explcit Size For String SOMA Type Metadata to Correct Error on macOS
nguyenv Nov 7, 2023
ef577fc
Replace with TPY_ERROR_LOC where necessary
nguyenv Nov 7, 2023
8aacce4
Correct MacOS Exception Handling for Getting SOMA Object Type
nguyenv Nov 7, 2023
f0a96a0
Fix segfault by not spawning multiple ManagedQueries per SOMAArray
nguyenv Jan 11, 2024
19bd2e2
Correct `QueryCondition`, Rename `tiledb_schema`, Remove `_set_internal`
nguyenv Jan 11, 2024
9bb751d
Use tiledb_schema to get ArraySchema
nguyenv Jan 12, 2024
53cca7e
Move all Python tests to same directory, Require KWs for `SOMADataFra…
nguyenv Jan 17, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions .github/workflows/python-ci-single.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,13 +122,6 @@ jobs:
- name: Run libtiledbsoma unit tests
run: ctest --output-on-failure --test-dir build/libtiledbsoma -C Release --verbose

- name: Run pytests for C++
shell: bash
# Setting PYTHONPATH ensures the tests load the in-tree source code unde apis/python/src
# instead of copy we `pip install`ed to site-packages above. That's needed for the code
# coverage analysis to work.
run: PYTHONPATH=$(pwd)/apis/python/src python -m pytest --cov=apis/python/src --cov-report=xml libtiledbsoma/test -v --durations=20

- name: Run pytests for Python
shell: bash
# Setting PYTHONPATH ensures the tests load the in-tree source code unde apis/python/src
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ update:
.PHONY: test
test: data
ctest --test-dir build/libtiledbsoma -C Release --verbose --rerun-failed --output-on-failure
pytest apis/python/tests libtiledbsoma/test
pytest apis/python/tests

.PHONY: data
data:
Expand Down
12 changes: 8 additions & 4 deletions apis/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,6 @@ def run(self):
"dist_links/libtiledbsoma/external/include",
"../../build/externals/install/include",
str(libtiledbsoma_dir / "include"),
str(
"./src/tiledbsoma"
), # since pytiledbsoma.cc does #include of query_condition.cc
str(libtiledbsoma_dir.parent / "build/externals/install/include"),
str(tiledb_dir / "include"),
]
Expand Down Expand Up @@ -258,7 +255,14 @@ def run(self):
ext_modules=[
Pybind11Extension(
"tiledbsoma.pytiledbsoma",
["src/tiledbsoma/pytiledbsoma.cc"],
[
"src/tiledbsoma/common.cc",
"src/tiledbsoma/query_condition.cc",
"src/tiledbsoma/soma_array.cc",
"src/tiledbsoma/soma_object.cc",
"src/tiledbsoma/soma_dataframe.cc",
"src/tiledbsoma/pytiledbsoma.cc",
],
include_dirs=INC_DIRS,
library_dirs=LIB_DIRS,
libraries=["tiledbsoma"] + (["tiledb"] if os.name == "nt" else []),
Expand Down
22 changes: 15 additions & 7 deletions apis/python/src/tiledbsoma/_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ class CollectionBase( # type: ignore[misc] # __eq__ false positive

__slots__ = ("_contents", "_mutated_keys")
_wrapper_type = _tdb_handles.GroupWrapper
_reader_wrapper_type = _tdb_handles.GroupWrapper

# TODO: Implement additional creation of members on collection subclasses.
@classmethod
Expand Down Expand Up @@ -426,13 +427,20 @@ def __getitem__(self, key: str) -> CollectionElementType:
if entry.soma is None:
from . import _factory # Delayed binding to resolve circular import.

entry.soma = _factory._open_internal(
entry.entry.wrapper_type.open,
entry.entry.uri,
self.mode,
self.context,
self.tiledb_timestamp_ms,
)
uri = entry.entry.uri
mode = self.mode
context = self.context
timestamp = self.tiledb_timestamp_ms

try:
wrapper = _tdb_handles._open_with_clib_wrapper(
uri, mode, context, timestamp
)
entry.soma = _factory.reify_handle(wrapper)
except SOMAError:
entry.soma = _factory._open_internal(
entry.entry.wrapper_type.open, uri, mode, context, timestamp
)
# Since we just opened this object, we own it and should close it.
self._close_stack.enter_context(entry.soma)
return cast(CollectionElementType, entry.soma)
Expand Down
168 changes: 55 additions & 113 deletions apis/python/src/tiledbsoma/_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from ._constants import SOMA_JOINID
from ._query_condition import QueryCondition
from ._read_iters import TableReadIter
from ._tdb_handles import DataFrameWrapper
from ._tiledb_array import TileDBArray
from ._types import NPFloating, NPInteger, OpenTimestamp, Slice, is_slice_of
from .options import SOMATileDBContext
Expand Down Expand Up @@ -121,6 +122,8 @@
it must be ``None``.
"""

_reader_wrapper_type = DataFrameWrapper

@classmethod
def create(
cls,
Expand Down Expand Up @@ -261,18 +264,8 @@
Experimental.
"""
self._check_open_read()
return cast(int, self._soma_reader().nnz())

def enumeration(self, name: str) -> Tuple[Any, ...]:
"""Doc place holder.

Returns:
Tuple[Any, ...]: _description_
"""
return tuple(self._soma_reader().get_enum(name))

def column_to_enumeration(self, name: str) -> str:
return str(self._soma_reader().get_enum_label_on_attr(name))
# if is it in read open mode, then it is a DataFrameWrapper
return cast(DataFrameWrapper, self._handle).count

def __len__(self) -> int:
"""Returns the number of rows in the dataframe. Same as ``df.count``."""
Expand Down Expand Up @@ -341,25 +334,28 @@
Lifecycle:
Experimental.
"""
del batch_size, platform_config # Currently unused.
del batch_size # Currently unused.
_util.check_unpartitioned(partitions)
self._check_open_read()

schema = self._handle.schema
query_condition = None
if value_filter is not None:
query_condition = QueryCondition(value_filter)

sr = self._soma_reader(
schema=schema, # query_condition needs this
column_names=column_names,
query_condition=query_condition,
result_order=result_order,
ts = None
if self._handle._handle.timestamp is not None:
ts = (0, self._handle._handle.timestamp)

sr = clib.SOMADataFrame.open(
uri=self._handle._handle.uri,
mode=clib.OpenMode.read,
platform_config=platform_config or {},
column_names=column_names or [],
result_order=_util.to_clib_result_order(result_order),
timestamp=ts,
)

if value_filter is not None:
sr.set_condition(QueryCondition(value_filter), self._handle.schema)

self._set_reader_coords(sr, coords)

# TODO: platform_config
# TODO: batch_size

return TableReadIter(sr)
Expand Down Expand Up @@ -415,7 +411,7 @@
if not pa.types.is_dictionary(col_info.type):
raise ValueError(
"Expected dictionary type for enumerated attribute "
f"{name} but saw {col_info.type}"
f"{name} but saw {col.type}"
)

enmr = self._handle.enum(attr.name)
Expand Down Expand Up @@ -521,37 +517,38 @@
if self._set_reader_coord_by_numeric_slice(sr, dim_idx, dim, coord):
return True

domain = self.domain[dim_idx]

# Note: slice(None, None) matches the is_slice_of part, unless we also check the dim-type
# part.
if (is_slice_of(coord, str) or is_slice_of(coord, bytes)) and (
dim.dtype == "str" or dim.dtype == "bytes"
):
if (
is_slice_of(coord, str) or is_slice_of(coord, bytes)
) and _util.pa_types_is_string_or_bytes(dim.type):
_util.validate_slice(coord)
# Figure out which one.
dim_type: Union[Type[str], Type[bytes]] = type(dim.domain[0])
dim_type: Union[Type[str], Type[bytes]] = type(domain[0])
# A ``None`` or empty start is always equivalent to empty str/bytes.
start = coord.start or dim_type()
if coord.stop is None:
# There's no way to specify "to infinity" for strings.
# We have to get the nonempty domain and use that as the end.
_, stop = self._handle.reader.nonempty_domain()[dim_idx]
ned = self._handle.non_empty_domain()
_, stop = ned[dim_idx]
else:
stop = coord.stop
sr.set_dim_ranges_string_or_bytes(dim.name, [(start, stop)])
return True

# Note: slice(None, None) matches the is_slice_of part, unless we also check the dim-type
# part.
if is_slice_of(coord, np.datetime64) and dim.dtype.name.startswith(
"datetime64"
):
if is_slice_of(coord, np.datetime64) and pa.types.is_timestamp(dim.type):
_util.validate_slice(coord)
# These timestamp types are stored in Arrow as well as TileDB as 64-bit integers (with
# distinguishing metadata of course). For purposes of the query logic they're just
# int64.
istart = coord.start or dim.domain[0]
istart = coord.start or domain[0]
istart = int(istart.astype("int64"))
istop = coord.stop or dim.domain[1]
istop = coord.stop or domain[1]
istop = int(istop.astype("int64"))
sr.set_dim_ranges_int64(dim.name, [(istart, istop)])
return True
Expand All @@ -574,41 +571,19 @@
f"only 1D numpy arrays may be used to index; got {coord.ndim}"
)

# See libtiledbsoma.cc for more context on why we need the
# explicit type-check here.

if dim.dtype == np.int64:
sr.set_dim_points_int64(dim.name, coord)
elif dim.dtype == np.int32:
sr.set_dim_points_int32(dim.name, coord)
elif dim.dtype == np.int16:
sr.set_dim_points_int16(dim.name, coord)
elif dim.dtype == np.int8:
sr.set_dim_points_int8(dim.name, coord)

elif dim.dtype == np.uint64:
sr.set_dim_points_uint64(dim.name, coord)
elif dim.dtype == np.uint32:
sr.set_dim_points_uint32(dim.name, coord)
elif dim.dtype == np.uint16:
sr.set_dim_points_uint16(dim.name, coord)
elif dim.dtype == np.uint8:
sr.set_dim_points_uint8(dim.name, coord)

elif dim.dtype == np.float64:
sr.set_dim_points_float64(dim.name, coord)
elif dim.dtype == np.float32:
sr.set_dim_points_float32(dim.name, coord)

elif dim.dtype == "str" or dim.dtype == "bytes":
sr.set_dim_points_string_or_bytes(dim.name, coord)
try:
set_dim_points = getattr(sr, f"set_dim_points_{dim.type}")
nguyenv marked this conversation as resolved.
Show resolved Hide resolved
except AttributeError:
# We have to handle this type specially below
pass
else:
set_dim_points(dim.name, coord)
return True

elif (
dim.dtype == "datetime64[s]"
or dim.dtype == "datetime64[ms]"
or dim.dtype == "datetime64[us]"
or dim.dtype == "datetime64[ns]"
):
if _util.pa_types_is_string_or_bytes(dim.type):
sr.set_dim_points_string_or_bytes(dim.name, coord)
nguyenv marked this conversation as resolved.
Show resolved Hide resolved
return True
elif pa.types.is_timestamp(dim.type):
if not isinstance(coord, (tuple, list, np.ndarray)):
raise ValueError(
f"unhandled coord type {type(coord)} for index column named {dim.name}"
Expand All @@ -618,64 +593,31 @@
for e in coord
]
sr.set_dim_points_int64(dim.name, icoord)
return True

# TODO: bool

else:
raise ValueError(
f"unhandled type {dim.dtype} for index column named {dim.name}"
)

return True
raise ValueError(

Check warning on line 600 in apis/python/src/tiledbsoma/_dataframe.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_dataframe.py#L600

Added line #L600 was not covered by tests
f"unhandled type {dim.dtype} for index column named {dim.name}"
)

def _set_reader_coord_by_numeric_slice(
self, sr: clib.SOMAArray, dim_idx: int, dim: tiledb.Dim, coord: Slice[Any]
self, sr: clib.SOMAArray, dim_idx: int, dim: pa.Field, coord: Slice[Any]
) -> bool:
try:
lo_hi = _util.slice_to_numeric_range(coord, dim.domain)
lo_hi = _util.slice_to_numeric_range(coord, self.domain[dim_idx])
except _util.NonNumericDimensionError:
return False # We only handle numeric dimensions here.

if not lo_hi:
return True

elif dim.dtype == np.int64:
sr.set_dim_ranges_int64(dim.name, [lo_hi])
return True
elif dim.dtype == np.int32:
sr.set_dim_ranges_int32(dim.name, [lo_hi])
return True
elif dim.dtype == np.int16:
sr.set_dim_ranges_int16(dim.name, [lo_hi])
return True
elif dim.dtype == np.int8:
sr.set_dim_ranges_int8(dim.name, [lo_hi])
return True

elif dim.dtype == np.uint64:
sr.set_dim_ranges_uint64(dim.name, [lo_hi])
return True
elif dim.dtype == np.uint32:
sr.set_dim_ranges_uint32(dim.name, [lo_hi])
return True
elif dim.dtype == np.uint16:
sr.set_dim_ranges_uint16(dim.name, [lo_hi])
return True
elif dim.dtype == np.uint8:
sr.set_dim_ranges_uint8(dim.name, [lo_hi])
return True

elif dim.dtype == np.float64:
sr.set_dim_ranges_float64(dim.name, [lo_hi])
return True
elif dim.dtype == np.float32:
sr.set_dim_ranges_float32(dim.name, [lo_hi])
try:
set_dim_range = getattr(sr, f"set_dim_ranges_{dim.type}")
set_dim_range(dim.name, [lo_hi])
return True

# TODO:
# elif dim.dtype == np.bool_:

return False
except AttributeError:
return False


def _canonicalize_schema(
Expand Down
3 changes: 3 additions & 0 deletions apis/python/src/tiledbsoma/_dense_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from . import _util
from ._common_nd_array import NDArray
from ._exception import SOMAError
from ._tdb_handles import ArrayWrapper
from ._util import dense_indices_to_shape
from .options._tiledb_create_options import TileDBCreateOptions

Expand Down Expand Up @@ -71,6 +72,8 @@ class DenseNDArray(NDArray, somacore.DenseNDArray):

__slots__ = ()

_reader_wrapper_type = ArrayWrapper

def read(
self,
coords: options.DenseNDCoords = (),
Expand Down
6 changes: 3 additions & 3 deletions apis/python/src/tiledbsoma/_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,18 +142,18 @@ def _open_internal(
"""Lower-level open function for internal use only."""
handle = opener(uri, mode, context, timestamp)
try:
return _reify_handle(handle)
return reify_handle(handle)
except Exception:
handle.close()
raise


@typeguard_ignore
def _reify_handle(hdl: _Wrapper) -> "_tiledb_object.TileDBObject[_Wrapper]":
def reify_handle(hdl: _Wrapper) -> "_tiledb_object.TileDBObject[_Wrapper]":
"""Picks out the appropriate SOMA class for a handle and wraps it."""
typename = _read_soma_type(hdl)
cls = _type_name_to_cls(typename)
if cls._wrapper_type != type(hdl):
if type(hdl) not in (cls._wrapper_type, cls._reader_wrapper_type):
raise SOMAError(
f"cannot open {hdl.uri!r}: a {type(hdl._handle)}"
f" cannot be converted to a {typename}"
Expand Down
Loading
Loading