From 7ee0b85b219a525ff4ed75f2d8b2e708cb6bcdc8 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Mon, 16 Oct 2023 21:48:50 -0500 Subject: [PATCH] Add Methods to `DataFrameWrapper` and `ArrayWrapper` * Take care of formatting / typing * Correct datetime domains * Get full nonempty domains for `SOMADataFrame` * Find missing open that needs to use `DataframeWrapper` --- apis/python/src/tiledbsoma/_collection.py | 8 +- apis/python/src/tiledbsoma/_dataframe.py | 34 +- apis/python/src/tiledbsoma/_tdb_handles.py | 93 ++- apis/python/src/tiledbsoma/_tiledb_array.py | 43 +- apis/python/src/tiledbsoma/_tiledb_object.py | 4 +- apis/python/src/tiledbsoma/io/ingest.py | 4 +- apis/python/src/tiledbsoma/soma_array.cc | 643 +++++++++++++++++++ apis/python/src/tiledbsoma/soma_dataframe.cc | 558 ++++++++++++++++ libtiledbsoma/src/soma/soma_array.h | 9 + libtiledbsoma/src/soma/soma_dataframe.h | 19 +- 10 files changed, 1345 insertions(+), 70 deletions(-) create mode 100644 apis/python/src/tiledbsoma/soma_array.cc create mode 100644 apis/python/src/tiledbsoma/soma_dataframe.cc diff --git a/apis/python/src/tiledbsoma/_collection.py b/apis/python/src/tiledbsoma/_collection.py index d5c45367c4..aa53e14289 100644 --- a/apis/python/src/tiledbsoma/_collection.py +++ b/apis/python/src/tiledbsoma/_collection.py @@ -33,11 +33,13 @@ from typing_extensions import Self from . import _funcs, _tdb_handles +from . import pytiledbsoma as clib from ._common_nd_array import NDArray from ._dataframe import DataFrame from ._dense_nd_array import DenseNDArray from ._exception import SOMAError, is_does_not_exist_error from ._sparse_nd_array import SparseNDArray +from ._tdb_handles import DataFrameWrapper from ._tiledb_object import AnyTileDBObject, TileDBObject from ._types import OpenTimestamp from ._util import ( @@ -47,8 +49,6 @@ ) from .options import SOMATileDBContext from .options._soma_tiledb_context import _validate_soma_tiledb_context -from ._tdb_handles import DataFrameWrapper -from . import pytiledbsoma as clib # A collection can hold any sub-type of TileDBObject CollectionElementType = TypeVar("CollectionElementType", bound=AnyTileDBObject) @@ -427,7 +427,9 @@ def __getitem__(self, key: str) -> CollectionElementType: raise KeyError(err_str) from None if entry.soma is None: from . import _factory # Delayed binding to resolve circular import. - + from ._tdb_handles import Wrapper + + wrapper: type[Wrapper[Any | Any | Any]] if self.mode == "r" and clib.SOMADataFrame.exists(entry.entry.uri): wrapper = DataFrameWrapper else: diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 4a6451c6bf..dd5180fe5f 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -354,22 +354,22 @@ def read( _util.check_unpartitioned(partitions) self._check_open_read() - schema = self._handle.schema - query_condition = None if value_filter is not None: - query_condition = QueryCondition(value_filter) - - order = { + QueryCondition(value_filter) + + to_clib_result_order = { options.ResultOrder.AUTO: clib.ResultOrder.automatic, options.ResultOrder.ROW_MAJOR: clib.ResultOrder.rowmajor, options.ResultOrder.COLUMN_MAJOR: clib.ResultOrder.colmajor, "auto": clib.ResultOrder.automatic, "row-major": clib.ResultOrder.rowmajor, - "col-major": clib.ResultOrder.colmajor + "column-major": clib.ResultOrder.colmajor, } + if result_order not in to_clib_result_order: + raise ValueError(f"Invalid result_order: {result_order}") sr = self._handle._handle - sr.reset(column_names or [], "auto", order[result_order]) + sr.reset(column_names or [], "auto", to_clib_result_order[result_order]) self._set_reader_coords(sr, coords) @@ -505,7 +505,10 @@ def _set_reader_coord( # Note: slice(None, None) matches the is_slice_of part, unless we also check the dim-type # part. if (is_slice_of(coord, str) or is_slice_of(coord, bytes)) and ( - pa.types.is_large_string(dim.type) or pa.types.is_large_binary(dim.type) or pa.types.is_string(dim.type) or pa.types.is_binary(dim.type) + pa.types.is_large_string(dim.type) + or pa.types.is_large_binary(dim.type) + or pa.types.is_string(dim.type) + or pa.types.is_binary(dim.type) ): _util.validate_slice(coord) # Figure out which one. @@ -515,7 +518,13 @@ def _set_reader_coord( if coord.stop is None: # There's no way to specify "to infinity" for strings. # We have to get the nonempty domain and use that as the end. - _, stop = self._handle.nonempty_domain(dim.name) + ned = self._handle.nonempty_domain() + if ned is None: + raise ValueError( + "Found empty nonempty domain when setting " + "string coordinates in _set_reader_coord" + ) + _, stop = ned[dim_idx] else: stop = coord.stop sr.set_dim_ranges_string_or_bytes(dim.name, [(start, stop)]) @@ -576,7 +585,12 @@ def _set_reader_coord_by_py_seq_or_np_array( sr.set_dim_points_float64(dim.name, coord) elif pa.types.is_float32(dim.type): sr.set_dim_points_float32(dim.name, coord) - elif pa.types.is_large_string(dim.type) or pa.types.is_large_binary(dim.type) or pa.types.is_string(dim.type) or pa.types.is_binary(dim.type): + elif ( + pa.types.is_large_string(dim.type) + or pa.types.is_large_binary(dim.type) + or pa.types.is_string(dim.type) + or pa.types.is_binary(dim.type) + ): sr.set_dim_points_string_or_bytes(dim.name, coord) elif pa.types.is_timestamp(dim.type): if not isinstance(coord, (tuple, list, np.ndarray)): diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 17ef971e40..1cb7cc23e8 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -18,25 +18,24 @@ Mapping, MutableMapping, Optional, + Tuple, Type, TypeVar, Union, ) import attrs +import pyarrow as pa import tiledb from somacore import options from typing_extensions import Literal, Self -import numpy as np -import pyarrow as pa +from . import pytiledbsoma as clib from ._exception import DoesNotExistError, SOMAError, is_does_not_exist_error from ._types import OpenTimestamp from .options._soma_tiledb_context import SOMATileDBContext -from . import pytiledbsoma as clib - -RawHandle = Union[tiledb.Array, tiledb.Group] +RawHandle = Union[tiledb.Array, tiledb.Group, clib.SOMADataFrame] _RawHdl_co = TypeVar("_RawHdl_co", bound=RawHandle, covariant=True) """A raw TileDB object. Covariant because Handles are immutable enough.""" @@ -200,6 +199,32 @@ def _opener( def schema(self) -> tiledb.ArraySchema: return self._handle.schema + @property + def domain(self) -> Tuple[Tuple[Any, Any], ...]: + dom = self._handle.schema.domain + return tuple(dom.dim(i).domain for i in range(dom.ndim)) + + @property + def ndim(self) -> int: + return int(self._handle.schema.domain.ndim) + + def nonempty_domain(self) -> Optional[Tuple[Tuple[Any, Any], ...]]: + try: + ned: Optional[Tuple[Tuple[Any, Any], ...]] = self._handle.nonempty_domain() + return ned + except tiledb.TileDBError as e: + raise SOMAError(e) + + @property + def attr_names(self) -> Tuple[str, ...]: + schema = self._handle.schema + return tuple(schema.attr(i).name for i in range(schema.nattr)) + + @property + def dim_names(self) -> Tuple[str, ...]: + schema = self._handle.schema + return tuple(schema.domain.dim(i).name for i in range(schema.domain.ndim)) + @attrs.define(frozen=True) class GroupEntry: @@ -237,7 +262,8 @@ def _do_initial_reads(self, reader: tiledb.Group) -> None: self.initial_contents = { o.name: GroupEntry.from_object(o) for o in reader if o.name is not None } - + + class DataFrameWrapper(Wrapper[clib.SOMADataFrame]): @classmethod def _opener( @@ -258,38 +284,63 @@ def _opener( ) @property - def schema(self) -> tiledb.ArraySchema: + def schema(self) -> pa.Schema: return self._handle.schema @property - def meta(self): - return self._handle.meta - + def meta(self) -> Dict[str, str]: + return dict(self._handle.meta) + @property - def domain(self): + def domain(self) -> Tuple[Tuple[Any, Any], ...]: result = [] for name in self._handle.index_column_names: dtype = self._handle.schema.field(name).type if pa.types.is_timestamp(dtype): dom = self._handle.domain(name) np_dtype = dtype.to_pandas_dtype() - tz = np.datetime_data(np_dtype)[0] result.append( - (np_dtype.type(dom[0], tz), np_dtype.type(dom[1], tz))) + ( + np_dtype.type(dom[0], dtype.unit), + np_dtype.type(dom[1], dtype.unit), + ) + ) else: result.append(self._handle.domain(name)) return tuple(result) - + @property - def ndim(self): - return self._handle.ndim - + def ndim(self) -> int: + return int(self._handle.ndim) + + def nonempty_domain(self) -> Optional[Tuple[Tuple[Any, Any], ...]]: + result = [] + for name in self._handle.index_column_names: + dtype = self._handle.schema.field(name).type + if pa.types.is_timestamp(dtype): + ned = self._handle.nonempty_domain(name) + np_dtype = dtype.to_pandas_dtype() + result.append( + ( + np_dtype.type(ned[0], dtype.unit), + np_dtype.type(ned[1], dtype.unit), + ) + ) + else: + result.append(self._handle.domain(name)) + return None if len(result) == 0 else tuple(result) + + @property + def attr_names(self) -> Tuple[str, ...]: + result = [] + for field in self.schema: + if field.name not in self._handle.index_column_names: + result.append(field.name) + return tuple(result) + @property - def index_column_names(self): + def dim_names(self) -> Tuple[str, ...]: return tuple(self._handle.index_column_names) - - def nonempty_domain(self, name: str): - return self._handle.nonempty_domain(name) class _DictMod(enum.Enum): diff --git a/apis/python/src/tiledbsoma/_tiledb_array.py b/apis/python/src/tiledbsoma/_tiledb_array.py index ec7352240b..49416b4e8a 100644 --- a/apis/python/src/tiledbsoma/_tiledb_array.py +++ b/apis/python/src/tiledbsoma/_tiledb_array.py @@ -64,7 +64,9 @@ def schema(self) -> pa.Schema: Experimental. """ if isinstance(self._tiledb_array_schema(), tiledb.ArraySchema): - return tiledb_schema_to_arrow(self._tiledb_array_schema(), self.uri, self._ctx) + return tiledb_schema_to_arrow( + self._tiledb_array_schema(), self.uri, self._ctx + ) else: return self._tiledb_array_schema() @@ -78,32 +80,16 @@ def _tiledb_array_keys(self) -> Tuple[str, ...]: def _tiledb_dim_names(self) -> Tuple[str, ...]: """Reads the dimension names from the schema: for example, ['obs_id', 'var_id'].""" - schema = self._handle.schema - if isinstance(schema, tiledb.ArraySchema): - return tuple(schema.domain.dim(i).name for i in range(schema.domain.ndim)) - else: - return tuple(self._handle.index_column_names) + return self._handle.dim_names def _tiledb_attr_names(self) -> Tuple[str, ...]: """Reads the attribute names from the schema: for example, the list of column names in a dataframe. """ - schema = self._handle.schema - if isinstance(schema, tiledb.ArraySchema): - return tuple(schema.attr(i).name for i in range(schema.nattr)) - else: - result = [] - for field in schema: - if field.name not in self._handle.index_column_names: - result.append(field.name) - return tuple(result) + return self._handle.attr_names def _tiledb_domain(self) -> Tuple[Tuple[Any, Any], ...]: - schema = self._handle.schema - if isinstance(schema, tiledb.ArraySchema): - return tuple(schema.domain.dim(i).domain for i in range(0, schema.domain.ndim)) - else: - return self._handle.domain + return self._handle.domain def _soma_reader( self, @@ -146,24 +132,20 @@ def _set_reader_coords(self, sr: clib.SOMAArray, coords: Sequence[object]) -> No f"coords type {type(coords)} must be a regular sequence," " not str or bytes" ) - - schema = self._handle.schema - if isinstance(schema, tiledb.ArraySchema): - ndim = schema.domain.ndim - else: - ndim = self._handle.ndim - - if len(coords) > ndim: + + if len(coords) > self._handle.ndim: raise ValueError( f"coords ({len(coords)} elements) must be shorter than ndim" - f" ({ndim})" + f" ({self._handle.ndim})" ) for i, coord in enumerate(coords): + schema = self._handle.schema + if isinstance(schema, tiledb.ArraySchema): dim = self._handle.schema.domain.dim(i) else: dim = self._handle.schema.field(i) - + if not self._set_reader_coord(sr, i, dim, coord): raise TypeError( f"coord type {type(coord)} for dimension {dim.name}" @@ -182,7 +164,6 @@ def _set_reader_coord( Returns: True if successful, False if unrecognized. """ - del dim_idx # Unused. if coord is None: return True # No constraint; select all in this dimension diff --git a/apis/python/src/tiledbsoma/_tiledb_object.py b/apis/python/src/tiledbsoma/_tiledb_object.py index ea12d3154c..119a1a5233 100644 --- a/apis/python/src/tiledbsoma/_tiledb_object.py +++ b/apis/python/src/tiledbsoma/_tiledb_object.py @@ -13,13 +13,13 @@ from typing_extensions import Self from . import _constants, _tdb_handles +from . import pytiledbsoma as clib from ._exception import SOMAError +from ._tdb_handles import DataFrameWrapper from ._types import OpenTimestamp from ._util import check_type, ms_to_datetime from .options import SOMATileDBContext from .options._soma_tiledb_context import _validate_soma_tiledb_context -from ._tdb_handles import DataFrameWrapper -from . import pytiledbsoma as clib _WrapperType_co = TypeVar( "_WrapperType_co", bound=_tdb_handles.AnyWrapper, covariant=True diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py index bcda9dbb57..f991ad339a 100644 --- a/apis/python/src/tiledbsoma/io/ingest.py +++ b/apis/python/src/tiledbsoma/io/ingest.py @@ -1750,7 +1750,7 @@ def _write_matrix_to_denseNDArray( def _read_nonempty_domain(arr: TileDBArray) -> Any: try: - return arr._handle.reader.nonempty_domain() + return arr._handle.nonempty_domain() except SOMAError: # This means that we're open in write-only mode. # Reopen the array in read mode. @@ -1758,7 +1758,7 @@ def _read_nonempty_domain(arr: TileDBArray) -> Any: cls = type(arr) with cls.open(arr.uri, "r", platform_config=None, context=arr.context) as readarr: - return readarr._handle.reader.nonempty_domain() + return readarr._handle.nonempty_domain() def _find_sparse_chunk_size( diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc new file mode 100644 index 0000000000..e4a9bdf616 --- /dev/null +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -0,0 +1,643 @@ +/** + * @file soma_array.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2022 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines the SOMAArray bindings. + */ + +#include +#include +#include +#include +#include + +#include + +#include "query_condition.cc" + +#define DENUM(x) .value(#x, TILEDB_##x) + +using namespace tiledbsoma; + +namespace py = pybind11; +using namespace py::literals; + +namespace tiledbsoma { + +py::tuple get_enum(SOMAArray& sr, std::string attr_name){ + auto attr_to_enmrs = sr.get_attr_to_enum_mapping(); + if(attr_to_enmrs.count(attr_name) == 0) + throw TileDBSOMAError("Given attribute does not have enumeration"); + + Enumeration enmr(attr_to_enmrs.at(attr_name)); + + switch (enmr.type()) { + case TILEDB_UINT8: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_INT8: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_UINT16: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_INT16: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_UINT32: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_INT32: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_UINT64: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_INT64: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_FLOAT32: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_FLOAT64: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + case TILEDB_CHAR: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_BOOL: + return py::tuple(py::cast(enmr.as_vector())); + default: + throw TileDBSOMAError("Unsupported enumeration type."); + } +} + +bool get_enum_is_ordered(SOMAArray& sr, std::string attr_name){ + auto attr_to_enmrs = sr.get_attr_to_enum_mapping(); + if(attr_to_enmrs.count(attr_name) == 0) + throw TileDBSOMAError("Given attribute does not have enumeration"); + return attr_to_enmrs.at(attr_name).ordered(); +} + +/** + * @brief Convert ColumnBuffer to Arrow array. + * + * @param column_buffer ColumnBuffer + * @return py::object Arrow array + */ +py::object to_array(std::shared_ptr column_buffer) { + auto pa = py::module::import("pyarrow"); + auto pa_array_import = pa.attr("Array").attr("_import_from_c"); + + auto [array, schema] = ArrowAdapter::to_arrow(column_buffer); + return pa_array_import(py::capsule(array.get()), py::capsule(schema.get())); +} + +/** + * @brief Convert ArrayBuffers to Arrow table. + * + * @param cbs ArrayBuffers + * @return py::object + */ +py::object to_table(SOMAArray& sr, std::shared_ptr array_buffers) { + auto pa = py::module::import("pyarrow"); + auto pa_table_from_arrays = pa.attr("Table").attr("from_arrays"); + auto pa_dict_from_arrays = pa.attr("DictionaryArray").attr("from_arrays"); + + py::list names; + py::list arrays; + + for (auto& name : array_buffers->names()) { + auto column = array_buffers->at(name); + names.append(name); + + if(sr.get_attr_to_enum_mapping().count(name) == 0){ + arrays.append(to_array(column)); + }else{ + arrays.append(pa_dict_from_arrays( + to_array(column), + get_enum(sr, name), + py::none(), + get_enum_is_ordered(sr, name))); + } + } + + auto pa_table = pa_table_from_arrays(arrays, names); + + return pa_table; +} + +/** + * @brief pybind11 bindings + * + */ +void init_soma_array(py::module &m) { + tiledbpy::init_query_condition(m); + + m.doc() = "SOMA acceleration library"; + + m.def("version", []() { return tiledbsoma::version::as_string(); }); + + m.def( + "config_logging", + [](const std::string& level, const std::string& logfile) { + LOG_CONFIG(level, logfile); + }, + "level"_a, + "logfile"_a = ""); + + m.def("info", &LOG_INFO, "message"_a = ""); + m.def("debug", &LOG_DEBUG, "message"_a = ""); + + m.def( + "tiledbsoma_stats_enable", + []() { tiledbsoma::stats::enable(); }, + "Enable TileDB internal statistics. Lifecycle: experimental."); + m.def( + "tiledbsoma_stats_disable", + []() { tiledbsoma::stats::disable(); }, + "Disable TileDB internal statistics. Lifecycle: experimental."); + m.def( + "tiledbsoma_stats_reset", + []() { tiledbsoma::stats::reset(); }, + "Reset all TileDB internal statistics to 0. Lifecycle: experimental."); + m.def( + "tiledbsoma_stats_dump", + []() { + py::print(tiledbsoma::version::as_string()); + std::string stats = tiledbsoma::stats::dump(); + py::print(stats); + }, + "Print TileDB internal statistics. Lifecycle: experimental."); + + py::class_(m, "SOMAArray") + .def( + py::init( + [](std::string_view uri, + std::string_view name, + std::optional> column_names_in, + py::object py_query_condition, + std::string_view batch_size, + ResultOrder result_order, + std::map platform_config, + std::optional> timestamp) { + // Handle optional args + std::vector column_names; + if (column_names_in) { + column_names = *column_names_in; + } + + // Handle query condition based on + // TileDB-Py::PyQuery::set_attr_cond() + QueryCondition* qc = nullptr; + if (!py_query_condition.is(py::none())) { + py::object init_pyqc = py_query_condition.attr( + "init_query_condition"); + + try { + // Column names will be updated with columns present + // in the query condition + auto new_column_names = + init_pyqc(uri, column_names, platform_config, timestamp) + .cast>(); + + // Update the column_names list if it was not empty, + // otherwise continue selecting all columns with an + // empty column_names list + if (!column_names.empty()) { + column_names = new_column_names; + } + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + + qc = py_query_condition.attr("c_obj") + .cast() + .ptr() + .get(); + } + + // Release python GIL after we're done accessing python + // objects + py::gil_scoped_release release; + + auto reader = SOMAArray::open( + OpenMode::read, + uri, + name, + platform_config, + column_names, + batch_size, + result_order, + timestamp); + + // Set query condition if present + if (qc) { + reader->set_condition(*qc); + } + + return reader; + }), + "uri"_a, + py::kw_only(), + "name"_a = "unnamed", + "column_names"_a = py::none(), + "query_condition"_a = py::none(), + "batch_size"_a = "auto", + "result_order"_a = ResultOrder::automatic, + "platform_config"_a = py::dict(), + "timestamp"_a = py::none()) + + .def( + "reset", + [](SOMAArray& reader, + std::optional> column_names_in, + py::object py_query_condition, + std::string_view batch_size, + ResultOrder result_order) { + // Handle optional args + std::vector column_names; + if (column_names_in) { + column_names = *column_names_in; + } + + // Handle query condition based on + // TileDB-Py::PyQuery::set_attr_cond() + QueryCondition* qc = nullptr; + if (!py_query_condition.is(py::none())) { + py::object init_pyqc = py_query_condition.attr( + "init_query_condition"); + + try { + // Convert TileDB::Config to std::unordered map for pybind11 passing + std::unordered_map cfg; + for (const auto& it : reader.ctx()->config()) { + cfg[it.first] = it.second; + } + // Column names will be updated with columns present in + // the query condition + auto new_column_names = + init_pyqc(reader.uri(), column_names, cfg, reader.timestamp()) + .cast>(); + + // Update the column_names list if it was not empty, + // otherwise continue selecting all columns with an + // empty column_names list + if (!column_names.empty()) { + column_names = new_column_names; + } + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + + qc = py_query_condition.attr("c_obj") + .cast() + .ptr() + .get(); + } + + // Release python GIL after we're done accessing python objects + py::gil_scoped_release release; + + // Reset state of the existing SOMAArray object + reader.reset(column_names, batch_size, result_order); + + // Set query condition if present + if (qc) { + reader.set_condition(*qc); + } + }, + py::kw_only(), + "column_names"_a = py::none(), + "query_condition"_a = py::none(), + "batch_size"_a = "auto", + "result_order"_a = ResultOrder::automatic) + + // After this are short functions expected to be invoked when the coords + // are Python list/tuple, or NumPy arrays. Arrow arrays are in this + // long if-else-if function. + .def( + "set_dim_points_arrow", + [](SOMAArray& reader, + const std::string& dim, + py::object py_arrow_array, + int partition_index, + int partition_count) { + // Create a list of array chunks + py::list array_chunks; + if (py::hasattr(py_arrow_array, "chunks")) { + array_chunks = py_arrow_array.attr("chunks") + .cast(); + } else { + array_chunks.append(py_arrow_array); + } + + for (const pybind11::handle array : array_chunks) { + ArrowSchema arrow_schema; + ArrowArray arrow_array; + uintptr_t arrow_schema_ptr = (uintptr_t)(&arrow_schema); + uintptr_t arrow_array_ptr = (uintptr_t)(&arrow_array); + + // Call array._export_to_c to get arrow array and schema + // + // If ever a NumPy array gets in here, there will be an + // exception like "AttributeError: 'numpy.ndarray' object + // has no attribute '_export_to_c'". + array.attr("_export_to_c")( + arrow_array_ptr, arrow_schema_ptr); + + auto coords = array.attr("tolist")(); + + if (!strcmp(arrow_schema.format, "l")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "i")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "s")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "c")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "L")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "I")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "S")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "C")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "f")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "g")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if ( + !strcmp(arrow_schema.format, "u") || + !strcmp(arrow_schema.format, "z")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if ( + !strcmp(arrow_schema.format, "tss:") || + !strcmp(arrow_schema.format, "tsm:") || + !strcmp(arrow_schema.format, "tsu:") || + !strcmp(arrow_schema.format, "tsn:")) { + // convert the Arrow Array to int64 + auto pa = py::module::import("pyarrow"); + coords = array.attr("cast")(pa.attr("int64")()).attr("tolist")(); + reader.set_dim_points( + dim, coords.cast>()); + } else if ( + !strcmp(arrow_schema.format, "U") || + !strcmp(arrow_schema.format, "Z")) { + reader.set_dim_points( + dim, coords.cast>()); + } else { + throw TileDBSOMAError(fmt::format( + "[pytiledbsoma] set_dim_points: type={} not " + "supported", + arrow_schema.format)); + } + + // Release arrow schema + arrow_schema.release(&arrow_schema); + } + }, + "dim"_a, + "py_arrow_array"_a, + "partition_index"_a = 0, + "partition_count"_a = 1) + + // The following short functions are expected to be invoked when the + // coords are Python list/tuple, or NumPy arrays. Arrow arrays are in + // the long if-else-if function above. + // + // Binding overloaded methods to templated member functions requires + // more effort, see: + // https://pybind11.readthedocs.io/en/stable/classes.html#overloaded-methods + + // In an initial version of this file we had `set_dim_ranges` relying + // solely on type-overloading. This worked since we supported only int + // and string indices. In a subsequent version we are now supporting + // various NumPy/PyArrow types including float32, float64, int8, uint16, + // etc. It is an unfortunate fact that pybind11 does _not_ successfully + // disambiguate between float32 and float64, or between int8 and int64, + // etc. given that we ask it to disambiguate using not just types but + // std::vector of types or std::vector of std::pair of types. + // Experiments have shown that when both float32 and float64 are + // implemented with overloaded names to be differentiated solely by + // type, pybind11 uses the _first found_. Therefore it is necessary for + // us to no longer use common overloaded names. + + .def( + "set_dim_points_string_or_bytes", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_float64", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_float32", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_int64", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_int32", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_int16", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_int8", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_uint64", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_uint32", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_uint16", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_uint8", + static_cast&)>( + &SOMAArray::set_dim_points)) + + // In an initial version of this file we had `set_dim_ranges` relying + // solely on type-overloading. This worked since we supported only int + // and string indices. In a subsequent version we are now supporting + // various NumPy/PyArrow types including float32, float64, int8, uint16, + // etc. It is an unfortunate fact that pybind11 does _not_ successfully + // disambiguate between float32 and float64, or between int8 and int64, + // etc. given that we ask it to disambiguate using not just types but + // std::vector of types or std::vector of std::pair of types. + // Experiments have shown that when both float32 and float64 are + // implemented with overloaded names to be differentiated solely by + // type, pybind11 uses the _first found_. Therefore it is necessary for + // us to no longer use common overloaded names. + + .def( + "set_dim_ranges_string_or_bytes", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_int64", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_int32", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_int16", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_int8", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_uint64", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_uint32", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_uint16", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_uint8", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_float64", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_float32", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def("results_complete", &SOMAArray::results_complete) + + .def( + "read_next", + [](SOMAArray& reader) -> std::optional { + // Release python GIL before reading data + py::gil_scoped_release release; + + // Try to read more data + auto buffers = reader.read_next(); + + // If more data was read, convert it to an arrow table and + // return + if (buffers.has_value()) { + // Acquire python GIL before accessing python objects + py::gil_scoped_acquire acquire; + return to_table(reader, *buffers); + } + + // No data was read, the query is complete, return nullopt + return std::nullopt; + }) + + .def("nnz", &SOMAArray::nnz, py::call_guard()) + + .def_property_readonly("shape", &SOMAArray::shape) + + .def("get_enum", get_enum) + + .def("get_enum_is_ordered", get_enum_is_ordered) + + .def("get_enum_label_on_attr", &SOMAArray::get_enum_label_on_attr); +} +} // namespace tiledbsoma \ No newline at end of file diff --git a/apis/python/src/tiledbsoma/soma_dataframe.cc b/apis/python/src/tiledbsoma/soma_dataframe.cc new file mode 100644 index 0000000000..0ce9435c1d --- /dev/null +++ b/apis/python/src/tiledbsoma/soma_dataframe.cc @@ -0,0 +1,558 @@ +#include +#include +#include +#include +#include + +#include + +using namespace tiledbsoma; +namespace py = pybind11; +using namespace py::literals; + +namespace tiledbsoma { + +/** + * @brief Convert ColumnBuffer to Arrow array. + * + * @param column_buffer ColumnBuffer + * @return py::object Arrow array + */ +py::object df_to_array(std::shared_ptr column_buffer) { + auto pa = py::module::import("pyarrow"); + auto pa_array_import = pa.attr("Array").attr("_import_from_c"); + + auto [array, schema] = ArrowAdapter::to_arrow(column_buffer); + return pa_array_import(py::capsule(array.get()), py::capsule(schema.get())); +} + +/** + * @brief Convert ArrayBuffers to Arrow table. + * + * @param cbs ArrayBuffers + * @return py::object + */ +py::object df_to_table(std::shared_ptr array_buffers) { + auto pa = py::module::import("pyarrow"); + auto pa_table_from_arrays = pa.attr("Table").attr("from_arrays"); + auto pa_dict_from_arrays = pa.attr("DictionaryArray").attr("from_arrays"); + + py::list names; + py::list arrays; + + for (auto& name : array_buffers->names()) { + auto column = array_buffers->at(name); + names.append(name); + arrays.append(df_to_array(column)); + } + + return pa_table_from_arrays(arrays, names); +} + +static std::optional read_next(SOMADataFrame& dataframe){ + // Release python GIL before reading data + py::gil_scoped_release release; + + // Try to read more data + auto buffers = dataframe.read_next(); + + // If more data was read, convert it to an arrow table and return + if (buffers.has_value()) { + // Acquire python GIL before accessing python objects + py::gil_scoped_acquire acquire; + return df_to_table(*buffers); + } + + // No data was read, the query is complete, return nullopt + return std::nullopt; +} + +std::unordered_map _tdb_to_np_name_dtype = { + {TILEDB_INT32, "int32"}, + {TILEDB_INT64, "int64"}, + {TILEDB_FLOAT32, "float32"}, + {TILEDB_FLOAT64, "float64"}, + {TILEDB_INT8, "int8"}, + {TILEDB_UINT8, "uint8"}, + {TILEDB_INT16, "int16"}, + {TILEDB_UINT16, "uint16"}, + {TILEDB_UINT32, "uint32"}, + {TILEDB_UINT64, "uint64"}, + {TILEDB_STRING_ASCII, "S"}, + {TILEDB_STRING_UTF8, "U1"}, + {TILEDB_CHAR, "S1"}, + {TILEDB_DATETIME_YEAR, "M8[Y]"}, + {TILEDB_DATETIME_MONTH, "M8[M]"}, + {TILEDB_DATETIME_WEEK, "M8[W]"}, + {TILEDB_DATETIME_DAY, "M8[D]"}, + {TILEDB_DATETIME_HR, "M8[h]"}, + {TILEDB_DATETIME_MIN, "M8[m]"}, + {TILEDB_DATETIME_SEC, "M8[s]"}, + {TILEDB_DATETIME_MS, "M8[ms]"}, + {TILEDB_DATETIME_US, "M8[us]"}, + {TILEDB_DATETIME_NS, "M8[ns]"}, + {TILEDB_DATETIME_PS, "M8[ps]"}, + {TILEDB_DATETIME_FS, "M8[fs]"}, + {TILEDB_DATETIME_AS, "M8[as]"}, + {TILEDB_TIME_HR, "m8[h]"}, + {TILEDB_TIME_MIN, "m8[m]"}, + {TILEDB_TIME_SEC, "m8[s]"}, + {TILEDB_TIME_MS, "m8[ms]"}, + {TILEDB_TIME_US, "m8[us]"}, + {TILEDB_TIME_NS, "m8[ns]"}, + {TILEDB_TIME_PS, "m8[ps]"}, + {TILEDB_TIME_FS, "m8[fs]"}, + {TILEDB_TIME_AS, "m8[as]"}, + {TILEDB_BLOB, "byte"}, + {TILEDB_BOOL, "bool"}, +}; + +py::dtype tdb_to_np_dtype(tiledb_datatype_t type, uint32_t cell_val_num) { + if (type == TILEDB_CHAR || type == TILEDB_STRING_UTF8 || + type == TILEDB_STRING_ASCII) { + std::string base_str = (type == TILEDB_STRING_UTF8) ? "|U" : "|S"; + if (cell_val_num < TILEDB_VAR_NUM) + base_str += std::to_string(cell_val_num); + return py::dtype(base_str); + } + + if (cell_val_num == 1) { + if (type == TILEDB_STRING_UTF16 || type == TILEDB_STRING_UTF32) + throw std::invalid_argument("Unimplemented UTF16 or UTF32 string conversion!"); + if (type == TILEDB_STRING_UCS2 || type == TILEDB_STRING_UCS4) + throw std::invalid_argument("Unimplemented UCS2 or UCS4 string conversion!"); + + if (_tdb_to_np_name_dtype.count(type) == 1) + return py::dtype(_tdb_to_np_name_dtype[type]); + } + + if (cell_val_num == 2) { + if (type == TILEDB_FLOAT32) + return py::dtype("complex64"); + if (type == TILEDB_FLOAT64) + return py::dtype("complex128"); + } + + if (cell_val_num == TILEDB_VAR_NUM) + return tdb_to_np_dtype(type, 1); + + if (cell_val_num > 1) { + py::dtype base_dtype = tdb_to_np_dtype(type, 1); + py::tuple rec_elem = py::make_tuple("", base_dtype); + py::list rec_list; + for (size_t i = 0; i < cell_val_num; i++) + rec_list.append(rec_elem); + // note: we call the 'dtype' constructor b/c py::dtype does not accept + // list + auto np = py::module::import("numpy"); + auto np_dtype = np.attr("dtype"); + return np_dtype(rec_list); + } + + throw std::invalid_argument("tiledb datatype not understood ('" + + tiledb::impl::type_to_str(type) + + "', cell_val_num: " + std::to_string(cell_val_num) + ")"); +} + +py::dict meta(SOMADataFrame &soma_dataframe) { + py::dict results; + + for (auto const& [key, val] : soma_dataframe.get_metadata()){ + tiledb_datatype_t tdb_type = std::get(val); + uint32_t value_num = std::get(val); + const void *value = std::get(val); + + if(tdb_type == TILEDB_STRING_UTF8){ + results[py::str(key)] = py::str(std::string((const char*)value, value_num)); + }else if(tdb_type == TILEDB_STRING_ASCII){ + results[py::str(key)] = py::bytes(std::string((const char*)value, value_num)); + }else{ + py::dtype value_type = tdb_to_np_dtype(tdb_type, 1); + results[py::str(key)] = py::array(value_type, value_num, value); + } + } + + return results; +} + +void init_soma_dataframe(py::module &m) { + py::class_(m, "SOMADataFrame") + + .def_static("open", py::overload_cast, std::vector, ResultOrder, std::optional>>(&SOMADataFrame::open)) + .def_static("open", py::overload_cast, std::vector, ResultOrder, std::optional>>(&SOMADataFrame::open)) + .def_static("exists", &SOMADataFrame::exists) + + .def("reopen", py::overload_cast>>(&SOMADataFrame::open)) + .def("close", &SOMADataFrame::close) + .def_property_readonly("closed", [](SOMADataFrame& soma_df) -> bool { + return soma_df.is_open(); + }) + .def("reset", &SOMADataFrame::reset) + .def("type", &SOMADataFrame::type) + .def("uri", &SOMADataFrame::uri) + .def_property_readonly("schema", [](SOMADataFrame& soma_df) -> py::object { + auto pa = py::module::import("pyarrow"); + auto pa_schema_import = pa.attr("Schema").attr("_import_from_c"); + return pa_schema_import(py::capsule(ArrowAdapter::tiledb_schema_to_arrow_schema(soma_df.schema()).get())); + }) + .def_property_readonly("timestamp", &SOMADataFrame::timestamp) + .def_property_readonly("index_column_names", &SOMADataFrame::index_column_names) + .def("nonempty_domain", [](SOMADataFrame& soma_df, std::string name){ + switch (soma_df.schema()->domain().dimension(name).type()) { + case TILEDB_UINT64: { + return py::cast(soma_df.non_empty_domain(name)); + } + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_INT64: { + return py::cast(soma_df.non_empty_domain(name)); + } + case TILEDB_UINT32: { + return py::cast(soma_df.non_empty_domain(name)); + } + case TILEDB_INT32: { + return py::cast(soma_df.non_empty_domain(name)); + } + case TILEDB_UINT16: { + return py::cast(soma_df.non_empty_domain(name)); + } + case TILEDB_INT16: { + return py::cast(soma_df.non_empty_domain(name)); + } + case TILEDB_UINT8: { + return py::cast(soma_df.non_empty_domain(name)); + } + case TILEDB_INT8: { + return py::cast(soma_df.non_empty_domain(name)); + } + case TILEDB_FLOAT64: { + return py::cast(soma_df.non_empty_domain(name)); + } + case TILEDB_FLOAT32: { + return py::cast(soma_df.non_empty_domain(name)); + } + case TILEDB_STRING_UTF8: + case TILEDB_STRING_ASCII: { + return py::cast(soma_df.non_empty_domain_var(name)); + } + default: + throw std::invalid_argument("Unsupported dtype for nonempty domain."); + } + }) + .def("domain", [](SOMADataFrame& soma_df, std::string name) -> py::tuple { + auto dim = soma_df.schema()->domain().dimension(name); + switch (dim.type()) { + case TILEDB_UINT64: { + auto dom = dim.domain(); + return py::make_tuple(dom.first, dom.second); + } + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_INT64: { + auto dom = dim.domain(); + return py::make_tuple(dom.first, dom.second); + } + case TILEDB_UINT32: { + auto dom = dim.domain(); + return py::make_tuple(dom.first, dom.second); + } + case TILEDB_INT32: { + auto dom = dim.domain(); + return py::make_tuple(dom.first, dom.second); + } + case TILEDB_UINT16: { + auto dom = dim.domain(); + return py::make_tuple(dom.first, dom.second); + } + case TILEDB_INT16: { + auto dom = dim.domain(); + return py::make_tuple(dom.first, dom.second); + } + case TILEDB_UINT8: { + auto dom = dim.domain(); + return py::make_tuple(dom.first, dom.second); + } + case TILEDB_INT8: { + auto dom = dim.domain(); + return py::make_tuple(dom.first, dom.second); + } + case TILEDB_FLOAT64: { + auto dom = dim.domain(); + return py::make_tuple(dom.first, dom.second); + } + case TILEDB_FLOAT32: { + auto dom = dim.domain(); + return py::make_tuple(dom.first, dom.second); + } + case TILEDB_STRING_ASCII: { + return py::make_tuple("", ""); + } + default: + throw std::invalid_argument("Unsupported dtype for Dimension's domain"); + } + }) + .def_property_readonly("ndim", &SOMADataFrame::ndim) + .def_property_readonly("count", &SOMADataFrame::count) + .def("read_next", read_next) + + .def("set_metadata", &SOMADataFrame::set_metadata) + .def("delete_metadata", &SOMADataFrame::delete_metadata) + .def("get_metadata", + py::overload_cast(&SOMADataFrame::get_metadata)) + .def_property_readonly("meta", meta) + .def("has_metadata", &SOMADataFrame::has_metadata) + .def("metadata_num", &SOMADataFrame::metadata_num) + .def( + "set_dim_points_arrow", + [](SOMADataFrame& reader, + const std::string& dim, + py::object py_arrow_array, + int partition_index, + int partition_count) { + // Create a list of array chunks + py::list array_chunks; + if (py::hasattr(py_arrow_array, "chunks")) { + array_chunks = py_arrow_array.attr("chunks") + .cast(); + } else { + array_chunks.append(py_arrow_array); + } + + for (const pybind11::handle array : array_chunks) { + ArrowSchema arrow_schema; + ArrowArray arrow_array; + uintptr_t arrow_schema_ptr = (uintptr_t)(&arrow_schema); + uintptr_t arrow_array_ptr = (uintptr_t)(&arrow_array); + + // Call array._export_to_c to get arrow array and schema + // + // If ever a NumPy array gets in here, there will be an + // exception like "AttributeError: 'numpy.ndarray' object + // has no attribute '_export_to_c'". + array.attr("_export_to_c")( + arrow_array_ptr, arrow_schema_ptr); + + auto coords = array.attr("tolist")(); + + if (!strcmp(arrow_schema.format, "l")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "i")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "s")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "c")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "L")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "I")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "S")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "C")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "f")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "g")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if ( + !strcmp(arrow_schema.format, "u") || + !strcmp(arrow_schema.format, "z")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if ( + !strcmp(arrow_schema.format, "tss:") || + !strcmp(arrow_schema.format, "tsm:") || + !strcmp(arrow_schema.format, "tsu:") || + !strcmp(arrow_schema.format, "tsn:")) { + // convert the Arrow Array to int64 + auto pa = py::module::import("pyarrow"); + coords = array.attr("cast")(pa.attr("int64")()).attr("tolist")(); + reader.set_dim_points( + dim, coords.cast>()); + } else if ( + !strcmp(arrow_schema.format, "U") || + !strcmp(arrow_schema.format, "Z")) { + reader.set_dim_points( + dim, coords.cast>()); + } else { + throw TileDBSOMAError(fmt::format( + "[pytiledbsoma] set_dim_points: type={} not " + "supported", + arrow_schema.format)); + } + + // Release arrow schema + arrow_schema.release(&arrow_schema); + } + }, + "dim"_a, + "py_arrow_array"_a, + "partition_index"_a = 0, + "partition_count"_a = 1) + + .def( + "set_dim_points_string_or_bytes", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_float64", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_float32", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_int64", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_int32", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_int16", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_int8", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_uint64", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_uint32", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_uint16", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_uint8", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + + .def( + "set_dim_ranges_string_or_bytes", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + + .def( + "set_dim_ranges_int64", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + + .def( + "set_dim_ranges_int32", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + + .def( + "set_dim_ranges_int16", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + + .def( + "set_dim_ranges_int8", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + + .def( + "set_dim_ranges_uint64", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + + .def( + "set_dim_ranges_uint32", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + + .def( + "set_dim_ranges_uint16", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + + .def( + "set_dim_ranges_uint8", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + + .def( + "set_dim_ranges_float64", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + + .def( + "set_dim_ranges_float32", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)); +} +} \ No newline at end of file diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 27b0d514d6..bbc733afb1 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -476,6 +476,15 @@ class SOMAArray { */ uint64_t ndim() const; + /** + * Retrieves the non-empty domain from the array. This is the union of the + * non-empty domains of the array fragments. + */ + template + std::pair non_empty_domain(const std::string& name) { + return arr_->non_empty_domain(name); + }; + /** * Retrieves the non-empty domain from the array on the given dimension. * This is the union of the non-empty domains of the array fragments. diff --git a/libtiledbsoma/src/soma/soma_dataframe.h b/libtiledbsoma/src/soma/soma_dataframe.h index c5f7753aca..c61728fa47 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.h +++ b/libtiledbsoma/src/soma/soma_dataframe.h @@ -231,13 +231,30 @@ class SOMADataFrame : public SOMAObject { const std::vector index_column_names() const; /** - * Return the number of dimnesions SOMADataFrame. + * Return the number of dimnesions. * * @return int64_t */ int64_t ndim() const; + + /** + * Return the number of rows. + * + * @return int64_t + */ int64_t count() const; + /** + * Retrieves the non-empty domain from the array. This is the union of the + * non-empty domains of the array fragments. + * + * @return int64_t + */ + template + std::pair non_empty_domain(const std::string& name) { + return array_->non_empty_domain(name); + }; + /** * Retrieves the non-empty domain from the array on the given dimension. * This is the union of the non-empty domains of the array fragments.