diff --git a/apis/python/setup.py b/apis/python/setup.py index be0fecbcb0..3742b7a6cd 100644 --- a/apis/python/setup.py +++ b/apis/python/setup.py @@ -313,6 +313,7 @@ def run(self): "src/tiledbsoma/soma_sparse_ndarray.cc", "src/tiledbsoma/soma_group.cc", "src/tiledbsoma/soma_collection.cc", + "src/tiledbsoma/managed_query.cc", "src/tiledbsoma/pytiledbsoma.cc", ], include_dirs=INC_DIRS, diff --git a/apis/python/src/tiledbsoma/managed_query.cc b/apis/python/src/tiledbsoma/managed_query.cc new file mode 100644 index 0000000000..7de08619bc --- /dev/null +++ b/apis/python/src/tiledbsoma/managed_query.cc @@ -0,0 +1,581 @@ +/** + * @file managed_query.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines the ManagedQuery bindings. + */ + +#include +#include +#include +#include +#include + +#include + +#include "common.h" + +namespace libtiledbsomacpp { + +namespace py = pybind11; +using namespace py::literals; +using namespace tiledbsoma; + +void load_managed_query(py::module& m) { + py::class_(m, "ManagedQuery") + .def( + py::init([](SOMAArray array, + std::shared_ptr ctx, + std::string_view name) { + return ManagedQuery( + std::make_unique(array), + ctx->tiledb_ctx(), + name); + }), + py::arg("array"), + py::arg("ctx"), + py::arg("name") = "unnamed") + + .def("setup_read", &ManagedQuery::setup_read) + .def("is_empty_query", &ManagedQuery::is_empty_query) + .def("is_complete", &ManagedQuery::is_complete) + + .def("set_layout", &ManagedQuery::set_layout) + .def( + "set_condition", + [](ManagedQuery& mq, + py::object py_query_condition, + py::object py_schema) { + auto column_names = mq.column_names(); + // Handle query condition based on + // TileDB-Py::PyQuery::set_attr_cond() + QueryCondition* qc = nullptr; + if (!py_query_condition.is(py::none())) { + py::object init_pyqc = py_query_condition.attr( + "init_query_condition"); + try { + // Column names will be updated with columns present + // in the query condition + auto new_column_names = + init_pyqc(py_schema, column_names) + .cast>(); + // Update the column_names list if it was not empty, + // otherwise continue selecting all columns with an + // empty column_names list + if (!column_names.empty()) { + column_names = new_column_names; + } + } catch (const std::exception& e) { + TPY_ERROR_LOC(e.what()); + } + qc = py_query_condition.attr("c_obj") + .cast() + .ptr() + .get(); + } + mq.reset(); + mq.select_columns(column_names); + + // Release python GIL after we're done accessing python + // objects + py::gil_scoped_release release; + // Set query condition if present + if (qc) { + mq.set_condition(*qc); + } + }, + "py_query_condition"_a, + "py_schema"_a) + .def( + "select_columns", + &ManagedQuery::select_columns, + "names"_a, + "if_not_empty"_a = false) + + .def("submit_read", &ManagedQuery::submit_read) + .def( + "results", + [](ManagedQuery& mq) -> std::optional { + try { + // Release python GIL before reading data + py::gil_scoped_release release; + auto tbl = mq.results(); + // Acquire python GIL before accessing python objects + py::gil_scoped_acquire acquire; + return to_table(std::make_optional(tbl)); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_array_data", + [](ManagedQuery& mq, py::handle py_batch) { + ArrowSchema arrow_schema; + ArrowArray arrow_array; + uintptr_t arrow_schema_ptr = (uintptr_t)(&arrow_schema); + uintptr_t arrow_array_ptr = (uintptr_t)(&arrow_array); + py_batch.attr("_export_to_c")( + arrow_array_ptr, arrow_schema_ptr); + + try { + mq.set_array_data( + std::make_unique(arrow_schema), + std::make_unique(arrow_array)); + } catch (const std::exception& e) { + TPY_ERROR_LOC(e.what()); + } + + arrow_schema.release(&arrow_schema); + arrow_array.release(&arrow_array); + }) + .def( + "set_soma_data", + [](ManagedQuery& mq, py::array data) { + py::buffer_info data_info = data.request(); + mq.setup_write_column( + "soma_data", + data.size(), + (const void*)data_info.ptr, + static_cast(nullptr), + static_cast(nullptr)); + }) + .def( + "submit_write", + &ManagedQuery::submit_write, + "sort_coords"_a = false) + + .def("reset", &ManagedQuery::reset) + .def("close", &ManagedQuery::close) + + // The following short functions are expected to be invoked when the + // coords are Python list/tuple, or NumPy arrays. Arrow arrays are in + // the long if-else-if function above. + // + // Binding overloaded methods to templated member functions requires + // more effort, see: + // https://pybind11.readthedocs.io/en/stable/classes.html#overloaded-methods + + // In an initial version of this file we had `set_dim_ranges` relying + // solely on type-overloading. This worked since we supported only int + // and string indices. In a subsequent version we are now supporting + // various NumPy/PyArrow types including float32, float64, int8, uint16, + // etc. It is an unfortunate fact that pybind11 does _not_ successfully + // disambiguate between float32 and float64, or between int8 and int64, + // etc. given that we ask it to disambiguate using not just types but + // std::vector of types or std::vector of std::pair of types. + // Experiments have shown that when both float32 and float64 are + // implemented with overloaded names to be differentiated solely by + // type, pybind11 uses the _first found_. Therefore it is necessary for + // us to no longer use common overloaded names. + + .def( + "set_dim_points_string_or_bytes", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector& points) { + try { + mq.select_points(dim, points); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_points_double", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector& points) { + try { + mq.select_points(dim, points); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_points_float", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector& points) { + try { + mq.select_points(dim, points); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_points_int64", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector& points) { + try { + mq.select_points(dim, points); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_points_int32", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector& points) { + try { + mq.select_points(dim, points); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_points_int16", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector& points) { + try { + mq.select_points(dim, points); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_points_int8", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector& points) { + try { + mq.select_points(dim, points); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_points_uint64", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector& points) { + try { + mq.select_points(dim, points); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_points_uint32", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector& points) { + try { + mq.select_points(dim, points); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_points_uint16", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector& points) { + try { + mq.select_points(dim, points); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_points_uint8", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector& points) { + try { + mq.select_points(dim, points); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + // In an initial version of this file we had `set_dim_ranges` relying + // solely on type-overloading. This worked since we supported only int + // and string indices. In a subsequent version we are now supporting + // various NumPy/PyArrow types including float32, float64, int8, uint16, + // etc. It is an unfortunate fact that pybind11 does _not_ successfully + // disambiguate between float32 and float64, or between int8 and int64, + // etc. given that we ask it to disambiguate using not just types but + // std::vector of types or std::vector of std::pair of types. + // Experiments have shown that when both float32 and float64 are + // implemented with overloaded names to be differentiated solely by + // type, pybind11 uses the _first found_. Therefore it is necessary for + // us to no longer use common overloaded names. + + .def( + "set_dim_ranges_string_or_bytes", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector>& ranges) { + try { + mq.select_ranges(dim, ranges); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_ranges_double", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector>& ranges) { + try { + mq.select_ranges(dim, ranges); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_ranges_float", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector>& ranges) { + try { + mq.select_ranges(dim, ranges); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_ranges_int64", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector>& ranges) { + try { + mq.select_ranges(dim, ranges); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_ranges_int32", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector>& ranges) { + try { + mq.select_ranges(dim, ranges); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_ranges_int16", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector>& ranges) { + try { + mq.select_ranges(dim, ranges); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_ranges_int8", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector>& ranges) { + try { + mq.select_ranges(dim, ranges); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_ranges_uint64", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector>& ranges) { + try { + mq.select_ranges(dim, ranges); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_ranges_uint32", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector>& ranges) { + try { + mq.select_ranges(dim, ranges); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_ranges_uint16", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector>& ranges) { + try { + mq.select_ranges(dim, ranges); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + .def( + "set_dim_ranges_uint8", + [](ManagedQuery& mq, + const std::string& dim, + const std::vector>& ranges) { + try { + mq.select_ranges(dim, ranges); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }) + + // After this are short functions expected to be invoked when the coords + // are Python list/tuple, or NumPy arrays. Arrow arrays are in this + // long if-else-if function. + .def( + "set_dim_points_arrow", + [](ManagedQuery& mq, + const std::string& dim, + py::object py_arrow_array, + int partition_index, + int partition_count) { + // Create a list of array chunks + py::list array_chunks; + if (py::hasattr(py_arrow_array, "chunks")) { + array_chunks = py_arrow_array.attr("chunks") + .cast(); + } else { + array_chunks.append(py_arrow_array); + } + + for (const pybind11::handle array_handle : array_chunks) { + ArrowSchema arrow_schema; + ArrowArray arrow_array; + uintptr_t arrow_schema_ptr = (uintptr_t)(&arrow_schema); + uintptr_t arrow_array_ptr = (uintptr_t)(&arrow_array); + + // Call handle._export_to_c to get arrow array and schema + // + // If ever a NumPy array gets in here, there will be an + // exception like "AttributeError: 'numpy.ndarray' object + // has no attribute '_export_to_c'". + array_handle.attr("_export_to_c")( + arrow_array_ptr, arrow_schema_ptr); + + auto coords = array_handle.attr("tolist")(); + + try { + if (!strcmp(arrow_schema.format, "l")) { + mq.select_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "i")) { + mq.select_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "s")) { + mq.select_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "c")) { + mq.select_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "L")) { + mq.select_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "I")) { + mq.select_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "S")) { + mq.select_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "C")) { + mq.select_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "f")) { + mq.select_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "g")) { + mq.select_points( + dim, coords.cast>()); + } else if ( + !strcmp(arrow_schema.format, "u") || + !strcmp(arrow_schema.format, "z")) { + mq.select_points( + dim, coords.cast>()); + } else if ( + !strcmp(arrow_schema.format, "tss:") || + !strcmp(arrow_schema.format, "tsm:") || + !strcmp(arrow_schema.format, "tsu:") || + !strcmp(arrow_schema.format, "tsn:")) { + // convert the Arrow Array to int64 + auto pa = py::module::import("pyarrow"); + coords = array_handle + .attr("cast")(pa.attr("int64")()) + .attr("tolist")(); + mq.select_points( + dim, coords.cast>()); + } else if ( + !strcmp(arrow_schema.format, "U") || + !strcmp(arrow_schema.format, "Z")) { + mq.select_points( + dim, coords.cast>()); + } else { + TPY_ERROR_LOC( + "[pytiledbsoma] set_dim_points: type={} not " + "supported" + + std::string(arrow_schema.format)); + } + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + + // Release arrow schema + arrow_schema.release(&arrow_schema); + } + }, + "dim"_a, + "py_arrow_array"_a, + "partition_index"_a = 0, + "partition_count"_a = 1); +} +} // namespace libtiledbsomacpp diff --git a/apis/python/src/tiledbsoma/pytiledbsoma.cc b/apis/python/src/tiledbsoma/pytiledbsoma.cc index 5f9daff759..154d3e52fb 100644 --- a/apis/python/src/tiledbsoma/pytiledbsoma.cc +++ b/apis/python/src/tiledbsoma/pytiledbsoma.cc @@ -28,6 +28,7 @@ void load_soma_collection(py::module&); void load_query_condition(py::module&); void load_reindexer(py::module&); void load_soma_vfs(py::module&); +void load_managed_query(py::module&); PYBIND11_MODULE(pytiledbsoma, m) { py::register_exception(m, "SOMAError"); @@ -155,6 +156,7 @@ PYBIND11_MODULE(pytiledbsoma, m) { load_query_condition(m); load_reindexer(m); load_soma_vfs(m); + load_managed_query(m); } }; // namespace libtiledbsomacpp