From ce950db11063d6b16f0251f144ac9b55fa51e75c Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Mon, 12 Feb 2024 10:48:22 -0600 Subject: [PATCH] [python] Refactor Pybind11 inheritance to reduce repetition --- apis/python/src/tiledbsoma/_dataframe.py | 4 +- apis/python/src/tiledbsoma/soma_array.cc | 150 ++++++- apis/python/src/tiledbsoma/soma_dataframe.cc | 412 +------------------ apis/python/src/tiledbsoma/soma_object.cc | 10 +- 4 files changed, 154 insertions(+), 422 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 4ab9b26ae0..466267da39 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -602,9 +602,7 @@ def _set_reader_coord_by_py_seq_or_np_array( # TODO: bool - raise ValueError( - f"unhandled type {dim.dtype} for index column named {dim.name}" - ) + raise ValueError(f"unhandled type {dim.type} for index column named {dim.name}") def _set_reader_coord_by_numeric_slice( self, sr: clib.SOMAArray, dim_idx: int, dim: pa.Field, coord: Slice[Any] diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 2434c0b4ae..44dd028905 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2022 TileDB, Inc. + * @copyright Copyright (c) 2024 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -86,7 +86,7 @@ bool get_enum_is_ordered(SOMAArray& sr, std::string attr_name){ } void load_soma_array(py::module &m) { - py::class_(m, "SOMAArray") + py::class_(m, "SOMAArray", "SOMAObject") .def( py::init( [](std::string_view uri, @@ -185,6 +185,23 @@ void load_soma_array(py::module &m) { "column_names"_a = py::none(), "batch_size"_a = "auto", "result_order"_a = ResultOrder::automatic) + + .def("reopen", py::overload_cast>>(&SOMAArray::open)) + .def("close", &SOMAArray::close) + .def_property_readonly("closed", [](SOMAArray& reader) -> bool { + return not reader.is_open(); + }) + .def_property_readonly("mode", [](SOMAArray& reader){ + return reader.mode() == OpenMode::read ? "r" : "w"; + }) + .def_property_readonly("schema", [](SOMAArray& reader) -> py::object { + auto pa = py::module::import("pyarrow"); + auto pa_schema_import = pa.attr("Schema").attr("_import_from_c"); + return pa_schema_import(py::capsule(reader.arrow_schema().get())); + }) + .def("config", [](SOMAArray& reader) -> py::dict { + return py::cast(reader.config()); + }) // After this are short functions expected to be invoked when the coords // are Python list/tuple, or NumPy arrays. Arrow arrays are in this @@ -315,13 +332,13 @@ void load_soma_array(py::module &m) { &SOMAArray::set_dim_points)) .def( - "set_dim_points_float64", + "set_dim_points_double", static_cast&)>( &SOMAArray::set_dim_points)) .def( - "set_dim_points_float32", + "set_dim_points_float", static_cast&)>( &SOMAArray::set_dim_points)) @@ -451,14 +468,14 @@ void load_soma_array(py::module &m) { &SOMAArray::set_dim_ranges)) .def( - "set_dim_ranges_float64", + "set_dim_ranges_double", static_cast>&)>( &SOMAArray::set_dim_ranges)) .def( - "set_dim_ranges_float32", + "set_dim_ranges_float", static_cast>&)>( @@ -501,6 +518,125 @@ void load_soma_array(py::module &m) { .def("get_enum_is_ordered", get_enum_is_ordered) - .def("get_enum_label_on_attr", &SOMAArray::get_enum_label_on_attr); + .def("get_enum_label_on_attr", &SOMAArray::get_enum_label_on_attr) + + .def_property_readonly("timestamp", [](SOMAArray& reader) -> py::object { + if(!reader.timestamp().has_value()) + return py::none(); + return py::cast(reader.timestamp()->second); + }) + + .def("non_empty_domain", [](SOMAArray& reader, std::string name, py::dtype dtype){ + switch (np_to_tdb_dtype(dtype)) { + case TILEDB_UINT64: + return py::cast(reader.non_empty_domain(name)); + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_INT64: + return py::cast(reader.non_empty_domain(name)); + case TILEDB_UINT32: + return py::cast(reader.non_empty_domain(name)); + case TILEDB_INT32: + return py::cast(reader.non_empty_domain(name)); + case TILEDB_UINT16: + return py::cast(reader.non_empty_domain(name)); + case TILEDB_INT16: + return py::cast(reader.non_empty_domain(name)); + case TILEDB_UINT8: + return py::cast(reader.non_empty_domain(name)); + case TILEDB_INT8: + return py::cast(reader.non_empty_domain(name)); + case TILEDB_FLOAT64: + return py::cast(reader.non_empty_domain(name)); + case TILEDB_FLOAT32: + return py::cast(reader.non_empty_domain(name)); + case TILEDB_STRING_UTF8: + case TILEDB_STRING_ASCII: + return py::cast(reader.non_empty_domain_var(name)); + default: + throw TileDBSOMAError("Unsupported dtype for nonempty domain."); + } + }) + .def("domain", [](SOMAArray& reader, std::string name, py::dtype dtype) { + switch (np_to_tdb_dtype(dtype)) { + case TILEDB_UINT64: + return py::cast(reader.domain(name)); + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_INT64: + return py::cast(reader.domain(name)); + case TILEDB_UINT32: + return py::cast(reader.domain(name)); + case TILEDB_INT32: + return py::cast(reader.domain(name)); + case TILEDB_UINT16: + return py::cast(reader.domain(name)); + case TILEDB_INT16: + return py::cast(reader.domain(name)); + case TILEDB_UINT8: + return py::cast(reader.domain(name)); + case TILEDB_INT8: + return py::cast(reader.domain(name)); + case TILEDB_FLOAT64: + return py::cast(reader.domain(name)); + case TILEDB_FLOAT32: + return py::cast(reader.domain(name)); + case TILEDB_STRING_UTF8: + case TILEDB_STRING_ASCII: { + std::pair str_domain; + return py::cast(std::make_pair("", "")); + } + default: + throw TileDBSOMAError("Unsupported dtype for Dimension's domain"); + } + }) + + .def("set_metadata", &SOMAArray::set_metadata) + .def("delete_metadata", &SOMAArray::delete_metadata) + .def("get_metadata", + py::overload_cast(&SOMAArray::get_metadata)) + .def_property_readonly("meta", [](SOMAArray&soma_dataframe) -> py::dict { + py::dict results; + + for (auto const& [key, val] : soma_dataframe.get_metadata()){ + tiledb_datatype_t tdb_type = std::get(val); + uint32_t value_num = std::get(val); + const void *value = std::get(val); + + if(tdb_type == TILEDB_STRING_UTF8){ + results[py::str(key)] = py::str(std::string((const char*)value, value_num)); + }else if(tdb_type == TILEDB_STRING_ASCII){ + results[py::str(key)] = py::bytes(std::string((const char*)value, value_num)); + }else{ + py::dtype value_type = tdb_to_np_dtype(tdb_type, 1); + results[py::str(key)] = py::array(value_type, value_num, value); + } + } + return results; + }) + .def("has_metadata", &SOMAArray::has_metadata) + .def("metadata_num", &SOMAArray::metadata_num); } } // namespace tiledbsoma \ No newline at end of file diff --git a/apis/python/src/tiledbsoma/soma_dataframe.cc b/apis/python/src/tiledbsoma/soma_dataframe.cc index 9a330c6a41..ce431612a9 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.cc +++ b/apis/python/src/tiledbsoma/soma_dataframe.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2023 TileDB, Inc. + * @copyright Copyright (c) 2024 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -47,7 +47,7 @@ using namespace py::literals; using namespace tiledbsoma; void load_soma_dataframe(py::module &m) { - py::class_(m, "SOMADataFrame") + py::class_(m, "SOMADataFrame") .def_static( "open", @@ -67,411 +67,7 @@ void load_soma_dataframe(py::module &m) { "timestamp"_a = py::none()) .def_static("exists", &SOMADataFrame::exists) - .def("reopen", py::overload_cast>>(&SOMADataFrame::open)) - .def("close", &SOMADataFrame::close) - .def_property_readonly("closed", [](SOMADataFrame& soma_df) -> bool { - return not soma_df.is_open(); - }) - .def("reset", &SOMADataFrame::reset) - .def("set_condition", - [](SOMADataFrame& reader, - py::object py_query_condition, - py::object pa_schema){ - auto column_names = reader.column_names(); - // Handle query condition based on - // TileDB-Py::PyQuery::set_attr_cond() - QueryCondition* qc = nullptr; - if (!py_query_condition.is(py::none())) { - py::object init_pyqc = py_query_condition.attr( - "init_query_condition"); - try { - // Column names will be updated with columns present - // in the query condition - auto new_column_names = - init_pyqc(pa_schema, column_names) - .cast>(); - // Update the column_names list if it was not empty, - // otherwise continue selecting all columns with an - // empty column_names list - if (!column_names.empty()) { - column_names = new_column_names; - } - } catch (const std::exception& e) { - throw TileDBSOMAError(e.what()); - } - qc = py_query_condition.attr("c_obj") - .cast() - .ptr() - .get(); - reader.reset(column_names); - - // Release python GIL after we're done accessing python - // objects - py::gil_scoped_release release; - // Set query condition if present - if (qc) { - reader.set_condition(*qc); - } - } - }, - "py_query_condition"_a, - "py_schema"_a) - .def_property_readonly("type", &SOMADataFrame::type) - .def_property_readonly("uri", &SOMADataFrame::uri) - .def_property_readonly("mode", [](SOMADataFrame& soma_df){ - return soma_df.mode() == OpenMode::read ? "r" : "w"; - }) - .def_property_readonly("schema", [](SOMADataFrame& soma_df) -> py::object { - auto pa = py::module::import("pyarrow"); - auto pa_schema_import = pa.attr("Schema").attr("_import_from_c"); - return pa_schema_import(py::capsule(soma_df.schema().get())); - }) - .def("config", [](SOMADataFrame& soma_df) -> py::dict { - return py::cast(soma_df.config()); - }) - .def_property_readonly("timestamp", [](SOMADataFrame& soma_df) -> py::object { - if(!soma_df.timestamp().has_value()) - return py::none(); - return py::cast(soma_df.timestamp()->second); - }) .def_property_readonly("index_column_names", &SOMADataFrame::index_column_names) - .def("non_empty_domain", [](SOMADataFrame& soma_df, std::string name, py::dtype dtype){ - switch (np_to_tdb_dtype(dtype)) { - case TILEDB_UINT64: - return py::cast(soma_df.non_empty_domain(name)); - case TILEDB_DATETIME_YEAR: - case TILEDB_DATETIME_MONTH: - case TILEDB_DATETIME_WEEK: - case TILEDB_DATETIME_DAY: - case TILEDB_DATETIME_HR: - case TILEDB_DATETIME_MIN: - case TILEDB_DATETIME_SEC: - case TILEDB_DATETIME_MS: - case TILEDB_DATETIME_US: - case TILEDB_DATETIME_NS: - case TILEDB_DATETIME_PS: - case TILEDB_DATETIME_FS: - case TILEDB_DATETIME_AS: - case TILEDB_INT64: - return py::cast(soma_df.non_empty_domain(name)); - case TILEDB_UINT32: - return py::cast(soma_df.non_empty_domain(name)); - case TILEDB_INT32: - return py::cast(soma_df.non_empty_domain(name)); - case TILEDB_UINT16: - return py::cast(soma_df.non_empty_domain(name)); - case TILEDB_INT16: - return py::cast(soma_df.non_empty_domain(name)); - case TILEDB_UINT8: - return py::cast(soma_df.non_empty_domain(name)); - case TILEDB_INT8: - return py::cast(soma_df.non_empty_domain(name)); - case TILEDB_FLOAT64: - return py::cast(soma_df.non_empty_domain(name)); - case TILEDB_FLOAT32: - return py::cast(soma_df.non_empty_domain(name)); - case TILEDB_STRING_UTF8: - case TILEDB_STRING_ASCII: - return py::cast(soma_df.non_empty_domain_var(name)); - default: - throw TileDBSOMAError("Unsupported dtype for nonempty domain."); - } - }) - .def("domain", [](SOMADataFrame& soma_df, std::string name, py::dtype dtype) { - switch (np_to_tdb_dtype(dtype)) { - case TILEDB_UINT64: - return py::cast(soma_df.domain(name)); - case TILEDB_DATETIME_YEAR: - case TILEDB_DATETIME_MONTH: - case TILEDB_DATETIME_WEEK: - case TILEDB_DATETIME_DAY: - case TILEDB_DATETIME_HR: - case TILEDB_DATETIME_MIN: - case TILEDB_DATETIME_SEC: - case TILEDB_DATETIME_MS: - case TILEDB_DATETIME_US: - case TILEDB_DATETIME_NS: - case TILEDB_DATETIME_PS: - case TILEDB_DATETIME_FS: - case TILEDB_DATETIME_AS: - case TILEDB_INT64: - return py::cast(soma_df.domain(name)); - case TILEDB_UINT32: - return py::cast(soma_df.domain(name)); - case TILEDB_INT32: - return py::cast(soma_df.domain(name)); - case TILEDB_UINT16: - return py::cast(soma_df.domain(name)); - case TILEDB_INT16: - return py::cast(soma_df.domain(name)); - case TILEDB_UINT8: - return py::cast(soma_df.domain(name)); - case TILEDB_INT8: - return py::cast(soma_df.domain(name)); - case TILEDB_FLOAT64: - return py::cast(soma_df.domain(name)); - case TILEDB_FLOAT32: - return py::cast(soma_df.domain(name)); - case TILEDB_STRING_UTF8: - case TILEDB_STRING_ASCII: { - std::pair str_domain; - return py::cast(std::make_pair("", "")); - } - default: - throw TileDBSOMAError("Unsupported dtype for Dimension's domain"); - } - }) - .def_property_readonly("count", &SOMADataFrame::count) - .def("read_next", [](SOMADataFrame& dataframe){ - // Release GIL when reading data - py::gil_scoped_release release; - auto buffers = dataframe.read_next(); - py::gil_scoped_acquire acquire; - - return to_table(buffers); - }) - .def("set_metadata", &SOMADataFrame::set_metadata) - .def("delete_metadata", &SOMADataFrame::delete_metadata) - .def("get_metadata", - py::overload_cast(&SOMADataFrame::get_metadata)) - .def_property_readonly("meta", [](SOMADataFrame&soma_dataframe) -> py::dict { - py::dict results; - - for (auto const& [key, val] : soma_dataframe.get_metadata()){ - tiledb_datatype_t tdb_type = std::get(val); - uint32_t value_num = std::get(val); - const void *value = std::get(val); - - if(tdb_type == TILEDB_STRING_UTF8){ - results[py::str(key)] = py::str(std::string((const char*)value, value_num)); - }else if(tdb_type == TILEDB_STRING_ASCII){ - results[py::str(key)] = py::bytes(std::string((const char*)value, value_num)); - }else{ - py::dtype value_type = tdb_to_np_dtype(tdb_type, 1); - results[py::str(key)] = py::array(value_type, value_num, value); - } - } - return results; - }) - .def("has_metadata", &SOMADataFrame::has_metadata) - .def("metadata_num", &SOMADataFrame::metadata_num) - .def( - "set_dim_points_arrow", - [](SOMADataFrame& reader, - const std::string& dim, - py::object py_arrow_array, - int partition_index, - int partition_count) { - // Create a list of array chunks - py::list array_chunks; - if (py::hasattr(py_arrow_array, "chunks")) { - array_chunks = py_arrow_array.attr("chunks") - .cast(); - } else { - array_chunks.append(py_arrow_array); - } - - for (const pybind11::handle array : array_chunks) { - ArrowSchema arrow_schema; - ArrowArray arrow_array; - uintptr_t arrow_schema_ptr = (uintptr_t)(&arrow_schema); - uintptr_t arrow_array_ptr = (uintptr_t)(&arrow_array); - - // Call array._export_to_c to get arrow array and schema - // - // If ever a NumPy array gets in here, there will be an - // exception like "AttributeError: 'numpy.ndarray' object - // has no attribute '_export_to_c'". - array.attr("_export_to_c")( - arrow_array_ptr, arrow_schema_ptr); - - auto coords = array.attr("tolist")(); - - if (!strcmp(arrow_schema.format, "l")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if (!strcmp(arrow_schema.format, "i")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if (!strcmp(arrow_schema.format, "s")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if (!strcmp(arrow_schema.format, "c")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if (!strcmp(arrow_schema.format, "L")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if (!strcmp(arrow_schema.format, "I")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if (!strcmp(arrow_schema.format, "S")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if (!strcmp(arrow_schema.format, "C")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if (!strcmp(arrow_schema.format, "f")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if (!strcmp(arrow_schema.format, "g")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if ( - !strcmp(arrow_schema.format, "u") || - !strcmp(arrow_schema.format, "z")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if ( - !strcmp(arrow_schema.format, "tss:") || - !strcmp(arrow_schema.format, "tsm:") || - !strcmp(arrow_schema.format, "tsu:") || - !strcmp(arrow_schema.format, "tsn:")) { - // convert the Arrow Array to int64 - auto pa = py::module::import("pyarrow"); - coords = array.attr("cast")(pa.attr("int64")()).attr("tolist")(); - reader.set_dim_points( - dim, coords.cast>()); - } else if ( - !strcmp(arrow_schema.format, "U") || - !strcmp(arrow_schema.format, "Z")) { - reader.set_dim_points( - dim, coords.cast>()); - } else { - throw TileDBSOMAError( - "[pytiledbsoma] set_dim_points: type={} not " - "supported" + - std::string(arrow_schema.format)); - } - - // Release arrow schema - arrow_schema.release(&arrow_schema); - } - }, - "dim"_a, - "py_arrow_array"_a, - "partition_index"_a = 0, - "partition_count"_a = 1) - .def( - "set_dim_points_string_or_bytes", - static_cast&)>( - &SOMADataFrame::set_dim_points)) - .def( - "set_dim_points_double", - static_cast&)>( - &SOMADataFrame::set_dim_points)) - .def( - "set_dim_points_float", - static_cast&)>( - &SOMADataFrame::set_dim_points)) - .def( - "set_dim_points_int64", - static_cast&)>( - &SOMADataFrame::set_dim_points)) - .def( - "set_dim_points_int32", - static_cast&)>( - &SOMADataFrame::set_dim_points)) - .def( - "set_dim_points_int16", - static_cast&)>( - &SOMADataFrame::set_dim_points)) - .def( - "set_dim_points_int8", - static_cast&)>( - &SOMADataFrame::set_dim_points)) - .def( - "set_dim_points_uint64", - static_cast&)>( - &SOMADataFrame::set_dim_points)) - .def( - "set_dim_points_uint32", - static_cast&)>( - &SOMADataFrame::set_dim_points)) - .def( - "set_dim_points_uint16", - static_cast&)>( - &SOMADataFrame::set_dim_points)) - .def( - "set_dim_points_uint8", - static_cast&)>( - &SOMADataFrame::set_dim_points)) - .def( - "set_dim_ranges_string_or_bytes", - static_cast>&)>( - &SOMADataFrame::set_dim_ranges)) - .def( - "set_dim_ranges_int64", - static_cast>&)>( - &SOMADataFrame::set_dim_ranges)) - .def( - "set_dim_ranges_int32", - static_cast>&)>( - &SOMADataFrame::set_dim_ranges)) - .def( - "set_dim_ranges_int16", - static_cast>&)>( - &SOMADataFrame::set_dim_ranges)) - .def( - "set_dim_ranges_int8", - static_cast>&)>( - &SOMADataFrame::set_dim_ranges)) - .def( - "set_dim_ranges_uint64", - static_cast>&)>( - &SOMADataFrame::set_dim_ranges)) - .def( - "set_dim_ranges_uint32", - static_cast>&)>( - &SOMADataFrame::set_dim_ranges)) - .def( - "set_dim_ranges_uint16", - static_cast>&)>( - &SOMADataFrame::set_dim_ranges)) - .def( - "set_dim_ranges_uint8", - static_cast>&)>( - &SOMADataFrame::set_dim_ranges)) - .def( - "set_dim_ranges_double", - static_cast>&)>( - &SOMADataFrame::set_dim_ranges)) - .def( - "set_dim_ranges_float", - static_cast>&)>( - &SOMADataFrame::set_dim_ranges)); - } + .def_property_readonly("count", &SOMADataFrame::count); +} } \ No newline at end of file diff --git a/apis/python/src/tiledbsoma/soma_object.cc b/apis/python/src/tiledbsoma/soma_object.cc index 6192961817..328de38024 100644 --- a/apis/python/src/tiledbsoma/soma_object.cc +++ b/apis/python/src/tiledbsoma/soma_object.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2023 TileDB, Inc. + * @copyright Copyright (c) 2024 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -64,7 +64,9 @@ void load_soma_object(py::module &m) { } catch(...){ TPY_ERROR_LOC("SOMAObject not handled in Python API yet."); - } - }); -} + }}) + + .def_property_readonly("type", &SOMAObject::type); + }; } +