diff --git a/apis/python/setup.py b/apis/python/setup.py index 00504d2ced..ece776b0bb 100644 --- a/apis/python/setup.py +++ b/apis/python/setup.py @@ -254,7 +254,7 @@ def run(self): CXX_FLAGS.append(f'-Wl,-rpath,{str(tiledb_dir / "lib")}') if sys.platform == "darwin": - CXX_FLAGS.append("-mmacosx-version-min=11.0") + CXX_FLAGS.append("-mmacosx-version-min=13.3") if os.name == "posix" and sys.platform != "darwin": LIB_DIRS.append(str(tiledbsoma_dir / "lib" / "x86_64-linux-gnu")) diff --git a/libtiledbsoma/src/CMakeLists.txt b/libtiledbsoma/src/CMakeLists.txt index c896a8b018..834be63bd1 100644 --- a/libtiledbsoma/src/CMakeLists.txt +++ b/libtiledbsoma/src/CMakeLists.txt @@ -33,6 +33,8 @@ add_library(TILEDB_SOMA_OBJECTS OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_array.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_group.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_object.cc + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_column.cc + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_dimension.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_collection.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_experiment.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_measurement.cc @@ -206,6 +208,8 @@ install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/soma/column_buffer.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_array.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_group.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_column.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_dimension.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_collection.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_dataframe.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_dense_ndarray.h diff --git a/libtiledbsoma/src/soma/enums.h b/libtiledbsoma/src/soma/enums.h index bf804110fd..209d827c87 100644 --- a/libtiledbsoma/src/soma/enums.h +++ b/libtiledbsoma/src/soma/enums.h @@ -43,4 +43,18 @@ enum class ResultOrder { automatic = 0, rowmajor, colmajor }; /** Defines whether the SOMAGroup URI is absolute or relative */ enum class URIType { automatic = 0, absolute, relative }; +typedef enum { + SOMA_COLUMN_DIMENSION = 0, + SOMA_COLUMN_ATTRIBUTE = 1, + SOMA_COLUMN_GEOMETRY = 2 +} soma_column_datatype_t; + +// This enables some code deduplication between core domain, core current +// domain, and core non-empty domain. +enum class Domainish { + kind_core_domain = 0, + kind_core_current_domain = 1, + kind_non_empty_domain = 2 +}; + #endif // SOMA_ENUMS \ No newline at end of file diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 8acb504594..d143f594db 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -90,14 +90,6 @@ using namespace tiledb; using StatusAndReason = std::pair; -// This enables some code deduplication between core domain, core current -// domain, and core non-empty domain. -enum class Domainish { - kind_core_domain = 0, - kind_core_current_domain = 1, - kind_non_empty_domain = 2 -}; - class SOMAArray : public SOMAObject { public: friend class ManagedQuery; diff --git a/libtiledbsoma/src/soma/soma_column.cc b/libtiledbsoma/src/soma/soma_column.cc new file mode 100644 index 0000000000..d55888ff79 --- /dev/null +++ b/libtiledbsoma/src/soma/soma_column.cc @@ -0,0 +1,81 @@ +/** + * @file soma_column.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines the SOMAColumn class. + */ + +#include "soma_column.h" + +namespace tiledbsoma { + +template <> +std::pair SOMAColumn::core_domain_slot() + const { + return std::pair("", ""); +} + +template <> +std::pair +SOMAColumn::core_current_domain_slot( + const SOMAContext& ctx, Array& array) const { + // Here is an intersection of a few oddities: + // + // * Core domain for string dims must be a nullptr pair; it cannot + // be + // anything else. + // * TileDB-Py shows this by using an empty-string pair, which we + // imitate. + // * Core current domain for string dims must _not_ be a nullptr + // pair. + // * In TileDB-SOMA, unless the user specifies otherwise, we use "" + // for + // min and "\x7f" for max. (We could use "\x7f" but that causes + // display problems in Python.) + // + // To work with all these factors, if the current domain is the + // default + // "" to "\x7f", return an empty-string pair just as we do for + // domain. (There was some pre-1.15 software using "\xff" and it's + // super-cheap to check for that as well.) + try { + std::pair + current_domain = std::any_cast>( + _core_current_domain_slot(ctx, array)); + + if (current_domain.first == "" && (current_domain.second == "\x7f" || + current_domain.second == "\xff")) { + return std::pair("", ""); + } + + return current_domain; + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } +} +} // namespace tiledbsoma \ No newline at end of file diff --git a/libtiledbsoma/src/soma/soma_column.h b/libtiledbsoma/src/soma/soma_column.h new file mode 100644 index 0000000000..4b2f513819 --- /dev/null +++ b/libtiledbsoma/src/soma/soma_column.h @@ -0,0 +1,515 @@ +/** + * @file soma_column.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines the SOMAColumn class. SOMAColumn is an abstraction over + * TileDB dimensions, attributes and combinations of them. It is designed to add + * indexing capabilities to any datatype utilizing native TileDB dimensions + * without exposing the internal indexing to the end user. + */ + +#ifndef SOMA_COLUMN_H +#define SOMA_COLUMN_H + +#include +#include +#include +#include +#include +#include + +#include "enums.h" +#include "managed_query.h" +#include "nanoarrow/nanoarrow.hpp" +#include "utils/common.h" + +namespace tiledbsoma { +using namespace tiledb; + +class SOMAColumn { + public: + //=================================================================== + //= public non-static + //=================================================================== + SOMAColumn() = default; + SOMAColumn(const SOMAColumn&) = default; + SOMAColumn(SOMAColumn&&) = default; + SOMAColumn& operator=(const SOMAColumn&) = default; + SOMAColumn& operator=(SOMAColumn&&) = default; + + virtual ~SOMAColumn() = default; + + /** + * Get the SOMAColumn name as defined in schema. + */ + virtual std::string name() const = 0; + + /** + * If true, this column is used as index. + * + * @remark SOMAColumns used as indexes should define at least one TileDB + * Dimension + */ + virtual bool isIndexColumn() const = 0; + + /** + * Get the TileDB Dimensions defined by the SOMAColumn object, if any. + */ + virtual std::optional> tiledb_dimensions() = 0; + + /** + * Get the TileDB Attributes defined by the SOMAColumn object, if any. + */ + virtual std::optional> tiledb_attributes() = 0; + + /** + * Get the TileDB Enumerations used by the SOMAColumn object, if any. + */ + virtual std::optional> tiledb_enumerations() = 0; + + /** + * Get the SOMAColumn type. Each subclass should define its own type. + */ + virtual soma_column_datatype_t type() const = 0; + + /** + * Get the datatype of the TileDB Dimensions if any. All dimensions must + * have the same type. + */ + virtual std::optional domain_type() const = 0; + + /** + * Get the datatype of the TileDB Attributes if any. All attributes must + * have the same type. + */ + virtual std::optional data_type() const = 0; + + /** + * @brief Select columns names to query (dim and attr). If the + * `if_not_empty` parameter is `true`, the column will be selected iff the + * list of selected columns is empty. This prevents a `select_columns` call + * from changing an empty list (all columns) to a subset of columns. + * + * @param query the ManagedQuery object to modify + * @param if_not_empty Prevent changing an "empty" selection of all columns + */ + virtual void select_columns( + const std::unique_ptr& query, + bool if_not_empty = false) const = 0; + + /** + * Get the domain kind of the SOMAColumn as an ArrowArray for use with + * R/Python API. + * + * @param ctx + * @param array + * @param which_kind + */ + virtual ArrowArray* arrow_domain_slot( + const SOMAContext& ctx, + Array& array, + enum Domainish which_kind) const = 0; + + /** + * Get the SOMAColumn encoded as an ArrowSchema for use with R/Python API. + * + * @param ctx + * @param array + */ + virtual ArrowSchema* arrow_schema_slot( + const SOMAContext& ctx, Array& array) = 0; + + /** + * Get the domain kind of the SOMAColumn. + * + * @tparam T + * @param ctx + * @param array + * @param which_kind + */ + template + std::pair domain_slot( + const SOMAContext& ctx, Array& array, enum Domainish which_kind) const { + switch (which_kind) { + case Domainish::kind_core_domain: + return core_domain_slot(); + case Domainish::kind_core_current_domain: + return core_current_domain_slot(ctx, array); + case Domainish::kind_non_empty_domain: + return non_empty_domain_slot(array); + default: + throw std::runtime_error( + "internal coding error in SOMAArray::_core_domainish_slot: " + "unknown kind"); + } + } + + /** + * Set the current domain of this SOMAColumn. + * + * @tparam T + * @param rectangle The current domain rectangle to modify. + * @param domain A vector of the n-dimensional domain in the form + * [dim_0_min, dim_1_min, ..., dim_n_max] + */ + template + void set_current_domain_slot( + NDRectangle& rectangle, const std::vector& domain) const { + if (!isIndexColumn()) { + throw TileDBSOMAError(std::format( + "[SOMAColumn][set_current_domain_slot] Column with name {} is " + "not an index column", + name())); + } + + if (domain.size() % 2 != 0) { + throw TileDBSOMAError(std::format( + "[SOMAColumn][set_current_domain_slot] Provided domain for " + "column {} has missing values", + name())); + } + + std::vector transformed_domain; + size_t dim_count = domain.size() / 2; + for (size_t i = 0; i < dim_count; ++i) { + transformed_domain.push_back(std::make_any>( + std::array({domain[i], domain[i + dim_count]}))); + } + + try { + _set_current_domain_slot(rectangle, transformed_domain); + } catch (const std::exception& e) { + throw TileDBSOMAError(std::format( + "[SOMAColumn][set_current_domain_slot] Failed on \"{}\" with " + "error \"{}\"", + name(), + e.what())); + } + } + + /** + * Set the multi-type current domain of this SOMAColumn. + * + * @tparam T + * @param rectangle The current domain rectangle to modify. + * @param domain A vector holding std::arrays with 2 elements each [min, + * max], casted as std::any + */ + void set_current_domain_slot( + NDRectangle& rectangle, const std::vector& domain) const { + if (!isIndexColumn()) { + throw TileDBSOMAError(std::format( + "[SOMAColumn][set_current_domain_slot] Column with name {} is " + "not an index column", + name())); + } + + try { + _set_current_domain_slot(rectangle, domain); + } catch (const std::exception& e) { + throw TileDBSOMAError(std::format( + "[SOMAColumn][set_current_domain_slot] Failed on \"{}\" with " + "error \"{}\"", + name(), + e.what())); + } + } + + /** + * Test if the multi-type current domain of this SOMAColumn can be set with + * the supplied new current domain. + * + * @tparam T + * @param rectangle The current domain rectangle to modify. + * @param domain A vector holding std::arrays with 2 elements each [min, + * max], casted as std::any + */ + std::pair can_set_current_domain_slot( + std::optional& rectangle, + const std::vector& domain) const { + if (!isIndexColumn()) { + throw TileDBSOMAError(std::format( + "[SOMAColumn][set_current_domain_slot] Column with name {} is " + "not an index column", + name())); + } + + try { + return _can_set_current_domain_slot(rectangle, domain); + } catch (const std::exception& e) { + throw TileDBSOMAError(std::format( + "[SOMAColumn][can_set_current_domain_slot] Failed on \"{}\" " + "with error \"{}\"", + name(), + e.what())); + } + } + + /** + * @brief Set the dimension slice using one point + * + * @note Partitioning is not supported + * + * @tparam T + * @param query + * @param ctx + * @param point + */ + template + void set_dim_point( + const std::unique_ptr& query, + const SOMAContext& ctx, + const T& point) const { + if (!isIndexColumn()) { + throw TileDBSOMAError(std::format( + "[SOMAColumn] Column with name {} is not an index column", + name())); + } + + T points[] = {point}; + + try { + this->_set_dim_points( + query, + ctx, + std::make_any>(std::span(points))); + } catch (const std::exception& e) { + throw TileDBSOMAError(std::format( + "[SOMAColumn][set_dim_point] Failed on \"{}\" with error " + "\"{}\"", + name(), + e.what())); + } + } + + /** + * @brief Set the dimension slice using multiple points + * + * @note Partitioning is not supported + * + * @tparam T + * @param query + * @param ctx + * @param points + */ + template + void set_dim_points( + const std::unique_ptr& query, + const SOMAContext& ctx, + std::span points) const { + if (!isIndexColumn()) { + throw TileDBSOMAError(std::format( + "[SOMAColumn] Column with name {} is not an index column", + name())); + } + + try { + this->_set_dim_points( + query, ctx, std::make_any>(points)); + } catch (const std::exception& e) { + throw TileDBSOMAError(std::format( + "[SOMAColumn][set_dim_points] Failed on \"{}\" with error " + "\"{}\"", + name(), + e.what())); + } + } + + /** + * @brief Set the dimension slice using multiple ranges + * + * @note Partitioning is not supported + * + * @tparam T + * @param query + * @param ranges + */ + template + void set_dim_ranges( + const std::unique_ptr& query, + const SOMAContext& ctx, + const std::vector>& ranges) const { + if (!isIndexColumn()) { + throw TileDBSOMAError(std::format( + "[SOMAColumn] Column with name {} is not an index column", + name())); + } + + try { + this->_set_dim_ranges( + query, + ctx, + std::make_any>>(ranges)); + } catch (const std::exception& e) { + throw TileDBSOMAError(std::format( + "[SOMAColumn][set_dim_ranges] Failed on \"{}\" with error " + "\"{}\"", + name(), + e.what())); + } + } + + /** + * Returns the core domain of this column. + * + * o For arrays with core current-domain support: + * - soma domain is core current domain + * - soma maxdomain is core domain + * o For arrays without core current-domain support: + * - soma domain is core domain + * - soma maxdomain is core domain + * - core current domain is not accessed at the soma level + * + * @tparam T Domain datatype + * @return Pair of [lower, upper] inclusive bounds. + */ + template + std::pair core_domain_slot() const { + try { + return std::any_cast>(_core_domain_slot()); + } catch (const std::exception& e) { + throw TileDBSOMAError(std::format( + "[SOMAColumn][core_domain_slot] Failed on \"{}\" with error " + "\"{}\"", + name(), + e.what())); + } + } + + /** + * Retrieves the non-empty domain from the array. This is the union of the + * non-empty domains of the array fragments. Returns (0, 0) for empty + * domains. + */ + template + std::pair non_empty_domain_slot(Array& array) const { + try { + return std::any_cast>( + _non_empty_domain_slot(array)); + } catch (const std::exception& e) { + throw TileDBSOMAError(std::format( + "[SOMAColumn][non_empty_domain_slot] Failed on \"{}\" with " + "error \"{}\"", + name(), + e.what())); + } + } + + /** + * Returns the core current domain of this column. + * + * o For arrays with core current-domain support: + * - soma domain is core current domain + * - soma maxdomain is core domain + * o For arrays without core current-domain support: + * - soma domain is core domain + * - soma maxdomain is core domain + * - core current domain is not accessed at the soma level + * + * @tparam T Domain datatype + * @return Pair of [lower, upper] inclusive bounds. + */ + template + std::pair core_current_domain_slot( + const SOMAContext& ctx, Array& array) const { + try { + return std::any_cast>( + _core_current_domain_slot(ctx, array)); + } catch (const std::exception& e) { + throw TileDBSOMAError(std::format( + "[SOMAColumn][core_current_domain_slot] Failed on \"{}\" with " + "error \"{}\"", + name(), + e.what())); + } + } + + /** + * Returns the core current domain of this column from the supplied + * NDRectangle. + * + * o For arrays with core current-domain support: + * - soma domain is core current domain + * - soma maxdomain is core domain + * o For arrays without core current-domain support: + * - soma domain is core domain + * - soma maxdomain is core domain + * - core current domain is not accessed at the soma level + * + * @tparam T Domain datatype + * @return Pair of [lower, upper] inclusive bounds. + */ + template + std::pair core_current_domain_slot(NDRectangle& ndrect) const { + try { + return std::any_cast>( + _core_current_domain_slot(ndrect)); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + } + + protected: + virtual void _set_dim_points( + const std::unique_ptr& query, + const SOMAContext& ctx, + const std::any& points) const = 0; + + virtual void _set_dim_ranges( + const std::unique_ptr& query, + const SOMAContext& ctx, + const std::any& ranges) const = 0; + + virtual void _set_current_domain_slot( + NDRectangle& rectangle, std::span domain) const = 0; + + virtual std::pair _can_set_current_domain_slot( + std::optional& rectangle, + std::span new_domain) const = 0; + + virtual std::any _core_domain_slot() const = 0; + + virtual std::any _non_empty_domain_slot(Array& array) const = 0; + + virtual std::any _core_current_domain_slot( + const SOMAContext& ctx, Array& array) const = 0; + + virtual std::any _core_current_domain_slot(NDRectangle& ndrect) const = 0; +}; + +template <> +std::pair SOMAColumn::core_domain_slot() + const; + +template <> +std::pair +SOMAColumn::core_current_domain_slot( + const SOMAContext& ctx, Array& array) const; + +} // namespace tiledbsoma +#endif \ No newline at end of file diff --git a/libtiledbsoma/src/soma/soma_dimension.cc b/libtiledbsoma/src/soma/soma_dimension.cc new file mode 100644 index 0000000000..7f667bf323 --- /dev/null +++ b/libtiledbsoma/src/soma/soma_dimension.cc @@ -0,0 +1,767 @@ +/** + * @file soma_dimension.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines the SOMADimension class. + */ + +#include "soma_dimension.h" +#include "utils/arrow_adapter.h" + +namespace tiledbsoma { + +std::shared_ptr SOMADimension::create( + std::shared_ptr ctx, + ArrowSchema* schema, + ArrowArray* array, + const std::string& soma_type, + std::string_view type_metadata, + PlatformConfig platform_config) { + auto dimension = ArrowAdapter::tiledb_dimension_from_arrow_schema( + ctx, schema, array, soma_type, type_metadata, "", "", platform_config); + + return std::make_shared(SOMADimension(dimension)); +} + +void SOMADimension::_set_dim_points( + const std::unique_ptr& query, + const SOMAContext&, + const std::any& points) const { + switch (dimension.type()) { + case TILEDB_UINT8: + query->select_points( + dimension.name(), + std::any_cast>(points)); + break; + case TILEDB_UINT16: + query->select_points( + dimension.name(), + std::any_cast>(points)); + break; + case TILEDB_UINT32: + query->select_points( + dimension.name(), + std::any_cast>(points)); + break; + case TILEDB_UINT64: + query->select_points( + dimension.name(), + std::any_cast>(points)); + break; + case TILEDB_INT8: + query->select_points( + dimension.name(), + std::any_cast>(points)); + break; + case TILEDB_INT16: + query->select_points( + dimension.name(), + std::any_cast>(points)); + break; + case TILEDB_INT32: + query->select_points( + dimension.name(), + std::any_cast>(points)); + break; + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_INT64: + query->select_points( + dimension.name(), + std::any_cast>(points)); + break; + case TILEDB_FLOAT32: + query->select_points( + dimension.name(), + std::any_cast>(points)); + break; + case TILEDB_FLOAT64: + query->select_points( + dimension.name(), + std::any_cast>(points)); + break; + case TILEDB_STRING_UTF8: + case TILEDB_STRING_ASCII: + case TILEDB_CHAR: + case TILEDB_BLOB: + query->select_points( + dimension.name(), + std::any_cast>(points)); + break; + default: + throw TileDBSOMAError(std::format( + "[SOMADimension] Unknown dimension type {}", + impl::type_to_str(dimension.type()))); + } +} + +void SOMADimension::_set_dim_ranges( + const std::unique_ptr& query, + const SOMAContext&, + const std::any& ranges) const { + switch (dimension.type()) { + case TILEDB_UINT8: + query->select_ranges( + dimension.name(), + std::any_cast>>( + ranges)); + break; + case TILEDB_UINT16: + query->select_ranges( + dimension.name(), + std::any_cast>>( + ranges)); + break; + case TILEDB_UINT32: + query->select_ranges( + dimension.name(), + std::any_cast>>( + ranges)); + break; + case TILEDB_UINT64: + query->select_ranges( + dimension.name(), + std::any_cast>>( + ranges)); + break; + case TILEDB_INT8: + query->select_ranges( + dimension.name(), + std::any_cast>>(ranges)); + break; + case TILEDB_INT16: + query->select_ranges( + dimension.name(), + std::any_cast>>( + ranges)); + break; + case TILEDB_INT32: + query->select_ranges( + dimension.name(), + std::any_cast>>( + ranges)); + break; + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_INT64: + query->select_ranges( + dimension.name(), + std::any_cast>>( + ranges)); + break; + case TILEDB_FLOAT32: + query->select_ranges( + dimension.name(), + std::any_cast>>( + ranges)); + break; + case TILEDB_FLOAT64: + query->select_ranges( + dimension.name(), + std::any_cast>>( + ranges)); + break; + case TILEDB_STRING_UTF8: + case TILEDB_STRING_ASCII: + case TILEDB_CHAR: + case TILEDB_BLOB: + case TILEDB_GEOM_WKT: + case TILEDB_GEOM_WKB: + query->select_ranges( + dimension.name(), + std::any_cast>>( + ranges)); + break; + default: + throw TileDBSOMAError(std::format( + "[SOMADimension] Unknown dimension type {}", + impl::type_to_str(dimension.type()))); + } +} + +void SOMADimension::_set_current_domain_slot( + NDRectangle& rectangle, std::span domain) const { + if (domain.size() != 1) { + throw TileDBSOMAError(std::format( + "[SOMADimension][_set_current_domain_slot] Invalid domain size. " + "Expected 1, got {}", + domain.size())); + } + + switch (dimension.type()) { + case TILEDB_UINT8: { + auto dom = std::any_cast>(domain[0]); + rectangle.set_range(dimension.name(), dom[0], dom[1]); + } break; + case TILEDB_UINT16: { + auto dom = std::any_cast>(domain[0]); + rectangle.set_range(dimension.name(), dom[0], dom[1]); + } break; + case TILEDB_UINT32: { + auto dom = std::any_cast>(domain[0]); + rectangle.set_range(dimension.name(), dom[0], dom[1]); + } break; + case TILEDB_UINT64: { + auto dom = std::any_cast>(domain[0]); + rectangle.set_range(dimension.name(), dom[0], dom[1]); + } break; + case TILEDB_INT8: { + auto dom = std::any_cast>(domain[0]); + rectangle.set_range(dimension.name(), dom[0], dom[1]); + } break; + case TILEDB_INT16: { + auto dom = std::any_cast>(domain[0]); + rectangle.set_range(dimension.name(), dom[0], dom[1]); + } break; + case TILEDB_INT32: { + auto dom = std::any_cast>(domain[0]); + rectangle.set_range(dimension.name(), dom[0], dom[1]); + } break; + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_INT64: { + auto dom = std::any_cast>(domain[0]); + rectangle.set_range(dimension.name(), dom[0], dom[1]); + } break; + case TILEDB_FLOAT32: { + auto dom = std::any_cast>(domain[0]); + rectangle.set_range(dimension.name(), dom[0], dom[1]); + } break; + case TILEDB_FLOAT64: { + auto dom = std::any_cast>(domain[0]); + rectangle.set_range(dimension.name(), dom[0], dom[1]); + } break; + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + case TILEDB_CHAR: + case TILEDB_BLOB: + case TILEDB_GEOM_WKT: + case TILEDB_GEOM_WKB: { + // Here is an intersection of a few oddities: + // + // * Core domain for string dims must be a nullptr pair; it cannot + // be + // anything else. + // * TileDB-Py shows this by using an empty-string pair, which we + // imitate. + // * Core current domain for string dims must _not_ be a nullptr + // pair. + // * In TileDB-SOMA, unless the user specifies otherwise, we use "" + // for + // min and "\x7f" for max. (We could use "\x7f" but that causes + // display problems in Python.) + // + // To work with all these factors, if the current domain is the + // default + // "" to "\7f", return an empty-string pair just as we do for + // domain. (There was some pre-1.15 software using "\xff" and it's + // super-cheap to check for that as well.) + auto dom = std::any_cast>(domain[0]); + if (dom[0] == "" && dom[1] == "") { + rectangle.set_range(dimension.name(), "", "\x7f"); + } else { + throw TileDBSOMAError(std::format( + "[SOMADimension][_set_current_domain_slot] domain (\"{}\", " + "\"{}\") cannot be set for " + "string index columns: please use " + "(\"\", \"\")", + dom[0], + dom[1])); + } + + } break; + default: + throw TileDBSOMAError(std::format( + "[SOMADimension][_set_current_domain_slot] Unknown datatype {}", + tiledb::impl::type_to_str(dimension.type()))); + } +} + +std::pair SOMADimension::_can_set_current_domain_slot( + std::optional& rectangle, + std::span new_domain) const { + if (new_domain.size() != 1) { + throw TileDBSOMAError(std::format( + "[SOMADimension][_can_set_current_domain_slot] Expected domain " + "size is 1, found {}", + new_domain.size())); + } + + auto comparator = + [&]( + const std::array& new_dom) -> std::pair { + if (new_dom[0] > new_dom[1]) { + return std::pair( + false, + std::format( + "index-column name {}: new lower > new upper", + dimension.name())); + } + + // If we're checking against the core current domain: the user-provided + // domain must contain the core current domain. + // + // If we're checking against the core (max) domain: the user-provided + // domain must be contained within the core (max) domain. + + if (rectangle.has_value()) { + auto dom = rectangle.value().range(dimension.name()); + + if (new_dom[0] > dom[0]) { + return std::pair( + false, + std::format( + "index-column name {}: new lower > old lower (downsize " + "is unsupported)", + dimension.name())); + } + if (new_dom[1] < dom[1]) { + return std::pair( + false, + std::format( + "index-column name {}: new upper < old upper (downsize " + "is unsupported)", + dimension.name())); + } + } else { + auto dom = std::any_cast>(_core_domain_slot()); + + if (new_dom[0] < dom.first) { + return std::pair( + false, + std::format( + "index-column name {}: new lower < limit lower", + dimension.name())); + } + if (new_dom[1] > dom.second) { + return std::pair( + false, + std::format( + "index-column name {}: new upper > limit upper", + dimension.name())); + } + } + + return std::pair(true, ""); + }; + + switch (dimension.type()) { + case TILEDB_UINT8: + return comparator( + std::any_cast>(new_domain[0])); + case TILEDB_UINT16: + return comparator( + std::any_cast>(new_domain[0])); + case TILEDB_UINT32: + return comparator( + std::any_cast>(new_domain[0])); + case TILEDB_UINT64: + return comparator( + std::any_cast>(new_domain[0])); + case TILEDB_INT8: + return comparator( + std::any_cast>(new_domain[0])); + case TILEDB_INT16: + return comparator( + std::any_cast>(new_domain[0])); + case TILEDB_INT32: + return comparator( + std::any_cast>(new_domain[0])); + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_INT64: + return comparator( + std::any_cast>(new_domain[0])); + case TILEDB_FLOAT32: + return comparator( + std::any_cast>(new_domain[0])); + case TILEDB_FLOAT64: + return comparator( + std::any_cast>(new_domain[0])); + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + case TILEDB_CHAR: + case TILEDB_BLOB: + case TILEDB_GEOM_WKT: + case TILEDB_GEOM_WKB: { + auto dom = std::any_cast>(new_domain[0]); + if (dom[0] != "" || dom[1] != "") { + return std::pair( + false, + "domain cannot be set for string index columns: please use " + "(\"\", \"\")"); + } + + return std::pair(true, ""); + } + default: + throw TileDBSOMAError(std::format( + "[SOMADimension][_can_set_current_domain_slot] Unknown dataype " + "{}", + tiledb::impl::type_to_str(dimension.type()))); + } +} + +std::any SOMADimension::_core_domain_slot() const { + switch (dimension.type()) { + case TILEDB_UINT8: + return std::make_any>( + dimension.domain()); + case TILEDB_UINT16: + return std::make_any>( + dimension.domain()); + case TILEDB_UINT32: + return std::make_any>( + dimension.domain()); + case TILEDB_UINT64: + return std::make_any>( + dimension.domain()); + case TILEDB_INT8: + return std::make_any>( + dimension.domain()); + case TILEDB_INT16: + return std::make_any>( + dimension.domain()); + case TILEDB_INT32: + return std::make_any>( + dimension.domain()); + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_INT64: + return std::make_any>( + dimension.domain()); + case TILEDB_FLOAT32: + return std::make_any>( + dimension.domain()); + case TILEDB_FLOAT64: + return std::make_any>( + dimension.domain()); + default: + throw TileDBSOMAError(std::format( + "[SOMADimension][_core_domain_slot] Unknown dimension type {}", + impl::type_to_str(dimension.type()))); + } +} + +std::any SOMADimension::_non_empty_domain_slot(Array& array) const { + switch (dimension.type()) { + case TILEDB_UINT8: + return std::make_any>( + array.non_empty_domain(dimension.name())); + case TILEDB_UINT16: + return std::make_any>( + array.non_empty_domain(dimension.name())); + case TILEDB_UINT32: + return std::make_any>( + array.non_empty_domain(dimension.name())); + case TILEDB_UINT64: + return std::make_any>( + array.non_empty_domain(dimension.name())); + case TILEDB_INT8: + return std::make_any>( + array.non_empty_domain(dimension.name())); + case TILEDB_INT16: + return std::make_any>( + array.non_empty_domain(dimension.name())); + case TILEDB_INT32: + return std::make_any>( + array.non_empty_domain(dimension.name())); + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_INT64: + return std::make_any>( + array.non_empty_domain(dimension.name())); + case TILEDB_FLOAT32: + return std::make_any>( + array.non_empty_domain(dimension.name())); + case TILEDB_FLOAT64: + return std::make_any>( + array.non_empty_domain(dimension.name())); + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + case TILEDB_BLOB: + case TILEDB_CHAR: + case TILEDB_GEOM_WKB: + case TILEDB_GEOM_WKT: + return std::make_any>( + array.non_empty_domain_var(dimension.name())); + default: + throw TileDBSOMAError(std::format( + "[SOMADimension][_non_empty_domain_slot] Unknown dimension " + "type {}", + impl::type_to_str(dimension.type()))); + } +} + +std::any SOMADimension::_core_current_domain_slot( + const SOMAContext& ctx, Array& array) const { + CurrentDomain + current_domain = tiledb::ArraySchemaExperimental::current_domain( + *ctx.tiledb_ctx(), array.schema()); + NDRectangle ndrect = current_domain.ndrectangle(); + + return _core_current_domain_slot(ndrect); +} + +std::any SOMADimension::_core_current_domain_slot(NDRectangle& ndrect) const { + switch (dimension.type()) { + case TILEDB_UINT8: { + std::array domain = ndrect.range( + dimension.name()); + return std::make_any>( + std::make_pair(domain[0], domain[1])); + } + case TILEDB_UINT16: { + std::array domain = ndrect.range( + dimension.name()); + return std::make_any>( + std::make_pair(domain[0], domain[1])); + } + case TILEDB_UINT32: { + std::array domain = ndrect.range( + dimension.name()); + return std::make_any>( + std::make_pair(domain[0], domain[1])); + } + case TILEDB_UINT64: { + std::array domain = ndrect.range( + dimension.name()); + return std::make_any>( + std::make_pair(domain[0], domain[1])); + } + case TILEDB_INT8: { + std::array domain = ndrect.range( + dimension.name()); + return std::make_any>( + std::make_pair(domain[0], domain[1])); + } + case TILEDB_INT16: { + std::array domain = ndrect.range( + dimension.name()); + return std::make_any>( + std::make_pair(domain[0], domain[1])); + } + case TILEDB_INT32: { + std::array domain = ndrect.range( + dimension.name()); + return std::make_any>( + std::make_pair(domain[0], domain[1])); + } + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_INT64: { + std::array domain = ndrect.range( + dimension.name()); + return std::make_any>( + std::make_pair(domain[0], domain[1])); + } + case TILEDB_FLOAT32: { + std::array domain = ndrect.range( + dimension.name()); + return std::make_any>( + std::make_pair(domain[0], domain[1])); + } + case TILEDB_FLOAT64: { + std::array domain = ndrect.range( + dimension.name()); + return std::make_any>( + std::make_pair(domain[0], domain[1])); + } + case TILEDB_STRING_UTF8: + case TILEDB_STRING_ASCII: + case TILEDB_CHAR: + case TILEDB_BLOB: + case TILEDB_GEOM_WKT: + case TILEDB_GEOM_WKB: { + std::array domain = ndrect.range( + dimension.name()); + return std::make_any>( + std::make_pair(domain[0], domain[1])); + } + default: + throw TileDBSOMAError(std::format( + "[SOMADimension] Unknown dimension type {}", + impl::type_to_str(dimension.type()))); + } +} + +ArrowArray* SOMADimension::arrow_domain_slot( + const SOMAContext& ctx, Array& array, enum Domainish kind) const { + switch (domain_type().value()) { + case TILEDB_INT64: + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_TIME_HR: + case TILEDB_TIME_MIN: + case TILEDB_TIME_SEC: + case TILEDB_TIME_MS: + case TILEDB_TIME_US: + case TILEDB_TIME_NS: + case TILEDB_TIME_PS: + case TILEDB_TIME_FS: + case TILEDB_TIME_AS: + return ArrowAdapter::make_arrow_array_child( + domain_slot(ctx, array, kind)); + case TILEDB_UINT64: + return ArrowAdapter::make_arrow_array_child( + domain_slot(ctx, array, kind)); + case TILEDB_INT32: + return ArrowAdapter::make_arrow_array_child( + domain_slot(ctx, array, kind)); + case TILEDB_UINT32: + return ArrowAdapter::make_arrow_array_child( + domain_slot(ctx, array, kind)); + case TILEDB_INT16: + return ArrowAdapter::make_arrow_array_child( + domain_slot(ctx, array, kind)); + case TILEDB_UINT16: + return ArrowAdapter::make_arrow_array_child( + domain_slot(ctx, array, kind)); + case TILEDB_INT8: + return ArrowAdapter::make_arrow_array_child( + domain_slot(ctx, array, kind)); + case TILEDB_UINT8: + return ArrowAdapter::make_arrow_array_child( + domain_slot(ctx, array, kind)); + case TILEDB_FLOAT64: + return ArrowAdapter::make_arrow_array_child( + domain_slot(ctx, array, kind)); + case TILEDB_FLOAT32: + return ArrowAdapter::make_arrow_array_child( + domain_slot(ctx, array, kind)); + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + case TILEDB_CHAR: + case TILEDB_GEOM_WKB: + case TILEDB_GEOM_WKT: + return ArrowAdapter::make_arrow_array_child_string( + domain_slot(ctx, array, kind)); + default: + throw TileDBSOMAError(std::format( + "[SOMADimension][arrow_domain_slot] dim {} has unhandled " + "type " + "{}", + name(), + tiledb::impl::type_to_str(domain_type().value()))); + } +} + +ArrowSchema* SOMADimension::arrow_schema_slot(const SOMAContext&, Array&) { + return ArrowAdapter::arrow_schema_from_tiledb_dimension(dimension) + .release(); +} + +} // namespace tiledbsoma \ No newline at end of file diff --git a/libtiledbsoma/src/soma/soma_dimension.h b/libtiledbsoma/src/soma/soma_dimension.h new file mode 100644 index 0000000000..b556476bc2 --- /dev/null +++ b/libtiledbsoma/src/soma/soma_dimension.h @@ -0,0 +1,143 @@ +/** + * @file soma_dimension.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines the SOMADimension class. SOMADimension acts as a wrapper + * to a TileDB Dimension and implements function to perform queries as well as + * core domain and current domain operations. It provides a common interface + * identical to TileDB attributes and composite columns. + */ + +#ifndef SOMA_DIMENSION_H +#define SOMA_DIMENSION_H + +#include +#include + +#include +#include "soma_column.h" + +namespace tiledbsoma { + +using namespace tiledb; + +class SOMADimension : public SOMAColumn { + public: + static std::shared_ptr create( + std::shared_ptr ctx, + ArrowSchema* schema, + ArrowArray* array, + const std::string& soma_type, + std::string_view type_metadata, + PlatformConfig platform_config); + + SOMADimension(Dimension dimension) + : dimension(dimension) { + } + + inline std::string name() const override { + return dimension.name(); + } + + inline bool isIndexColumn() const override { + return true; + } + + inline void select_columns( + const std::unique_ptr& query, + bool if_not_empty = false) const override { + query->select_columns(std::vector({dimension.name()}), if_not_empty); + }; + + inline soma_column_datatype_t type() const override { + return soma_column_datatype_t::SOMA_COLUMN_DIMENSION; + } + + inline std::optional domain_type() const override { + return dimension.type(); + } + + inline std::optional data_type() const override { + return std::nullopt; + } + + inline std::optional> tiledb_dimensions() override { + return std::vector({dimension}); + } + + inline std::optional> tiledb_attributes() override { + return std::nullopt; + } + + inline std::optional> tiledb_enumerations() + override { + return std::nullopt; + } + + ArrowArray* arrow_domain_slot( + const SOMAContext& ctx, + Array& array, + enum Domainish kind) const override; + + ArrowSchema* arrow_schema_slot( + const SOMAContext& ctx, Array& array) override; + + protected: + void _set_dim_points( + const std::unique_ptr& query, + const SOMAContext& ctx, + const std::any& ranges) const override; + + void _set_dim_ranges( + const std::unique_ptr& query, + const SOMAContext& ctx, + const std::any& ranges) const override; + + void _set_current_domain_slot( + NDRectangle& rectangle, + std::span domain) const override; + + std::pair _can_set_current_domain_slot( + std::optional& rectangle, + std::span new_domain) const override; + + std::any _core_domain_slot() const override; + + std::any _non_empty_domain_slot(Array& array) const override; + + std::any _core_current_domain_slot( + const SOMAContext& ctx, Array& array) const override; + + std::any _core_current_domain_slot(NDRectangle& ndrect) const override; + + private: + Dimension dimension; +}; +} // namespace tiledbsoma + +#endif \ No newline at end of file diff --git a/libtiledbsoma/src/tiledbsoma/tiledbsoma b/libtiledbsoma/src/tiledbsoma/tiledbsoma index 3f71c123a4..cfb3ab8243 100644 --- a/libtiledbsoma/src/tiledbsoma/tiledbsoma +++ b/libtiledbsoma/src/tiledbsoma/tiledbsoma @@ -49,6 +49,8 @@ #include "soma/column_buffer.h" #include "soma/soma_array.h" #include "soma/soma_collection.h" +#include "soma/soma_column.h" +#include "soma/soma_dimension.h" #include "soma/soma_dataframe.h" #include "soma/soma_group.h" #include "soma/soma_experiment.h" diff --git a/libtiledbsoma/test/CMakeLists.txt b/libtiledbsoma/test/CMakeLists.txt index 0427cbe03f..de78fddca1 100644 --- a/libtiledbsoma/test/CMakeLists.txt +++ b/libtiledbsoma/test/CMakeLists.txt @@ -29,6 +29,7 @@ add_executable(unit_soma unit_soma_dense_ndarray.cc unit_soma_sparse_ndarray.cc unit_soma_collection.cc + unit_soma_column.cc unit_soma_scene.cc unit_soma_geometry_dataframe.cc unit_soma_point_cloud_dataframe.cc diff --git a/libtiledbsoma/test/common.cc b/libtiledbsoma/test/common.cc index d194626783..0fff451d86 100644 --- a/libtiledbsoma/test/common.cc +++ b/libtiledbsoma/test/common.cc @@ -35,13 +35,6 @@ namespace helper { -// This non-obvious number is: -// * Something that fits into signed 32-bit integer for R-friendliness; -// * Is a comfortable tile-extent distance away from 2^31-1 for default -// core tile extent. (Using 2^31-1 exactly would result in a core -// array-creation error.) -const int CORE_DOMAIN_MAX = 2147483646; - static std::unique_ptr _create_index_cols_info_array( const std::vector& dim_infos); diff --git a/libtiledbsoma/test/common.h b/libtiledbsoma/test/common.h index df0ecd3aef..f5909d2b32 100644 --- a/libtiledbsoma/test/common.h +++ b/libtiledbsoma/test/common.h @@ -61,6 +61,13 @@ static const std::string src_path = TILEDBSOMA_SOURCE_ROOT; namespace helper { +// This non-obvious number is: +// * Something that fits into signed 32-bit integer for R-friendliness; +// * Is a comfortable tile-extent distance away from 2^31-1 for default +// core tile extent. (Using 2^31-1 exactly would result in a core +// array-creation error.) +const int CORE_DOMAIN_MAX = 2147483646; + // E.g. "d0" is of type TILEDB_INT64 with dim_max 1000 and current-domain // feature enabled struct DimInfo { diff --git a/libtiledbsoma/test/unit_soma_column.cc b/libtiledbsoma/test/unit_soma_column.cc new file mode 100644 index 0000000000..6a12bafe02 --- /dev/null +++ b/libtiledbsoma/test/unit_soma_column.cc @@ -0,0 +1,437 @@ +/** + * @file unit_soma_column.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file manages unit tests for implementation of SOMAColumn class. This is + * temparary and to be removed once SOMAColumn is fully integrated. + */ + +#include +#include +#include +#include "common.h" + +const int64_t SOMA_JOINID_DIM_MAX = 99; +const int64_t SOMA_JOINID_RESIZE_DIM_MAX = 199; + +// This is a keystroke-reduction fixture for some similar unit-test cases For +// convenience there are dims/attrs of type int64, uint32, and string. (Feel +// free to add more types.) The main value-adds of this fixture are (a) simple +// keystroke-reduction; (b) you get to pick which ones are the dim(s) and which +// are the attr(s). +struct VariouslyIndexedDataFrameFixture { + std::shared_ptr ctx_; + std::string uri_; + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // Using Catch2's TEST_CASE_METHOD we can't pass constructor args. + // This is a call-after-construction method. + void set_up(std::shared_ptr ctx, std::string uri) { + ctx_ = ctx; + uri_ = uri; + } + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // Helpers for setting up dim/attr configs and data + static const inline int64_t i64_dim_max = SOMA_JOINID_DIM_MAX; + static const inline int64_t u32_dim_max = 9999; + static const inline int64_t str_dim_max = 0; // not used for string dims + + static const inline std::string i64_name = "soma_joinid"; + static const inline std::string u32_name = "myuint32"; + static const inline std::string str_name = "mystring"; + + tiledb_datatype_t i64_datatype = TILEDB_INT64; + tiledb_datatype_t u32_datatype = TILEDB_UINT32; + tiledb_datatype_t str_datatype = TILEDB_STRING_ASCII; + + std::string i64_arrow_format = ArrowAdapter::tdb_to_arrow_type( + i64_datatype); + std::string u32_arrow_format = ArrowAdapter::tdb_to_arrow_type( + u32_datatype); + std::string attr_1_arrow_format = ArrowAdapter::tdb_to_arrow_type( + str_datatype); + + helper::DimInfo i64_dim_info() { + return helper::DimInfo( + {.name = i64_name, + .tiledb_datatype = i64_datatype, + .dim_max = i64_dim_max, + .string_lo = "N/A", + .string_hi = "N/A"}); + } + helper::DimInfo u32_dim_info() { + return helper::DimInfo( + {.name = u32_name, + .tiledb_datatype = u32_datatype, + .dim_max = u32_dim_max, + .string_lo = "N/A", + .string_hi = "N/A"}); + } + helper::DimInfo str_dim_info(std::string string_lo, std::string string_hi) { + return helper::DimInfo( + {.name = str_name, + .tiledb_datatype = str_datatype, + .dim_max = str_dim_max, + .string_lo = string_lo, + .string_hi = string_hi}); + } + + helper::AttrInfo i64_attr_info(std::string name = i64_name) { + return helper::AttrInfo( + {.name = name, .tiledb_datatype = i64_datatype}); + } + helper::AttrInfo u32_attr_info() { + return helper::AttrInfo( + {.name = u32_name, .tiledb_datatype = u32_datatype}); + } + helper::AttrInfo str_attr_info() { + return helper::AttrInfo( + {.name = str_name, .tiledb_datatype = str_datatype}); + } + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // Helper methods for create/open/write/etc. + + void create( + const std::vector& dim_infos, + const std::vector& attr_infos) { + auto [schema, index_columns] = + helper::create_arrow_schema_and_index_columns( + dim_infos, attr_infos); + SOMADataFrame::create( + uri_, + std::move(schema), + ArrowTable( + std::move(index_columns.first), + std::move(index_columns.second)), + ctx_); + } + + void create( + const std::vector& dim_infos, + const std::vector& attr_infos, + const PlatformConfig& platform_config, + std::optional timestamp_range = std::nullopt) { + auto [schema, index_columns] = + helper::create_arrow_schema_and_index_columns( + dim_infos, attr_infos); + SOMADataFrame::create( + uri_, + std::move(schema), + ArrowTable( + std::move(index_columns.first), + std::move(index_columns.second)), + ctx_, + platform_config, + timestamp_range); + } + + std::unique_ptr open( + OpenMode mode, + ResultOrder result_order = ResultOrder::automatic, + std::optional timestamp_range = std::nullopt) { + return SOMADataFrame::open( + uri_, + mode, + ctx_, + {}, // column_names + result_order, + timestamp_range); + } + + void write_sjid_u32_str_data_from(int64_t sjid_base) { + auto sdf = SOMADataFrame::open(uri_, OpenMode::write, ctx_); + + auto i64_data = std::vector({sjid_base + 1, sjid_base + 2}); + + auto u32_data = std::vector({1234, 5678}); + + // We like to think we're writing an array of strings ... + auto strings = std::vector({"apple", "bat"}); + // ... but really we're writing an array of characters along + // with offsets data. + // + // It would be possible here to just hard-code a string "applebat" and + // an offsets array {0, 5, 8}. The following bits simply automate that. + std::string char_data(""); + std::vector char_offsets(0); + uint64_t offset = 0; + for (auto e : strings) { + char_data += e; + char_offsets.push_back(offset); + offset += e.size(); + } + char_offsets.push_back(offset); + + sdf->set_column_data(i64_name, i64_data.size(), i64_data.data()); + sdf->set_column_data( + str_name, strings.size(), char_data.data(), char_offsets.data()); + sdf->set_column_data(u32_name, u32_data.size(), u32_data.data()); + sdf->write(); + + sdf->close(); + } +}; + +TEST_CASE("SOMAColumn: SOMADimension") { + auto ctx = std::make_shared(); + PlatformConfig platform_config{}; + + std::vector dim_infos( + {helper::DimInfo( + {.name = "dimension", + .tiledb_datatype = TILEDB_UINT32, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A"}), + helper::DimInfo( + {.name = "dimension", + .tiledb_datatype = TILEDB_FLOAT64, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A"}), + helper::DimInfo( + {.name = "dimension", + .tiledb_datatype = TILEDB_INT64, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A"}), + helper::DimInfo( + {.name = "dimension", + .tiledb_datatype = TILEDB_STRING_ASCII, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A"})}); + + std::vector geom_dim_infos({helper::DimInfo( + {.name = "dimension", + .tiledb_datatype = TILEDB_GEOM_WKB, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A"})}); + + std::vector spatial_dim_infos( + {helper::DimInfo( + {.name = "x", + .tiledb_datatype = TILEDB_FLOAT64, + .dim_max = 200, + .string_lo = "N/A", + .string_hi = "N/A"}), + helper::DimInfo( + {.name = "y", + .tiledb_datatype = TILEDB_FLOAT64, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A"})}); + + auto index_columns = helper::create_column_index_info(dim_infos); + + std::vector> columns; + + for (int64_t i = 0; i < index_columns.second->n_children; ++i) { + columns.push_back(SOMADimension::create( + ctx->tiledb_ctx(), + index_columns.second->children[i], + index_columns.first->children[i], + "SOMAGeometryDataFrame", + "", + platform_config)); + + REQUIRE( + columns.back()->tiledb_dimensions().value()[0].type() == + dim_infos[i].tiledb_datatype); + } + + REQUIRE( + columns[1]->core_domain_slot() == + std::make_pair(0, helper::CORE_DOMAIN_MAX)); + REQUIRE( + columns[1]->core_domain_slot() == + std::make_pair(0, helper::CORE_DOMAIN_MAX)); + REQUIRE( + columns[2]->core_domain_slot() == + std::make_pair(0, helper::CORE_DOMAIN_MAX)); + REQUIRE( + columns[3]->core_domain_slot() == + std::make_pair("", "")); +} + +TEST_CASE_METHOD( + VariouslyIndexedDataFrameFixture, + "SOMAColumn: query variant-indexed dataframe dim-str-u32 attr-sjid", + "[SOMADataFrame]") { + auto specify_domain = GENERATE(false, true); + SECTION(std::format("- specify_domain={}", specify_domain)) { + std::string suffix1 = specify_domain ? "true" : "false"; + set_up( + std::make_shared(), + "mem://unit-test-column-variant-indexed-dataframe-4-" + suffix1); + + std::string string_lo = ""; + std::string string_hi = ""; + std::vector dim_infos( + {str_dim_info(string_lo, string_hi), u32_dim_info()}); + std::vector attr_infos({i64_attr_info()}); + + // Create + create(dim_infos, attr_infos); + + // Check current domain + auto sdf = open(OpenMode::read); + + // External column initialization + auto raw_array = tiledb::Array(*ctx_->tiledb_ctx(), uri_, TILEDB_READ); + std::vector> columns; + + for (auto dimension : sdf->tiledb_schema()->domain().dimensions()) { + columns.push_back( + std::make_shared(SOMADimension(dimension))); + } + + CurrentDomain current_domain = sdf->get_current_domain_for_test(); + + REQUIRE(!current_domain.is_empty()); + REQUIRE(current_domain.type() == TILEDB_NDRECTANGLE); + NDRectangle ndrect = current_domain.ndrectangle(); + + std::array str_range = ndrect.range( + dim_infos[0].name); + std::pair + str_external = columns[0]->core_current_domain_slot( + *ctx_, raw_array); + + // Can we write empty strings in this range? + REQUIRE(str_range[0] <= ""); + REQUIRE(str_external.first <= ""); + REQUIRE(str_range[1] >= ""); + REQUIRE(str_external.second >= ""); + // Can we write ASCII values in this range? + REQUIRE(str_range[0] < " "); + REQUIRE(str_external.first <= " "); + REQUIRE(str_range[1] > "~"); + // REQUIRE(str_external.second >= "~"); + + std::array u32_range = ndrect.range( + dim_infos[1].name); + std::pair + u32_external = columns[1]->core_current_domain_slot( + *ctx_, raw_array); + REQUIRE(u32_range[0] == u32_external.first); + REQUIRE(u32_range[1] == u32_external.second); + + // Check shape before write + std::optional actual = sdf->maybe_soma_joinid_shape(); + REQUIRE(!actual.has_value()); + + // Check domainish accessors before resize + ArrowTable non_empty_domain = sdf->get_non_empty_domain(); + std::vector + ned_str = ArrowAdapter::get_table_string_column_by_name( + non_empty_domain, "mystring"); + + std::vector + ned_str_col = ArrowAdapter::get_array_string_column( + columns[0]->arrow_domain_slot( + *ctx_, raw_array, Domainish::kind_non_empty_domain), + columns[0]->arrow_schema_slot(*ctx_, raw_array)); + + ArrowTable soma_domain = sdf->get_soma_domain(); + std::vector + dom_str = ArrowAdapter::get_table_string_column_by_name( + soma_domain, "mystring"); + + std::vector + dom_str_col = ArrowAdapter::get_array_string_column( + columns[0]->arrow_domain_slot( + *ctx_, raw_array, Domainish::kind_core_current_domain), + columns[0]->arrow_schema_slot(*ctx_, raw_array)); + + ArrowTable soma_maxdomain = sdf->get_soma_maxdomain(); + std::vector + maxdom_str = ArrowAdapter::get_table_string_column_by_name( + soma_maxdomain, "mystring"); + + std::vector + maxdom_str_col = ArrowAdapter::get_array_string_column( + columns[0]->arrow_domain_slot( + *ctx_, raw_array, Domainish::kind_core_domain), + columns[0]->arrow_schema_slot(*ctx_, raw_array)); + + REQUIRE(ned_str == std::vector({"", ""})); + + REQUIRE(ned_str == ned_str_col); + REQUIRE(dom_str == dom_str_col); + REQUIRE(maxdom_str == maxdom_str_col); + + if (specify_domain) { + REQUIRE(dom_str[0] == dim_infos[0].string_lo); + REQUIRE(dom_str[1] == dim_infos[0].string_hi); + } else { + REQUIRE(dom_str == std::vector({"", ""})); + } + REQUIRE(maxdom_str == std::vector({"", ""})); + + sdf->close(); + + sdf = open(OpenMode::write); + write_sjid_u32_str_data_from(0); + + sdf->close(); + + sdf = open(OpenMode::read); + REQUIRE(sdf->nnz() == 2); + + sdf->close(); + + auto external_query = std::make_unique( + open(OpenMode::read), ctx_->tiledb_ctx()); + + columns[1]->select_columns(external_query); + columns[1]->set_dim_point(external_query, *ctx_, 1234); + + // Configure query and allocate result buffers + auto ext_res = external_query->read_next().value(); + + REQUIRE(ext_res->num_rows() == 1); + + external_query->reset(); + + columns[0]->select_columns(external_query); + columns[0]->set_dim_ranges( + external_query, + *ctx_, + std::vector( + {std::make_pair("apple", "b")})); + + // Configure query and allocate result buffers + ext_res = external_query->read_next().value(); + + REQUIRE(ext_res->num_rows() == 1); + } +}