Skip to content

Commit

Permalink
[c++] Support Enumeration in C++ codebase (#1519)
Browse files Browse the repository at this point in the history
* [c++] Support `Enumeration` in C++ Codebase

* Addition of `SOMAArray::get_enum` and `SOMAArray::get_enum_label_on_attr`
* Attach an enumeration/dictionary to the `ColumnBuffer` if applicable;
  this is used when converting from `ArrayBuffers` to Arrow Tables in
  the Python and R APIs

* Add `get_attr_to_enum_mapping` Function

* WIP fix bug where attr name was passed instead of enum name

* Add Unit Tests for Enumeration in C++

* `to_varlen_buffers` Returns `std::string`

* Prior to TileDB-Inc/TileDB#4272, the SOMA unit
tests were erroneously writing a byte vector for string dimensions which
maps to `TILEDB_BLOB` rather than `TILEDB_STRING_ASCII`

* Update SOMA Array get_metadata Signatures

* Depend on 2.17.0-rc0

* resolve an incomplete merge with #1559

* [r] Update tiledb-r to RC (borrowed from #1663, #1665)

* [r] Undo brown-bag typo in helper script

* 2.17.0

---------

Co-authored-by: John Kerl <[email protected]>
Co-authored-by: Dirk Eddelbuettel <[email protected]>
  • Loading branch information
3 people authored Sep 14, 2023
1 parent 04b9b80 commit aa1c3fe
Show file tree
Hide file tree
Showing 11 changed files with 194 additions and 81 deletions.
8 changes: 8 additions & 0 deletions .github/workflows/r-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,14 @@ jobs:

- name: Install BioConductor package SingleCellExperiment
run: cd apis/r && tools/r-ci.sh install_bioc SingleCellExperiment

- name: Install rc version of tiledb-r (macOS)
if: ${{ matrix.os == 'macOS-latest' }}
run: cd apis/r && Rscript -e "install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev', 'https://cloud.r-project.org'))"

- name: Install rc version of tiledb-r (linux)
if: ${{ matrix.os != 'macOS-latest' }}
run: cd apis/r && Rscript -e "options(bspm.version.check=TRUE); install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev/bin/linux/jammy/4.3/', 'https://cloud.r-project.org'))"

- name: Dependencies
run: cd apis/r && tools/r-ci.sh install_all
Expand Down
4 changes: 2 additions & 2 deletions apis/r/tools/get_tarball.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/usr/bin/env Rscript

## version pinning info
tiledb_core_version <- "2.16.2"
tiledb_core_sha1 <- "07b65de"
tiledb_core_version <- "2.17.0"
tiledb_core_sha1 <- "93c173d"

if ( ! dir.exists("inst/") ) {
stop("No 'inst/' directory. Exiting.", call. = FALSE)
Expand Down
28 changes: 14 additions & 14 deletions libtiledbsoma/cmake/Modules/FindTileDB_EP.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ else()
# NB When updating the pinned URLs here, please also update in file apis/r/tools/get_tarball.R
if(DOWNLOAD_TILEDB_PREBUILT)
if (WIN32) # Windows
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.16.2/tiledb-windows-x86_64-2.16.2-07b65de.zip")
SET(DOWNLOAD_SHA1 "1cda23235ceeff70cb2b30e0c0e22fcd9fd83b51")
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.17.0/tiledb-windows-x86_64-2.17.0-93c173d.zip")
SET(DOWNLOAD_SHA1 "d43589b22de95d45b40de9918d105a6174ec352e")
elseif(APPLE) # OSX

# Status quo as of 2023-05-18:
Expand All @@ -76,22 +76,22 @@ else()
# o CMAKE_SYSTEM_PROCESSOR is x86_64

if (CMAKE_OSX_ARCHITECTURES STREQUAL x86_64)
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.16.2/tiledb-macos-x86_64-2.16.2-07b65de.tar.gz")
SET(DOWNLOAD_SHA1 "355233cee1515857c91b2f12fe4f7bbc1ac02465")
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.17.0/tiledb-macos-x86_64-2.17.0-93c173d.tar.gz")
SET(DOWNLOAD_SHA1 "9a232015cbf09c5bd37375537cef80a382e1ffa4")
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL arm64)
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.16.2/tiledb-macos-arm64-2.16.2-07b65de.tar.gz")
SET(DOWNLOAD_SHA1 "5aad92b76e6fe3f7129f514ed926ef1c8af4bfa3")
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.17.0/tiledb-macos-arm64-2.17.0-93c173d.tar.gz")
SET(DOWNLOAD_SHA1 "b861b90b462963db44fe0217087fac3510fd6293")
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.16.2/tiledb-macos-x86_64-2.16.2-07b65de.tar.gz")
SET(DOWNLOAD_SHA1 "355233cee1515857c91b2f12fe4f7bbc1ac02465")
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.17.0/tiledb-macos-x86_64-2.17.0-93c173d.tar.gz")
SET(DOWNLOAD_SHA1 "9a232015cbf09c5bd37375537cef80a382e1ffa4")
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.16.2/tiledb-macos-arm64-2.16.2-07b65de.tar.gz")
SET(DOWNLOAD_SHA1 "5aad92b76e6fe3f7129f514ed926ef1c8af4bfa3")
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.17.0/tiledb-macos-arm64-2.17.0-93c173d.tar.gz")
SET(DOWNLOAD_SHA1 "b861b90b462963db44fe0217087fac3510fd6293")
endif()

else() # Linux
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.16.2/tiledb-linux-x86_64-2.16.2-07b65de.tar.gz")
SET(DOWNLOAD_SHA1 "b9fc44a104f31a9348a399e55ef9e32903b99590")
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.17.0/tiledb-linux-x86_64-2.17.0-93c173d.tar.gz")
SET(DOWNLOAD_SHA1 "5c04c07a73d3fe48a9ba8f3ad8af5e1912a39ce8")
endif()

ExternalProject_Add(ep_tiledb
Expand All @@ -113,8 +113,8 @@ else()
else() # Build from source
ExternalProject_Add(ep_tiledb
PREFIX "externals"
URL "https://github.com/TileDB-Inc/TileDB/archive/2.16.2.zip"
URL_HASH SHA1=d54ff7fc4c3a1c5afb1027bab1ba011ae47c3d79
URL "https://github.com/TileDB-Inc/TileDB/archive/2.17.0.zip"
URL_HASH SHA1=bbf5b34fec1c729f048f48bf1a0f03abb447d7de
DOWNLOAD_NAME "tiledb.zip"
CMAKE_ARGS
-DCMAKE_INSTALL_PREFIX=${EP_INSTALL_PREFIX}
Expand Down
33 changes: 21 additions & 12 deletions libtiledbsoma/src/soma/column_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,26 +42,31 @@ using namespace tiledb;

std::shared_ptr<ColumnBuffer> ColumnBuffer::create(
std::shared_ptr<Array> array, std::string_view name) {
return ColumnBuffer::create(array->schema(), name);
}

std::shared_ptr<ColumnBuffer> ColumnBuffer::create(
ArraySchema schema, std::string_view name) {
auto schema = array->schema();
auto name_str = std::string(name); // string for TileDB API

if (schema.has_attribute(name_str)) {
auto attr = schema.attribute(name_str);
auto type = attr.type();
bool is_var = attr.cell_val_num() == TILEDB_VAR_NUM;
bool is_nullable = attr.nullable();
auto enum_name = AttributeExperimental::get_enumeration_name(
schema.context(), attr);
std::optional<Enumeration> enumeration = std::nullopt;
if (enum_name.has_value()) {
enumeration = std::make_optional<Enumeration>(
ArrayExperimental::get_enumeration(
schema.context(), *array, *enum_name));
}

if (!is_var && attr.cell_val_num() != 1) {
throw TileDBSOMAError(
"[ColumnBuffer] Values per cell > 1 is not supported: " +
name_str);
}

return ColumnBuffer::alloc(schema, name_str, type, is_var, is_nullable);
return ColumnBuffer::alloc(
schema, name_str, type, is_var, is_nullable, enumeration);

} else if (schema.domain().has_dimension(name_str)) {
auto dim = schema.domain().dimension(name_str);
Expand All @@ -76,7 +81,8 @@ std::shared_ptr<ColumnBuffer> ColumnBuffer::create(
name_str);
}

return ColumnBuffer::alloc(schema, name_str, type, is_var, false);
return ColumnBuffer::alloc(
schema, name_str, type, is_var, false, std::nullopt);
}

throw TileDBSOMAError("[ColumnBuffer] Column name not found: " + name_str);
Expand Down Expand Up @@ -109,13 +115,15 @@ ColumnBuffer::ColumnBuffer(
size_t num_cells,
size_t num_bytes,
bool is_var,
bool is_nullable)
bool is_nullable,
std::optional<Enumeration> enumeration)
: name_(name)
, type_(type)
, type_size_(tiledb::impl::type_size(type))
, num_cells_(0)
, is_var_(is_var)
, is_nullable_(is_nullable) {
, is_nullable_(is_nullable)
, enumeration_(enumeration) {
LOG_DEBUG(fmt::format(
"[ColumnBuffer] '{}' {} bytes is_var={} is_nullable={}",
name,
Expand Down Expand Up @@ -192,7 +200,8 @@ std::shared_ptr<ColumnBuffer> ColumnBuffer::alloc(
std::string_view name,
tiledb_datatype_t type,
bool is_var,
bool is_nullable) {
bool is_nullable,
std::optional<Enumeration> enumeration) {
// Set number of bytes for the data buffer. Override with a value from
// the config if present.
auto num_bytes = DEFAULT_ALLOC_BYTES;
Expand Down Expand Up @@ -224,7 +233,7 @@ std::shared_ptr<ColumnBuffer> ColumnBuffer::alloc(
num_bytes / tiledb::impl::type_size(type);

return std::make_shared<ColumnBuffer>(
name, type, num_cells, num_bytes, is_var, is_nullable);
name, type, num_cells, num_bytes, is_var, is_nullable, enumeration);
}

} // namespace tiledbsoma
} // namespace tiledbsoma
36 changes: 20 additions & 16 deletions libtiledbsoma/src/soma/column_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <stdexcept> // for windows: error C2039: 'runtime_error': is not a member of 'std'

#include <tiledb/tiledb>
#include <tiledb/tiledb_experimental>

#include "../utils/common.h"
#include "../utils/logger.h"
Expand Down Expand Up @@ -69,28 +70,20 @@ class ColumnBuffer {
static std::shared_ptr<ColumnBuffer> create(
std::shared_ptr<Array> array, std::string_view name);

/**
* @brief Create a ColumnBuffer from a schema and column name.
*
* @param schema TileDB schema
* @param name TileDB dimension or attribute name
* @return ColumnBuffer
*/
static std::shared_ptr<ColumnBuffer> create(
ArraySchema schema, std::string_view name);

/**
* @brief Create a ColumnBuffer from a schema, column name, and data.
*
* @param schema TileDB schema
* @param array TileDB array
* @param name TileDB dimension or attribute name
* @param data Data to set in buffer
* @return ColumnBuffer
*/
template <typename T>
static std::shared_ptr<ColumnBuffer> create(
ArraySchema schema, std::string_view name, std::vector<T> data) {
auto column_buff = ColumnBuffer::create(schema, name);
std::shared_ptr<Array> array,
std::string_view name,
std::vector<T> data) {
auto column_buff = ColumnBuffer::create(array, name);
column_buff->num_cells_ = data.size();
column_buff->data_.resize(data.size());
column_buff->data_.assign(
Expand Down Expand Up @@ -118,14 +111,16 @@ class ColumnBuffer {
* @param num_bytes Number of bytes to allocate for data
* @param is_var Column type is variable length
* @param is_nullable Column can contain null values
* @param enumeration Optional Enumeration associated with column
*/
ColumnBuffer(
std::string_view name,
tiledb_datatype_t type,
size_t num_cells,
size_t num_bytes,
bool is_var = false,
bool is_nullable = false);
bool is_nullable = false,
std::optional<Enumeration> enumeration = std::nullopt);

ColumnBuffer() = delete;
ColumnBuffer(const ColumnBuffer&) = delete;
Expand Down Expand Up @@ -243,6 +238,10 @@ class ColumnBuffer {
return is_nullable_;
}

std::optional<Enumeration> get_enumeration() const {
return enumeration_;
}

/**
* @brief Convert the data bytemap to a bitmap in place.
*
Expand Down Expand Up @@ -272,14 +271,16 @@ class ColumnBuffer {
* @param type TileDB datatype
* @param is_var True if variable length data
* @param is_nullable True if nullable data
* @param enumeration Optional Enumeration associated with column
* @return ColumnBuffer
*/
static std::shared_ptr<ColumnBuffer> alloc(
ArraySchema schema,
std::string_view name,
tiledb_datatype_t type,
bool is_var,
bool is_nullable);
bool is_nullable,
std::optional<Enumeration> enumeration);

//===================================================================
//= private non-static
Expand All @@ -303,6 +304,9 @@ class ColumnBuffer {
// If true, the data is nullable
bool is_nullable_;

// If applicable, the Enumeration associated with the column
std::optional<Enumeration> enumeration_;

// Data buffer.
std::vector<std::byte> data_;

Expand All @@ -314,4 +318,4 @@ class ColumnBuffer {
};

} // namespace tiledbsoma
#endif
#endif
24 changes: 24 additions & 0 deletions libtiledbsoma/src/soma/soma_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,30 @@ std::vector<std::string> SOMAArray::dimension_names() const {
return result;
}

std::map<std::string, Enumeration> SOMAArray::get_attr_to_enum_mapping() {
std::map<std::string, Enumeration> result;
for (uint32_t i = 0; i < arr_->schema().attribute_num(); ++i) {
auto attr = arr_->schema().attribute(i);
if (attr_has_enum(attr.name())) {
auto enmr_label = *get_enum_label_on_attr(attr.name());
auto enmr = ArrayExperimental::get_enumeration(
*ctx_, *arr_, enmr_label);
result.insert({attr.name(), enmr});
}
}
return result;
}

std::optional<std::string> SOMAArray::get_enum_label_on_attr(
std::string attr_name) {
auto attr = arr_->schema().attribute(attr_name);
return AttributeExperimental::get_enumeration_name(*ctx_, attr);
}

bool SOMAArray::attr_has_enum(std::string attr_name) {
return get_enum_label_on_attr(attr_name).has_value();
}

void SOMAArray::set_metadata(
const std::string& key,
tiledb_datatype_t value_type,
Expand Down
Loading

0 comments on commit aa1c3fe

Please sign in to comment.