Skip to content

Commit

Permalink
[r] Support enumerations in update_obs/update_var (#1711)
Browse files Browse the repository at this point in the history
* [r] Support enumerations in update_obs/update_var [WIP]

* Allow updating a `SOMADataFrame` with enums

* Add tests where user explicitly sets levels (should fail because of tiledb-r)

* [c++] Support return of `ordered` enumerations (#1717)

* [c++] Support return of `ordered` enumerations

* [r] Install tiledb-r from r-universe

* Run ordered enum tests

* Add tests for missing levels in enums

* code-review feedback

---------

Co-authored-by: Paul Hoffman <[email protected]>
Co-authored-by: Dirk Eddelbuettel <[email protected]>
  • Loading branch information
3 people authored Sep 28, 2023
1 parent fd9d458 commit 1a3b5c3
Show file tree
Hide file tree
Showing 5 changed files with 209 additions and 22 deletions.
27 changes: 21 additions & 6 deletions apis/r/R/SOMADataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -326,13 +326,28 @@ SOMADataFrame <- R6::R6Class(
# Add columns
for (add_col in add_cols) {
spdl::info("[SOMADataFrame update]: adding column '{}'", add_col)
se <- tiledb::tiledb_array_schema_evolution_add_attribute(
object = se,
attr = tiledb_attr_from_arrow_field(
field = new_schema$GetFieldByName(add_col),
tiledb_create_options = tiledb_create_options
)

col_type <- new_schema$GetFieldByName(add_col)$type
attr <- tiledb_attr_from_arrow_field(
field = new_schema$GetFieldByName(add_col),
tiledb_create_options = tiledb_create_options
)

if (inherits(col_type, "DictionaryType")) {
spdl::debug(
"[SOMADataFrame update]: adding column '{}' as an enumerated type",
add_col
)
se <- tiledb::tiledb_array_schema_evolution_add_enumeration(
object = se,
name = add_col,
enums = levels(values$GetColumnByName(add_col)$as_vector()),
ordered = col_type$ordered
)
attr <- tiledb::tiledb_attribute_set_enumeration_name(attr, add_col)
}

se <- tiledb::tiledb_array_schema_evolution_add_attribute(se, attr)
}

se <- tiledb::tiledb_array_schema_evolution_array_evolve(se, self$uri)
Expand Down
145 changes: 142 additions & 3 deletions apis/r/tests/testthat/test-SOMADataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ test_that("creation with ordered factors", {
uri <- withr::local_tempdir("soma-dataframe-ordered")
n <- 10L
df <- data.frame(
soma_joinid = bit64::as.integer64(seq_len(length.out = n) - 1L),
soma_joinid = bit64::seq.integer64(from = 0L, to = n - 1L),
int = seq_len(length.out = n),
bool = rep_len(c(TRUE, FALSE), length.out = n),
ord = ordered(rep_len(c("g1", "g2", "g3"), length.out = n))
Expand Down Expand Up @@ -259,7 +259,7 @@ test_that("explicit casting of ordered factors to regular factors", {
uri <- withr::local_tempdir("soma-dataframe-unordered")
n <- 10L
df <- data.frame(
soma_joinid = bit64::as.integer64(seq_len(length.out = n) - 1L),
soma_joinid = bit64::seq.integer64(from = 0L, to = n - 1L),
int = seq_len(length.out = n),
bool = rep_len(c(TRUE, FALSE), length.out = n),
ord = ordered(rep_len(c("g1", "g2", "g3"), length.out = n))
Expand Down Expand Up @@ -291,7 +291,6 @@ test_that("explicit casting of ordered factors to regular factors", {
expect_identical(levels(ord), levels(df$ord))
})


test_that("SOMADataFrame read", {
skip_if(!extended_tests())
uri <- extract_dataset("soma-dataframe-pbmc3k-processed-obs")
Expand Down Expand Up @@ -601,6 +600,69 @@ test_that("SOMADataFrame can be updated", {
tbl1 <- SOMADataFrameOpen(uri, mode = "READ")$read()$concat()
expect_true(tbl1$Equals(tbl0))

# Add a new enum and update
tbl0$frobo <- factor(sample(letters[1:3], nrow(tbl0), replace = TRUE))
expect_no_condition(sdf <- SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0))

# Verify enum was added on disk
expect_s3_class(
tbl1 <- SOMADataFrameOpen(uri, mode = "READ")$read()$concat(),
"Table"
)
expect_identical(as.data.frame(tbl1), as.data.frame(tbl0))
expect_s3_class(
tbl1$GetColumnByName("frobo")$as_vector(),
"factor",
exact = TRUE
)

# Add a new enum where levels aren't in appearance- or alphabetical-order
tbl0 <- tbl1
tbl0$rlvl <- factor(
rep_len(c("red", "blue", "green"), nrow(tbl0)),
levels = c("green", "red", "blue")
)
expect_identical(
levels(tbl0$GetColumnByName("rlvl")$as_vector()),
c("green", "red", "blue")
)
expect_no_condition(sdf <- SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0))

# Verify unordered enum was added on disk
expect_s3_class(
tbl1 <- SOMADataFrameOpen(uri, mode = "READ")$read()$concat(),
"Table"
)
expect_identical(as.data.frame(tbl1), as.data.frame(tbl0))
expect_s3_class(
tbl1$GetColumnByName("rlvl")$as_vector(),
"factor",
exact = TRUE
)
expect_identical(
levels(tbl1$GetColumnByName("rlvl")$as_vector()),
c("green", "red", "blue")
)

# Add a new ordered and update
tbl0 <- tbl1
tbl0$ord <- ordered(sample(c("g1", "g2", "g3"), nrow(tbl0), replace = TRUE))
expect_no_condition(sdf <- SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0))

# Verify ordered was added on disk
expect_s3_class(
tbl1 <- SOMADataFrameOpen(uri, mode = "READ")$read()$concat(),
"Table"
)

# Read ordered enums
expect_identical(as.data.frame(tbl1), as.data.frame(tbl0))
expect_s3_class(
tbl1$GetColumnByName("ord")$as_vector(),
c("ordered", "factor"),
exact = TRUE
)

# Error if attempting to drop an array dimension
tbl0$foo <- NULL # drop the indexed dimension
expect_error(
Expand Down Expand Up @@ -654,3 +716,80 @@ test_that("SOMADataFrame can be updated from a data frame", {
"'row_index_name' conflicts with an existing column name"
)
})

test_that("missing levels in enums", {
skip_if_not_installed("tiledb", "0.21.0")
skip_if(!extended_tests())
uri <- withr::local_tempdir("soma-dataframe-missing-levels")
n <- 10L
df <- data.frame(
soma_joinid = bit64::seq.integer64(from = 0L, to = n - 1L),
int = seq_len(length.out = n),
enum = factor(
x = rep_len(c("g1", "g2", "g3"), length.out = n),
levels = c("g1", "g3")
)
)
expect_true(any(is.na(df$enum)))

# Create SOMADataFrame w/ missing enum levels
tbl <- arrow::as_arrow_table(df)
lvls <- sapply(
setdiff(names(df), "soma_joinid"),
function(x) levels(df[[x]]),
simplify = FALSE,
USE.NAMES = TRUE
)
sdf <- SOMADataFrameCreate(uri, tbl$schema, levels = lvls)
on.exit(sdf$close())
sdf$write(tbl)
sdf$close()

# Test missingness is preserved
expect_s3_class(sdf <- SOMADataFrameOpen(uri), "SOMADataFrame")
expect_true(tiledb::tiledb_array_has_enumeration(sdf$object)["enum"])
expect_s4_class(
attr <- tiledb::attrs(sdf$tiledb_schema())$enum,
"tiledb_attr"
)
expect_identical(
tiledb::tiledb_attribute_get_enumeration(attr, sdf$object),
levels(df$enum)
)
expect_true(tiledb::tiledb_attribute_get_nullable(attr))

# Test reading preserves missingness
expect_identical(sdf$object[]$enum, df$enum)
tbl0 <- sdf$read()$concat()
expect_identical(tbl0$enum$as_vector(), df$enum)
sdf$close()

# Update w/ missing enum levels
tbl0$miss <- factor(
rep_len(letters[1:3], length.out = n),
levels = c("b", "a")
)
expect_true(any(is.na(tbl0$miss$as_vector())))
sdf <- SOMADataFrameOpen(uri, mode = "WRITE")
expect_no_condition(sdf$update(tbl0))
sdf$close()

# Test missingness is preserved when updating
expect_s3_class(sdf <- SOMADataFrameOpen(uri), "SOMADataFrame")
expect_true(tiledb::tiledb_array_has_enumeration(sdf$object)["miss"])
expect_s4_class(
attr <- tiledb::attrs(sdf$tiledb_schema())$miss,
"tiledb_attr"
)
expect_identical(
tiledb::tiledb_attribute_get_enumeration(attr, sdf$object),
levels(tbl0$miss$as_vector())
)
expect_true(tiledb::tiledb_attribute_get_nullable(attr))

# Test reading preserves updated missingness
expect_identical(sdf$object[]$miss, tbl0$miss$as_vector())
tbl1 <- sdf$read()$concat()
expect_identical(tbl1$miss$as_vector(), tbl0$miss$as_vector())
sdf$close()
})
38 changes: 28 additions & 10 deletions libtiledbsoma/src/soma/column_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,12 @@ std::shared_ptr<ColumnBuffer> ColumnBuffer::create(
auto enum_name = AttributeExperimental::get_enumeration_name(
schema.context(), attr);
std::optional<Enumeration> enumeration = std::nullopt;
bool is_ordered = false;
if (enum_name.has_value()) {
enumeration = std::make_optional<Enumeration>(
ArrayExperimental::get_enumeration(
schema.context(), *array, *enum_name));
auto enmr = ArrayExperimental::get_enumeration(
schema.context(), *array, *enum_name);
is_ordered = enmr.ordered();
enumeration = std::make_optional<Enumeration>(enmr);
}

if (!is_var && attr.cell_val_num() != 1) {
Expand All @@ -66,7 +68,13 @@ std::shared_ptr<ColumnBuffer> ColumnBuffer::create(
}

return ColumnBuffer::alloc(
schema, name_str, type, is_var, is_nullable, enumeration);
schema,
name_str,
type,
is_var,
is_nullable,
enumeration,
is_ordered);

} else if (schema.domain().has_dimension(name_str)) {
auto dim = schema.domain().dimension(name_str);
Expand All @@ -82,7 +90,7 @@ std::shared_ptr<ColumnBuffer> ColumnBuffer::create(
}

return ColumnBuffer::alloc(
schema, name_str, type, is_var, false, std::nullopt);
schema, name_str, type, is_var, false, std::nullopt, false);
}

throw TileDBSOMAError("[ColumnBuffer] Column name not found: " + name_str);
Expand Down Expand Up @@ -116,14 +124,16 @@ ColumnBuffer::ColumnBuffer(
size_t num_bytes,
bool is_var,
bool is_nullable,
std::optional<Enumeration> enumeration)
std::optional<Enumeration> enumeration,
bool is_ordered)
: name_(name)
, type_(type)
, type_size_(tiledb::impl::type_size(type))
, num_cells_(0)
, is_var_(is_var)
, is_nullable_(is_nullable)
, enumeration_(enumeration) {
, enumeration_(enumeration)
, is_ordered_(is_ordered) {
LOG_DEBUG(fmt::format(
"[ColumnBuffer] '{}' {} bytes is_var={} is_nullable={}",
name,
Expand Down Expand Up @@ -201,7 +211,8 @@ std::shared_ptr<ColumnBuffer> ColumnBuffer::alloc(
tiledb_datatype_t type,
bool is_var,
bool is_nullable,
std::optional<Enumeration> enumeration) {
std::optional<Enumeration> enumeration,
bool is_ordered) {
// Set number of bytes for the data buffer. Override with a value from
// the config if present.
auto num_bytes = DEFAULT_ALLOC_BYTES;
Expand Down Expand Up @@ -233,7 +244,14 @@ std::shared_ptr<ColumnBuffer> ColumnBuffer::alloc(
num_bytes / tiledb::impl::type_size(type);

return std::make_shared<ColumnBuffer>(
name, type, num_cells, num_bytes, is_var, is_nullable, enumeration);
name,
type,
num_cells,
num_bytes,
is_var,
is_nullable,
enumeration,
is_ordered);
}

} // namespace tiledbsoma
} // namespace tiledbsoma
18 changes: 15 additions & 3 deletions libtiledbsoma/src/soma/column_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ class ColumnBuffer {
* @param is_var Column type is variable length
* @param is_nullable Column can contain null values
* @param enumeration Optional Enumeration associated with column
* @param is_ordered Optional Enumeration is ordered
*/
ColumnBuffer(
std::string_view name,
Expand All @@ -120,7 +121,8 @@ class ColumnBuffer {
size_t num_bytes,
bool is_var = false,
bool is_nullable = false,
std::optional<Enumeration> enumeration = std::nullopt);
std::optional<Enumeration> enumeration = std::nullopt,
bool is_ordered = false);

ColumnBuffer() = delete;
ColumnBuffer(const ColumnBuffer&) = delete;
Expand Down Expand Up @@ -328,6 +330,13 @@ class ColumnBuffer {
return tcb::span<char>(enum_str_.data(), enum_str_.length());
}

/**
* @brief Return true if the buffer contains an ordered enumeration.
*/
bool is_ordered() const {
return is_ordered_;
}

private:
//===================================================================
//= private static
Expand All @@ -342,6 +351,7 @@ class ColumnBuffer {
* @param is_var True if variable length data
* @param is_nullable True if nullable data
* @param enumeration Optional Enumeration associated with column
* @param is_ordered Optional Enumeration is ordered
* @return ColumnBuffer
*/
static std::shared_ptr<ColumnBuffer> alloc(
Expand All @@ -350,7 +360,8 @@ class ColumnBuffer {
tiledb_datatype_t type,
bool is_var,
bool is_nullable,
std::optional<Enumeration> enumeration);
std::optional<Enumeration> enumeration,
bool is_ordered);

//===================================================================
//= private non-static
Expand Down Expand Up @@ -395,7 +406,8 @@ class ColumnBuffer {
// Enumerations (optional) as string and offsets
std::string enum_str_;
std::vector<uint32_t> enum_offsets_;
bool is_ordered_ = false;
};

} // namespace tiledbsoma
#endif
#endif
3 changes: 3 additions & 0 deletions libtiledbsoma/src/utils/arrow_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ class ArrowAdapter {
column->validity_to_bitmap();
array->buffers[0] = column->validity().data();
}
if (column->is_ordered()) {
schema->flags |= ARROW_FLAG_DICTIONARY_ORDERED;
}

/* Workaround to cast TILEDB_BOOL from uint8 to 1-bit Arrow boolean. */
if (column->type() == TILEDB_BOOL) {
Expand Down

0 comments on commit 1a3b5c3

Please sign in to comment.