Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport release-1.5] [r] Support enumerations in update_obs/update_var #1734

Merged
merged 1 commit into from
Sep 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions apis/r/R/SOMADataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -326,13 +326,28 @@ SOMADataFrame <- R6::R6Class(
# Add columns
for (add_col in add_cols) {
spdl::info("[SOMADataFrame update]: adding column '{}'", add_col)
se <- tiledb::tiledb_array_schema_evolution_add_attribute(
object = se,
attr = tiledb_attr_from_arrow_field(
field = new_schema$GetFieldByName(add_col),
tiledb_create_options = tiledb_create_options
)

col_type <- new_schema$GetFieldByName(add_col)$type
attr <- tiledb_attr_from_arrow_field(
field = new_schema$GetFieldByName(add_col),
tiledb_create_options = tiledb_create_options
)

if (inherits(col_type, "DictionaryType")) {
spdl::debug(
"[SOMADataFrame update]: adding column '{}' as an enumerated type",
add_col
)
se <- tiledb::tiledb_array_schema_evolution_add_enumeration(
object = se,
name = add_col,
enums = levels(values$GetColumnByName(add_col)$as_vector()),
ordered = col_type$ordered
)
attr <- tiledb::tiledb_attribute_set_enumeration_name(attr, add_col)
}

se <- tiledb::tiledb_array_schema_evolution_add_attribute(se, attr)
}

se <- tiledb::tiledb_array_schema_evolution_array_evolve(se, self$uri)
Expand Down
145 changes: 142 additions & 3 deletions apis/r/tests/testthat/test-SOMADataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ test_that("creation with ordered factors", {
uri <- withr::local_tempdir("soma-dataframe-ordered")
n <- 10L
df <- data.frame(
soma_joinid = bit64::as.integer64(seq_len(length.out = n) - 1L),
soma_joinid = bit64::seq.integer64(from = 0L, to = n - 1L),
int = seq_len(length.out = n),
bool = rep_len(c(TRUE, FALSE), length.out = n),
ord = ordered(rep_len(c("g1", "g2", "g3"), length.out = n))
Expand Down Expand Up @@ -259,7 +259,7 @@ test_that("explicit casting of ordered factors to regular factors", {
uri <- withr::local_tempdir("soma-dataframe-unordered")
n <- 10L
df <- data.frame(
soma_joinid = bit64::as.integer64(seq_len(length.out = n) - 1L),
soma_joinid = bit64::seq.integer64(from = 0L, to = n - 1L),
int = seq_len(length.out = n),
bool = rep_len(c(TRUE, FALSE), length.out = n),
ord = ordered(rep_len(c("g1", "g2", "g3"), length.out = n))
Expand Down Expand Up @@ -291,7 +291,6 @@ test_that("explicit casting of ordered factors to regular factors", {
expect_identical(levels(ord), levels(df$ord))
})


test_that("SOMADataFrame read", {
skip_if(!extended_tests())
uri <- extract_dataset("soma-dataframe-pbmc3k-processed-obs")
Expand Down Expand Up @@ -601,6 +600,69 @@ test_that("SOMADataFrame can be updated", {
tbl1 <- SOMADataFrameOpen(uri, mode = "READ")$read()$concat()
expect_true(tbl1$Equals(tbl0))

# Add a new enum and update
tbl0$frobo <- factor(sample(letters[1:3], nrow(tbl0), replace = TRUE))
expect_no_condition(sdf <- SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0))

# Verify enum was added on disk
expect_s3_class(
tbl1 <- SOMADataFrameOpen(uri, mode = "READ")$read()$concat(),
"Table"
)
expect_identical(as.data.frame(tbl1), as.data.frame(tbl0))
expect_s3_class(
tbl1$GetColumnByName("frobo")$as_vector(),
"factor",
exact = TRUE
)

# Add a new enum where levels aren't in appearance- or alphabetical-order
tbl0 <- tbl1
tbl0$rlvl <- factor(
rep_len(c("red", "blue", "green"), nrow(tbl0)),
levels = c("green", "red", "blue")
)
expect_identical(
levels(tbl0$GetColumnByName("rlvl")$as_vector()),
c("green", "red", "blue")
)
expect_no_condition(sdf <- SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0))

# Verify unordered enum was added on disk
expect_s3_class(
tbl1 <- SOMADataFrameOpen(uri, mode = "READ")$read()$concat(),
"Table"
)
expect_identical(as.data.frame(tbl1), as.data.frame(tbl0))
expect_s3_class(
tbl1$GetColumnByName("rlvl")$as_vector(),
"factor",
exact = TRUE
)
expect_identical(
levels(tbl1$GetColumnByName("rlvl")$as_vector()),
c("green", "red", "blue")
)

# Add a new ordered and update
tbl0 <- tbl1
tbl0$ord <- ordered(sample(c("g1", "g2", "g3"), nrow(tbl0), replace = TRUE))
expect_no_condition(sdf <- SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0))

# Verify ordered was added on disk
expect_s3_class(
tbl1 <- SOMADataFrameOpen(uri, mode = "READ")$read()$concat(),
"Table"
)

# Read ordered enums
expect_identical(as.data.frame(tbl1), as.data.frame(tbl0))
expect_s3_class(
tbl1$GetColumnByName("ord")$as_vector(),
c("ordered", "factor"),
exact = TRUE
)

# Error if attempting to drop an array dimension
tbl0$foo <- NULL # drop the indexed dimension
expect_error(
Expand Down Expand Up @@ -654,3 +716,80 @@ test_that("SOMADataFrame can be updated from a data frame", {
"'row_index_name' conflicts with an existing column name"
)
})

test_that("missing levels in enums", {
skip_if_not_installed("tiledb", "0.21.0")
skip_if(!extended_tests())
uri <- withr::local_tempdir("soma-dataframe-missing-levels")
n <- 10L
df <- data.frame(
soma_joinid = bit64::seq.integer64(from = 0L, to = n - 1L),
int = seq_len(length.out = n),
enum = factor(
x = rep_len(c("g1", "g2", "g3"), length.out = n),
levels = c("g1", "g3")
)
)
expect_true(any(is.na(df$enum)))

# Create SOMADataFrame w/ missing enum levels
tbl <- arrow::as_arrow_table(df)
lvls <- sapply(
setdiff(names(df), "soma_joinid"),
function(x) levels(df[[x]]),
simplify = FALSE,
USE.NAMES = TRUE
)
sdf <- SOMADataFrameCreate(uri, tbl$schema, levels = lvls)
on.exit(sdf$close())
sdf$write(tbl)
sdf$close()

# Test missingness is preserved
expect_s3_class(sdf <- SOMADataFrameOpen(uri), "SOMADataFrame")
expect_true(tiledb::tiledb_array_has_enumeration(sdf$object)["enum"])
expect_s4_class(
attr <- tiledb::attrs(sdf$tiledb_schema())$enum,
"tiledb_attr"
)
expect_identical(
tiledb::tiledb_attribute_get_enumeration(attr, sdf$object),
levels(df$enum)
)
expect_true(tiledb::tiledb_attribute_get_nullable(attr))

# Test reading preserves missingness
expect_identical(sdf$object[]$enum, df$enum)
tbl0 <- sdf$read()$concat()
expect_identical(tbl0$enum$as_vector(), df$enum)
sdf$close()

# Update w/ missing enum levels
tbl0$miss <- factor(
rep_len(letters[1:3], length.out = n),
levels = c("b", "a")
)
expect_true(any(is.na(tbl0$miss$as_vector())))
sdf <- SOMADataFrameOpen(uri, mode = "WRITE")
expect_no_condition(sdf$update(tbl0))
sdf$close()

# Test missingness is preserved when updating
expect_s3_class(sdf <- SOMADataFrameOpen(uri), "SOMADataFrame")
expect_true(tiledb::tiledb_array_has_enumeration(sdf$object)["miss"])
expect_s4_class(
attr <- tiledb::attrs(sdf$tiledb_schema())$miss,
"tiledb_attr"
)
expect_identical(
tiledb::tiledb_attribute_get_enumeration(attr, sdf$object),
levels(tbl0$miss$as_vector())
)
expect_true(tiledb::tiledb_attribute_get_nullable(attr))

# Test reading preserves updated missingness
expect_identical(sdf$object[]$miss, tbl0$miss$as_vector())
tbl1 <- sdf$read()$concat()
expect_identical(tbl1$miss$as_vector(), tbl0$miss$as_vector())
sdf$close()
})
38 changes: 28 additions & 10 deletions libtiledbsoma/src/soma/column_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,12 @@ std::shared_ptr<ColumnBuffer> ColumnBuffer::create(
auto enum_name = AttributeExperimental::get_enumeration_name(
schema.context(), attr);
std::optional<Enumeration> enumeration = std::nullopt;
bool is_ordered = false;
if (enum_name.has_value()) {
enumeration = std::make_optional<Enumeration>(
ArrayExperimental::get_enumeration(
schema.context(), *array, *enum_name));
auto enmr = ArrayExperimental::get_enumeration(
schema.context(), *array, *enum_name);
is_ordered = enmr.ordered();
enumeration = std::make_optional<Enumeration>(enmr);
}

if (!is_var && attr.cell_val_num() != 1) {
Expand All @@ -66,7 +68,13 @@ std::shared_ptr<ColumnBuffer> ColumnBuffer::create(
}

return ColumnBuffer::alloc(
schema, name_str, type, is_var, is_nullable, enumeration);
schema,
name_str,
type,
is_var,
is_nullable,
enumeration,
is_ordered);

} else if (schema.domain().has_dimension(name_str)) {
auto dim = schema.domain().dimension(name_str);
Expand All @@ -82,7 +90,7 @@ std::shared_ptr<ColumnBuffer> ColumnBuffer::create(
}

return ColumnBuffer::alloc(
schema, name_str, type, is_var, false, std::nullopt);
schema, name_str, type, is_var, false, std::nullopt, false);
}

throw TileDBSOMAError("[ColumnBuffer] Column name not found: " + name_str);
Expand Down Expand Up @@ -116,14 +124,16 @@ ColumnBuffer::ColumnBuffer(
size_t num_bytes,
bool is_var,
bool is_nullable,
std::optional<Enumeration> enumeration)
std::optional<Enumeration> enumeration,
bool is_ordered)
: name_(name)
, type_(type)
, type_size_(tiledb::impl::type_size(type))
, num_cells_(0)
, is_var_(is_var)
, is_nullable_(is_nullable)
, enumeration_(enumeration) {
, enumeration_(enumeration)
, is_ordered_(is_ordered) {
LOG_DEBUG(fmt::format(
"[ColumnBuffer] '{}' {} bytes is_var={} is_nullable={}",
name,
Expand Down Expand Up @@ -201,7 +211,8 @@ std::shared_ptr<ColumnBuffer> ColumnBuffer::alloc(
tiledb_datatype_t type,
bool is_var,
bool is_nullable,
std::optional<Enumeration> enumeration) {
std::optional<Enumeration> enumeration,
bool is_ordered) {
// Set number of bytes for the data buffer. Override with a value from
// the config if present.
auto num_bytes = DEFAULT_ALLOC_BYTES;
Expand Down Expand Up @@ -233,7 +244,14 @@ std::shared_ptr<ColumnBuffer> ColumnBuffer::alloc(
num_bytes / tiledb::impl::type_size(type);

return std::make_shared<ColumnBuffer>(
name, type, num_cells, num_bytes, is_var, is_nullable, enumeration);
name,
type,
num_cells,
num_bytes,
is_var,
is_nullable,
enumeration,
is_ordered);
}

} // namespace tiledbsoma
} // namespace tiledbsoma
18 changes: 15 additions & 3 deletions libtiledbsoma/src/soma/column_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ class ColumnBuffer {
* @param is_var Column type is variable length
* @param is_nullable Column can contain null values
* @param enumeration Optional Enumeration associated with column
* @param is_ordered Optional Enumeration is ordered
*/
ColumnBuffer(
std::string_view name,
Expand All @@ -120,7 +121,8 @@ class ColumnBuffer {
size_t num_bytes,
bool is_var = false,
bool is_nullable = false,
std::optional<Enumeration> enumeration = std::nullopt);
std::optional<Enumeration> enumeration = std::nullopt,
bool is_ordered = false);

ColumnBuffer() = delete;
ColumnBuffer(const ColumnBuffer&) = delete;
Expand Down Expand Up @@ -328,6 +330,13 @@ class ColumnBuffer {
return tcb::span<char>(enum_str_.data(), enum_str_.length());
}

/**
* @brief Return true if the buffer contains an ordered enumeration.
*/
bool is_ordered() const {
return is_ordered_;
}

private:
//===================================================================
//= private static
Expand All @@ -342,6 +351,7 @@ class ColumnBuffer {
* @param is_var True if variable length data
* @param is_nullable True if nullable data
* @param enumeration Optional Enumeration associated with column
* @param is_ordered Optional Enumeration is ordered
* @return ColumnBuffer
*/
static std::shared_ptr<ColumnBuffer> alloc(
Expand All @@ -350,7 +360,8 @@ class ColumnBuffer {
tiledb_datatype_t type,
bool is_var,
bool is_nullable,
std::optional<Enumeration> enumeration);
std::optional<Enumeration> enumeration,
bool is_ordered);

//===================================================================
//= private non-static
Expand Down Expand Up @@ -395,7 +406,8 @@ class ColumnBuffer {
// Enumerations (optional) as string and offsets
std::string enum_str_;
std::vector<uint32_t> enum_offsets_;
bool is_ordered_ = false;
};

} // namespace tiledbsoma
#endif
#endif
3 changes: 3 additions & 0 deletions libtiledbsoma/src/utils/arrow_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ class ArrowAdapter {
column->validity_to_bitmap();
array->buffers[0] = column->validity().data();
}
if (column->is_ordered()) {
schema->flags |= ARROW_FLAG_DICTIONARY_ORDERED;
}

/* Workaround to cast TILEDB_BOOL from uint8 to 1-bit Arrow boolean. */
if (column->type() == TILEDB_BOOL) {
Expand Down
Loading