From e90d7db6f2f2e2d256948617630cfa559d36b0d3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 11:13:42 -0400 Subject: [PATCH 1/9] [python] Consolidation and vacuuming are now platform configuration options (#1690) (#1696) * Consolidation and vacuuming are now platform configuration options Commit and fragment_metadata consolidation and vacuuming can improve the opening and query performance of SOMA experiments. Vacuuming requires slight coordination though and should not happen by default. Instead a platform config allows the user to control these operations based. This will be expanded to defaults for top-level `io` packages where its more likely a user is doing a one-shot ingestion and will want automatic handling. A new platform config, `consolidate_and_vacuum` has been added which is a boolean to handle this behavior. * set is_mac for ci-minimal workflow Co-authored-by: Seth Shelnutt --- .github/workflows/python-ci-minimal.yml | 1 + apis/python/src/tiledbsoma/_dataframe.py | 9 +++--- apis/python/src/tiledbsoma/_dense_nd_array.py | 7 +++-- .../python/src/tiledbsoma/_sparse_nd_array.py | 19 +++++++----- apis/python/src/tiledbsoma/_tiledb_array.py | 29 ++++++++++++++++--- .../options/_tiledb_create_options.py | 3 ++ 6 files changed, 51 insertions(+), 17 deletions(-) diff --git a/.github/workflows/python-ci-minimal.yml b/.github/workflows/python-ci-minimal.yml index a1eb0a153f..ce9f3a0f4e 100644 --- a/.github/workflows/python-ci-minimal.yml +++ b/.github/workflows/python-ci-minimal.yml @@ -29,6 +29,7 @@ jobs: python_version: ${{ matrix.python-version }} cc: ${{ matrix.cc }} cxx: ${{ matrix.cxx }} + is_mac: ${{ contains(matrix.os, 'macos') }} report_codecov: ${{ matrix.python-version == '3.10' }} run_lint: ${{ matrix.python-version == '3.10' }} secrets: inherit diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 3d77312763..cb118f9fd4 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -401,7 +401,6 @@ def write( """ _util.check_type("values", values, (pa.Table,)) - del platform_config # unused dim_cols_map: Dict[str, pd.DataFrame] = {} attr_cols_map: Dict[str, pd.DataFrame] = {} dim_names_set = self.index_column_names @@ -437,14 +436,17 @@ def write( dim_cols_list = [dim_cols_map[name] for name in self.index_column_names] dim_cols_tuple = tuple(dim_cols_list) self._handle.writer[dim_cols_tuple] = attr_cols_map - self._consolidate_and_vacuum_fragment_metadata() + tiledb_create_options = TileDBCreateOptions.from_platform_config( + platform_config + ) + if tiledb_create_options.consolidate_and_vacuum: + self._consolidate_and_vacuum() return self def _set_reader_coord( self, sr: clib.SOMAArray, dim_idx: int, dim: tiledb.Dim, coord: object ) -> bool: - if coord is None: return True # No constraint; select all in this dimension @@ -582,7 +584,6 @@ def _set_reader_coord_by_py_seq_or_np_array( def _set_reader_coord_by_numeric_slice( self, sr: clib.SOMAArray, dim_idx: int, dim: tiledb.Dim, coord: Slice[Any] ) -> bool: - try: lo_hi = _util.slice_to_numeric_range(coord, dim.domain) except _util.NonNumericDimensionError: diff --git a/apis/python/src/tiledbsoma/_dense_nd_array.py b/apis/python/src/tiledbsoma/_dense_nd_array.py index 542579d5c8..e538444930 100644 --- a/apis/python/src/tiledbsoma/_dense_nd_array.py +++ b/apis/python/src/tiledbsoma/_dense_nd_array.py @@ -172,9 +172,12 @@ def write( """ _util.check_type("values", values, (pa.Tensor,)) - del platform_config # Currently unused. self._handle.writer[coords] = values.to_numpy() - self._consolidate_and_vacuum_fragment_metadata() + tiledb_create_options = TileDBCreateOptions.from_platform_config( + platform_config + ) + if tiledb_create_options.consolidate_and_vacuum: + self._consolidate_and_vacuum() return self @classmethod diff --git a/apis/python/src/tiledbsoma/_sparse_nd_array.py b/apis/python/src/tiledbsoma/_sparse_nd_array.py index 97103d06f4..2fd86f5258 100644 --- a/apis/python/src/tiledbsoma/_sparse_nd_array.py +++ b/apis/python/src/tiledbsoma/_sparse_nd_array.py @@ -183,9 +183,11 @@ def write( Lifecycle: Experimental. """ - del platform_config # Currently unused. arr = self._handle.writer + tiledb_create_options = TileDBCreateOptions.from_platform_config( + platform_config + ) if isinstance(values, pa.SparseCOOTensor): # Write bulk data @@ -197,8 +199,9 @@ def write( bounding_box = self._compute_bounding_box_metadata(maxes) self._set_bounding_box_metadata(bounding_box) - # Consolidate non-bulk data - self._consolidate_and_vacuum_fragment_metadata() + if tiledb_create_options.consolidate_and_vacuum: + # Consolidate non-bulk data + self._consolidate_and_vacuum() return self if isinstance(values, (pa.SparseCSCMatrix, pa.SparseCSRMatrix)): @@ -216,8 +219,9 @@ def write( bounding_box = self._compute_bounding_box_metadata([nr - 1, nc - 1]) self._set_bounding_box_metadata(bounding_box) - # Consolidate non-bulk data - self._consolidate_and_vacuum_fragment_metadata() + if tiledb_create_options.consolidate_and_vacuum: + # Consolidate non-bulk data + self._consolidate_and_vacuum() return self if isinstance(values, pa.Table): @@ -241,8 +245,9 @@ def write( bounding_box = self._compute_bounding_box_metadata(maxes) self._set_bounding_box_metadata(bounding_box) - # Consolidate non-bulk data - self._consolidate_and_vacuum_fragment_metadata() + if tiledb_create_options.consolidate_and_vacuum: + # Consolidate non-bulk data + self._consolidate_and_vacuum() return self raise TypeError( diff --git a/apis/python/src/tiledbsoma/_tiledb_array.py b/apis/python/src/tiledbsoma/_tiledb_array.py index a5bdeed4a2..516563f4a8 100644 --- a/apis/python/src/tiledbsoma/_tiledb_array.py +++ b/apis/python/src/tiledbsoma/_tiledb_array.py @@ -6,7 +6,7 @@ import ctypes import os import sys -from typing import Any, Dict, Optional, Sequence, Tuple +from typing import Any, Dict, List, Optional, Sequence, Tuple import pyarrow as pa import tiledb @@ -194,7 +194,9 @@ def _create_internal( cls._set_create_metadata(handle) return handle - def _consolidate_and_vacuum_fragment_metadata(self) -> None: + def _consolidate_and_vacuum( + self, modes: List[str] = ["fragment_meta", "commits"] + ) -> None: """ This post-ingestion helper consolidates and vacuums fragment metadata and commit files -- this is quick to do, and positively impacts query performance. It does _not_ consolidate @@ -202,12 +204,31 @@ def _consolidate_and_vacuum_fragment_metadata(self) -> None: discretion. """ - for mode in ["fragment_meta", "commits"]: + for mode in modes: + self._consolidate(modes=[mode]) + self._vacuum(modes=[mode]) + def _consolidate(self, modes: List[str] = ["fragment_meta", "commits"]) -> None: + """ + This post-ingestion helper consolidates by default fragment metadata and commit files -- + this is quick to do, and positively impacts query performance. + """ + + for mode in modes: cfg = self._ctx.config() cfg["sm.consolidation.mode"] = mode - cfg["sm.vacuum.mode"] = mode ctx = tiledb.Ctx(cfg) tiledb.consolidate(self.uri, ctx=ctx) + + def _vacuum(self, modes: List[str] = ["fragment_meta", "commits"]) -> None: + """ + This post-ingestion helper vacuums by default fragment metadata and commit files. Vacuuming is not multi-process safe and requires coordination that nothing is currently reading the files that will be vacuumed. + """ + + for mode in modes: + cfg = self._ctx.config() + cfg["sm.vacuum.mode"] = mode + ctx = tiledb.Ctx(cfg) + tiledb.vacuum(self.uri, ctx=ctx) diff --git a/apis/python/src/tiledbsoma/options/_tiledb_create_options.py b/apis/python/src/tiledbsoma/options/_tiledb_create_options.py index a60c687197..74b69f9a33 100644 --- a/apis/python/src/tiledbsoma/options/_tiledb_create_options.py +++ b/apis/python/src/tiledbsoma/options/_tiledb_create_options.py @@ -143,6 +143,9 @@ class TileDBCreateOptions: attrs: Mapping[str, _ColumnConfig] = attrs_.field( factory=dict, converter=_normalize_columns ) + consolidate_and_vacuum: bool = attrs_.field( + validator=vld.instance_of(bool), default=False + ) @classmethod def from_platform_config( From 3bf320a652d82681d79961c1270aca839d15d8ef Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 12:18:11 -0400 Subject: [PATCH 2/9] [ci] MacOS CI updates (#1695) (#1697) Co-authored-by: John Kerl --- .github/workflows/python-ci-minimal.yml | 3 +++ libtiledbsoma/src/soma/logger_public.h | 2 -- libtiledbsoma/src/utils/logger.cc | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-ci-minimal.yml b/.github/workflows/python-ci-minimal.yml index ce9f3a0f4e..04b0b9890e 100644 --- a/.github/workflows/python-ci-minimal.yml +++ b/.github/workflows/python-ci-minimal.yml @@ -23,6 +23,9 @@ jobs: - runs-on: ubuntu-22.04 cc: gcc-11 cxx: g++-11 + - runs-on: macos-12 + cc: clang + cxx: clang++ uses: ./.github/workflows/python-ci-single.yml with: os: ${{ matrix.os }} diff --git a/libtiledbsoma/src/soma/logger_public.h b/libtiledbsoma/src/soma/logger_public.h index a3483b76fb..b2745a4d96 100644 --- a/libtiledbsoma/src/soma/logger_public.h +++ b/libtiledbsoma/src/soma/logger_public.h @@ -38,8 +38,6 @@ #include // for windows: error C2039: 'runtime_error': is not a member of 'std' -#include - namespace tiledbsoma { /** Set log level for global logger and optionally set a logfile. */ diff --git a/libtiledbsoma/src/utils/logger.cc b/libtiledbsoma/src/utils/logger.cc index 5a0690d4fe..ba1d1a9f21 100644 --- a/libtiledbsoma/src/utils/logger.cc +++ b/libtiledbsoma/src/utils/logger.cc @@ -68,9 +68,9 @@ Logger::Logger() { logger_->sinks().back().get()); console_sink->set_color( spdlog::level::critical, console_sink->red_bold); + logger_->set_pattern(LOG_PATTERN); #endif } - logger_->set_pattern(LOG_PATTERN); set_level("INFO"); } From 6bf76617cb2e8d2319b2112f724d0a189befb4ad Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 12:33:59 -0400 Subject: [PATCH 3/9] [python] Flatten categorical `soma_joinid` if presented at `write` (#1698) (#1699) Co-authored-by: John Kerl --- apis/python/src/tiledbsoma/_dataframe.py | 21 +++++++++----- apis/python/tests/test_dataframe.py | 35 ++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 7 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index cb118f9fd4..0133941ff5 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -409,18 +409,25 @@ def write( for name in values.schema.names: col = values.column(name) n = len(col) + cols_map = dim_cols_map if name in dim_names_set else attr_cols_map if pa.types.is_dictionary(col.type) and col.num_chunks != 0: - attr = self._handle.schema.attr(name) - if attr.enum_label is not None: - # Normal case: writing categorical data to categorical schema. - cols_map[name] = col.chunk(0).indices.to_pandas() - else: - # Schema is non-categorical but the user is writing categorical. - # Simply decategoricalize for them. + if name in dim_names_set: + # Dims are never categorical. Decategoricalize for them. cols_map[name] = pa.chunked_array( [chunk.dictionary_decode() for chunk in col.chunks] ) + else: + attr = self._handle.schema.attr(name) + if attr.enum_label is not None: + # Normal case: writing categorical data to categorical schema. + cols_map[name] = col.chunk(0).indices.to_pandas() + else: + # Schema is non-categorical but the user is writing categorical. + # Simply decategoricalize for them. + cols_map[name] = pa.chunked_array( + [chunk.dictionary_decode() for chunk in col.chunks] + ) else: cols_map[name] = col.to_pandas() diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index 8a817e8bf5..77376064f5 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -914,6 +914,41 @@ def test_write_categorical_types(tmp_path): assert (df == sdf.read().concat().to_pandas()).all().all() +def test_write_categorical_dims(tmp_path): + """ + Categories are not supported as dims. Here we test our handling of what we + do when we are given them as input. + """ + schema = pa.schema( + [ + ("soma_joinid", pa.int64()), + ("string", pa.dictionary(pa.int8(), pa.large_string())), + ] + ) + with soma.DataFrame.create( + tmp_path.as_posix(), + schema=schema, + index_column_names=["soma_joinid"], + enumerations={ + "enum-string": ["b", "a"], + }, + ordered_enumerations=[], + column_to_enumerations={ + "string": "enum-string", + }, + ) as sdf: + df = pd.DataFrame( + data={ + "soma_joinid": pd.Categorical([0, 1, 2, 3], categories=[0, 1, 2, 3]), + "string": pd.Categorical(["a", "b", "a", "b"], categories=["b", "a"]), + } + ) + sdf.write(pa.Table.from_pandas(df)) + + with soma.DataFrame.open(tmp_path.as_posix()) as sdf: + assert (df == sdf.read().concat().to_pandas()).all().all() + + def test_result_order(tmp_path): # cf. https://docs.tiledb.com/main/background/key-concepts-and-data-format#data-layout schema = pa.schema( From 3740ed17039d75e2e4e6b670a0f68cc9b265d647 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 20 Sep 2023 14:36:20 -0400 Subject: [PATCH 4/9] Minor follow-up to #1695 (#1702) (#1705) * logger->set_pattern call should be unconditional, not windows-only * Remove EXCLUDE_FROM_ALL annotation on unit_soma, so that the target updates reliably * Add note about failure mode on a fork Co-authored-by: Isaiah Norton --- apis/python/version.py | 3 +++ libtiledbsoma/src/utils/logger.cc | 2 +- libtiledbsoma/test/CMakeLists.txt | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/apis/python/version.py b/apis/python/version.py index 39147c50e7..3ad057b092 100644 --- a/apis/python/version.py +++ b/apis/python/version.py @@ -85,6 +85,9 @@ def readGitVersion(): + # NOTE: this will fail if on a fork with unsynchronized tags. + # use `git fetch --tags upstream` + # and `git push --tags ` try: proc = subprocess.Popen( ("git", "describe", "--long", "--tags", "--match", "[0-9]*.*"), diff --git a/libtiledbsoma/src/utils/logger.cc b/libtiledbsoma/src/utils/logger.cc index ba1d1a9f21..8c6d43a06a 100644 --- a/libtiledbsoma/src/utils/logger.cc +++ b/libtiledbsoma/src/utils/logger.cc @@ -62,13 +62,13 @@ Logger::Logger() { logger_ = spdlog::get(CONSOLE_LOGGER); if (logger_ == nullptr) { logger_ = spdlog::stdout_color_mt(CONSOLE_LOGGER); + logger_->set_pattern(LOG_PATTERN); #if !defined(_WIN32) // change color of critical messages auto console_sink = static_cast( logger_->sinks().back().get()); console_sink->set_color( spdlog::level::critical, console_sink->red_bold); - logger_->set_pattern(LOG_PATTERN); #endif } set_level("INFO"); diff --git a/libtiledbsoma/test/CMakeLists.txt b/libtiledbsoma/test/CMakeLists.txt index 816730726f..d503041765 100644 --- a/libtiledbsoma/test/CMakeLists.txt +++ b/libtiledbsoma/test/CMakeLists.txt @@ -19,7 +19,7 @@ find_package(Spdlog_EP REQUIRED) find_package(Catch_EP REQUIRED) -add_executable(unit_soma EXCLUDE_FROM_ALL +add_executable(unit_soma $ unit_column_buffer.cc unit_managed_query.cc From 29df697147927993be510100c5f31306d6119f40 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 22 Sep 2023 10:24:34 -0400 Subject: [PATCH 5/9] [r] Support `factor` and `ordered` in `SOMADataFrameCreate` (#1701) (#1709) * [r] Support `factor` and `ordered` in `SOMADataFrameCreate` * Rework test to not require `nanoarrow` * Remove no-longer-need install of tiledb-r from r-universe [ci skip] * Rename test file * Small refactor in new test file * Add explicit tibble import for as_tibble (but not a new dependency) Co-authored-by: Dirk Eddelbuettel --- .github/workflows/r-ci.yml | 10 +- apis/r/DESCRIPTION | 3 +- apis/r/NAMESPACE | 1 + apis/r/R/SOMADataFrame.R | 3 +- apis/r/R/utils-arrow.R | 15 ++- apis/r/tests/testthat/test-OrderedAndFactor.R | 112 ++++++++++++++++++ 6 files changed, 130 insertions(+), 14 deletions(-) create mode 100644 apis/r/tests/testthat/test-OrderedAndFactor.R diff --git a/.github/workflows/r-ci.yml b/.github/workflows/r-ci.yml index c602f8a51c..fdb088aa93 100644 --- a/.github/workflows/r-ci.yml +++ b/.github/workflows/r-ci.yml @@ -40,15 +40,7 @@ jobs: - name: Install BioConductor package SingleCellExperiment run: cd apis/r && tools/r-ci.sh install_bioc SingleCellExperiment - - - name: Install r-universe build of tiledb-r (macOS) - if: ${{ matrix.os == 'macOS-latest' }} - run: cd apis/r && Rscript -e "install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev', 'https://cloud.r-project.org'))" - - - name: Install r-universe build of tiledb-r (linux) - if: ${{ matrix.os != 'macOS-latest' }} - run: cd apis/r && Rscript -e "options(bspm.version.check=TRUE); install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev/bin/linux/jammy/4.3/', 'https://cloud.r-project.org'))" - + - name: Dependencies run: cd apis/r && tools/r-ci.sh install_all diff --git a/apis/r/DESCRIPTION b/apis/r/DESCRIPTION index 6edc3b018c..67148e6de9 100644 --- a/apis/r/DESCRIPTION +++ b/apis/r/DESCRIPTION @@ -44,7 +44,8 @@ Imports: data.table, spdl, rlang, - tools + tools, + tibble LinkingTo: Rcpp, RcppSpdlog diff --git a/apis/r/NAMESPACE b/apis/r/NAMESPACE index e303812728..af8cb98ef1 100644 --- a/apis/r/NAMESPACE +++ b/apis/r/NAMESPACE @@ -98,6 +98,7 @@ importFrom(spdl,debug) importFrom(spdl,info) importFrom(spdl,setup) importFrom(stats,setNames) +importFrom(tibble,as_tibble) importFrom(tools,R_user_dir) importFrom(tools,file_path_sans_ext) importFrom(urltools,url_compose) diff --git a/apis/r/R/SOMADataFrame.R b/apis/r/R/SOMADataFrame.R index cffa235c4c..4c1116f27d 100644 --- a/apis/r/R/SOMADataFrame.R +++ b/apis/r/R/SOMADataFrame.R @@ -351,7 +351,8 @@ SOMADataFrame <- R6::R6Class( stopifnot( "'schema' must be a valid Arrow schema" = is_arrow_schema(schema), - is.character(index_column_names) && length(index_column_names) > 0, + "'index_column_names' must be a non-empty character vector" = + is.character(index_column_names) && length(index_column_names) > 0, "All 'index_column_names' must be defined in the 'schema'" = assert_subset(index_column_names, schema$names, "indexed field"), "Column names must not start with reserved prefix 'soma_'" = diff --git a/apis/r/R/utils-arrow.R b/apis/r/R/utils-arrow.R index 955ad647de..e085ef89c6 100644 --- a/apis/r/R/utils-arrow.R +++ b/apis/r/R/utils-arrow.R @@ -30,6 +30,10 @@ is_arrow_schema <- function(x) { is_arrow_object(x) && inherits(x, "Schema") } +is_arrow_dictionary <- function(x) { + is_arrow_object(x) && inherits(x, "Field") && inherits(x$type, "DictionaryType") +} + #' Convert Arrow types to supported TileDB type #' List of TileDB types supported in R: https://github.com/TileDB-Inc/TileDB-R/blob/8014da156b5fee5b4cc221d57b4aa7d388abc968/inst/tinytest/test_dim.R#L97-L121 #' @@ -316,16 +320,21 @@ check_arrow_schema_data_types <- function(from, to) { } #' Extract levels from dictionaries +#' @importFrom tibble as_tibble #' @noRd -extract_levels <- function(arrtbl) { +extract_levels <- function(arrtbl, exclude_cols=c("soma_joinid")) { stopifnot("Argument must be an Arrow Table object" = is_arrow_table(arrtbl)) - nm <- names(arrtbl) # we go over the table column by column + nm <- names(arrtbl) # we go over the table column by column + nm <- nm[-match(exclude_cols, nm)] # but skip soma_joinid etc as in exclude_cols reslst <- vector(mode = "list", length = length(nm)) names(reslst) <- nm # and fill a named list, entries default to NULL for (n in nm) { - if (inherits(arrow::infer_type(arrtbl[[n]]), "DictionaryType")) { + inftp <- arrow::infer_type(arrtbl[[n]]) + if (inherits(inftp, "DictionaryType")) { # levels() extracts the enumeration levels from the factor vector we have reslst[[n]] <- levels(arrtbl[[n]]$as_vector()) + # set 'ordered' attribute + attr(reslst[[n]], "ordered") <- inftp$ordered } } reslst diff --git a/apis/r/tests/testthat/test-OrderedAndFactor.R b/apis/r/tests/testthat/test-OrderedAndFactor.R new file mode 100644 index 0000000000..9769568911 --- /dev/null +++ b/apis/r/tests/testthat/test-OrderedAndFactor.R @@ -0,0 +1,112 @@ +test_that("SOMADataFrame round-trip with factor and ordered", { + skip_if(!extended_tests()) + + uri <- tempfile() + + ## borrowed from tiledb-r test file test_ordered.R + ## A data.frame with an ordered column, taken from package `earth` and its `etitanic` cleaned + + ## dataset of Titanic survivors (with NAs removed). + ## + ## et <- earth::etitanic + ## et$pclass <- as.ordered(et$pclass) + ## set.seed(42) + ## et <- et[sort(sample(nrow(et), 100)), ] + ## dput(et) + ## + ## Slightly edited (for code alignment) `dput(et)` output below + et <- structure(list(pclass = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, + 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, + 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, + 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, + 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, + 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, + 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), + levels = c("1st", "2nd", "3rd"), class = c("ordered", "factor")), + survived = c(0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, + 1L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, + 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, + 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, + 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, + 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, + 0L, 0L, 0L), + sex = structure(c(1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, + 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, + 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, + 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, + 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, + 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, + 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), + levels = c("female", "male"), class = "factor"), + age = c(2, 24, 29, 58, 59, 28, 36, + 27, 39, 27, 48, 24, 19, 22, 48, 35, 38, 16, 65, 28.5, 35, 34, + 32, 43, 49, 31, 30, 18, 28, 32, 19, 40, 0.833299994, 19, 37, + 32, 34, 54, 8, 27, 34, 16, 21, 62, 21, 23, 36, 29, 41, 33, 25, + 25, 18.5, 13, 20, 6, 32, 21, 18, 26, 32, 29, 18.5, 21, 17, 37, + 35, 30, 22, 47, 26, 21, 28, 25, 28, 43, 22, 30, 20.5, 51, 35, + 28, 19, 28, 29, 41, 19, 28, 8, 39, 2, 45, 30, 33, 21, 24, 11.5, + 18, 36, 45.5), + sibsp = c(1L, 0L, 0L, 0L, 2L, 0L, 1L, 1L, 1L, + 1L, 1L, 3L, 3L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, + 0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, + 0L, 1L, 0L, 2L, 2L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 4L, 1L, + 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 2L, 2L, + 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 4L, + 0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L), + parch = c(2L, 1L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 2L, 0L, 2L, 2L, 2L, 0L, 0L, 0L, 1L, + 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 0L, + 0L, 0L, 1L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 2L, + 0L, 0L, 0L, 2L, 0L, 2L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, + 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, + 1L, 0L, 4L, 5L, 0L, 0L, 1L, 5L, 1L, 4L, 0L, 0L, 0L, 0L, 1L, 0L, + 0L, 0L)), + row.names = c("3", "17", "25", "34", "43", "53", "58", + "65", "85", "91", "100", "112", "115", "123", "146", "165", "169", + "188", "206", "223", "258", "260", "279", "282", "295", "299", + "324", "327", "335", "337", "338", "353", "360", "365", "369", + "390", "397", "398", "399", "402", "415", "417", "420", "433", + "445", "448", "449", "453", "533", "543", "556", "568", "569", + "602", "616", "624", "656", "676", "677", "678", "685", "689", + "693", "697", "701", "711", "730", "761", "786", "794", "804", + "807", "839", "854", "864", "869", "953", "975", "978", "980", + "996", "1022", "1051", "1084", "1101", "1107", "1109", "1127", + "1146", "1147", "1157", "1212", "1219", "1223", "1225", "1238", + "1264", "1289", "1299", "1302"), + class = "data.frame") + expect_true(is.data.frame(et)) + + ett <- data.frame(soma_joinid=bit64::as.integer64(seq(1, nrow(et))), et) + ## quick write with tiledb-r so that we get a schema from the manifested array + ## there should possibly be a helper function to create the schema from a data.frame + turi <- tempfile() + expect_silent(tiledb::fromDataFrame(ett, turi, col_index="soma_joinid")) + + tsch <- tiledb::schema(turi) + expect_true(inherits(tsch, "tiledb_array_schema")) + + sch <- tiledbsoma:::arrow_schema_from_tiledb_schema(tsch) + expect_true(inherits(sch, "Schema")) + + att <- arrow::as_arrow_table(ett) + expect_true(inherits(att, "Table")) + + lvls <- tiledbsoma:::extract_levels(att) + expect_true(is.list(lvls)) + expect_equal(length(lvls), ncol(et)) # et, not ett or tsch or sch as no soma_joinid + expect_equal(names(lvls), colnames(et)) + + sdf <- SOMADataFrameCreate(uri, sch, levels=lvls) + expect_true(inherits(sdf, "SOMADataFrame")) + + sdf$write(att) + + op <- getOption("arrow.int64_downcast") + options("arrow.int64_downcast"=FALSE) # else it becomes int + ndf <- SOMADataFrameOpen(uri)$read()$concat() + expect_true(inherits(ndf, "Table")) + + expect_equivalent(tibble::as_tibble(ndf), tibble::as_tibble(att)) + + options("arrow.int64_downcast"=op) + +}) From 1dff10e11215bd4068b9946d86f94287e712c0bf Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 22 Sep 2023 10:24:51 -0400 Subject: [PATCH 6/9] [c++] Fix For Empty-Indexed Multi-Dimensional Array (#1706) (#1708) Co-authored-by: Vivian Nguyen --- apis/python/tests/test_sparse_nd_array.py | 29 +++++++++++++++++++++++ libtiledbsoma/src/soma/managed_query.cc | 2 +- libtiledbsoma/src/soma/managed_query.h | 24 +++++++++++++------ 3 files changed, 47 insertions(+), 8 deletions(-) diff --git a/apis/python/tests/test_sparse_nd_array.py b/apis/python/tests/test_sparse_nd_array.py index 1d7201b362..628aa1a7d5 100644 --- a/apis/python/tests/test_sparse_nd_array.py +++ b/apis/python/tests/test_sparse_nd_array.py @@ -1100,3 +1100,32 @@ def test_timestamped_ops(tmp_path): [0, 0], ] assert a.nnz == 1 + + +def test_empty_indexed_read(tmp_path): + """ + Verify that queries expected to return empty results actually + work. There are edge cases around SparseTensors, which are unable + to represent empty arrays. + """ + shape = (10, 100) + soma.SparseNDArray.create( + tmp_path.as_posix(), type=pa.uint16(), shape=shape + ).close() + + data = create_random_tensor("coo", shape, np.float64, 1.0) + with soma.SparseNDArray.open(tmp_path.as_posix(), "w") as a: + a.write(data) + + with soma.SparseNDArray.open(tmp_path.as_posix()) as a: + coords = [slice(None), slice(None)] + assert sum(len(t) for t in a.read(coords).tables()) == 1000 + + coords = [[3], [4]] + assert sum(len(t) for t in a.read(coords).tables()) == 1 + + coords = [[3], []] + assert sum(len(t) for t in a.read(coords).tables()) == 0 + + coords = [[], [4]] + assert sum(len(t) for t in a.read(coords).tables()) == 0 diff --git a/libtiledbsoma/src/soma/managed_query.cc b/libtiledbsoma/src/soma/managed_query.cc index 10a1b39fcc..c3f9c5fe1e 100644 --- a/libtiledbsoma/src/soma/managed_query.cc +++ b/libtiledbsoma/src/soma/managed_query.cc @@ -65,7 +65,7 @@ void ManagedQuery::reset() { } subarray_range_set_ = false; - subarray_range_empty_ = true; + subarray_range_empty_ = {}; columns_.clear(); results_complete_ = true; total_num_cells_ = 0; diff --git a/libtiledbsoma/src/soma/managed_query.h b/libtiledbsoma/src/soma/managed_query.h index 685f28c593..f6a1d86a1f 100644 --- a/libtiledbsoma/src/soma/managed_query.h +++ b/libtiledbsoma/src/soma/managed_query.h @@ -98,9 +98,10 @@ class ManagedQuery { void select_ranges( const std::string& dim, const std::vector>& ranges) { subarray_range_set_ = true; + subarray_range_empty_[dim] = true; for (auto& [start, stop] : ranges) { subarray_->add_range(dim, start, stop); - subarray_range_empty_ = false; + subarray_range_empty_[dim] = false; } } @@ -114,9 +115,10 @@ class ManagedQuery { template void select_points(const std::string& dim, const std::vector& points) { subarray_range_set_ = true; + subarray_range_empty_[dim] = true; for (auto& point : points) { subarray_->add_range(dim, point, point); - subarray_range_empty_ = false; + subarray_range_empty_[dim] = false; } } @@ -130,9 +132,10 @@ class ManagedQuery { template void select_points(const std::string& dim, const tcb::span points) { subarray_range_set_ = true; + subarray_range_empty_[dim] = true; for (auto& point : points) { subarray_->add_range(dim, point, point); - subarray_range_empty_ = false; + subarray_range_empty_[dim] = false; } } @@ -147,7 +150,7 @@ class ManagedQuery { void select_point(const std::string& dim, const T& point) { subarray_->add_range(dim, point, point); subarray_range_set_ = true; - subarray_range_empty_ = false; + subarray_range_empty_[dim] = false; } /** @@ -388,7 +391,14 @@ class ManagedQuery { * @return true if the query contains only empty ranges. */ bool is_empty_query() { - return subarray_range_set_ && subarray_range_empty_; + bool has_empty = false; + for (auto subdim : subarray_range_empty_) { + if (subdim.second == true) { + has_empty = true; + break; + } + } + return subarray_range_set_ && has_empty; } /** @@ -440,8 +450,8 @@ class ManagedQuery { // True if a range has been added to the subarray bool subarray_range_set_ = false; - // True unless a non-empty range has been added to the subarray - bool subarray_range_empty_ = true; + // Map whether the dimension is empty (true) or not + std::map subarray_range_empty_ = {}; // Set of column names to read (dim and attr). If empty, query all columns. std::vector columns_; From 9697d4f4d01d15649bc02d085cb7a57aa1ac089a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 27 Sep 2023 13:45:20 -0500 Subject: [PATCH 7/9] [r] Replace three local utility functions with use of RcppInt64 (#1721) (#1724) Co-authored-by: Dirk Eddelbuettel --- apis/r/DESCRIPTION | 3 ++- apis/r/src/rinterface.cpp | 3 ++- apis/r/src/rutilities.cpp | 13 ++++++------ apis/r/src/rutilities.h | 42 +++------------------------------------ 4 files changed, 14 insertions(+), 47 deletions(-) diff --git a/apis/r/DESCRIPTION b/apis/r/DESCRIPTION index 67148e6de9..fcf263ed46 100644 --- a/apis/r/DESCRIPTION +++ b/apis/r/DESCRIPTION @@ -48,7 +48,8 @@ Imports: tibble LinkingTo: Rcpp, - RcppSpdlog + RcppSpdlog, + RcppInt64 Additional_repositories: https://ghrr.github.io/drat Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 diff --git a/apis/r/src/rinterface.cpp b/apis/r/src/rinterface.cpp index 082ebdb92f..812a52859b 100644 --- a/apis/r/src/rinterface.cpp +++ b/apis/r/src/rinterface.cpp @@ -1,5 +1,6 @@ #include // for R interface to C++ #include // for C interface to Arrow +#include // for fromInteger64 // we currently get deprecation warnings by default which are noisy #ifndef TILEDB_NO_API_DEPRECATION_WARNINGS @@ -233,5 +234,5 @@ bool check_arrow_array_tag(Rcpp::XPtr xp) { Rcpp::NumericVector shape(const std::string& uri, Rcpp::Nullable config = R_NilValue) { auto sr = tdbs::SOMAArray::open(OpenMode::read, uri, "unnamed", config_vector_to_map(Rcpp::wrap(config))); - return makeInteger64(sr->shape()); + return Rcpp::toInteger64(sr->shape()); } diff --git a/apis/r/src/rutilities.cpp b/apis/r/src/rutilities.cpp index 9fa81545d9..ddd6ee69a3 100644 --- a/apis/r/src/rutilities.cpp +++ b/apis/r/src/rutilities.cpp @@ -6,6 +6,7 @@ #include // for R interface to C++ #include // for C interface to Arrow +#include // for fromInteger64 // We get these via nanoarrow and must cannot include carrow.h again #define ARROW_SCHEMA_AND_ARRAY_DEFINED 1 @@ -26,7 +27,7 @@ void apply_dim_points(tdbs::SOMAArray *sr, bool suitable = false; if (tp == TILEDB_UINT64) { Rcpp::NumericVector payload = lst[nm]; - std::vector iv = getInt64Vector(payload); + std::vector iv = Rcpp::fromInteger64(payload, false); std::vector uv(iv.size()); const std::pair pr = dm->domain(); for (size_t i=0; i iv = getInt64Vector(payload); + std::vector iv = Rcpp::fromInteger64(payload, false); const std::pair pr = dm->domain(); for (size_t i=0; i= pr.first && iv[i] <= pr.second) { @@ -103,8 +104,8 @@ void apply_dim_ranges(tdbs::SOMAArray* sr, std::vector> vp(mm.nrow()); const std::pair pr = dm->domain(); for (int i=0; i(makeScalarInteger64(lo[i])); - uint64_t h = static_cast(makeScalarInteger64(hi[i])); + uint64_t l = static_cast(Rcpp::fromInteger64(lo[i])); + uint64_t h = static_cast(Rcpp::fromInteger64(hi[i])); vp[i] = std::make_pair(std::max(l,pr.first), std::min(h, pr.second)); spdl::info("[apply_dim_ranges] Applying dim point {} on {} with {} - {}", i, nm, l, h) ; suitable = l < pr.second && h > pr.first; // lower must be less than max, higher more than min @@ -112,8 +113,8 @@ void apply_dim_ranges(tdbs::SOMAArray* sr, if (suitable) sr->set_dim_ranges(nm, vp); } else if (tp == TILEDB_INT64) { Rcpp::NumericMatrix mm = lst[nm]; - std::vector lo = getInt64Vector(mm.column(0)); - std::vector hi = getInt64Vector(mm.column(1)); + std::vector lo = Rcpp::fromInteger64(mm.column(0), false); + std::vector hi = Rcpp::fromInteger64(mm.column(1), false); std::vector> vp(mm.nrow()); const std::pair pr = dm->domain(); for (int i=0; i& vec) { - size_t n = vec.size(); - - Rcpp::NumericVector num(n); - std::memcpy(&(num[0]), vec.data(), n * sizeof(double)); - - num.attr("class") = "integer64"; - return (num); -} - -// Convert to a scalar int64_t -// -inline int64_t makeScalarInteger64(const double val) { - int64_t newval; - memcpy(&newval, &val, sizeof(double)); - return newval; -} - -// Create a int64_t vector from a NumericVector -// -inline std::vector getInt64Vector(Rcpp::NumericVector vec) { - size_t n = vec.size(); - std::vector num(n); - std::memcpy(&(num[0]), &(vec[0]), n * sizeof(double)); - return num; -} - +#define TILEDB_VERSION TileDB_Version(TILEDB_VERSION_MAJOR, \ + TILEDB_VERSION_MINOR, \ + TILEDB_VERSION_PATCH) // Applies (named list of) vectors of points to the named dimensions void apply_dim_points( From 3d021ce59e8f50acc1f46f50911ca6cf5128037d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 27 Sep 2023 15:18:16 -0400 Subject: [PATCH 8/9] [python] update numba to 0.58 (#1723) (#1726) The newer version lets us clean up some existing dependency complications -- especially an incompatibility with newer versions of numpy, which was getting more problematic as time went on. Co-authored-by: Mike Lin --- apis/python/setup.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/apis/python/setup.py b/apis/python/setup.py index bf205f8260..93c58cba21 100644 --- a/apis/python/setup.py +++ b/apis/python/setup.py @@ -271,21 +271,13 @@ def run(self): "anndata < 0.9; python_version<'3.8'", "anndata; python_version>='3.8'", "attrs>=22.2", - # Pinning numba & its particular numpy constraints: - # The old pip solver (<=2020) doesn't deal with the transitive - # requirements (scanpy -> numba -> numpy) properly resulting in broken - # installation of incompatible numpy>=1.24. Issue #1051 - # These pins can be removed either when there's a new numba release - # with less-particular numpy version constraints, or if we decide we no - # longer need to support the old pip solver (default on ubuntu 20.04). - # - # Also: numba doesn't support Python 3.11 until 0.57.0rc1. - # It' not preferable to pin to an RC dependency, so we only do this - # when we must, which is for 3.11. - "numba==0.56.4; python_version<'3.11'", - "numba==0.57; python_version=='3.11'", - "numpy>=1.18,<1.24; python_version<'3.11'", - "numpy>=1.18,<1.25; python_version=='3.11'", + "numba~=0.58.0; python_version>='3.8'", + # Older numba version needed for Python3.7. + # This older numba version was also incompatble with newer numpy + # versions, and the old pip solver (<=2020) needed us to explicate + # that constraint here (issue #1051). + "numba==0.56.4; python_version<'3.8'", + "numpy>=1.18,<1.24; python_version<'3.8'", "pandas", "pyarrow>=9.0.0", "scanpy>=1.9.2", From 2c23a4e2e1376393e41196cae55a18c185c798cc Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 27 Sep 2023 16:16:53 -0400 Subject: [PATCH 9/9] [r] Backport #1720 to `release-1.5` (#1727) --- apis/r/DESCRIPTION | 2 +- apis/r/NEWS.md | 1 + apis/r/R/SOMADataFrame.R | 5 ++ apis/r/tests/testthat/test-SOMADataFrame.R | 70 ++++++++++++++++++++++ 4 files changed, 77 insertions(+), 1 deletion(-) diff --git a/apis/r/DESCRIPTION b/apis/r/DESCRIPTION index fcf263ed46..58a89016da 100644 --- a/apis/r/DESCRIPTION +++ b/apis/r/DESCRIPTION @@ -6,7 +6,7 @@ Description: Interface for working with 'TileDB'-based Stack of Matrices, like those commonly used for single cell data analysis. It is documented at ; a formal specification available is at . -Version: 1.4.3.1 +Version: 1.4.3.2 Authors@R: c( person(given = "Aaron", family = "Wolen", role = c("cre", "aut"), email = "aaron@tiledb.com", diff --git a/apis/r/NEWS.md b/apis/r/NEWS.md index 8c097f5761..82c903b6b2 100644 --- a/apis/r/NEWS.md +++ b/apis/r/NEWS.md @@ -4,6 +4,7 @@ * Add support for writing `SummarizedExperiment` and `SingleCellExperiment` object to SOMAs * Add support for bounding boxes for sparse arrays +* Add support for creating `SOMADataFrames` with `ordered()` columns # tiledbsoma 1.4.0 diff --git a/apis/r/R/SOMADataFrame.R b/apis/r/R/SOMADataFrame.R index 4c1116f27d..b58a7e35b1 100644 --- a/apis/r/R/SOMADataFrame.R +++ b/apis/r/R/SOMADataFrame.R @@ -97,6 +97,11 @@ SOMADataFrame <- R6::R6Class( field <- schema$GetFieldByName(field_name) field_type <- tiledb_type_from_arrow_type(field$type) + # Check if the field is ordered and mark it as such + if (!is.null(x = levels[[field_name]]) && isTRUE(field$type$ordered)) { + attr(levels[[field_name]], 'ordered') <- attr(levels[[field_name]], 'ordered', exact = TRUE) %||% TRUE + } + tdb_attrs[[field_name]] <- tiledb::tiledb_attr( name = field_name, type = field_type, diff --git a/apis/r/tests/testthat/test-SOMADataFrame.R b/apis/r/tests/testthat/test-SOMADataFrame.R index ccd83e863e..178553128d 100644 --- a/apis/r/tests/testthat/test-SOMADataFrame.R +++ b/apis/r/tests/testthat/test-SOMADataFrame.R @@ -222,6 +222,76 @@ test_that("int64 values are stored correctly", { gc() }) +test_that("creation with ordered factors", { + skip_if_not_installed("tiledb", "0.21.0") + skip_if(!extended_tests()) + uri <- withr::local_tempdir("soma-dataframe-ordered") + n <- 10L + df <- data.frame( + soma_joinid = bit64::as.integer64(seq_len(length.out = n) - 1L), + int = seq_len(length.out = n), + bool = rep_len(c(TRUE, FALSE), length.out = n), + ord = ordered(rep_len(c("g1", "g2", "g3"), length.out = n)) + ) + tbl <- arrow::as_arrow_table(df) + expect_true(tbl$schema$GetFieldByName("ord")$type$ordered) + expect_no_condition(sdf <- SOMADataFrameCreate( + uri = uri, + schema = tbl$schema, + levels = sapply( + X = df[, setdiff(names(df), "soma_joinid")], + FUN = levels, + simplify = FALSE, + USE.NAMES = TRUE + ) + )) + expect_no_condition(sdf$write(values = tbl)) + expect_s3_class(sdf <- SOMADataFrameOpen(uri), "SOMADataFrame") + expect_true(sdf$schema()$GetFieldByName("ord")$type$ordered) + expect_s3_class(ord <- sdf$object[]$ord, c("ordered", "factor"), exact = TRUE) + expect_length(ord, n) + expect_identical(levels(ord), levels(df$ord)) +}) + +test_that("explicit casting of ordered factors to regular factors", { + skip_if_not_installed("tiledb", "0.21.0") + skip_if(!extended_tests()) + uri <- withr::local_tempdir("soma-dataframe-unordered") + n <- 10L + df <- data.frame( + soma_joinid = bit64::as.integer64(seq_len(length.out = n) - 1L), + int = seq_len(length.out = n), + bool = rep_len(c(TRUE, FALSE), length.out = n), + ord = ordered(rep_len(c("g1", "g2", "g3"), length.out = n)) + ) + tbl <- arrow::as_arrow_table(df) + expect_true(tbl$schema$GetFieldByName("ord")$type$ordered) + lvls <- sapply( + X = df[, setdiff(names(df), "soma_joinid")], + FUN = levels, + simplify = FALSE, + USE.NAMES = TRUE + ) + for (col in names(lvls)) { + if (!is.null(lvls[[col]])) { + attr(lvls[[col]], 'ordered') <- FALSE + } + } + expect_no_condition(sdf <- SOMADataFrameCreate( + uri = uri, + schema = tbl$schema, + levels = lvls + )) + expect_no_condition(sdf$write(values = tbl)) + expect_s3_class(sdf <- SOMADataFrameOpen(uri), "SOMADataFrame") + expect_false(sdf$schema()$GetFieldByName("ord")$type$ordered) + expect_s3_class(ord <- sdf$object[]$ord, "factor", exact = TRUE) + expect_false(is.ordered(ord)) + expect_length(ord, n) + expect_identical(levels(ord), levels(df$ord)) +}) + + test_that("SOMADataFrame read", { skip_if(!extended_tests()) uri <- extract_dataset("soma-dataframe-pbmc3k-processed-obs")