From e90d7db6f2f2e2d256948617630cfa559d36b0d3 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 18 Sep 2023 11:13:42 -0400
Subject: [PATCH 1/9] [python] Consolidation and vacuuming are now platform
 configuration options (#1690) (#1696)

* Consolidation and vacuuming are now platform configuration options

Commit and fragment_metadata consolidation and vacuuming can improve the
opening and query performance of SOMA experiments. Vacuuming requires
slight coordination though and should not happen by default. Instead a
platform config allows the user to control these operations based. This
will be expanded to defaults for top-level `io` packages where its more
likely a user is doing a one-shot ingestion and will want automatic
handling.

A new platform config, `consolidate_and_vacuum` has been added which is
a boolean to handle this behavior.

* set is_mac for ci-minimal workflow

Co-authored-by: Seth Shelnutt <Shelnutt2@gmail.com>
---
 .github/workflows/python-ci-minimal.yml       |  1 +
 apis/python/src/tiledbsoma/_dataframe.py      |  9 +++---
 apis/python/src/tiledbsoma/_dense_nd_array.py |  7 +++--
 .../python/src/tiledbsoma/_sparse_nd_array.py | 19 +++++++-----
 apis/python/src/tiledbsoma/_tiledb_array.py   | 29 ++++++++++++++++---
 .../options/_tiledb_create_options.py         |  3 ++
 6 files changed, 51 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/python-ci-minimal.yml b/.github/workflows/python-ci-minimal.yml
index a1eb0a153f..ce9f3a0f4e 100644
--- a/.github/workflows/python-ci-minimal.yml
+++ b/.github/workflows/python-ci-minimal.yml
@@ -29,6 +29,7 @@ jobs:
       python_version: ${{ matrix.python-version }}
       cc: ${{ matrix.cc }}
       cxx: ${{ matrix.cxx }}
+      is_mac: ${{ contains(matrix.os, 'macos') }}
       report_codecov: ${{ matrix.python-version == '3.10' }}
       run_lint: ${{ matrix.python-version == '3.10' }}
     secrets: inherit
diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py
index 3d77312763..cb118f9fd4 100644
--- a/apis/python/src/tiledbsoma/_dataframe.py
+++ b/apis/python/src/tiledbsoma/_dataframe.py
@@ -401,7 +401,6 @@ def write(
         """
         _util.check_type("values", values, (pa.Table,))
 
-        del platform_config  # unused
         dim_cols_map: Dict[str, pd.DataFrame] = {}
         attr_cols_map: Dict[str, pd.DataFrame] = {}
         dim_names_set = self.index_column_names
@@ -437,14 +436,17 @@ def write(
         dim_cols_list = [dim_cols_map[name] for name in self.index_column_names]
         dim_cols_tuple = tuple(dim_cols_list)
         self._handle.writer[dim_cols_tuple] = attr_cols_map
-        self._consolidate_and_vacuum_fragment_metadata()
+        tiledb_create_options = TileDBCreateOptions.from_platform_config(
+            platform_config
+        )
+        if tiledb_create_options.consolidate_and_vacuum:
+            self._consolidate_and_vacuum()
 
         return self
 
     def _set_reader_coord(
         self, sr: clib.SOMAArray, dim_idx: int, dim: tiledb.Dim, coord: object
     ) -> bool:
-
         if coord is None:
             return True  # No constraint; select all in this dimension
 
@@ -582,7 +584,6 @@ def _set_reader_coord_by_py_seq_or_np_array(
     def _set_reader_coord_by_numeric_slice(
         self, sr: clib.SOMAArray, dim_idx: int, dim: tiledb.Dim, coord: Slice[Any]
     ) -> bool:
-
         try:
             lo_hi = _util.slice_to_numeric_range(coord, dim.domain)
         except _util.NonNumericDimensionError:
diff --git a/apis/python/src/tiledbsoma/_dense_nd_array.py b/apis/python/src/tiledbsoma/_dense_nd_array.py
index 542579d5c8..e538444930 100644
--- a/apis/python/src/tiledbsoma/_dense_nd_array.py
+++ b/apis/python/src/tiledbsoma/_dense_nd_array.py
@@ -172,9 +172,12 @@ def write(
         """
         _util.check_type("values", values, (pa.Tensor,))
 
-        del platform_config  # Currently unused.
         self._handle.writer[coords] = values.to_numpy()
-        self._consolidate_and_vacuum_fragment_metadata()
+        tiledb_create_options = TileDBCreateOptions.from_platform_config(
+            platform_config
+        )
+        if tiledb_create_options.consolidate_and_vacuum:
+            self._consolidate_and_vacuum()
         return self
 
     @classmethod
diff --git a/apis/python/src/tiledbsoma/_sparse_nd_array.py b/apis/python/src/tiledbsoma/_sparse_nd_array.py
index 97103d06f4..2fd86f5258 100644
--- a/apis/python/src/tiledbsoma/_sparse_nd_array.py
+++ b/apis/python/src/tiledbsoma/_sparse_nd_array.py
@@ -183,9 +183,11 @@ def write(
         Lifecycle:
             Experimental.
         """
-        del platform_config  # Currently unused.
 
         arr = self._handle.writer
+        tiledb_create_options = TileDBCreateOptions.from_platform_config(
+            platform_config
+        )
 
         if isinstance(values, pa.SparseCOOTensor):
             # Write bulk data
@@ -197,8 +199,9 @@ def write(
             bounding_box = self._compute_bounding_box_metadata(maxes)
             self._set_bounding_box_metadata(bounding_box)
 
-            # Consolidate non-bulk data
-            self._consolidate_and_vacuum_fragment_metadata()
+            if tiledb_create_options.consolidate_and_vacuum:
+                # Consolidate non-bulk data
+                self._consolidate_and_vacuum()
             return self
 
         if isinstance(values, (pa.SparseCSCMatrix, pa.SparseCSRMatrix)):
@@ -216,8 +219,9 @@ def write(
             bounding_box = self._compute_bounding_box_metadata([nr - 1, nc - 1])
             self._set_bounding_box_metadata(bounding_box)
 
-            # Consolidate non-bulk data
-            self._consolidate_and_vacuum_fragment_metadata()
+            if tiledb_create_options.consolidate_and_vacuum:
+                # Consolidate non-bulk data
+                self._consolidate_and_vacuum()
             return self
 
         if isinstance(values, pa.Table):
@@ -241,8 +245,9 @@ def write(
             bounding_box = self._compute_bounding_box_metadata(maxes)
             self._set_bounding_box_metadata(bounding_box)
 
-            # Consolidate non-bulk data
-            self._consolidate_and_vacuum_fragment_metadata()
+            if tiledb_create_options.consolidate_and_vacuum:
+                # Consolidate non-bulk data
+                self._consolidate_and_vacuum()
             return self
 
         raise TypeError(
diff --git a/apis/python/src/tiledbsoma/_tiledb_array.py b/apis/python/src/tiledbsoma/_tiledb_array.py
index a5bdeed4a2..516563f4a8 100644
--- a/apis/python/src/tiledbsoma/_tiledb_array.py
+++ b/apis/python/src/tiledbsoma/_tiledb_array.py
@@ -6,7 +6,7 @@
 import ctypes
 import os
 import sys
-from typing import Any, Dict, Optional, Sequence, Tuple
+from typing import Any, Dict, List, Optional, Sequence, Tuple
 
 import pyarrow as pa
 import tiledb
@@ -194,7 +194,9 @@ def _create_internal(
         cls._set_create_metadata(handle)
         return handle
 
-    def _consolidate_and_vacuum_fragment_metadata(self) -> None:
+    def _consolidate_and_vacuum(
+        self, modes: List[str] = ["fragment_meta", "commits"]
+    ) -> None:
         """
         This post-ingestion helper consolidates and vacuums fragment metadata and commit files --
         this is quick to do, and positively impacts query performance.  It does _not_ consolidate
@@ -202,12 +204,31 @@ def _consolidate_and_vacuum_fragment_metadata(self) -> None:
         discretion.
         """
 
-        for mode in ["fragment_meta", "commits"]:
+        for mode in modes:
+            self._consolidate(modes=[mode])
+            self._vacuum(modes=[mode])
 
+    def _consolidate(self, modes: List[str] = ["fragment_meta", "commits"]) -> None:
+        """
+        This post-ingestion helper consolidates by default fragment metadata and commit files --
+        this is quick to do, and positively impacts query performance.
+        """
+
+        for mode in modes:
             cfg = self._ctx.config()
             cfg["sm.consolidation.mode"] = mode
-            cfg["sm.vacuum.mode"] = mode
             ctx = tiledb.Ctx(cfg)
 
             tiledb.consolidate(self.uri, ctx=ctx)
+
+    def _vacuum(self, modes: List[str] = ["fragment_meta", "commits"]) -> None:
+        """
+        This post-ingestion helper vacuums by default fragment metadata and commit files. Vacuuming is not multi-process safe and requires coordination that nothing is currently reading the files that will be vacuumed.
+        """
+
+        for mode in modes:
+            cfg = self._ctx.config()
+            cfg["sm.vacuum.mode"] = mode
+            ctx = tiledb.Ctx(cfg)
+
             tiledb.vacuum(self.uri, ctx=ctx)
diff --git a/apis/python/src/tiledbsoma/options/_tiledb_create_options.py b/apis/python/src/tiledbsoma/options/_tiledb_create_options.py
index a60c687197..74b69f9a33 100644
--- a/apis/python/src/tiledbsoma/options/_tiledb_create_options.py
+++ b/apis/python/src/tiledbsoma/options/_tiledb_create_options.py
@@ -143,6 +143,9 @@ class TileDBCreateOptions:
     attrs: Mapping[str, _ColumnConfig] = attrs_.field(
         factory=dict, converter=_normalize_columns
     )
+    consolidate_and_vacuum: bool = attrs_.field(
+        validator=vld.instance_of(bool), default=False
+    )
 
     @classmethod
     def from_platform_config(

From 3bf320a652d82681d79961c1270aca839d15d8ef Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 18 Sep 2023 12:18:11 -0400
Subject: [PATCH 2/9] [ci] MacOS CI updates (#1695) (#1697)

Co-authored-by: John Kerl <kerl.john.r@gmail.com>
---
 .github/workflows/python-ci-minimal.yml | 3 +++
 libtiledbsoma/src/soma/logger_public.h  | 2 --
 libtiledbsoma/src/utils/logger.cc       | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/python-ci-minimal.yml b/.github/workflows/python-ci-minimal.yml
index ce9f3a0f4e..04b0b9890e 100644
--- a/.github/workflows/python-ci-minimal.yml
+++ b/.github/workflows/python-ci-minimal.yml
@@ -23,6 +23,9 @@ jobs:
           - runs-on: ubuntu-22.04
             cc: gcc-11
             cxx: g++-11
+          - runs-on: macos-12
+            cc: clang
+            cxx: clang++
     uses: ./.github/workflows/python-ci-single.yml
     with:
       os: ${{ matrix.os }}
diff --git a/libtiledbsoma/src/soma/logger_public.h b/libtiledbsoma/src/soma/logger_public.h
index a3483b76fb..b2745a4d96 100644
--- a/libtiledbsoma/src/soma/logger_public.h
+++ b/libtiledbsoma/src/soma/logger_public.h
@@ -38,8 +38,6 @@
 
 #include <stdexcept>  // for windows: error C2039: 'runtime_error': is not a member of 'std'
 
-#include <spdlog/spdlog.h>
-
 namespace tiledbsoma {
 
 /** Set log level for global logger and optionally set a logfile. */
diff --git a/libtiledbsoma/src/utils/logger.cc b/libtiledbsoma/src/utils/logger.cc
index 5a0690d4fe..ba1d1a9f21 100644
--- a/libtiledbsoma/src/utils/logger.cc
+++ b/libtiledbsoma/src/utils/logger.cc
@@ -68,9 +68,9 @@ Logger::Logger() {
             logger_->sinks().back().get());
         console_sink->set_color(
             spdlog::level::critical, console_sink->red_bold);
+        logger_->set_pattern(LOG_PATTERN);
 #endif
     }
-    logger_->set_pattern(LOG_PATTERN);
     set_level("INFO");
 }
 

From 6bf76617cb2e8d2319b2112f724d0a189befb4ad Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 18 Sep 2023 12:33:59 -0400
Subject: [PATCH 3/9] [python] Flatten categorical `soma_joinid` if presented
 at `write` (#1698) (#1699)

Co-authored-by: John Kerl <kerl.john.r@gmail.com>
---
 apis/python/src/tiledbsoma/_dataframe.py | 21 +++++++++-----
 apis/python/tests/test_dataframe.py      | 35 ++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py
index cb118f9fd4..0133941ff5 100644
--- a/apis/python/src/tiledbsoma/_dataframe.py
+++ b/apis/python/src/tiledbsoma/_dataframe.py
@@ -409,18 +409,25 @@ def write(
         for name in values.schema.names:
             col = values.column(name)
             n = len(col)
+
             cols_map = dim_cols_map if name in dim_names_set else attr_cols_map
             if pa.types.is_dictionary(col.type) and col.num_chunks != 0:
-                attr = self._handle.schema.attr(name)
-                if attr.enum_label is not None:
-                    # Normal case: writing categorical data to categorical schema.
-                    cols_map[name] = col.chunk(0).indices.to_pandas()
-                else:
-                    # Schema is non-categorical but the user is writing categorical.
-                    # Simply decategoricalize for them.
+                if name in dim_names_set:
+                    # Dims are never categorical. Decategoricalize for them.
                     cols_map[name] = pa.chunked_array(
                         [chunk.dictionary_decode() for chunk in col.chunks]
                     )
+                else:
+                    attr = self._handle.schema.attr(name)
+                    if attr.enum_label is not None:
+                        # Normal case: writing categorical data to categorical schema.
+                        cols_map[name] = col.chunk(0).indices.to_pandas()
+                    else:
+                        # Schema is non-categorical but the user is writing categorical.
+                        # Simply decategoricalize for them.
+                        cols_map[name] = pa.chunked_array(
+                            [chunk.dictionary_decode() for chunk in col.chunks]
+                        )
             else:
                 cols_map[name] = col.to_pandas()
 
diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py
index 8a817e8bf5..77376064f5 100644
--- a/apis/python/tests/test_dataframe.py
+++ b/apis/python/tests/test_dataframe.py
@@ -914,6 +914,41 @@ def test_write_categorical_types(tmp_path):
         assert (df == sdf.read().concat().to_pandas()).all().all()
 
 
+def test_write_categorical_dims(tmp_path):
+    """
+    Categories are not supported as dims. Here we test our handling of what we
+    do when we are given them as input.
+    """
+    schema = pa.schema(
+        [
+            ("soma_joinid", pa.int64()),
+            ("string", pa.dictionary(pa.int8(), pa.large_string())),
+        ]
+    )
+    with soma.DataFrame.create(
+        tmp_path.as_posix(),
+        schema=schema,
+        index_column_names=["soma_joinid"],
+        enumerations={
+            "enum-string": ["b", "a"],
+        },
+        ordered_enumerations=[],
+        column_to_enumerations={
+            "string": "enum-string",
+        },
+    ) as sdf:
+        df = pd.DataFrame(
+            data={
+                "soma_joinid": pd.Categorical([0, 1, 2, 3], categories=[0, 1, 2, 3]),
+                "string": pd.Categorical(["a", "b", "a", "b"], categories=["b", "a"]),
+            }
+        )
+        sdf.write(pa.Table.from_pandas(df))
+
+    with soma.DataFrame.open(tmp_path.as_posix()) as sdf:
+        assert (df == sdf.read().concat().to_pandas()).all().all()
+
+
 def test_result_order(tmp_path):
     # cf. https://docs.tiledb.com/main/background/key-concepts-and-data-format#data-layout
     schema = pa.schema(

From 3740ed17039d75e2e4e6b670a0f68cc9b265d647 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 20 Sep 2023 14:36:20 -0400
Subject: [PATCH 4/9] Minor follow-up to #1695 (#1702) (#1705)

* logger->set_pattern call should be unconditional, not windows-only

* Remove EXCLUDE_FROM_ALL annotation on unit_soma, so that the target updates reliably

* Add note about  failure mode on a fork

Co-authored-by: Isaiah Norton <ihnorton@users.noreply.github.com>
---
 apis/python/version.py            | 3 +++
 libtiledbsoma/src/utils/logger.cc | 2 +-
 libtiledbsoma/test/CMakeLists.txt | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/apis/python/version.py b/apis/python/version.py
index 39147c50e7..3ad057b092 100644
--- a/apis/python/version.py
+++ b/apis/python/version.py
@@ -85,6 +85,9 @@
 
 
 def readGitVersion():
+    # NOTE: this will fail if on a fork with unsynchronized tags.
+    #       use `git fetch --tags upstream`
+    #       and `git push --tags <your fork>`
     try:
         proc = subprocess.Popen(
             ("git", "describe", "--long", "--tags", "--match", "[0-9]*.*"),
diff --git a/libtiledbsoma/src/utils/logger.cc b/libtiledbsoma/src/utils/logger.cc
index ba1d1a9f21..8c6d43a06a 100644
--- a/libtiledbsoma/src/utils/logger.cc
+++ b/libtiledbsoma/src/utils/logger.cc
@@ -62,13 +62,13 @@ Logger::Logger() {
     logger_ = spdlog::get(CONSOLE_LOGGER);
     if (logger_ == nullptr) {
         logger_ = spdlog::stdout_color_mt(CONSOLE_LOGGER);
+        logger_->set_pattern(LOG_PATTERN);
 #if !defined(_WIN32)
         // change color of critical messages
         auto console_sink = static_cast<spdlog::sinks::stdout_color_sink_mt*>(
             logger_->sinks().back().get());
         console_sink->set_color(
             spdlog::level::critical, console_sink->red_bold);
-        logger_->set_pattern(LOG_PATTERN);
 #endif
     }
     set_level("INFO");
diff --git a/libtiledbsoma/test/CMakeLists.txt b/libtiledbsoma/test/CMakeLists.txt
index 816730726f..d503041765 100644
--- a/libtiledbsoma/test/CMakeLists.txt
+++ b/libtiledbsoma/test/CMakeLists.txt
@@ -19,7 +19,7 @@ find_package(Spdlog_EP REQUIRED)
 
 find_package(Catch_EP REQUIRED)
 
-add_executable(unit_soma EXCLUDE_FROM_ALL
+add_executable(unit_soma
     $<TARGET_OBJECTS:TILEDB_SOMA_OBJECTS>
     unit_column_buffer.cc
     unit_managed_query.cc

From 29df697147927993be510100c5f31306d6119f40 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 22 Sep 2023 10:24:34 -0400
Subject: [PATCH 5/9] [r] Support `factor` and `ordered` in
 `SOMADataFrameCreate` (#1701) (#1709)

* [r] Support `factor` and `ordered` in `SOMADataFrameCreate`

* Rework test to not require `nanoarrow`

* Remove no-longer-need install of tiledb-r from r-universe [ci skip]

* Rename test file

* Small refactor in new test file

* Add explicit tibble import for as_tibble (but not a new dependency)

Co-authored-by: Dirk Eddelbuettel <edd@debian.org>
---
 .github/workflows/r-ci.yml                    |  10 +-
 apis/r/DESCRIPTION                            |   3 +-
 apis/r/NAMESPACE                              |   1 +
 apis/r/R/SOMADataFrame.R                      |   3 +-
 apis/r/R/utils-arrow.R                        |  15 ++-
 apis/r/tests/testthat/test-OrderedAndFactor.R | 112 ++++++++++++++++++
 6 files changed, 130 insertions(+), 14 deletions(-)
 create mode 100644 apis/r/tests/testthat/test-OrderedAndFactor.R

diff --git a/.github/workflows/r-ci.yml b/.github/workflows/r-ci.yml
index c602f8a51c..fdb088aa93 100644
--- a/.github/workflows/r-ci.yml
+++ b/.github/workflows/r-ci.yml
@@ -40,15 +40,7 @@ jobs:
 
       - name: Install BioConductor package SingleCellExperiment
         run: cd apis/r && tools/r-ci.sh install_bioc SingleCellExperiment
-
-      - name: Install r-universe build of tiledb-r (macOS)
-        if: ${{ matrix.os == 'macOS-latest' }}
-        run: cd apis/r && Rscript -e "install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev', 'https://cloud.r-project.org'))"
-
-      - name: Install r-universe build of tiledb-r (linux)
-        if: ${{ matrix.os != 'macOS-latest' }}
-        run: cd apis/r && Rscript -e "options(bspm.version.check=TRUE); install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev/bin/linux/jammy/4.3/', 'https://cloud.r-project.org'))"      
-        
+       
       - name: Dependencies
         run: cd apis/r && tools/r-ci.sh install_all
 
diff --git a/apis/r/DESCRIPTION b/apis/r/DESCRIPTION
index 6edc3b018c..67148e6de9 100644
--- a/apis/r/DESCRIPTION
+++ b/apis/r/DESCRIPTION
@@ -44,7 +44,8 @@ Imports:
     data.table,
     spdl,
     rlang,
-    tools
+    tools,
+    tibble
 LinkingTo:
     Rcpp,
     RcppSpdlog
diff --git a/apis/r/NAMESPACE b/apis/r/NAMESPACE
index e303812728..af8cb98ef1 100644
--- a/apis/r/NAMESPACE
+++ b/apis/r/NAMESPACE
@@ -98,6 +98,7 @@ importFrom(spdl,debug)
 importFrom(spdl,info)
 importFrom(spdl,setup)
 importFrom(stats,setNames)
+importFrom(tibble,as_tibble)
 importFrom(tools,R_user_dir)
 importFrom(tools,file_path_sans_ext)
 importFrom(urltools,url_compose)
diff --git a/apis/r/R/SOMADataFrame.R b/apis/r/R/SOMADataFrame.R
index cffa235c4c..4c1116f27d 100644
--- a/apis/r/R/SOMADataFrame.R
+++ b/apis/r/R/SOMADataFrame.R
@@ -351,7 +351,8 @@ SOMADataFrame <- R6::R6Class(
       stopifnot(
         "'schema' must be a valid Arrow schema" =
           is_arrow_schema(schema),
-        is.character(index_column_names) && length(index_column_names) > 0,
+        "'index_column_names' must be a non-empty character vector" =
+            is.character(index_column_names) && length(index_column_names) > 0,
         "All 'index_column_names' must be defined in the 'schema'" =
           assert_subset(index_column_names, schema$names, "indexed field"),
         "Column names must not start with reserved prefix 'soma_'" =
diff --git a/apis/r/R/utils-arrow.R b/apis/r/R/utils-arrow.R
index 955ad647de..e085ef89c6 100644
--- a/apis/r/R/utils-arrow.R
+++ b/apis/r/R/utils-arrow.R
@@ -30,6 +30,10 @@ is_arrow_schema <- function(x) {
   is_arrow_object(x) && inherits(x, "Schema")
 }
 
+is_arrow_dictionary <- function(x) {
+  is_arrow_object(x) && inherits(x, "Field") && inherits(x$type, "DictionaryType")
+}
+
 #' Convert Arrow types to supported TileDB type
 #' List of TileDB types supported in R: https://github.com/TileDB-Inc/TileDB-R/blob/8014da156b5fee5b4cc221d57b4aa7d388abc968/inst/tinytest/test_dim.R#L97-L121
 #'
@@ -316,16 +320,21 @@ check_arrow_schema_data_types <- function(from, to) {
 }
 
 #' Extract levels from dictionaries
+#' @importFrom tibble as_tibble
 #' @noRd
-extract_levels <- function(arrtbl) {
+extract_levels <- function(arrtbl, exclude_cols=c("soma_joinid")) {
     stopifnot("Argument must be an Arrow Table object" = is_arrow_table(arrtbl))
-    nm <- names(arrtbl) 	# we go over the table column by column
+    nm <- names(arrtbl)                 # we go over the table column by column
+    nm <- nm[-match(exclude_cols, nm)]  # but skip soma_joinid etc as in exclude_cols
     reslst <- vector(mode = "list", length = length(nm))
     names(reslst) <- nm		# and fill a named list, entries default to NULL
     for (n in nm) {
-        if (inherits(arrow::infer_type(arrtbl[[n]]), "DictionaryType")) {
+        inftp <- arrow::infer_type(arrtbl[[n]])
+        if (inherits(inftp, "DictionaryType")) {
             # levels() extracts the enumeration levels from the factor vector we have
             reslst[[n]] <- levels(arrtbl[[n]]$as_vector())
+            # set 'ordered' attribute
+            attr(reslst[[n]], "ordered") <- inftp$ordered
         }
     }
     reslst
diff --git a/apis/r/tests/testthat/test-OrderedAndFactor.R b/apis/r/tests/testthat/test-OrderedAndFactor.R
new file mode 100644
index 0000000000..9769568911
--- /dev/null
+++ b/apis/r/tests/testthat/test-OrderedAndFactor.R
@@ -0,0 +1,112 @@
+test_that("SOMADataFrame round-trip with factor and ordered", {
+    skip_if(!extended_tests())
+
+    uri <- tempfile()
+
+    ## borrowed from tiledb-r test file test_ordered.R
+    ## A data.frame with an ordered column, taken from package `earth` and its `etitanic` cleaned
+
+    ## dataset of Titanic survivors (with NAs removed).
+    ##
+    ## et <- earth::etitanic
+    ## et$pclass <- as.ordered(et$pclass)
+    ## set.seed(42)
+    ## et <- et[sort(sample(nrow(et), 100)), ]
+    ## dput(et)
+    ##
+    ## Slightly edited (for code alignment) `dput(et)` output below
+    et <- structure(list(pclass = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
+                                              1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
+                                              1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
+                                              2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
+                                              3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
+                                              3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
+                                              3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L),
+                                            levels = c("1st", "2nd", "3rd"), class = c("ordered", "factor")),
+                     survived = c(0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L,
+                                  1L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
+                                  0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L,
+                                  1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
+                                  0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
+                                  0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
+                                  0L, 0L, 0L),
+                     sex = structure(c(1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L,
+                                       2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L,
+                                       2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L,
+                                       2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L,
+                                       2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L,
+                                       2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
+                                       2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
+                                     levels = c("female", "male"), class = "factor"),
+                     age = c(2, 24, 29, 58, 59, 28, 36,
+                             27, 39, 27, 48, 24, 19, 22, 48, 35, 38, 16, 65, 28.5, 35, 34,
+                             32, 43, 49, 31, 30, 18, 28, 32, 19, 40, 0.833299994, 19, 37,
+                             32, 34, 54, 8, 27, 34, 16, 21, 62, 21, 23, 36, 29, 41, 33, 25,
+                             25, 18.5, 13, 20, 6, 32, 21, 18, 26, 32, 29, 18.5, 21, 17, 37,
+                             35, 30, 22, 47, 26, 21, 28, 25, 28, 43, 22, 30, 20.5, 51, 35,
+                             28, 19, 28, 29, 41, 19, 28, 8, 39, 2, 45, 30, 33, 21, 24, 11.5,
+                             18, 36, 45.5),
+                     sibsp = c(1L, 0L, 0L, 0L, 2L, 0L, 1L, 1L, 1L,
+                               1L, 1L, 3L, 3L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L,
+                               0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
+                               0L, 1L, 0L, 2L, 2L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 4L, 1L,
+                               0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 2L, 2L,
+                               0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 4L,
+                               0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L),
+                     parch = c(2L, 1L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 2L, 0L, 2L, 2L, 2L, 0L, 0L, 0L, 1L,
+                               0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 0L,
+                               0L, 0L, 1L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 2L,
+                               0L, 0L, 0L, 2L, 0L, 2L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
+                               0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
+                               1L, 0L, 4L, 5L, 0L, 0L, 1L, 5L, 1L, 4L, 0L, 0L, 0L, 0L, 1L, 0L,
+                               0L, 0L)),
+                row.names = c("3", "17", "25", "34", "43", "53", "58",
+                              "65", "85", "91", "100", "112", "115", "123", "146", "165", "169",
+                              "188", "206", "223", "258", "260", "279", "282", "295", "299",
+                              "324", "327", "335", "337", "338", "353", "360", "365", "369",
+                              "390", "397", "398", "399", "402", "415", "417", "420", "433",
+                              "445", "448", "449", "453", "533", "543", "556", "568", "569",
+                              "602", "616", "624", "656", "676", "677", "678", "685", "689",
+                              "693", "697", "701", "711", "730", "761", "786", "794", "804",
+                              "807", "839", "854", "864", "869", "953", "975", "978", "980",
+                              "996", "1022", "1051", "1084", "1101", "1107", "1109", "1127",
+                              "1146", "1147", "1157", "1212", "1219", "1223", "1225", "1238",
+                              "1264", "1289", "1299", "1302"),
+                class = "data.frame")
+    expect_true(is.data.frame(et))
+
+    ett <- data.frame(soma_joinid=bit64::as.integer64(seq(1, nrow(et))), et)
+    ## quick write with tiledb-r so that we get a schema from the manifested array
+    ## there should possibly be a helper function to create the schema from a data.frame
+    turi <- tempfile()
+    expect_silent(tiledb::fromDataFrame(ett, turi, col_index="soma_joinid"))
+
+    tsch <- tiledb::schema(turi)
+    expect_true(inherits(tsch, "tiledb_array_schema"))
+
+    sch <- tiledbsoma:::arrow_schema_from_tiledb_schema(tsch)
+    expect_true(inherits(sch, "Schema"))
+
+    att <- arrow::as_arrow_table(ett)
+    expect_true(inherits(att, "Table"))
+
+    lvls <- tiledbsoma:::extract_levels(att)
+    expect_true(is.list(lvls))
+    expect_equal(length(lvls), ncol(et))  # et, not ett or tsch or sch as no soma_joinid
+    expect_equal(names(lvls), colnames(et))
+
+    sdf <- SOMADataFrameCreate(uri, sch, levels=lvls)
+    expect_true(inherits(sdf, "SOMADataFrame"))
+
+    sdf$write(att)
+
+    op <- getOption("arrow.int64_downcast")
+    options("arrow.int64_downcast"=FALSE) # else it becomes int
+    ndf <- SOMADataFrameOpen(uri)$read()$concat()
+    expect_true(inherits(ndf, "Table"))
+
+    expect_equivalent(tibble::as_tibble(ndf), tibble::as_tibble(att))
+
+    options("arrow.int64_downcast"=op)
+
+})

From 1dff10e11215bd4068b9946d86f94287e712c0bf Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 22 Sep 2023 10:24:51 -0400
Subject: [PATCH 6/9] [c++] Fix For Empty-Indexed Multi-Dimensional Array
 (#1706) (#1708)

Co-authored-by: Vivian Nguyen <vivian@tiledb.com>
---
 apis/python/tests/test_sparse_nd_array.py | 29 +++++++++++++++++++++++
 libtiledbsoma/src/soma/managed_query.cc   |  2 +-
 libtiledbsoma/src/soma/managed_query.h    | 24 +++++++++++++------
 3 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/apis/python/tests/test_sparse_nd_array.py b/apis/python/tests/test_sparse_nd_array.py
index 1d7201b362..628aa1a7d5 100644
--- a/apis/python/tests/test_sparse_nd_array.py
+++ b/apis/python/tests/test_sparse_nd_array.py
@@ -1100,3 +1100,32 @@ def test_timestamped_ops(tmp_path):
             [0, 0],
         ]
         assert a.nnz == 1
+
+
+def test_empty_indexed_read(tmp_path):
+    """
+    Verify that queries expected to return empty results actually
+    work. There are edge cases around SparseTensors, which are unable
+    to represent empty arrays.
+    """
+    shape = (10, 100)
+    soma.SparseNDArray.create(
+        tmp_path.as_posix(), type=pa.uint16(), shape=shape
+    ).close()
+
+    data = create_random_tensor("coo", shape, np.float64, 1.0)
+    with soma.SparseNDArray.open(tmp_path.as_posix(), "w") as a:
+        a.write(data)
+
+    with soma.SparseNDArray.open(tmp_path.as_posix()) as a:
+        coords = [slice(None), slice(None)]
+        assert sum(len(t) for t in a.read(coords).tables()) == 1000
+
+        coords = [[3], [4]]
+        assert sum(len(t) for t in a.read(coords).tables()) == 1
+
+        coords = [[3], []]
+        assert sum(len(t) for t in a.read(coords).tables()) == 0
+
+        coords = [[], [4]]
+        assert sum(len(t) for t in a.read(coords).tables()) == 0
diff --git a/libtiledbsoma/src/soma/managed_query.cc b/libtiledbsoma/src/soma/managed_query.cc
index 10a1b39fcc..c3f9c5fe1e 100644
--- a/libtiledbsoma/src/soma/managed_query.cc
+++ b/libtiledbsoma/src/soma/managed_query.cc
@@ -65,7 +65,7 @@ void ManagedQuery::reset() {
     }
 
     subarray_range_set_ = false;
-    subarray_range_empty_ = true;
+    subarray_range_empty_ = {};
     columns_.clear();
     results_complete_ = true;
     total_num_cells_ = 0;
diff --git a/libtiledbsoma/src/soma/managed_query.h b/libtiledbsoma/src/soma/managed_query.h
index 685f28c593..f6a1d86a1f 100644
--- a/libtiledbsoma/src/soma/managed_query.h
+++ b/libtiledbsoma/src/soma/managed_query.h
@@ -98,9 +98,10 @@ class ManagedQuery {
     void select_ranges(
         const std::string& dim, const std::vector<std::pair<T, T>>& ranges) {
         subarray_range_set_ = true;
+        subarray_range_empty_[dim] = true;
         for (auto& [start, stop] : ranges) {
             subarray_->add_range(dim, start, stop);
-            subarray_range_empty_ = false;
+            subarray_range_empty_[dim] = false;
         }
     }
 
@@ -114,9 +115,10 @@ class ManagedQuery {
     template <typename T>
     void select_points(const std::string& dim, const std::vector<T>& points) {
         subarray_range_set_ = true;
+        subarray_range_empty_[dim] = true;
         for (auto& point : points) {
             subarray_->add_range(dim, point, point);
-            subarray_range_empty_ = false;
+            subarray_range_empty_[dim] = false;
         }
     }
 
@@ -130,9 +132,10 @@ class ManagedQuery {
     template <typename T>
     void select_points(const std::string& dim, const tcb::span<T> points) {
         subarray_range_set_ = true;
+        subarray_range_empty_[dim] = true;
         for (auto& point : points) {
             subarray_->add_range(dim, point, point);
-            subarray_range_empty_ = false;
+            subarray_range_empty_[dim] = false;
         }
     }
 
@@ -147,7 +150,7 @@ class ManagedQuery {
     void select_point(const std::string& dim, const T& point) {
         subarray_->add_range(dim, point, point);
         subarray_range_set_ = true;
-        subarray_range_empty_ = false;
+        subarray_range_empty_[dim] = false;
     }
 
     /**
@@ -388,7 +391,14 @@ class ManagedQuery {
      * @return true if the query contains only empty ranges.
      */
     bool is_empty_query() {
-        return subarray_range_set_ && subarray_range_empty_;
+        bool has_empty = false;
+        for (auto subdim : subarray_range_empty_) {
+            if (subdim.second == true) {
+                has_empty = true;
+                break;
+            }
+        }
+        return subarray_range_set_ && has_empty;
     }
 
     /**
@@ -440,8 +450,8 @@ class ManagedQuery {
     // True if a range has been added to the subarray
     bool subarray_range_set_ = false;
 
-    // True unless a non-empty range has been added to the subarray
-    bool subarray_range_empty_ = true;
+    // Map whether the dimension is empty (true) or not
+    std::map<std::string, bool> subarray_range_empty_ = {};
 
     // Set of column names to read (dim and attr). If empty, query all columns.
     std::vector<std::string> columns_;

From 9697d4f4d01d15649bc02d085cb7a57aa1ac089a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 27 Sep 2023 13:45:20 -0500
Subject: [PATCH 7/9] [r] Replace three local utility functions with use of
 RcppInt64 (#1721) (#1724)

Co-authored-by: Dirk Eddelbuettel <edd@debian.org>
---
 apis/r/DESCRIPTION        |  3 ++-
 apis/r/src/rinterface.cpp |  3 ++-
 apis/r/src/rutilities.cpp | 13 ++++++------
 apis/r/src/rutilities.h   | 42 +++------------------------------------
 4 files changed, 14 insertions(+), 47 deletions(-)

diff --git a/apis/r/DESCRIPTION b/apis/r/DESCRIPTION
index 67148e6de9..fcf263ed46 100644
--- a/apis/r/DESCRIPTION
+++ b/apis/r/DESCRIPTION
@@ -48,7 +48,8 @@ Imports:
     tibble
 LinkingTo:
     Rcpp,
-    RcppSpdlog
+    RcppSpdlog,
+    RcppInt64
 Additional_repositories: https://ghrr.github.io/drat
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.2.3
diff --git a/apis/r/src/rinterface.cpp b/apis/r/src/rinterface.cpp
index 082ebdb92f..812a52859b 100644
--- a/apis/r/src/rinterface.cpp
+++ b/apis/r/src/rinterface.cpp
@@ -1,5 +1,6 @@
 #include <Rcpp.h>               // for R interface to C++
 #include <nanoarrow.h>          // for C interface to Arrow
+#include <RcppInt64>            // for fromInteger64
 
 // we currently get deprecation warnings by default which are noisy
 #ifndef TILEDB_NO_API_DEPRECATION_WARNINGS
@@ -233,5 +234,5 @@ bool check_arrow_array_tag(Rcpp::XPtr<ArrowArray> xp) {
 Rcpp::NumericVector shape(const std::string& uri,
                           Rcpp::Nullable<Rcpp::CharacterVector> config = R_NilValue) {
     auto sr = tdbs::SOMAArray::open(OpenMode::read, uri, "unnamed", config_vector_to_map(Rcpp::wrap(config)));
-    return makeInteger64(sr->shape());
+    return Rcpp::toInteger64(sr->shape());
 }
diff --git a/apis/r/src/rutilities.cpp b/apis/r/src/rutilities.cpp
index 9fa81545d9..ddd6ee69a3 100644
--- a/apis/r/src/rutilities.cpp
+++ b/apis/r/src/rutilities.cpp
@@ -6,6 +6,7 @@
 
 #include <Rcpp.h>               // for R interface to C++
 #include <nanoarrow.h>          // for C interface to Arrow
+#include <RcppInt64>            // for fromInteger64
 
 // We get these via nanoarrow and must cannot include carrow.h again
 #define ARROW_SCHEMA_AND_ARRAY_DEFINED 1
@@ -26,7 +27,7 @@ void apply_dim_points(tdbs::SOMAArray *sr,
         bool suitable = false;
         if (tp == TILEDB_UINT64) {
             Rcpp::NumericVector payload = lst[nm];
-            std::vector<int64_t> iv = getInt64Vector(payload);
+            std::vector<int64_t> iv = Rcpp::fromInteger64(payload, false);
             std::vector<uint64_t> uv(iv.size());
             const std::pair<uint64_t,uint64_t> pr = dm->domain<uint64_t>();
             for (size_t i=0; i<iv.size(); i++) {
@@ -39,7 +40,7 @@ void apply_dim_points(tdbs::SOMAArray *sr,
             }
         } else if (tp == TILEDB_INT64) {
             Rcpp::NumericVector payload = lst[nm];
-            std::vector<int64_t> iv = getInt64Vector(payload);
+            std::vector<int64_t> iv = Rcpp::fromInteger64(payload, false);
             const std::pair<int64_t,int64_t> pr = dm->domain<int64_t>();
             for (size_t i=0; i<iv.size(); i++) {
                 if (iv[i] >= pr.first && iv[i] <= pr.second) {
@@ -103,8 +104,8 @@ void apply_dim_ranges(tdbs::SOMAArray* sr,
             std::vector<std::pair<uint64_t, uint64_t>> vp(mm.nrow());
             const std::pair<uint64_t,uint64_t> pr = dm->domain<uint64_t>();
             for (int i=0; i<mm.nrow(); i++) {
-                uint64_t l = static_cast<uint64_t>(makeScalarInteger64(lo[i]));
-                uint64_t h = static_cast<uint64_t>(makeScalarInteger64(hi[i]));
+                uint64_t l = static_cast<uint64_t>(Rcpp::fromInteger64(lo[i]));
+                uint64_t h = static_cast<uint64_t>(Rcpp::fromInteger64(hi[i]));
                 vp[i] = std::make_pair(std::max(l,pr.first), std::min(h, pr.second));
                 spdl::info("[apply_dim_ranges] Applying dim point {} on {} with {} - {}", i, nm, l, h) ;
                 suitable = l < pr.second && h > pr.first; // lower must be less than max, higher more than min
@@ -112,8 +113,8 @@ void apply_dim_ranges(tdbs::SOMAArray* sr,
             if (suitable) sr->set_dim_ranges<uint64_t>(nm, vp);
         } else if (tp == TILEDB_INT64) {
             Rcpp::NumericMatrix mm = lst[nm];
-            std::vector<int64_t> lo = getInt64Vector(mm.column(0));
-            std::vector<int64_t> hi = getInt64Vector(mm.column(1));
+            std::vector<int64_t> lo = Rcpp::fromInteger64(mm.column(0), false);
+            std::vector<int64_t> hi = Rcpp::fromInteger64(mm.column(1), false);
             std::vector<std::pair<int64_t, int64_t>> vp(mm.nrow());
             const std::pair<int64_t,int64_t> pr = dm->domain<int64_t>();
             for (int i=0; i<mm.nrow(); i++) {
diff --git a/apis/r/src/rutilities.h b/apis/r/src/rutilities.h
index 1e08fbf613..3669252ae9 100644
--- a/apis/r/src/rutilities.h
+++ b/apis/r/src/rutilities.h
@@ -12,45 +12,9 @@ namespace tdbs = tiledbsoma;
 #define TileDB_Version(v, m, p) (((v)*65536) + ((m)*256) + (p))
 
 // current build is encoded in TILEDB_VERSION
-#define TILEDB_VERSION \
-    TileDB_Version(    \
-        TILEDB_VERSION_MAJOR, TILEDB_VERSION_MINOR, TILEDB_VERSION_PATCH)
-
-// Create a integer64 object
-//
-// Integer64 is an S3 class. Integers in R are 32-bits. To handle C++
-// signed 64-bit integers (int64_t), the full bits may be stored using double '
-// as an intermediary which then can be coereced to Integer64.
-// For more on this see e.g.
-// https://gallery.rcpp.org/articles/creating-integer64-and-nanotime-vectors/
-//
-inline Rcpp::NumericVector makeInteger64(const std::vector<int64_t>& vec) {
-    size_t n = vec.size();
-
-    Rcpp::NumericVector num(n);
-    std::memcpy(&(num[0]), vec.data(), n * sizeof(double));
-
-    num.attr("class") = "integer64";
-    return (num);
-}
-
-// Convert to a scalar int64_t
-//
-inline int64_t makeScalarInteger64(const double val) {
-    int64_t newval;
-    memcpy(&newval, &val, sizeof(double));
-    return newval;
-}
-
-// Create a int64_t vector from a NumericVector
-//
-inline std::vector<int64_t> getInt64Vector(Rcpp::NumericVector vec) {
-    size_t n = vec.size();
-    std::vector<int64_t> num(n);
-    std::memcpy(&(num[0]), &(vec[0]), n * sizeof(double));
-    return num;
-}
-
+#define TILEDB_VERSION TileDB_Version(TILEDB_VERSION_MAJOR, \
+                                      TILEDB_VERSION_MINOR, \
+                                      TILEDB_VERSION_PATCH)
 
 // Applies (named list of) vectors of points to the named dimensions
 void apply_dim_points(

From 3d021ce59e8f50acc1f46f50911ca6cf5128037d Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 27 Sep 2023 15:18:16 -0400
Subject: [PATCH 8/9] [python] update numba to 0.58 (#1723) (#1726)

The newer version lets us clean up some existing dependency complications -- especially an incompatibility with newer versions of numpy, which was getting more problematic as time went on.

Co-authored-by: Mike Lin <mlin@contractor.chanzuckerberg.com>
---
 apis/python/setup.py | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/apis/python/setup.py b/apis/python/setup.py
index bf205f8260..93c58cba21 100644
--- a/apis/python/setup.py
+++ b/apis/python/setup.py
@@ -271,21 +271,13 @@ def run(self):
         "anndata < 0.9; python_version<'3.8'",
         "anndata; python_version>='3.8'",
         "attrs>=22.2",
-        # Pinning numba & its particular numpy constraints:
-        # The old pip solver (<=2020) doesn't deal with the transitive
-        # requirements (scanpy -> numba -> numpy) properly resulting in broken
-        # installation of incompatible numpy>=1.24. Issue #1051
-        # These pins can be removed either when there's a new numba release
-        # with less-particular numpy version constraints, or if we decide we no
-        # longer need to support the old pip solver (default on ubuntu 20.04).
-        #
-        # Also: numba doesn't support Python 3.11 until 0.57.0rc1.
-        # It' not preferable to pin to an RC dependency, so we only do this
-        # when we must, which is for 3.11.
-        "numba==0.56.4; python_version<'3.11'",
-        "numba==0.57; python_version=='3.11'",
-        "numpy>=1.18,<1.24; python_version<'3.11'",
-        "numpy>=1.18,<1.25; python_version=='3.11'",
+        "numba~=0.58.0; python_version>='3.8'",
+        # Older numba version needed for Python3.7.
+        # This older numba version was also incompatble with newer numpy
+        # versions, and the old pip solver (<=2020) needed us to explicate
+        # that constraint here (issue #1051).
+        "numba==0.56.4; python_version<'3.8'",
+        "numpy>=1.18,<1.24; python_version<'3.8'",
         "pandas",
         "pyarrow>=9.0.0",
         "scanpy>=1.9.2",

From 2c23a4e2e1376393e41196cae55a18c185c798cc Mon Sep 17 00:00:00 2001
From: John Kerl <kerl.john.r@gmail.com>
Date: Wed, 27 Sep 2023 16:16:53 -0400
Subject: [PATCH 9/9] [r] Backport #1720 to `release-1.5` (#1727)

---
 apis/r/DESCRIPTION                         |  2 +-
 apis/r/NEWS.md                             |  1 +
 apis/r/R/SOMADataFrame.R                   |  5 ++
 apis/r/tests/testthat/test-SOMADataFrame.R | 70 ++++++++++++++++++++++
 4 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/apis/r/DESCRIPTION b/apis/r/DESCRIPTION
index fcf263ed46..58a89016da 100644
--- a/apis/r/DESCRIPTION
+++ b/apis/r/DESCRIPTION
@@ -6,7 +6,7 @@ Description: Interface for working with 'TileDB'-based Stack of Matrices,
     like those commonly used for single cell data analysis. It is documented at
     <https://github.com/single-cell-data>; a formal specification available is at
     <https://github.com/single-cell-data/SOMA/blob/main/abstract_specification.md>.
-Version: 1.4.3.1
+Version: 1.4.3.2
 Authors@R: c(
     person(given = "Aaron", family = "Wolen",
            role = c("cre", "aut"), email = "aaron@tiledb.com",
diff --git a/apis/r/NEWS.md b/apis/r/NEWS.md
index 8c097f5761..82c903b6b2 100644
--- a/apis/r/NEWS.md
+++ b/apis/r/NEWS.md
@@ -4,6 +4,7 @@
 
 * Add support for writing `SummarizedExperiment` and `SingleCellExperiment` object to SOMAs
 * Add support for bounding boxes for sparse arrays
+* Add support for creating `SOMADataFrames` with `ordered()` columns
 
 
 # tiledbsoma 1.4.0
diff --git a/apis/r/R/SOMADataFrame.R b/apis/r/R/SOMADataFrame.R
index 4c1116f27d..b58a7e35b1 100644
--- a/apis/r/R/SOMADataFrame.R
+++ b/apis/r/R/SOMADataFrame.R
@@ -97,6 +97,11 @@ SOMADataFrame <- R6::R6Class(
         field <- schema$GetFieldByName(field_name)
         field_type <- tiledb_type_from_arrow_type(field$type)
 
+        # Check if the field is ordered and mark it as such
+        if (!is.null(x = levels[[field_name]]) && isTRUE(field$type$ordered)) {
+          attr(levels[[field_name]], 'ordered') <- attr(levels[[field_name]], 'ordered', exact = TRUE) %||% TRUE
+        }
+
         tdb_attrs[[field_name]] <- tiledb::tiledb_attr(
           name = field_name,
           type = field_type,
diff --git a/apis/r/tests/testthat/test-SOMADataFrame.R b/apis/r/tests/testthat/test-SOMADataFrame.R
index ccd83e863e..178553128d 100644
--- a/apis/r/tests/testthat/test-SOMADataFrame.R
+++ b/apis/r/tests/testthat/test-SOMADataFrame.R
@@ -222,6 +222,76 @@ test_that("int64 values are stored correctly", {
   gc()
 })
 
+test_that("creation with ordered factors", {
+  skip_if_not_installed("tiledb", "0.21.0")
+  skip_if(!extended_tests())
+  uri <- withr::local_tempdir("soma-dataframe-ordered")
+  n <- 10L
+  df <- data.frame(
+    soma_joinid = bit64::as.integer64(seq_len(length.out = n) - 1L),
+    int = seq_len(length.out = n),
+    bool = rep_len(c(TRUE, FALSE), length.out = n),
+    ord = ordered(rep_len(c("g1", "g2", "g3"), length.out = n))
+  )
+  tbl <- arrow::as_arrow_table(df)
+  expect_true(tbl$schema$GetFieldByName("ord")$type$ordered)
+  expect_no_condition(sdf <- SOMADataFrameCreate(
+    uri = uri,
+    schema = tbl$schema,
+    levels = sapply(
+      X = df[, setdiff(names(df), "soma_joinid")],
+      FUN = levels,
+      simplify = FALSE,
+      USE.NAMES = TRUE
+    )
+  ))
+  expect_no_condition(sdf$write(values = tbl))
+  expect_s3_class(sdf <- SOMADataFrameOpen(uri), "SOMADataFrame")
+  expect_true(sdf$schema()$GetFieldByName("ord")$type$ordered)
+  expect_s3_class(ord <- sdf$object[]$ord, c("ordered", "factor"), exact = TRUE)
+  expect_length(ord, n)
+  expect_identical(levels(ord), levels(df$ord))
+})
+
+test_that("explicit casting of ordered factors to regular factors", {
+  skip_if_not_installed("tiledb", "0.21.0")
+  skip_if(!extended_tests())
+  uri <- withr::local_tempdir("soma-dataframe-unordered")
+  n <- 10L
+  df <- data.frame(
+    soma_joinid = bit64::as.integer64(seq_len(length.out = n) - 1L),
+    int = seq_len(length.out = n),
+    bool = rep_len(c(TRUE, FALSE), length.out = n),
+    ord = ordered(rep_len(c("g1", "g2", "g3"), length.out = n))
+  )
+  tbl <- arrow::as_arrow_table(df)
+  expect_true(tbl$schema$GetFieldByName("ord")$type$ordered)
+  lvls <- sapply(
+    X = df[, setdiff(names(df), "soma_joinid")],
+    FUN = levels,
+    simplify = FALSE,
+    USE.NAMES = TRUE
+  )
+  for (col in names(lvls)) {
+    if (!is.null(lvls[[col]])) {
+      attr(lvls[[col]], 'ordered') <- FALSE
+    }
+  }
+  expect_no_condition(sdf <- SOMADataFrameCreate(
+    uri = uri,
+    schema = tbl$schema,
+    levels = lvls
+  ))
+  expect_no_condition(sdf$write(values = tbl))
+  expect_s3_class(sdf <- SOMADataFrameOpen(uri), "SOMADataFrame")
+  expect_false(sdf$schema()$GetFieldByName("ord")$type$ordered)
+  expect_s3_class(ord <- sdf$object[]$ord, "factor", exact = TRUE)
+  expect_false(is.ordered(ord))
+  expect_length(ord, n)
+  expect_identical(levels(ord), levels(df$ord))
+})
+
+
 test_that("SOMADataFrame read", {
   skip_if(!extended_tests())
   uri <- extract_dataset("soma-dataframe-pbmc3k-processed-obs")