single-cell-data · johnkerl · Sep 18, 2023 · Sep 18, 2023 · Sep 18, 2023 · Sep 20, 2023
diff --git a/.github/workflows/python-ci-minimal.yml b/.github/workflows/python-ci-minimal.yml
@@ -23,12 +23,16 @@ jobs:
           - runs-on: ubuntu-22.04
             cc: gcc-11
             cxx: g++-11
+          - runs-on: macos-12
+            cc: clang
+            cxx: clang++
     uses: ./.github/workflows/python-ci-single.yml
     with:
       os: ${{ matrix.os }}
       python_version: ${{ matrix.python-version }}
       cc: ${{ matrix.cc }}
       cxx: ${{ matrix.cxx }}
+      is_mac: ${{ contains(matrix.os, 'macos') }}
       report_codecov: ${{ matrix.python-version == '3.10' }}
       run_lint: ${{ matrix.python-version == '3.10' }}
     secrets: inherit
diff --git a/.github/workflows/r-ci.yml b/.github/workflows/r-ci.yml
@@ -40,15 +40,7 @@ jobs:
 
       - name: Install BioConductor package SingleCellExperiment
         run: cd apis/r && tools/r-ci.sh install_bioc SingleCellExperiment
-
-      - name: Install r-universe build of tiledb-r (macOS)
-        if: ${{ matrix.os == 'macOS-latest' }}
-        run: cd apis/r && Rscript -e "install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev', 'https://cloud.r-project.org'))"
-
-      - name: Install r-universe build of tiledb-r (linux)
-        if: ${{ matrix.os != 'macOS-latest' }}
-        run: cd apis/r && Rscript -e "options(bspm.version.check=TRUE); install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev/bin/linux/jammy/4.3/', 'https://cloud.r-project.org'))"      
-
+
       - name: Dependencies
         run: cd apis/r && tools/r-ci.sh install_all
 

diff --git a/apis/python/setup.py b/apis/python/setup.py
@@ -271,21 +271,13 @@ def run(self):
         "anndata < 0.9; python_version<'3.8'",
         "anndata; python_version>='3.8'",
         "attrs>=22.2",
-        # Pinning numba & its particular numpy constraints:
-        # The old pip solver (<=2020) doesn't deal with the transitive
-        # requirements (scanpy -> numba -> numpy) properly resulting in broken
-        # installation of incompatible numpy>=1.24. Issue #1051
-        # These pins can be removed either when there's a new numba release
-        # with less-particular numpy version constraints, or if we decide we no
-        # longer need to support the old pip solver (default on ubuntu 20.04).
-        #
-        # Also: numba doesn't support Python 3.11 until 0.57.0rc1.
-        # It' not preferable to pin to an RC dependency, so we only do this
-        # when we must, which is for 3.11.
-        "numba==0.56.4; python_version<'3.11'",
-        "numba==0.57; python_version=='3.11'",
-        "numpy>=1.18,<1.24; python_version<'3.11'",
-        "numpy>=1.18,<1.25; python_version=='3.11'",
+        "numba~=0.58.0; python_version>='3.8'",
+        # Older numba version needed for Python3.7.
+        # This older numba version was also incompatble with newer numpy
+        # versions, and the old pip solver (<=2020) needed us to explicate
+        # that constraint here (issue #1051).
+        "numba==0.56.4; python_version<'3.8'",
+        "numpy>=1.18,<1.24; python_version<'3.8'",
         "pandas",
         "pyarrow>=9.0.0",
         "scanpy>=1.9.2",

diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py
@@ -401,7 +401,6 @@ def write(
         """
         _util.check_type("values", values, (pa.Table,))
 
-        del platform_config  # unused
         dim_cols_map: Dict[str, pd.DataFrame] = {}
         attr_cols_map: Dict[str, pd.DataFrame] = {}
         dim_names_set = self.index_column_names
@@ -410,18 +409,25 @@ def write(
         for name in values.schema.names:
             col = values.column(name)
             n = len(col)
+
             cols_map = dim_cols_map if name in dim_names_set else attr_cols_map
             if pa.types.is_dictionary(col.type) and col.num_chunks != 0:
-                attr = self._handle.schema.attr(name)
-                if attr.enum_label is not None:
-                    # Normal case: writing categorical data to categorical schema.
-                    cols_map[name] = col.chunk(0).indices.to_pandas()
-                else:
-                    # Schema is non-categorical but the user is writing categorical.
-                    # Simply decategoricalize for them.
+                if name in dim_names_set:
+                    # Dims are never categorical. Decategoricalize for them.
                     cols_map[name] = pa.chunked_array(
                         [chunk.dictionary_decode() for chunk in col.chunks]
                     )
+                else:
+                    attr = self._handle.schema.attr(name)
+                    if attr.enum_label is not None:
+                        # Normal case: writing categorical data to categorical schema.
+                        cols_map[name] = col.chunk(0).indices.to_pandas()
+                    else:
+                        # Schema is non-categorical but the user is writing categorical.
+                        # Simply decategoricalize for them.
+                        cols_map[name] = pa.chunked_array(
+                            [chunk.dictionary_decode() for chunk in col.chunks]
+                        )
             else:
                 cols_map[name] = col.to_pandas()
 
@@ -437,14 +443,17 @@ def write(
         dim_cols_list = [dim_cols_map[name] for name in self.index_column_names]
         dim_cols_tuple = tuple(dim_cols_list)
         self._handle.writer[dim_cols_tuple] = attr_cols_map
-        self._consolidate_and_vacuum_fragment_metadata()
+        tiledb_create_options = TileDBCreateOptions.from_platform_config(
+            platform_config
+        )
+        if tiledb_create_options.consolidate_and_vacuum:
+            self._consolidate_and_vacuum()
 
         return self
 
     def _set_reader_coord(
         self, sr: clib.SOMAArray, dim_idx: int, dim: tiledb.Dim, coord: object
     ) -> bool:
-
         if coord is None:
             return True  # No constraint; select all in this dimension
 
@@ -582,7 +591,6 @@ def _set_reader_coord_by_py_seq_or_np_array(
     def _set_reader_coord_by_numeric_slice(
         self, sr: clib.SOMAArray, dim_idx: int, dim: tiledb.Dim, coord: Slice[Any]
     ) -> bool:
-
         try:
             lo_hi = _util.slice_to_numeric_range(coord, dim.domain)
         except _util.NonNumericDimensionError:

diff --git a/apis/python/src/tiledbsoma/_dense_nd_array.py b/apis/python/src/tiledbsoma/_dense_nd_array.py
@@ -172,9 +172,12 @@ def write(
         """
         _util.check_type("values", values, (pa.Tensor,))
 
-        del platform_config  # Currently unused.
         self._handle.writer[coords] = values.to_numpy()
-        self._consolidate_and_vacuum_fragment_metadata()
+        tiledb_create_options = TileDBCreateOptions.from_platform_config(
+            platform_config
+        )
+        if tiledb_create_options.consolidate_and_vacuum:
+            self._consolidate_and_vacuum()
         return self
 
     @classmethod

diff --git a/apis/python/src/tiledbsoma/_sparse_nd_array.py b/apis/python/src/tiledbsoma/_sparse_nd_array.py
@@ -183,9 +183,11 @@ def write(
         Lifecycle:
             Experimental.
         """
-        del platform_config  # Currently unused.
 
         arr = self._handle.writer
+        tiledb_create_options = TileDBCreateOptions.from_platform_config(
+            platform_config
+        )
 
         if isinstance(values, pa.SparseCOOTensor):
             # Write bulk data
@@ -197,8 +199,9 @@ def write(
             bounding_box = self._compute_bounding_box_metadata(maxes)
             self._set_bounding_box_metadata(bounding_box)
 
-            # Consolidate non-bulk data
-            self._consolidate_and_vacuum_fragment_metadata()
+            if tiledb_create_options.consolidate_and_vacuum:
+                # Consolidate non-bulk data
+                self._consolidate_and_vacuum()
             return self
 
         if isinstance(values, (pa.SparseCSCMatrix, pa.SparseCSRMatrix)):
@@ -216,8 +219,9 @@ def write(
             bounding_box = self._compute_bounding_box_metadata([nr - 1, nc - 1])
             self._set_bounding_box_metadata(bounding_box)
 
-            # Consolidate non-bulk data
-            self._consolidate_and_vacuum_fragment_metadata()
+            if tiledb_create_options.consolidate_and_vacuum:
+                # Consolidate non-bulk data
+                self._consolidate_and_vacuum()
             return self
 
         if isinstance(values, pa.Table):
@@ -241,8 +245,9 @@ def write(
             bounding_box = self._compute_bounding_box_metadata(maxes)
             self._set_bounding_box_metadata(bounding_box)
 
-            # Consolidate non-bulk data
-            self._consolidate_and_vacuum_fragment_metadata()
+            if tiledb_create_options.consolidate_and_vacuum:
+                # Consolidate non-bulk data
+                self._consolidate_and_vacuum()
             return self
 
         raise TypeError(

diff --git a/apis/python/src/tiledbsoma/_tiledb_array.py b/apis/python/src/tiledbsoma/_tiledb_array.py
@@ -6,7 +6,7 @@
 import ctypes
 import os
 import sys
-from typing import Any, Dict, Optional, Sequence, Tuple
+from typing import Any, Dict, List, Optional, Sequence, Tuple
 
 import pyarrow as pa
 import tiledb
@@ -194,20 +194,41 @@ def _create_internal(
         cls._set_create_metadata(handle)
         return handle
 
-    def _consolidate_and_vacuum_fragment_metadata(self) -> None:
+    def _consolidate_and_vacuum(
+        self, modes: List[str] = ["fragment_meta", "commits"]
+    ) -> None:
         """
         This post-ingestion helper consolidates and vacuums fragment metadata and commit files --
         this is quick to do, and positively impacts query performance.  It does _not_ consolidate
         bulk array data, which is more time-consuming and should be done at the user's opt-in
         discretion.
         """
 
-        for mode in ["fragment_meta", "commits"]:
+        for mode in modes:
+            self._consolidate(modes=[mode])
+            self._vacuum(modes=[mode])
 
+    def _consolidate(self, modes: List[str] = ["fragment_meta", "commits"]) -> None:
+        """
+        This post-ingestion helper consolidates by default fragment metadata and commit files --
+        this is quick to do, and positively impacts query performance.
+        """
+
+        for mode in modes:
             cfg = self._ctx.config()
             cfg["sm.consolidation.mode"] = mode
-            cfg["sm.vacuum.mode"] = mode
             ctx = tiledb.Ctx(cfg)
 
             tiledb.consolidate(self.uri, ctx=ctx)
+
+    def _vacuum(self, modes: List[str] = ["fragment_meta", "commits"]) -> None:
+        """
+        This post-ingestion helper vacuums by default fragment metadata and commit files. Vacuuming is not multi-process safe and requires coordination that nothing is currently reading the files that will be vacuumed.
+        """
+
+        for mode in modes:
+            cfg = self._ctx.config()
+            cfg["sm.vacuum.mode"] = mode
+            ctx = tiledb.Ctx(cfg)
+
             tiledb.vacuum(self.uri, ctx=ctx)
diff --git a/apis/python/src/tiledbsoma/options/_tiledb_create_options.py b/apis/python/src/tiledbsoma/options/_tiledb_create_options.py
@@ -143,6 +143,9 @@ class TileDBCreateOptions:
     attrs: Mapping[str, _ColumnConfig] = attrs_.field(
         factory=dict, converter=_normalize_columns
     )
+    consolidate_and_vacuum: bool = attrs_.field(
+        validator=vld.instance_of(bool), default=False
+    )
 
     @classmethod
     def from_platform_config(

diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py
@@ -914,6 +914,41 @@ def test_write_categorical_types(tmp_path):
         assert (df == sdf.read().concat().to_pandas()).all().all()
 
 
+def test_write_categorical_dims(tmp_path):
+    """
+    Categories are not supported as dims. Here we test our handling of what we
+    do when we are given them as input.
+    """
+    schema = pa.schema(
+        [
+            ("soma_joinid", pa.int64()),
+            ("string", pa.dictionary(pa.int8(), pa.large_string())),
+        ]
+    )
+    with soma.DataFrame.create(
+        tmp_path.as_posix(),
+        schema=schema,
+        index_column_names=["soma_joinid"],
+        enumerations={
+            "enum-string": ["b", "a"],
+        },
+        ordered_enumerations=[],
+        column_to_enumerations={
+            "string": "enum-string",
+        },
+    ) as sdf:
+        df = pd.DataFrame(
+            data={
+                "soma_joinid": pd.Categorical([0, 1, 2, 3], categories=[0, 1, 2, 3]),
+                "string": pd.Categorical(["a", "b", "a", "b"], categories=["b", "a"]),
+            }
+        )
+        sdf.write(pa.Table.from_pandas(df))
+
+    with soma.DataFrame.open(tmp_path.as_posix()) as sdf:
+        assert (df == sdf.read().concat().to_pandas()).all().all()
+
+
 def test_result_order(tmp_path):
     # cf. https://docs.tiledb.com/main/background/key-concepts-and-data-format#data-layout
     schema = pa.schema(

diff --git a/apis/python/tests/test_sparse_nd_array.py b/apis/python/tests/test_sparse_nd_array.py
@@ -1100,3 +1100,32 @@ def test_timestamped_ops(tmp_path):
             [0, 0],
         ]
         assert a.nnz == 1
+
+
+def test_empty_indexed_read(tmp_path):
+    """
+    Verify that queries expected to return empty results actually
+    work. There are edge cases around SparseTensors, which are unable
+    to represent empty arrays.
+    """
+    shape = (10, 100)
+    soma.SparseNDArray.create(
+        tmp_path.as_posix(), type=pa.uint16(), shape=shape
+    ).close()
+
+    data = create_random_tensor("coo", shape, np.float64, 1.0)
+    with soma.SparseNDArray.open(tmp_path.as_posix(), "w") as a:
+        a.write(data)
+
+    with soma.SparseNDArray.open(tmp_path.as_posix()) as a:
+        coords = [slice(None), slice(None)]
+        assert sum(len(t) for t in a.read(coords).tables()) == 1000
+
+        coords = [[3], [4]]
+        assert sum(len(t) for t in a.read(coords).tables()) == 1
+
+        coords = [[3], []]
+        assert sum(len(t) for t in a.read(coords).tables()) == 0
+
+        coords = [[], [4]]
+        assert sum(len(t) for t in a.read(coords).tables()) == 0
diff --git a/apis/python/version.py b/apis/python/version.py
@@ -85,6 +85,9 @@
 
 
 def readGitVersion():
+    # NOTE: this will fail if on a fork with unsynchronized tags.
+    #       use `git fetch --tags upstream`
+    #       and `git push --tags <your fork>`
     try:
         proc = subprocess.Popen(
             ("git", "describe", "--long", "--tags", "--match", "[0-9]*.*"),

diff --git a/apis/r/DESCRIPTION b/apis/r/DESCRIPTION
@@ -6,7 +6,7 @@ Description: Interface for working with 'TileDB'-based Stack of Matrices,
     like those commonly used for single cell data analysis. It is documented at
     <https://github.com/single-cell-data>; a formal specification available is at
     <https://github.com/single-cell-data/SOMA/blob/main/abstract_specification.md>.
-Version: 1.4.3.1
+Version: 1.4.3.2
 Authors@R: c(
     person(given = "Aaron", family = "Wolen",
            role = c("cre", "aut"), email = "[email protected]",
@@ -44,10 +44,12 @@ Imports:
     data.table,
     spdl,
     rlang,
-    tools
+    tools,
+    tibble
 LinkingTo:
     Rcpp,
-    RcppSpdlog
+    RcppSpdlog,
+    RcppInt64
 Additional_repositories: https://ghrr.github.io/drat
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.2.3

diff --git a/apis/r/NAMESPACE b/apis/r/NAMESPACE
@@ -98,6 +98,7 @@ importFrom(spdl,debug)
 importFrom(spdl,info)
 importFrom(spdl,setup)
 importFrom(stats,setNames)
+importFrom(tibble,as_tibble)
 importFrom(tools,R_user_dir)
 importFrom(tools,file_path_sans_ext)
 importFrom(urltools,url_compose)

diff --git a/apis/r/NEWS.md b/apis/r/NEWS.md
@@ -4,6 +4,7 @@
 
 * Add support for writing `SummarizedExperiment` and `SingleCellExperiment` object to SOMAs
 * Add support for bounding boxes for sparse arrays
+* Add support for creating `SOMADataFrames` with `ordered()` columns
 
 
 # tiledbsoma 1.4.0