Merge branch 'main' into de/sc-33875/use_rc_in_ci

single-cell-data · Sep 15, 2023 · af37773 · af37773
2 parents 0e802b9 + 84e53e5
commit af37773
Show file tree

Hide file tree

Showing 17 changed files with 335 additions and 109 deletions.
diff --git a/.github/workflows/r-ci.yml b/.github/workflows/r-ci.yml
@@ -40,11 +40,11 @@ jobs:
       - name: Install BioConductor package SingleCellExperiment
         run: cd apis/r && tools/r-ci.sh install_bioc SingleCellExperiment
 
-      - name: Install rc version of tiledb-r (macOS)
+      - name: Install r-universe build of tiledb-r (macOS)"
         if: ${{ matrix.os == 'macOS-latest' }}
         run: cd apis/r && Rscript -e "install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev', 'https://cloud.r-project.org'))"
 
-      - name: Install rc version of tiledb-r (linux)
+      - name: Install r-universe build of tiledb-r (linux)"
         if: ${{ matrix.os != 'macOS-latest' }}
         run: cd apis/r && Rscript -e "options(bspm.version.check=TRUE); install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev/bin/linux/jammy/4.3/', 'https://cloud.r-project.org'))"      
 

diff --git a/.github/workflows/r-python-interop-testing.yml b/.github/workflows/r-python-interop-testing.yml
@@ -2,10 +2,12 @@ name: TileDB-SOMA R-Python interop testing
 
 on:
   pull_request:
-    paths:
-      - "apis/python/**"
-      - "apis/r/**"
-      - "apis/system/**"
+    # TODO: leave this enabled for pre-merge signal for now. At some point we may want to go back to
+    # only having this signal post-merge.
+    #paths:
+    #  - "apis/python/**"
+    #  - "apis/r/**"
+    #  - "apis/system/**"
   push:
     branches:
       - main
@@ -38,6 +40,14 @@ jobs:
       - name: MkVars
         run: mkdir ~/.R && echo "CXX17FLAGS=-Wno-deprecated-declarations -Wno-deprecated" > ~/.R/Makevars
 
+      - name: Install r-universe build of tiledb-r (macOS)"
+        if: ${{ matrix.os == 'macOS-latest' }}
+        run: cd apis/r && Rscript -e "install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev', 'https://cloud.r-project.org'))"
+
+      - name: Install r-universe build of tiledb-r (linux)"
+        if: ${{ matrix.os != 'macOS-latest' }}
+        run: cd apis/r && Rscript -e "options(bspm.version.check=TRUE); install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev/bin/linux/jammy/4.3/', 'https://cloud.r-project.org'))"
+
       - name: Build and install libtiledbsoma
         run: sudo scripts/bld --prefix=/usr/local && sudo ldconfig
 
@@ -48,6 +58,9 @@ jobs:
           FILE=$(ls -1t *.tar.gz | head -n 1)
           R CMD INSTALL $FILE
 
+      - name: Show R package versions
+        run: Rscript -e 'tiledbsoma::show_package_versions()'
+
       - name: Install testing prereqs
         run: python -m pip -v install -U pip pytest-cov 'typeguard<3.0' types-setuptools
 
@@ -61,8 +74,10 @@ jobs:
       - name: Install tiledbsoma
         run: python -m pip -v install -e apis/python
 
-      - name: Show package versions
-        run: python scripts/show-versions.py
+      - name: Show Python package versions
+        run: |
+          python -c 'import tiledbsoma; tiledbsoma.show_package_versions()'
+          python scripts/show-versions.py
 
       - name: Interop Tests
         run: python -m pytest apis/system/tests/
diff --git a/apis/python/src/tiledbsoma/_arrow_types.py b/apis/python/src/tiledbsoma/_arrow_types.py
@@ -95,6 +95,9 @@ def tiledb_type_from_arrow_type(
     Raises:
         TypeError: if the type is unsupported.
     """
+    if pa.types.is_dictionary(t):
+        t = t.index_type
+
     arrow_to_tdb = _ARROW_TO_TDB_DIM if is_indexed_column else _ARROW_TO_TDB_ATTR
     if t in arrow_to_tdb:
         arrow_type = arrow_to_tdb[t]
@@ -142,7 +145,9 @@ def arrow_type_from_tiledb_dtype(
         return pa.from_numpy_dtype(tiledb_dtype)
 
 
-def tiledb_schema_to_arrow(tdb_schema: tiledb.ArraySchema) -> pa.Schema:
+def tiledb_schema_to_arrow(
+    tdb_schema: tiledb.ArraySchema, uri: str, ctx: tiledb.ctx.Ctx
+) -> pa.Schema:
     arrow_schema_dict = {}
     dom = tdb_schema.domain
     for i in range(dom.ndim):
@@ -152,12 +157,32 @@ def tiledb_schema_to_arrow(tdb_schema: tiledb.ArraySchema) -> pa.Schema:
             name = "unnamed"
         arrow_schema_dict[name] = arrow_type_from_tiledb_dtype(dim.dtype)
 
+    # If there are any enumerated-type columns, we'll need to open the array once to get
+    # some information from there. If not, we'll need to open the array zero times.
+    # Open the array only if we'll need it for enum infos.
+    A = None
     for i in range(tdb_schema.nattr):
         attr = tdb_schema.attr(i)
         name = attr.name
         if name == "":
             name = "unnamed"
-        arrow_schema_dict[name] = arrow_type_from_tiledb_dtype(attr.dtype, attr.isascii)
+        if attr.enum_label is not None:  # enumerated
+            if A is None:
+                A = tiledb.open(uri, ctx=ctx)
+            info = A.enum(name)
+            arrow_schema_dict[name] = pa.dictionary(
+                index_type=arrow_type_from_tiledb_dtype(attr.dtype),
+                value_type=arrow_type_from_tiledb_dtype(
+                    tiledb.datatypes.DataType.from_tiledb(info.type).np_dtype
+                ),
+                ordered=info.ordered,
+            )
+        else:  # non-enumerated
+            arrow_schema_dict[name] = arrow_type_from_tiledb_dtype(
+                attr.dtype, attr.isascii
+            )
+    if A is not None:
+        A.close()
 
     return pa.schema(arrow_schema_dict)
 
@@ -170,8 +195,6 @@ def df_to_arrow(df: pd.DataFrame) -> pa.Table:
     null_fields = set()
     # Not for name, col in df.items() since we need df[k] on the left-hand sides
     for k in df:
-        if df[k].dtype == "category":
-            df[k] = df[k].astype(df[k].cat.categories.dtype)
         if df[k].isnull().any():
             if df[k].isnull().all():
                 df[k] = pa.nulls(df.shape[0], pa.infer_type(df[k]))
@@ -182,6 +205,25 @@ def df_to_arrow(df: pd.DataFrame) -> pa.Table:
                     inplace=True,
                 )
             null_fields.add(k)
+
+    # For categoricals, it's possible to get
+    #   TypeError: Object of type bool_ is not JSON serializable
+    # deep within library functions. Debugging reveals that this happens when
+    # the df[key].values.ordered is of type np.bool_ rather than Python bool.
+    # So, we cast and reconstruct.
+    for key in df:
+        column = df[key]
+        if isinstance(column.dtype, pd.CategoricalDtype):
+            if hasattr(column.values, "categories"):
+                categories = column.values.categories
+
+            if hasattr(column.values, "ordered"):
+                ordered = bool(column.values.ordered)
+
+            df[key] = pd.Categorical(
+                values=column, categories=categories, ordered=ordered
+            )
+
     arrow_table = pa.Table.from_pandas(df)
     if null_fields:
         md = arrow_table.schema.metadata

diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py
@@ -6,12 +6,14 @@
 """
 Implementation of a SOMA DataFrame
 """
-from typing import Any, Optional, Sequence, Tuple, Type, Union, cast
+from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union, cast
 
 import numpy as np
+import pandas as pd
 import pyarrow as pa
 import somacore
 import tiledb
+from numpy.typing import NDArray
 from somacore import options
 from typing_extensions import Self
 
@@ -133,6 +135,9 @@ def create(
         platform_config: Optional[options.PlatformConfig] = None,
         context: Optional[SOMATileDBContext] = None,
         tiledb_timestamp: Optional[OpenTimestamp] = None,
+        enumerations: Optional[Dict[str, Union[Sequence[Any], NDArray[Any]]]] = None,
+        ordered_enumerations: Optional[Sequence[str]] = None,
+        column_to_enumerations: Optional[Dict[str, str]] = None,
     ) -> "DataFrame":
         """Creates the data structure on disk/S3/cloud.
 
@@ -168,6 +173,9 @@ def create(
                 If specified, overrides the default timestamp
                 used to open this object. If unset, uses the timestamp provided by
                 the context.
+            enumeration:
+                If specified, enumerate attributes with the given sequence of values.
+
 
         Returns:
             The DataFrame.
@@ -208,6 +216,9 @@ def create(
             schema,
             index_column_names,
             domain,
+            enumerations or {},
+            ordered_enumerations or [],
+            column_to_enumerations or {},
             TileDBCreateOptions.from_platform_config(platform_config),
             context,
         )
@@ -261,6 +272,17 @@ def count(self) -> int:
         self._check_open_read()
         return cast(int, self._soma_reader().nnz())
 
+    def enumeration(self, name: str) -> Tuple[Any, ...]:
+        """Doc place holder.
+
+        Returns:
+            Tuple[Any, ...]: _description_
+        """
+        return tuple(self._soma_reader().get_enum(name))
+
+    def column_to_enumeration(self, name: str) -> str:
+        return str(self._soma_reader().get_enum_label_on_attr(name))
+
     def __len__(self) -> int:
         """Returns the number of rows in the dataframe. Same as ``df.count``."""
         return self.count
@@ -380,17 +402,19 @@ def write(
         _util.check_type("values", values, (pa.Table,))
 
         del platform_config  # unused
-        dim_cols_map = {}
-        attr_cols_map = {}
+        dim_cols_map: Dict[str, pd.DataFrame] = {}
+        attr_cols_map: Dict[str, pd.DataFrame] = {}
         dim_names_set = self.index_column_names
         n = None
 
         for name in values.schema.names:
-            n = len(values.column(name))
-            if name in dim_names_set:
-                dim_cols_map[name] = values.column(name).to_pandas()
+            col = values.column(name)
+            n = len(col)
+            cols_map = dim_cols_map if name in dim_names_set else attr_cols_map
+            if pa.types.is_dictionary(col.type) and col.num_chunks != 0:
+                cols_map[name] = col.chunk(0).indices.to_pandas()
             else:
-                attr_cols_map[name] = values.column(name).to_pandas()
+                cols_map[name] = col.to_pandas()
         if n is None:
             raise ValueError(f"did not find any column names in {values.schema.names}")
 
@@ -634,7 +658,8 @@ def _canonicalize_schema(
             raise ValueError(
                 f"All index names must be defined in the dataframe schema: '{index_column_name}' not in {schema_names_string}"
             )
-        if schema.field(index_column_name).type not in [
+        dtype = schema.field(index_column_name).type
+        if not pa.types.is_dictionary(dtype) and dtype not in [
             pa.int8(),
             pa.uint8(),
             pa.int16(),
@@ -665,6 +690,9 @@ def _build_tiledb_schema(
     schema: pa.Schema,
     index_column_names: Sequence[str],
     domain: Optional[Sequence[Optional[Tuple[Any, Any]]]],
+    enumerations: Dict[str, Any],
+    ordered_enumerations: Sequence[str],
+    column_to_enumerations: Dict[str, str],
     tiledb_create_options: TileDBCreateOptions,
     context: SOMATileDBContext,
 ) -> tiledb.ArraySchema:
@@ -714,6 +742,17 @@ def _build_tiledb_schema(
 
     dom = tiledb.Domain(dims, ctx=context.tiledb_ctx)
 
+    enums = []
+    if enumerations is not None:
+        for enum_name in enumerations:
+            enums.append(
+                tiledb.Enumeration(
+                    enum_name,
+                    enum_name in ordered_enumerations,
+                    np.array(enumerations[enum_name]),
+                )
+            )
+
     attrs = []
     metadata = schema.metadata or {}
     for attr_name in schema.names:
@@ -728,6 +767,9 @@ def _build_tiledb_schema(
             filters=tiledb_create_options.attr_filters_tiledb(
                 attr_name, ["ZstdFilter"]
             ),
+            enum_label=column_to_enumerations[attr_name]
+            if attr_name in column_to_enumerations
+            else None,
             ctx=context.tiledb_ctx,
         )
         attrs.append(attr)
@@ -737,6 +779,7 @@ def _build_tiledb_schema(
     return tiledb.ArraySchema(
         domain=dom,
         attrs=attrs,
+        enums=enums,
         sparse=True,
         allows_duplicates=tiledb_create_options.allows_duplicates,
         offsets_filters=tiledb_create_options.offsets_filters_tiledb(),

diff --git a/apis/python/src/tiledbsoma/_query_condition.py b/apis/python/src/tiledbsoma/_query_condition.py
@@ -128,8 +128,8 @@ def __attrs_post_init__(self):
                 "(Is this an empty expression?)"
             )
 
-    def init_query_condition(self, schema: tiledb.ArraySchema, query_attrs: List[str]):
-        qctree = QueryConditionTree(schema, query_attrs)
+    def init_query_condition(self, uri: str, query_attrs: List[str]):
+        qctree = QueryConditionTree(tiledb.open(uri), query_attrs)
         self.c_obj = qctree.visit(self.tree.body)
 
         if not isinstance(self.c_obj, clib.PyQueryCondition):
@@ -143,7 +143,7 @@ def init_query_condition(self, schema: tiledb.ArraySchema, query_attrs: List[str
 
 @attrs.define
 class QueryConditionTree(ast.NodeVisitor):
-    schema: tiledb.ArraySchema
+    array: tiledb.Array
     query_attrs: List[str]
 
     def visit_BitOr(self, node):
@@ -237,8 +237,11 @@ def aux_visit_Compare(
 
         att = self.get_att_from_node(att)
         val = self.get_val_from_node(val)
-
-        dt = self.schema.attr(att).dtype
+        enum_label = self.array.attr(att).enum_label
+        if enum_label is not None:
+            dt = self.array.enum(enum_label).dtype
+        else:
+            dt = self.array.attr(att).dtype
         dtype = "string" if dt.kind in "SUa" else dt.name
         val = self.cast_val_to_dtype(val, dtype)
 
@@ -318,8 +321,8 @@ def get_att_from_node(self, node: QueryConditionNodeElem) -> Any:
                 f"Incorrect type for attribute name: {ast.dump(node)}"
             )
 
-        if not self.schema.has_attr(att):
-            if self.schema.domain.has_dim(att):
+        if not self.array.schema.has_attr(att):
+            if self.array.schema.domain.has_dim(att):
                 raise tiledb.TileDBError(
                     f"`{att}` is a dimension. QueryConditions currently only "
                     "work on attributes."

diff --git a/apis/python/src/tiledbsoma/_tiledb_array.py b/apis/python/src/tiledbsoma/_tiledb_array.py
@@ -63,7 +63,7 @@ def schema(self) -> pa.Schema:
         Lifecycle:
             Experimental.
         """
-        return tiledb_schema_to_arrow(self._tiledb_array_schema())
+        return tiledb_schema_to_arrow(self._tiledb_array_schema(), self.uri, self._ctx)
 
     def _tiledb_array_schema(self) -> tiledb.ArraySchema:
         """Returns the TileDB array schema, for internal use."""
@@ -101,8 +101,8 @@ def _soma_reader(
         # Leave empty arguments out of kwargs to allow C++ constructor defaults to apply, as
         # they're not all wrapped in std::optional<>.
         kwargs: Dict[str, object] = {}
-        if schema:
-            kwargs["schema"] = schema
+        # if schema:
+        #     kwargs["schema"] = schema
         if column_names:
             kwargs["column_names"] = column_names
         if query_condition:

diff --git a/apis/python/src/tiledbsoma/io/_registration/signatures.py b/apis/python/src/tiledbsoma/io/_registration/signatures.py
@@ -36,9 +36,12 @@ def _string_dict_from_arrow_schema(schema: pa.Schema) -> Dict[str, str]:
     Converts an Arrow schema to a string/string dict, which is easier on the eyes,
     easier to convert from/to JSON for distributed logging, and easier to do del-key on.
     """
-
-    retval = {name: _stringify_type(schema.field(name).type) for name in schema.names}
-
+    retval = {}
+    for name in schema.names:
+        arrow_type = schema.field(name).type
+        if pa.types.is_dictionary(arrow_type):
+            arrow_type = arrow_type.index_type
+        retval[name] = _stringify_type(arrow_type)
     # The soma_joinid field is specific to SOMA data but does not exist in AnnData/H5AD.  When we
     # pre-check an AnnData/H5AD input to see if it's appendable to an existing SOMA experiment, we
     # must not punish the AnnData/H5AD input for it not having a soma_joinid column in its obs and