[python] Support Enumerations On Nullable Attributes and Query Condit…

…ions
single-cell-data · Aug 24, 2023 · a84e76c · a84e76c
1 parent 2fbc777
commit a84e76c
Show file tree

Hide file tree

Showing 13 changed files with 100 additions and 74 deletions.
diff --git a/apis/python/devtools/outgestor b/apis/python/devtools/outgestor
@@ -62,7 +62,7 @@ def main():
         "--var-id-name",
         help="Which var column name to use as index for outgested andata",
         type=str,
-        default="obs_id",
+        default="var_id",
     )
     parser.add_argument(
         "paths",

diff --git a/apis/python/src/tiledbsoma/_arrow_types.py b/apis/python/src/tiledbsoma/_arrow_types.py
@@ -173,8 +173,6 @@ def df_to_arrow(df: pd.DataFrame) -> pa.Table:
     null_fields = set()
     # Not for name, col in df.items() since we need df[k] on the left-hand sides
     for k in df:
-        if df[k].dtype == "category":
-            df[k] = df[k].astype(df[k].cat.categories.dtype)
         if df[k].isnull().any():
             if df[k].isnull().all():
                 df[k] = pa.nulls(df.shape[0], pa.infer_type(df[k]))

diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py
@@ -6,9 +6,10 @@
 """
 Implementation of a SOMA DataFrame
 """
-from typing import Any, Optional, Sequence, Tuple, Type, Union, cast
+from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union, cast
 
 import numpy as np
+import pandas as pd
 import pyarrow as pa
 import somacore
 import tiledb
@@ -133,7 +134,9 @@ def create(
         platform_config: Optional[options.PlatformConfig] = None,
         context: Optional[SOMATileDBContext] = None,
         tiledb_timestamp: Optional[OpenTimestamp] = None,
-        enumerations: Optional[dict[str, Sequence[Any]]] = None,
+        enumerations: Optional[
+            Dict[str, Union[Sequence[Any], np.ndarray[Any, Any]]]
+        ] = None,
         ordered_enumerations: Optional[Sequence[str]] = None,
         column_to_enumerations: Optional[dict[str, str]] = None,
     ) -> "DataFrame":
@@ -400,8 +403,8 @@ def write(
         _util.check_type("values", values, (pa.Table,))
 
         del platform_config  # unused
-        dim_cols_map = {}
-        attr_cols_map = {}
+        dim_cols_map: Dict[str, pd.DataFrame] = {}
+        attr_cols_map: Dict[str, pd.DataFrame] = {}
         dim_names_set = self.index_column_names
         n = None
 

diff --git a/apis/python/src/tiledbsoma/_query_condition.py b/apis/python/src/tiledbsoma/_query_condition.py
@@ -128,8 +128,8 @@ def __attrs_post_init__(self):
                 "(Is this an empty expression?)"
             )
 
-    def init_query_condition(self, schema: tiledb.ArraySchema, query_attrs: List[str]):
-        qctree = QueryConditionTree(schema, query_attrs)
+    def init_query_condition(self, uri: str, query_attrs: List[str]):
+        qctree = QueryConditionTree(tiledb.open(uri), query_attrs)
         self.c_obj = qctree.visit(self.tree.body)
 
         if not isinstance(self.c_obj, clib.PyQueryCondition):
@@ -143,7 +143,7 @@ def init_query_condition(self, schema: tiledb.ArraySchema, query_attrs: List[str
 
 @attrs.define
 class QueryConditionTree(ast.NodeVisitor):
-    schema: tiledb.ArraySchema
+    array: tiledb.Array
     query_attrs: List[str]
 
     def visit_BitOr(self, node):
@@ -237,8 +237,13 @@ def aux_visit_Compare(
 
         att = self.get_att_from_node(att)
         val = self.get_val_from_node(val)
-
-        dt = self.schema.attr(att).dtype
+
+        enum_label = self.array.attr(att).enum_label
+        if enum_label is not None:
+            dt = self.array.enum(enum_label).dtype
+        else:
+            dt = self.array.attr(att).dtype
+
         dtype = "string" if dt.kind in "SUa" else dt.name
         val = self.cast_val_to_dtype(val, dtype)
 
@@ -318,8 +323,8 @@ def get_att_from_node(self, node: QueryConditionNodeElem) -> Any:
                 f"Incorrect type for attribute name: {ast.dump(node)}"
             )
 
-        if not self.schema.has_attr(att):
-            if self.schema.domain.has_dim(att):
+        if not self.array.schema.has_attr(att):
+            if self.array.schema.domain.has_dim(att):
                 raise tiledb.TileDBError(
                     f"`{att}` is a dimension. QueryConditions currently only "
                     "work on attributes."

diff --git a/apis/python/src/tiledbsoma/_tiledb_array.py b/apis/python/src/tiledbsoma/_tiledb_array.py
@@ -101,8 +101,8 @@ def _soma_reader(
         # Leave empty arguments out of kwargs to allow C++ constructor defaults to apply, as
         # they're not all wrapped in std::optional<>.
         kwargs: Dict[str, object] = {}
-        if schema:
-            kwargs["schema"] = schema
+        # if schema:
+        #     kwargs["schema"] = schema
         if column_names:
             kwargs["column_names"] = column_names
         if query_condition:

diff --git a/apis/python/src/tiledbsoma/io/conversions.py b/apis/python/src/tiledbsoma/io/conversions.py
@@ -44,49 +44,41 @@ def _to_tiledb_supported_dtype(dtype: _DT) -> _DT:
 
 def to_tiledb_supported_array_type(name: str, x: _MT) -> _MT:
     """Converts datatypes unrepresentable by TileDB into datatypes it can represent.
-    E.g., categorical strings -> string.
-
-    See also `https://docs.scipy.org/doc/numpy-1.10.1/reference/arrays.dtypes.html ,https://docs.scipy.org/doc/numpy-1.10.1/reference/arrays.dtypes.html>`_.
-
-    Preferentially converts to the underlying primitive type, as TileDB does not support
-    most complex types. NOTE: this does not support ``datetime64`` conversion.
-
-    Categoricals are a special case. If the underlying categorical type is a primitive,
-    convert to that. If the array contains NA/NaN (i.e. not in the category, code == -1),
-    raise error unless it is a float or string.
+    E.g., float16 -> float32
     """
     if isinstance(x, (np.ndarray, sp.spmatrix)) or not is_categorical_dtype(x):
         # mypy issues a spurious error here, but only when
         # _to_tiledb_supported_dtype is decorated with @typeguard_ignore???
         target_dtype = _to_tiledb_supported_dtype(x.dtype)  # type: ignore[arg-type]
         return x if target_dtype == x.dtype else x.astype(target_dtype)
 
-    categories = x.cat.categories
-    cat_dtype = categories.dtype
-    if cat_dtype.kind in ("f", "u", "i"):
-        if x.hasnans and cat_dtype.kind == "i":
-            raise ValueError(
-                f"Categorical column {name!r} contains NaN -- unable to convert to TileDB array."
-            )
-        # More mysterious spurious mypy errors.
-        target_dtype = _to_tiledb_supported_dtype(cat_dtype)  # type: ignore[arg-type]
-    else:
-        # Into the weirdness. See if Pandas can help with edge cases.
-        inferred = infer_dtype(categories)
-        if x.hasnans and inferred in ("boolean", "bytes"):
-            raise ValueError(
-                "Categorical array contains NaN -- unable to convert to TileDB array."
-            )
-        target_dtype = np.dtype(  # type: ignore[assignment]
-            _str_to_type.get(inferred, object)
-        )
-
-    return x.astype(target_dtype)
+    # categories = x.cat.categories
+    # cat_dtype = categories.dtype
+    # if cat_dtype.kind in ("f", "u", "i"):
+    #     if x.hasnans and cat_dtype.kind == "i":
+    #         raise ValueError(
+    #             f"Categorical column {name!r} contains NaN -- unable to convert to TileDB array."
+    #         )
+    #     # More mysterious spurious mypy errors.
+    #     target_dtype = _to_tiledb_supported_dtype(cat_dtype)  # type: ignore[arg-type]
+    # else:
+    #     # Into the weirdness. See if Pandas can help with edge cases.
+    #     inferred = infer_dtype(categories)
+    #     if x.hasnans and inferred in ("boolean", "bytes"):
+    #         raise ValueError(
+    #             "Categorical array contains NaN -- unable to convert to TileDB array."
+    #         )
+    #     target_dtype = np.dtype(  # type: ignore[assignment]
+    #         _str_to_type.get(inferred, object)
+    #     )
+
+    # return x.astype(target_dtype)
+    return x
 
 
 def csr_from_tiledb_df(df: pd.DataFrame, num_rows: int, num_cols: int) -> sp.csr_matrix:
     """Given a tiledb dataframe, return a ``scipy.sparse.csr_matrx``."""
     return sp.csr_matrix(
         (df["soma_data"], (df["soma_dim_0"], df["soma_dim_1"])),
         shape=(num_rows, num_cols),
-    )
+    )
diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py
@@ -13,6 +13,7 @@
 import time
 from typing import (
     Any,
+    Dict,
     List,
     Mapping,
     Optional,
@@ -739,7 +740,7 @@ def _write_dataframe_impl(
     try:
         soma_df = _factory.open(df_uri, "w", soma_type=DataFrame, context=context)
     except DoesNotExistError:
-        enums = {}
+        enums: Dict[str, Union[Sequence[Any], np.ndarray[Any, Any]]] = {}
         col_to_enums = {}
         for att in arrow_table.schema:
             if pa.types.is_dictionary(att.type):
@@ -749,7 +750,7 @@ def _write_dataframe_impl(
                 else:
                     enums[att.name] = cat
                 col_to_enums[att.name] = att.name
-
+                            
         soma_df = DataFrame.create(
             df_uri,
             schema=arrow_table.schema,
@@ -1024,6 +1025,7 @@ def _update_dataframe(
     for key in common_keys:
         old_type = old_sig[key]
         new_type = new_sig[key]
+
         if old_type != new_type:
             msgs.append(f"{key} type {old_type} != {new_type}")
     if msgs:
@@ -1948,4 +1950,4 @@ def to_anndata(
 
     logging.log_io(None, _util.format_elapsed(s, "FINISH Experiment.to_anndata"))
 
-    return anndata
+    return anndata
diff --git a/apis/python/src/tiledbsoma/io/registration/signatures.py b/apis/python/src/tiledbsoma/io/registration/signatures.py
@@ -36,9 +36,13 @@ def _string_dict_from_arrow_schema(schema: pa.Schema) -> Dict[str, str]:
     Converts an Arrow schema to a string/string dict, which is easier on the eyes,
     easier to convert from/to JSON for distributed logging, and easier to do del-key on.
     """
-
-    retval = {name: _stringify_type(schema.field(name).type) for name in schema.names}
-
+    retval = {}
+    for name in schema.names:
+        arrow_type = schema.field(name).type
+        if pa.types.is_dictionary(arrow_type):
+            arrow_type = arrow_type.index_type
+        retval[name] = _stringify_type(arrow_type)
+
     # The soma_joinid field is specific to SOMA data but does not exist in AnnData/H5AD.  When we
     # pre-check an AnnData/H5AD input to see if it's appendable to an existing SOMA experiment, we
     # must not punish the AnnData/H5AD input for it not having a soma_joinid column in its obs and
@@ -241,7 +245,7 @@ def from_soma_experiment(
                     varm_dtypes[varm_layer_name] = str(
                         varm.schema.field("soma_data").type
                     )
-
+                    
             return cls(
                 obs_schema=obs_schema,
                 var_schema=var_schema,

diff --git a/apis/python/src/tiledbsoma/pytiledbsoma.cc b/apis/python/src/tiledbsoma/pytiledbsoma.cc
@@ -200,7 +200,6 @@ PYBIND11_MODULE(pytiledbsoma, m) {
                    std::string_view name,
                    std::optional<std::vector<std::string>> column_names_in,
                    py::object py_query_condition,
-                   py::object py_schema,
                    std::string_view batch_size,
                    ResultOrder result_order,
                    std::map<std::string, std::string> platform_config,
@@ -222,7 +221,7 @@ PYBIND11_MODULE(pytiledbsoma, m) {
                             // Column names will be updated with columns present
                             // in the query condition
                             auto new_column_names =
-                                init_pyqc(py_schema, column_names)
+                                init_pyqc(uri, column_names)
                                     .cast<std::vector<std::string>>();
 
                             // Update the column_names list if it was not empty,
@@ -267,7 +266,6 @@ PYBIND11_MODULE(pytiledbsoma, m) {
             "name"_a = "unnamed",
             "column_names"_a = py::none(),
             "query_condition"_a = py::none(),
-            "schema"_a = py::none(),
             "batch_size"_a = "auto",
             "result_order"_a = ResultOrder::automatic,
             "platform_config"_a = py::dict(),
@@ -278,7 +276,6 @@ PYBIND11_MODULE(pytiledbsoma, m) {
             [](SOMAArray& reader,
                std::optional<std::vector<std::string>> column_names_in,
                py::object py_query_condition,
-               py::object py_schema,
                std::string_view batch_size,
                ResultOrder result_order) {
                 // Handle optional args
@@ -298,7 +295,7 @@ PYBIND11_MODULE(pytiledbsoma, m) {
                         // Column names will be updated with columns present in
                         // the query condition
                         auto new_column_names =
-                            init_pyqc(py_schema, column_names)
+                            init_pyqc(reader.uri(), column_names)
                                 .cast<std::vector<std::string>>();
 
                         // Update the column_names list if it was not empty,
@@ -331,7 +328,6 @@ PYBIND11_MODULE(pytiledbsoma, m) {
             py::kw_only(),
             "column_names"_a = py::none(),
             "query_condition"_a = py::none(),
-            "schema"_a = py::none(),
             "batch_size"_a = "auto",
             "result_order"_a = ResultOrder::automatic)
 

diff --git a/apis/python/testdata/categorical_int_nan.h5ad b/apis/python/testdata/categorical_int_nan.h5ad
diff --git a/apis/python/tests/test_basic_anndata_io.py b/apis/python/tests/test_basic_anndata_io.py
@@ -1,3 +1,4 @@
+import math
 import pathlib
 import tempfile
 from pathlib import Path
@@ -38,6 +39,24 @@ def h5ad_file_uns_string_array(request):
     return input_path
 
 
+@pytest.fixture
+def h5ad_file_categorical_int_nan(request):
+    # This has obs["categ_int_nan"] as a categorical int but with math.nan as a
+    # "not-in-the-category" indicator. Such H5AD files do arise in the wild.
+    #
+    # Reference:
+    #   import anndata as ad
+    #   import pandas  as pd
+    #   import math
+    #   adata = adata.read_h5ad("whatever.h5ad")
+    #   s = pd.Series(list(range(80)), dtype="category")
+    #   s[0] = math.nan
+    #   adata.obs["categ_int_nan"] = s
+    #   adata.write_h5ad("categorical_int_nan.h5ad")
+    input_path = HERE.parent / "testdata/categorical_int_nan.h5ad"
+    return input_path
+
+
 @pytest.fixture
 def adata(h5ad_file):
     return anndata.read_h5ad(h5ad_file)
@@ -476,3 +495,16 @@ def test_null_obs(adata, tmp_path: Path):
         #   of the Pandas data frame
         for k in adata.obs:
             assert obs.attr(k).isnullable == adata.obs[k].isnull().any()
+
+
+# There exist in the wild AnnData files with categorical-int columns where the "not in the category"
+# is indicated by the presence of floating-point math.NaN in cells. Here we test that we can ingest
+# this.
+def test_obs_with_categorical_int_nan_enumeration(
+    tmp_path, h5ad_file_categorical_int_nan
+):
+    output_path = tmp_path.as_uri()
+
+    tiledbsoma.io.from_h5ad(
+        output_path, h5ad_file_categorical_int_nan, measurement_name="RNA"
+    )
diff --git a/apis/python/tests/test_type_system.py b/apis/python/tests/test_type_system.py
@@ -31,7 +31,7 @@
     (pa.binary(), pa.large_binary()),
     (pa.large_string(),) * 2,
     (pa.large_binary(),) * 2,
-    (pa.dictionary(pa.int32(), pa.string()), pa.int32()) 
+    (pa.dictionary(pa.int32(), pa.string()), pa.int32()),
 ]