Merge branch 'viviannguyen/sc-30316/enumerated-data-types-aka-categor…

…icals-aka' into kerl/categ-int-nan-python-test
single-cell-data · Aug 24, 2023 · 2a8061d · 2a8061d
2 parents 22368cf + a84e76c
commit 2a8061d
Show file tree

Hide file tree

Showing 9 changed files with 55 additions and 64 deletions.
diff --git a/apis/python/src/tiledbsoma/_arrow_types.py b/apis/python/src/tiledbsoma/_arrow_types.py
@@ -173,8 +173,6 @@ def df_to_arrow(df: pd.DataFrame) -> pa.Table:
     null_fields = set()
     # Not for name, col in df.items() since we need df[k] on the left-hand sides
     for k in df:
-        if df[k].dtype == "category":
-            df[k] = df[k].astype(df[k].cat.categories.dtype)
         if df[k].isnull().any():
             if df[k].isnull().all():
                 df[k] = pa.nulls(df.shape[0], pa.infer_type(df[k]))

diff --git a/apis/python/src/tiledbsoma/_query_condition.py b/apis/python/src/tiledbsoma/_query_condition.py
@@ -128,8 +128,8 @@ def __attrs_post_init__(self):
                 "(Is this an empty expression?)"
             )
 
-    def init_query_condition(self, schema: tiledb.ArraySchema, query_attrs: List[str]):
-        qctree = QueryConditionTree(schema, query_attrs)
+    def init_query_condition(self, uri: str, query_attrs: List[str]):
+        qctree = QueryConditionTree(tiledb.open(uri), query_attrs)
         self.c_obj = qctree.visit(self.tree.body)
 
         if not isinstance(self.c_obj, clib.PyQueryCondition):
@@ -143,7 +143,7 @@ def init_query_condition(self, schema: tiledb.ArraySchema, query_attrs: List[str
 
 @attrs.define
 class QueryConditionTree(ast.NodeVisitor):
-    schema: tiledb.ArraySchema
+    array: tiledb.Array
     query_attrs: List[str]
 
     def visit_BitOr(self, node):
@@ -237,8 +237,13 @@ def aux_visit_Compare(
 
         att = self.get_att_from_node(att)
         val = self.get_val_from_node(val)
-
-        dt = self.schema.attr(att).dtype
+
+        enum_label = self.array.attr(att).enum_label
+        if enum_label is not None:
+            dt = self.array.enum(enum_label).dtype
+        else:
+            dt = self.array.attr(att).dtype
+
         dtype = "string" if dt.kind in "SUa" else dt.name
         val = self.cast_val_to_dtype(val, dtype)
 
@@ -318,8 +323,8 @@ def get_att_from_node(self, node: QueryConditionNodeElem) -> Any:
                 f"Incorrect type for attribute name: {ast.dump(node)}"
             )
 
-        if not self.schema.has_attr(att):
-            if self.schema.domain.has_dim(att):
+        if not self.array.schema.has_attr(att):
+            if self.array.schema.domain.has_dim(att):
                 raise tiledb.TileDBError(
                     f"`{att}` is a dimension. QueryConditions currently only "
                     "work on attributes."

diff --git a/apis/python/src/tiledbsoma/_tiledb_array.py b/apis/python/src/tiledbsoma/_tiledb_array.py
@@ -101,8 +101,8 @@ def _soma_reader(
         # Leave empty arguments out of kwargs to allow C++ constructor defaults to apply, as
         # they're not all wrapped in std::optional<>.
         kwargs: Dict[str, object] = {}
-        if schema:
-            kwargs["schema"] = schema
+        # if schema:
+        #     kwargs["schema"] = schema
         if column_names:
             kwargs["column_names"] = column_names
         if query_condition:

diff --git a/apis/python/src/tiledbsoma/io/conversions.py b/apis/python/src/tiledbsoma/io/conversions.py
@@ -44,44 +44,36 @@ def _to_tiledb_supported_dtype(dtype: _DT) -> _DT:
 
 def to_tiledb_supported_array_type(name: str, x: _MT) -> _MT:
     """Converts datatypes unrepresentable by TileDB into datatypes it can represent.
-    E.g., categorical strings -> string.
-
-    See also `https://docs.scipy.org/doc/numpy-1.10.1/reference/arrays.dtypes.html ,https://docs.scipy.org/doc/numpy-1.10.1/reference/arrays.dtypes.html>`_.
-
-    Preferentially converts to the underlying primitive type, as TileDB does not support
-    most complex types. NOTE: this does not support ``datetime64`` conversion.
-
-    Categoricals are a special case. If the underlying categorical type is a primitive,
-    convert to that. If the array contains NA/NaN (i.e. not in the category, code == -1),
-    raise error unless it is a float or string.
+    E.g., float16 -> float32
     """
     if isinstance(x, (np.ndarray, sp.spmatrix)) or not is_categorical_dtype(x):
         # mypy issues a spurious error here, but only when
         # _to_tiledb_supported_dtype is decorated with @typeguard_ignore???
         target_dtype = _to_tiledb_supported_dtype(x.dtype)  # type: ignore[arg-type]
         return x if target_dtype == x.dtype else x.astype(target_dtype)
 
-    categories = x.cat.categories
-    cat_dtype = categories.dtype
-    if cat_dtype.kind in ("f", "u", "i"):
-        if x.hasnans and cat_dtype.kind == "i":
-            raise ValueError(
-                f"Categorical column {name!r} contains NaN -- unable to convert to TileDB array."
-            )
-        # More mysterious spurious mypy errors.
-        target_dtype = _to_tiledb_supported_dtype(cat_dtype)  # type: ignore[arg-type]
-    else:
-        # Into the weirdness. See if Pandas can help with edge cases.
-        inferred = infer_dtype(categories)
-        if x.hasnans and inferred in ("boolean", "bytes"):
-            raise ValueError(
-                "Categorical array contains NaN -- unable to convert to TileDB array."
-            )
-        target_dtype = np.dtype(  # type: ignore[assignment]
-            _str_to_type.get(inferred, object)
-        )
-
-    return x.astype(target_dtype)
+    # categories = x.cat.categories
+    # cat_dtype = categories.dtype
+    # if cat_dtype.kind in ("f", "u", "i"):
+    #     if x.hasnans and cat_dtype.kind == "i":
+    #         raise ValueError(
+    #             f"Categorical column {name!r} contains NaN -- unable to convert to TileDB array."
+    #         )
+    #     # More mysterious spurious mypy errors.
+    #     target_dtype = _to_tiledb_supported_dtype(cat_dtype)  # type: ignore[arg-type]
+    # else:
+    #     # Into the weirdness. See if Pandas can help with edge cases.
+    #     inferred = infer_dtype(categories)
+    #     if x.hasnans and inferred in ("boolean", "bytes"):
+    #         raise ValueError(
+    #             "Categorical array contains NaN -- unable to convert to TileDB array."
+    #         )
+    #     target_dtype = np.dtype(  # type: ignore[assignment]
+    #         _str_to_type.get(inferred, object)
+    #     )
+
+    # return x.astype(target_dtype)
+    return x
 
 
 def csr_from_tiledb_df(df: pd.DataFrame, num_rows: int, num_cols: int) -> sp.csr_matrix:

diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py
@@ -1025,6 +1025,7 @@ def _update_dataframe(
     for key in common_keys:
         old_type = old_sig[key]
         new_type = new_sig[key]
+
         if old_type != new_type:
             msgs.append(f"{key} type {old_type} != {new_type}")
     if msgs:

diff --git a/apis/python/src/tiledbsoma/io/registration/signatures.py b/apis/python/src/tiledbsoma/io/registration/signatures.py
@@ -36,9 +36,13 @@ def _string_dict_from_arrow_schema(schema: pa.Schema) -> Dict[str, str]:
     Converts an Arrow schema to a string/string dict, which is easier on the eyes,
     easier to convert from/to JSON for distributed logging, and easier to do del-key on.
     """
-
-    retval = {name: _stringify_type(schema.field(name).type) for name in schema.names}
-
+    retval = {}
+    for name in schema.names:
+        arrow_type = schema.field(name).type
+        if pa.types.is_dictionary(arrow_type):
+            arrow_type = arrow_type.index_type
+        retval[name] = _stringify_type(arrow_type)
+
     # The soma_joinid field is specific to SOMA data but does not exist in AnnData/H5AD.  When we
     # pre-check an AnnData/H5AD input to see if it's appendable to an existing SOMA experiment, we
     # must not punish the AnnData/H5AD input for it not having a soma_joinid column in its obs and
@@ -241,7 +245,7 @@ def from_soma_experiment(
                     varm_dtypes[varm_layer_name] = str(
                         varm.schema.field("soma_data").type
                     )
-
+                    
             return cls(
                 obs_schema=obs_schema,
                 var_schema=var_schema,

diff --git a/apis/python/src/tiledbsoma/pytiledbsoma.cc b/apis/python/src/tiledbsoma/pytiledbsoma.cc
@@ -200,7 +200,6 @@ PYBIND11_MODULE(pytiledbsoma, m) {
                    std::string_view name,
                    std::optional<std::vector<std::string>> column_names_in,
                    py::object py_query_condition,
-                   py::object py_schema,
                    std::string_view batch_size,
                    ResultOrder result_order,
                    std::map<std::string, std::string> platform_config,
@@ -222,7 +221,7 @@ PYBIND11_MODULE(pytiledbsoma, m) {
                             // Column names will be updated with columns present
                             // in the query condition
                             auto new_column_names =
-                                init_pyqc(py_schema, column_names)
+                                init_pyqc(uri, column_names)
                                     .cast<std::vector<std::string>>();
 
                             // Update the column_names list if it was not empty,
@@ -267,7 +266,6 @@ PYBIND11_MODULE(pytiledbsoma, m) {
             "name"_a = "unnamed",
             "column_names"_a = py::none(),
             "query_condition"_a = py::none(),
-            "schema"_a = py::none(),
             "batch_size"_a = "auto",
             "result_order"_a = ResultOrder::automatic,
             "platform_config"_a = py::dict(),
@@ -278,7 +276,6 @@ PYBIND11_MODULE(pytiledbsoma, m) {
             [](SOMAArray& reader,
                std::optional<std::vector<std::string>> column_names_in,
                py::object py_query_condition,
-               py::object py_schema,
                std::string_view batch_size,
                ResultOrder result_order) {
                 // Handle optional args
@@ -298,7 +295,7 @@ PYBIND11_MODULE(pytiledbsoma, m) {
                         // Column names will be updated with columns present in
                         // the query condition
                         auto new_column_names =
-                            init_pyqc(py_schema, column_names)
+                            init_pyqc(reader.uri(), column_names)
                                 .cast<std::vector<std::string>>();
 
                         // Update the column_names list if it was not empty,
@@ -331,7 +328,6 @@ PYBIND11_MODULE(pytiledbsoma, m) {
             py::kw_only(),
             "column_names"_a = py::none(),
             "query_condition"_a = py::none(),
-            "schema"_a = py::none(),
             "batch_size"_a = "auto",
             "result_order"_a = ResultOrder::automatic)
 

diff --git a/apis/python/tests/test_basic_anndata_io.py b/apis/python/tests/test_basic_anndata_io.py
@@ -1,3 +1,4 @@
+import math
 import pathlib
 import tempfile
 from pathlib import Path

diff --git a/libtiledbsoma/test/test_query_condition.py b/libtiledbsoma/test/test_query_condition.py
@@ -29,9 +29,7 @@ def pandas_query(uri, condition):
 
 def soma_query(uri, condition):
     qc = QueryCondition(condition)
-    schema = tiledb.open(uri).schema
-
-    sr = clib.SOMAArray(uri, query_condition=qc, schema=schema)
+    sr = clib.SOMAArray(uri, query_condition=qc)
     sr.submit()
     arrow_table = sr.read_next()
     assert sr.results_complete()
@@ -110,10 +108,9 @@ def test_query_condition_select_columns():
     condition = "percent_mito > 0.02"
 
     qc = QueryCondition(condition)
-    schema = tiledb.open(uri).schema
 
     sr = clib.SOMAArray(
-        uri, query_condition=qc, schema=schema, column_names=["n_genes"]
+        uri, query_condition=qc, column_names=["n_genes"]
     )
     sr.submit()
     arrow_table = sr.read_next()
@@ -128,9 +125,8 @@ def test_query_condition_all_columns():
     condition = "percent_mito > 0.02"
 
     qc = QueryCondition(condition)
-    schema = tiledb.open(uri).schema
 
-    sr = clib.SOMAArray(uri, query_condition=qc, schema=schema)
+    sr = clib.SOMAArray(uri, query_condition=qc)
     sr.submit()
     arrow_table = sr.read_next()
 
@@ -144,9 +140,8 @@ def test_query_condition_reset():
     condition = "percent_mito > 0.02"
 
     qc = QueryCondition(condition)
-    schema = tiledb.open(uri).schema
 
-    sr = clib.SOMAArray(uri, query_condition=qc, schema=schema)
+    sr = clib.SOMAArray(uri, query_condition=qc)
     sr.submit()
     arrow_table = sr.read_next()
 
@@ -158,7 +153,7 @@ def test_query_condition_reset():
     # ---------------------------------------------------------------
     condition = "percent_mito < 0.02"
     qc = QueryCondition(condition)
-    sr.reset(column_names=["percent_mito"], query_condition=qc, schema=schema)
+    sr.reset(column_names=["percent_mito"], query_condition=qc)
 
     sr.submit()
     arrow_table = sr.read_next()
@@ -221,14 +216,13 @@ def test_parsing_error_conditions(malformed_condition):
 def test_eval_error_conditions(malformed_condition):
     """Conditions which should not evaluate (but WILL parse)"""
     uri = os.path.join(SOMA_URI, "obs")
-    schema = tiledb.open(uri).schema
 
     # TODO: these raise the wrong error - it should be SOMAError. Change the test
     # when https://github.com/single-cell-data/TileDB-SOMA/issues/783 is fixed
     #
     with pytest.raises(RuntimeError):
         qc = QueryCondition(malformed_condition)
-        sr = clib.SOMAArray(uri, query_condition=qc, schema=schema)
+        sr = clib.SOMAArray(uri, query_condition=qc)
         sr.submit()
         sr.read_next()