[python] Support enumerations in update_obs/update_var (#1707) (#…

…1736) * [python] Support enumerations in `update_obs`/`update_var` [WIP] * expand unit tests * silence a warning * Avoid unnecessary MacOS 3.7 check * experimenting * code-review feedback Co-authored-by: John Kerl <[email protected]>
single-cell-data · Sep 29, 2023 · 68d2630 · 68d2630
1 parent a8623c8
commit 68d2630
Show file tree

Hide file tree

Showing 4 changed files with 37 additions and 8 deletions.
diff --git a/.github/workflows/python-ci-single.yml b/.github/workflows/python-ci-single.yml
@@ -80,13 +80,14 @@ jobs:
         cache: pip
         cache-dependency-path: ./apis/python/setup.py
 
-    - name: Cache native libraries
-      uses: actions/cache@v3
-      with:
-        path: |
-          build
-          dist
-        key: libtiledbsoma-build-dist-${{ inputs.os }}-${{ inputs.python_version }}-${{ hashFiles('libtiledbsoma', 'scripts/bld') }}
+# Experiment for MacOS CI false negatives ...
+#    - name: Cache native libraries
+#      uses: actions/cache@v3
+#      with:
+#        path: |
+#          build
+#          dist
+#        key: libtiledbsoma-build-dist-${{ inputs.os }}-${{ inputs.python_version }}-${{ hashFiles('libtiledbsoma', 'scripts/bld') }}
 
     - name: Install testing prereqs
       run: python -m pip -v install -U pip pytest-cov 'typeguard<3.0' types-setuptools

diff --git a/apis/python/src/tiledbsoma/io/_registration/signatures.py b/apis/python/src/tiledbsoma/io/_registration/signatures.py
@@ -72,7 +72,11 @@ def _string_dict_from_pandas_dataframe(
         df.reset_index(inplace=True)
         if default_index_name in df:
             if "index" in df:
-                df.drop(columns=["index"], inplace=True)
+                # Avoid the warning:
+                # "A value is trying to be set on a copy of a slice from a DataFrame"
+                # which would occur if we did:
+                # df.drop(columns=["index"], inplace=True)
+                df = df.drop(columns=["index"])
         else:
             df.rename(columns={"index": default_index_name}, inplace=True)
     else:

diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py
@@ -1513,12 +1513,24 @@ def _update_dataframe(
         # schema-creation logic.
         atype = arrow_schema.field(add_key).type
         dtype = tiledb_type_from_arrow_type(atype)
+
+        enum_label: Optional[str] = None
+        if pa.types.is_dictionary(arrow_table.schema.field(add_key).type):
+            enum_label = add_key
+            dt = cast(pd.CategoricalDtype, new_data[add_key].dtype)
+            se.add_enumeration(
+                tiledb.Enumeration(
+                    name=add_key, ordered=atype.ordered, values=list(dt.categories)
+                )
+            )
+
         filters = tiledb_create_options.attr_filters_tiledb(add_key, ["ZstdFilter"])
         se.add_attribute(
             tiledb.Attr(
                 name=add_key,
                 dtype=dtype,
                 filters=filters,
+                enum_label=enum_label,
             )
         )
 

diff --git a/apis/python/tests/test_update_dataframes.py b/apis/python/tests/test_update_dataframes.py
@@ -71,8 +71,14 @@ def test_add(adata, readback):
             new_obs = adata.obs
             new_var = adata.var
 
+    # boolean
     new_obs["is_g1"] = new_obs["groups"] == "g1"
+    # int
     new_obs["seq"] = np.arange(new_obs.shape[0], dtype=np.int32)
+    # categorical of string
+    new_obs["parity"] = pd.Categorical(
+        np.asarray([["even", "odd"][e % 2] for e in range(len(new_obs))])
+    )
 
     new_var["vst.mean.sq"] = new_var["vst.mean"] ** 2
 
@@ -83,9 +89,15 @@ def test_add(adata, readback):
     with tiledbsoma.Experiment.open(output_path) as exp:
         o2 = exp.obs.schema
         v2 = exp.ms["RNA"].var.schema
+        obs = exp.obs.read().concat().to_pandas()
 
     assert o2.field("is_g1").type == pa.bool_()
     assert o2.field("seq").type == pa.int32()
+    assert o2.field("parity").type == pa.dictionary(
+        index_type=pa.int8(), value_type=pa.string(), ordered=False
+    )
+    assert obs["parity"][0] == "even"
+    assert obs["parity"][1] == "odd"
     assert v2.field("vst.mean.sq").type == pa.float64()