From 516310ffc6e8ca8d07ae040f2b531ba00ec8fe65 Mon Sep 17 00:00:00 2001
From: John Kerl <kerl.john.r@gmail.com>
Date: Tue, 3 Oct 2023 09:09:25 -0400
Subject: [PATCH] [python] Check for downgraded categorical write (#1748)

* [python] Check for downgraded categorical write

* lint

* bugfix on append-is-rewrite corner case

* CI from 1749

* fix broken test

* unit-test case

* docstrings

* code-review feedback

Co-authored-by: Vivian Nguyen <vivian@tiledb.com>

* fix broken indentation in review suggestion

* code-review feedback

---------

Co-authored-by: Vivian Nguyen <vivian@tiledb.com>
---
 apis/python/src/tiledbsoma/_dataframe.py | 45 ++++++++++++++++--------
 apis/python/tests/test_dataframe.py      | 10 ++++--
 2 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py
index 0133941ff5..fef2f992a6 100644
--- a/apis/python/src/tiledbsoma/_dataframe.py
+++ b/apis/python/src/tiledbsoma/_dataframe.py
@@ -388,6 +388,12 @@ def write(
                 containing all columns, including the index columns. The schema for the values must
                 match the schema for the :class:`DataFrame`.
 
+                If a column is of categorical type in the schema and a flattened/non-categorical
+                column is presented for data on write, a ``ValueError`` is raised.  If a column is
+                of non-categorical type in the schema and a categorical column is presented for data
+                on write, the data are written as an array of category values, and the category-type
+                information is not saved.
+
         Raises:
             TypeError:
                 If the ``values`` parameter is an unsupported type.
@@ -411,24 +417,35 @@ def write(
             n = len(col)
 
             cols_map = dim_cols_map if name in dim_names_set else attr_cols_map
-            if pa.types.is_dictionary(col.type) and col.num_chunks != 0:
-                if name in dim_names_set:
-                    # Dims are never categorical. Decategoricalize for them.
-                    cols_map[name] = pa.chunked_array(
-                        [chunk.dictionary_decode() for chunk in col.chunks]
-                    )
-                else:
-                    attr = self._handle.schema.attr(name)
-                    if attr.enum_label is not None:
-                        # Normal case: writing categorical data to categorical schema.
-                        cols_map[name] = col.chunk(0).indices.to_pandas()
-                    else:
-                        # Schema is non-categorical but the user is writing categorical.
-                        # Simply decategoricalize for them.
+            if pa.types.is_dictionary(col.type):
+                if col.num_chunks != 0:
+                    if name in dim_names_set:
+                        # Dims are never categorical. Decategoricalize for them.
                         cols_map[name] = pa.chunked_array(
                             [chunk.dictionary_decode() for chunk in col.chunks]
                         )
+                    else:
+                        attr = self._handle.schema.attr(name)
+                        if attr.enum_label is not None:
+                            # Normal case: writing categorical data to categorical schema.
+                            cols_map[name] = col.chunk(0).indices.to_pandas()
+                        else:
+                            # Schema is non-categorical but the user is writing categorical.
+                            # Simply decategoricalize for them.
+                            cols_map[name] = pa.chunked_array(
+                                [chunk.dictionary_decode() for chunk in col.chunks]
+                            )
+                else:
+                    cols_map[name] = col.to_pandas()
+
             else:
+                if name not in dim_names_set:
+                    attr = self._handle.schema.attr(name)
+                    if attr.enum_label is not None:
+                        raise ValueError(
+                            f"Categorical column {name} must be presented with categorical data"
+                        )
+
                 cols_map[name] = col.to_pandas()
 
         if n is None:
diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py
index 77376064f5..faa2b67e60 100644
--- a/apis/python/tests/test_dataframe.py
+++ b/apis/python/tests/test_dataframe.py
@@ -130,7 +130,6 @@ def test_dataframe_with_enumeration(tmp_path):
         ]
     )
     enums = {"enmr1": ("a", "bb", "ccc"), "enmr2": ("cat", "dog")}
-
     with soma.DataFrame.create(
         tmp_path.as_posix(),
         schema=schema,
@@ -139,8 +138,13 @@ def test_dataframe_with_enumeration(tmp_path):
     ) as sdf:
         data = {}
         data["soma_joinid"] = [0, 1, 2, 3, 4]
-        data["foo"] = [2, 1, 2, 1, 0]
-        data["bar"] = [0, 1, 1, 0, 1]
+        data["foo"] = ["a", "bb", "ccc", "bb", "a"]
+        data["bar"] = ["cat", "dog", "cat", "cat", "cat"]
+        with pytest.raises(ValueError):
+            sdf.write(pa.Table.from_pydict(data))
+
+        data["foo"] = pd.Categorical(["a", "bb", "ccc", "bb", "a"])
+        data["bar"] = pd.Categorical(["cat", "dog", "cat", "cat", "cat"])
         sdf.write(pa.Table.from_pydict(data))
         assert sdf.enumeration("foo") == enums["enmr1"]
         assert sdf.enumeration("bar") == enums["enmr2"]