From 516310ffc6e8ca8d07ae040f2b531ba00ec8fe65 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 3 Oct 2023 09:09:25 -0400 Subject: [PATCH] [python] Check for downgraded categorical write (#1748) * [python] Check for downgraded categorical write * lint * bugfix on append-is-rewrite corner case * CI from 1749 * fix broken test * unit-test case * docstrings * code-review feedback Co-authored-by: Vivian Nguyen * fix broken indentation in review suggestion * code-review feedback --------- Co-authored-by: Vivian Nguyen --- apis/python/src/tiledbsoma/_dataframe.py | 45 ++++++++++++++++-------- apis/python/tests/test_dataframe.py | 10 ++++-- 2 files changed, 38 insertions(+), 17 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 0133941ff5..fef2f992a6 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -388,6 +388,12 @@ def write( containing all columns, including the index columns. The schema for the values must match the schema for the :class:`DataFrame`. + If a column is of categorical type in the schema and a flattened/non-categorical + column is presented for data on write, a ``ValueError`` is raised. If a column is + of non-categorical type in the schema and a categorical column is presented for data + on write, the data are written as an array of category values, and the category-type + information is not saved. + Raises: TypeError: If the ``values`` parameter is an unsupported type. @@ -411,24 +417,35 @@ def write( n = len(col) cols_map = dim_cols_map if name in dim_names_set else attr_cols_map - if pa.types.is_dictionary(col.type) and col.num_chunks != 0: - if name in dim_names_set: - # Dims are never categorical. Decategoricalize for them. - cols_map[name] = pa.chunked_array( - [chunk.dictionary_decode() for chunk in col.chunks] - ) - else: - attr = self._handle.schema.attr(name) - if attr.enum_label is not None: - # Normal case: writing categorical data to categorical schema. - cols_map[name] = col.chunk(0).indices.to_pandas() - else: - # Schema is non-categorical but the user is writing categorical. - # Simply decategoricalize for them. + if pa.types.is_dictionary(col.type): + if col.num_chunks != 0: + if name in dim_names_set: + # Dims are never categorical. Decategoricalize for them. cols_map[name] = pa.chunked_array( [chunk.dictionary_decode() for chunk in col.chunks] ) + else: + attr = self._handle.schema.attr(name) + if attr.enum_label is not None: + # Normal case: writing categorical data to categorical schema. + cols_map[name] = col.chunk(0).indices.to_pandas() + else: + # Schema is non-categorical but the user is writing categorical. + # Simply decategoricalize for them. + cols_map[name] = pa.chunked_array( + [chunk.dictionary_decode() for chunk in col.chunks] + ) + else: + cols_map[name] = col.to_pandas() + else: + if name not in dim_names_set: + attr = self._handle.schema.attr(name) + if attr.enum_label is not None: + raise ValueError( + f"Categorical column {name} must be presented with categorical data" + ) + cols_map[name] = col.to_pandas() if n is None: diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index 77376064f5..faa2b67e60 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -130,7 +130,6 @@ def test_dataframe_with_enumeration(tmp_path): ] ) enums = {"enmr1": ("a", "bb", "ccc"), "enmr2": ("cat", "dog")} - with soma.DataFrame.create( tmp_path.as_posix(), schema=schema, @@ -139,8 +138,13 @@ def test_dataframe_with_enumeration(tmp_path): ) as sdf: data = {} data["soma_joinid"] = [0, 1, 2, 3, 4] - data["foo"] = [2, 1, 2, 1, 0] - data["bar"] = [0, 1, 1, 0, 1] + data["foo"] = ["a", "bb", "ccc", "bb", "a"] + data["bar"] = ["cat", "dog", "cat", "cat", "cat"] + with pytest.raises(ValueError): + sdf.write(pa.Table.from_pydict(data)) + + data["foo"] = pd.Categorical(["a", "bb", "ccc", "bb", "a"]) + data["bar"] = pd.Categorical(["cat", "dog", "cat", "cat", "cat"]) sdf.write(pa.Table.from_pydict(data)) assert sdf.enumeration("foo") == enums["enmr1"] assert sdf.enumeration("bar") == enums["enmr2"]