Skip to content

Commit

Permalink
[python] Check for downgraded categorical write (#1748)
Browse files Browse the repository at this point in the history
* [python] Check for downgraded categorical write

* lint

* bugfix on append-is-rewrite corner case

* CI from 1749

* fix broken test

* unit-test case

* docstrings

* code-review feedback

Co-authored-by: Vivian Nguyen <[email protected]>

* fix broken indentation in review suggestion

* code-review feedback

---------

Co-authored-by: Vivian Nguyen <[email protected]>
  • Loading branch information
2 people authored and github-actions[bot] committed Oct 3, 2023
1 parent ae580a3 commit 516310f
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 17 deletions.
45 changes: 31 additions & 14 deletions apis/python/src/tiledbsoma/_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,12 @@ def write(
containing all columns, including the index columns. The schema for the values must
match the schema for the :class:`DataFrame`.
If a column is of categorical type in the schema and a flattened/non-categorical
column is presented for data on write, a ``ValueError`` is raised. If a column is
of non-categorical type in the schema and a categorical column is presented for data
on write, the data are written as an array of category values, and the category-type
information is not saved.
Raises:
TypeError:
If the ``values`` parameter is an unsupported type.
Expand All @@ -411,24 +417,35 @@ def write(
n = len(col)

cols_map = dim_cols_map if name in dim_names_set else attr_cols_map
if pa.types.is_dictionary(col.type) and col.num_chunks != 0:
if name in dim_names_set:
# Dims are never categorical. Decategoricalize for them.
cols_map[name] = pa.chunked_array(
[chunk.dictionary_decode() for chunk in col.chunks]
)
else:
attr = self._handle.schema.attr(name)
if attr.enum_label is not None:
# Normal case: writing categorical data to categorical schema.
cols_map[name] = col.chunk(0).indices.to_pandas()
else:
# Schema is non-categorical but the user is writing categorical.
# Simply decategoricalize for them.
if pa.types.is_dictionary(col.type):
if col.num_chunks != 0:
if name in dim_names_set:
# Dims are never categorical. Decategoricalize for them.
cols_map[name] = pa.chunked_array(
[chunk.dictionary_decode() for chunk in col.chunks]
)
else:
attr = self._handle.schema.attr(name)
if attr.enum_label is not None:
# Normal case: writing categorical data to categorical schema.
cols_map[name] = col.chunk(0).indices.to_pandas()
else:
# Schema is non-categorical but the user is writing categorical.
# Simply decategoricalize for them.
cols_map[name] = pa.chunked_array(
[chunk.dictionary_decode() for chunk in col.chunks]
)
else:
cols_map[name] = col.to_pandas()

else:
if name not in dim_names_set:
attr = self._handle.schema.attr(name)
if attr.enum_label is not None:
raise ValueError(
f"Categorical column {name} must be presented with categorical data"
)

cols_map[name] = col.to_pandas()

if n is None:
Expand Down
10 changes: 7 additions & 3 deletions apis/python/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@ def test_dataframe_with_enumeration(tmp_path):
]
)
enums = {"enmr1": ("a", "bb", "ccc"), "enmr2": ("cat", "dog")}

with soma.DataFrame.create(
tmp_path.as_posix(),
schema=schema,
Expand All @@ -139,8 +138,13 @@ def test_dataframe_with_enumeration(tmp_path):
) as sdf:
data = {}
data["soma_joinid"] = [0, 1, 2, 3, 4]
data["foo"] = [2, 1, 2, 1, 0]
data["bar"] = [0, 1, 1, 0, 1]
data["foo"] = ["a", "bb", "ccc", "bb", "a"]
data["bar"] = ["cat", "dog", "cat", "cat", "cat"]
with pytest.raises(ValueError):
sdf.write(pa.Table.from_pydict(data))

data["foo"] = pd.Categorical(["a", "bb", "ccc", "bb", "a"])
data["bar"] = pd.Categorical(["cat", "dog", "cat", "cat", "cat"])
sdf.write(pa.Table.from_pydict(data))
assert sdf.enumeration("foo") == enums["enmr1"]
assert sdf.enumeration("bar") == enums["enmr2"]
Expand Down

0 comments on commit 516310f

Please sign in to comment.