Skip to content

Commit

Permalink
[python] Flatten categorical soma_joinid if presented at write (#…
Browse files Browse the repository at this point in the history
…1698) (#1699)

Co-authored-by: John Kerl <[email protected]>
  • Loading branch information
github-actions[bot] and johnkerl authored Sep 18, 2023
1 parent 3bf320a commit 6bf7661
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 7 deletions.
21 changes: 14 additions & 7 deletions apis/python/src/tiledbsoma/_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,18 +409,25 @@ def write(
for name in values.schema.names:
col = values.column(name)
n = len(col)

cols_map = dim_cols_map if name in dim_names_set else attr_cols_map
if pa.types.is_dictionary(col.type) and col.num_chunks != 0:
attr = self._handle.schema.attr(name)
if attr.enum_label is not None:
# Normal case: writing categorical data to categorical schema.
cols_map[name] = col.chunk(0).indices.to_pandas()
else:
# Schema is non-categorical but the user is writing categorical.
# Simply decategoricalize for them.
if name in dim_names_set:
# Dims are never categorical. Decategoricalize for them.
cols_map[name] = pa.chunked_array(
[chunk.dictionary_decode() for chunk in col.chunks]
)
else:
attr = self._handle.schema.attr(name)
if attr.enum_label is not None:
# Normal case: writing categorical data to categorical schema.
cols_map[name] = col.chunk(0).indices.to_pandas()
else:
# Schema is non-categorical but the user is writing categorical.
# Simply decategoricalize for them.
cols_map[name] = pa.chunked_array(
[chunk.dictionary_decode() for chunk in col.chunks]
)
else:
cols_map[name] = col.to_pandas()

Expand Down
35 changes: 35 additions & 0 deletions apis/python/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -914,6 +914,41 @@ def test_write_categorical_types(tmp_path):
assert (df == sdf.read().concat().to_pandas()).all().all()


def test_write_categorical_dims(tmp_path):
"""
Categories are not supported as dims. Here we test our handling of what we
do when we are given them as input.
"""
schema = pa.schema(
[
("soma_joinid", pa.int64()),
("string", pa.dictionary(pa.int8(), pa.large_string())),
]
)
with soma.DataFrame.create(
tmp_path.as_posix(),
schema=schema,
index_column_names=["soma_joinid"],
enumerations={
"enum-string": ["b", "a"],
},
ordered_enumerations=[],
column_to_enumerations={
"string": "enum-string",
},
) as sdf:
df = pd.DataFrame(
data={
"soma_joinid": pd.Categorical([0, 1, 2, 3], categories=[0, 1, 2, 3]),
"string": pd.Categorical(["a", "b", "a", "b"], categories=["b", "a"]),
}
)
sdf.write(pa.Table.from_pandas(df))

with soma.DataFrame.open(tmp_path.as_posix()) as sdf:
assert (df == sdf.read().concat().to_pandas()).all().all()


def test_result_order(tmp_path):
# cf. https://docs.tiledb.com/main/background/key-concepts-and-data-format#data-layout
schema = pa.schema(
Expand Down

0 comments on commit 6bf7661

Please sign in to comment.