Skip to content

Commit

Permalink
[python] Error if extending enum past index type limit (#1986)
Browse files Browse the repository at this point in the history
  • Loading branch information
nguyenv authored Dec 14, 2023
1 parent d712bf2 commit 37329c9
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 1 deletion.
12 changes: 11 additions & 1 deletion apis/python/src/tiledbsoma/_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,8 +426,18 @@ def write(
col.dictionary, enmr.values(), assume_unique=True
)

index_capacity_current = len(enmr.values()) + len(update_vals)
index_capacity_max = np.iinfo(
col_info.type.index_type.to_pandas_dtype()
).max
if index_capacity_max < index_capacity_current:
raise ValueError(
f"Too many enumeration values ({index_capacity_current}) "
"for index type {col_info.type.index_type}"
)

# only extend if there are new values
if update_vals:
if len(update_vals) != 0:
se = tiledb.ArraySchemaEvolution(self.context.tiledb_ctx)
if np.issubdtype(enmr.dtype.type, np.str_):
extend_vals = np.array(update_vals, "U")
Expand Down
49 changes: 49 additions & 0 deletions apis/python/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1307,3 +1307,52 @@ def test_multichunk_with_enums(tmp_path):
expected_df = pd.concat((df_0, df_1, df_2), ignore_index=True)

assert df.equals(expected_df)


def test_enum_extend_past_numerical_limit(tmp_path):
uri = tmp_path.as_posix()

schema = pa.schema(
[
("soma_joinid", pa.int64()),
(
"obs",
pa.dictionary(
index_type=pa.int8(), value_type=pa.large_string(), ordered=False
),
),
]
)
soma.DataFrame.create(uri, schema=schema).close()

n_elem = 132
n_cats = 128
df1 = pd.DataFrame(
{
"soma_joinid": pd.Series(np.arange(n_elem), dtype=np.int64),
"obs": pd.Series(
[f"enum_{i % n_cats}" for i in range(n_elem)], dtype="category"
),
}
)

# use max number of possible categories
tbl = pa.Table.from_pandas(df1, preserve_index=False)
with soma.open(uri, mode="w") as A:
A.write(tbl)

more_elem = 4
df2 = pd.DataFrame(
{
"soma_joinid": pd.Series(
np.arange(n_elem, n_elem + more_elem), dtype=np.int64
),
"obs": pd.Series(["TEST"] * more_elem, dtype="category"),
}
)

# cannot add additional categories as already maxed out earlier
tbl = pa.Table.from_pandas(df2, preserve_index=False)
with pytest.raises(ValueError):
with soma.open(uri, mode="w") as A:
A.write(tbl)

0 comments on commit 37329c9

Please sign in to comment.