diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 1c0547e58c..90ce28cdb3 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -426,8 +426,18 @@ def write( col.dictionary, enmr.values(), assume_unique=True ) + index_capacity_current = len(enmr.values()) + len(update_vals) + index_capacity_max = np.iinfo( + col_info.type.index_type.to_pandas_dtype() + ).max + if index_capacity_max < index_capacity_current: + raise ValueError( + f"Too many enumeration values ({index_capacity_current}) " + "for index type {col_info.type.index_type}" + ) + # only extend if there are new values - if update_vals: + if len(update_vals) != 0: se = tiledb.ArraySchemaEvolution(self.context.tiledb_ctx) if np.issubdtype(enmr.dtype.type, np.str_): extend_vals = np.array(update_vals, "U") diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index d360577cd8..612d130fe3 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -1307,3 +1307,52 @@ def test_multichunk_with_enums(tmp_path): expected_df = pd.concat((df_0, df_1, df_2), ignore_index=True) assert df.equals(expected_df) + + +def test_enum_extend_past_numerical_limit(tmp_path): + uri = tmp_path.as_posix() + + schema = pa.schema( + [ + ("soma_joinid", pa.int64()), + ( + "obs", + pa.dictionary( + index_type=pa.int8(), value_type=pa.large_string(), ordered=False + ), + ), + ] + ) + soma.DataFrame.create(uri, schema=schema).close() + + n_elem = 132 + n_cats = 128 + df1 = pd.DataFrame( + { + "soma_joinid": pd.Series(np.arange(n_elem), dtype=np.int64), + "obs": pd.Series( + [f"enum_{i % n_cats}" for i in range(n_elem)], dtype="category" + ), + } + ) + + # use max number of possible categories + tbl = pa.Table.from_pandas(df1, preserve_index=False) + with soma.open(uri, mode="w") as A: + A.write(tbl) + + more_elem = 4 + df2 = pd.DataFrame( + { + "soma_joinid": pd.Series( + np.arange(n_elem, n_elem + more_elem), dtype=np.int64 + ), + "obs": pd.Series(["TEST"] * more_elem, dtype="category"), + } + ) + + # cannot add additional categories as already maxed out earlier + tbl = pa.Table.from_pandas(df2, preserve_index=False) + with pytest.raises(ValueError): + with soma.open(uri, mode="w") as A: + A.write(tbl)