-
Notifications
You must be signed in to change notification settings - Fork 25
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[python] Protect against huge enum-of-strings input #3354
Changes from all commits
7490e2b
7a82682
e1c2488
6e755aa
d4baf75
8435095
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ | |
import numpy as np | ||
import pandas as pd | ||
import pandas._typing as pdt | ||
import pandas.api.types | ||
import pyarrow as pa | ||
import scipy.sparse as sp | ||
|
||
|
@@ -25,19 +26,31 @@ | |
_MT = TypeVar("_MT", NPNDArray, sp.spmatrix, PDSeries) | ||
_str_to_type = {"boolean": bool, "string": str, "bytes": bytes} | ||
|
||
STRING_DECAT_THRESHOLD = 4096 | ||
""" | ||
For enum-of-string columns with a cardinality higher than this, we convert from | ||
enum-of-string in the AnnData ``obs``/``var``, to plain string in TileDB-SOMA | ||
``obs``/``var``. However, if we're appending to existing storage, we follow the | ||
schema there. | ||
""" | ||
|
||
def decategoricalize_obs_or_var(obs_or_var: pd.DataFrame) -> pd.DataFrame: | ||
"""Performs a typecast into types that TileDB can persist.""" | ||
if len(obs_or_var.columns) > 0: | ||
return pd.DataFrame.from_dict( | ||
{ | ||
str(k): to_tiledb_supported_array_type(str(k), v) | ||
for k, v in obs_or_var.items() | ||
}, | ||
) | ||
else: | ||
|
||
def obs_or_var_to_tiledb_supported_array_type(obs_or_var: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Performs a typecast into types that TileDB can persist. This includes, as a | ||
performance improvement, converting high-cardinality categorical-of-string | ||
columns (cardinality > 4096) to plain string. | ||
""" | ||
if len(obs_or_var.columns) == 0: | ||
return obs_or_var.copy() | ||
|
||
return pd.DataFrame.from_dict( | ||
{ | ||
str(k): to_tiledb_supported_array_type(str(k), v) | ||
for k, v in obs_or_var.items() | ||
}, | ||
) | ||
|
||
|
||
@typeguard_ignore | ||
def _to_tiledb_supported_dtype(dtype: _DT) -> _DT: | ||
|
@@ -47,36 +60,26 @@ def _to_tiledb_supported_dtype(dtype: _DT) -> _DT: | |
|
||
|
||
def to_tiledb_supported_array_type(name: str, x: _MT) -> _MT: | ||
"""Converts datatypes unrepresentable by TileDB into datatypes it can represent. | ||
E.g., float16 -> float32 | ||
"""Converts datatypes unrepresentable by TileDB into datatypes it can represent, | ||
e.g., float16 -> float32. | ||
""" | ||
if isinstance(x, (np.ndarray, sp.spmatrix)) or not isinstance( | ||
x.dtype, pd.CategoricalDtype | ||
): | ||
target_dtype = _to_tiledb_supported_dtype(x.dtype) | ||
return x if target_dtype == x.dtype else x.astype(target_dtype) | ||
|
||
# categories = x.cat.categories | ||
# cat_dtype = categories.dtype | ||
# if cat_dtype.kind in ("f", "u", "i"): | ||
# if x.hasnans and cat_dtype.kind == "i": | ||
# raise ValueError( | ||
# f"Categorical column {name!r} contains NaN -- unable to convert to TileDB array." | ||
# ) | ||
# # More mysterious spurious mypy errors. | ||
# target_dtype = _to_tiledb_supported_dtype(cat_dtype) # type: ignore[arg-type] | ||
# else: | ||
# # Into the weirdness. See if Pandas can help with edge cases. | ||
# inferred = infer_dtype(categories) | ||
# if x.hasnans and inferred in ("boolean", "bytes"): | ||
# raise ValueError( | ||
# "Categorical array contains NaN -- unable to convert to TileDB array." | ||
# ) | ||
# target_dtype = np.dtype( # type: ignore[assignment] | ||
# _str_to_type.get(inferred, object) | ||
# ) | ||
|
||
# return x.astype(target_dtype) | ||
# If the column is categorical-of-string of high cardinality, we declare | ||
# this is likely a mistake, and it will definitely lead to performance | ||
# issues in subsequent processing. | ||
if isinstance(x, pd.Series) and isinstance(x.dtype, pd.CategoricalDtype): | ||
# Heuristic number | ||
if ( | ||
pandas.api.types.is_string_dtype(x) | ||
and len(x.cat.categories) > STRING_DECAT_THRESHOLD | ||
): | ||
return x.astype(str) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a bug - assumes all categoricals are of type There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @bkmartinjr -- I'll make a follow-on PR -- this one is "Protect against huge enum-of-strings input" -- I'll generalize the follow-on to "Protect against huge enum-of-anything input" |
||
|
||
return x | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we add a warning or debugging message that even though the column is a category type, we are coverting that to a string?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What do you think @bkmartinjr ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would not - I would instead just add it to the help/docstrings that this is the default behavior.
I don't find these warnings particularly useful, as most of this code runs in production pipelines. Better to document the behavior in the API docs, IMHO.