Skip to content

Commit

Permalink
[python] Extend #3354 to categoricals of arbitrary value type (#3415) (
Browse files Browse the repository at this point in the history
…#3423)

* [python] Extend #3354 to categoricals of arbitrary value type

* code-review feedback

* code-review feedback

* code-review feedback

* code-review feedback

Co-authored-by: John Kerl <[email protected]>
  • Loading branch information
github-actions[bot] and johnkerl authored Dec 11, 2024
1 parent 6a26b87 commit 2a3ed32
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 73 deletions.
12 changes: 5 additions & 7 deletions apis/python/src/tiledbsoma/io/conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,13 @@
_MT = TypeVar("_MT", NPNDArray, sp.spmatrix, PDSeries)
_str_to_type = {"boolean": bool, "string": str, "bytes": bytes}

STRING_DECAT_THRESHOLD = 4096
COLUMN_DECAT_THRESHOLD = 32767
"""
For enum-of-string columns with a cardinality higher than this, we convert from
enum-of-string in the AnnData ``obs``/``var``, to plain string in TileDB-SOMA
``obs``/``var``. However, if we're appending to existing storage, we follow the
schema there.
schema there. Reasoning behind this choice: accommodate signed 16-bit index type.
See also https://github.com/single-cell-data/TileDB-SOMA/pull/3415.
"""


Expand Down Expand Up @@ -74,11 +75,8 @@ def to_tiledb_supported_array_type(name: str, x: _MT) -> _MT:
# issues in subsequent processing.
if isinstance(x, pd.Series) and isinstance(x.dtype, pd.CategoricalDtype):
# Heuristic number
if (
pandas.api.types.is_string_dtype(x)
and len(x.cat.categories) > STRING_DECAT_THRESHOLD
):
return x.astype(str)
if len(x.cat.categories) > COLUMN_DECAT_THRESHOLD:
return x.astype(x.cat.categories.dtype)

return x

Expand Down
95 changes: 45 additions & 50 deletions apis/python/src/tiledbsoma/io/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1079,37 +1079,36 @@ def _extract_new_values_for_append_aux(
This does two things:
* Retains only the 'new' rows compared to existing storage.
Example is append-mode updates to the var dataframe for
which it's likely that most/all gene IDs have already been seen.
* Retains only the 'new' rows compared to existing storage. Example is
append-mode updates to the var dataframe for which it's likely that
most/all gene IDs have already been seen.
* String vs categorical of string:
* Categorical of type sometype vs plain sometype:
o If we're appending a plain-string column to existing
categorical-of-string storage, convert the about-to-be-written data
to categorical of string, to match.
o If we're appending a non-categorical column to existing categorical
storage, convert the about-to-be-written data to categorical of that
type, to match.
o If we're appending a categorical-of-string column to existing
plain-string storage, convert the about-to-be-written data
to plain string, to match.
o If we're appending a categorical column to existing non-categorical
storage, convert the about-to-be-written data to non-categorical, to
match.
Context: https://github.com/single-cell-data/TileDB-SOMA/issues/3353.
Namely, we find that AnnData's to_h5ad/from_h5ad can categoricalize (without
the user's knowledge or intention) string columns. For example, even
cell_id/barcode, for which there may be millions of distinct values, with no
gain to be had from dictionary encoding, will be converted to categorical.
We find that converting these high-cardinality enums to plain string is a
We find that converting these high-cardinality enums to non-enumerated is a
significant performance win for subsequent accesses. When we do an initial
ingest from AnnData to TileDB-SOMA, we convert from categorical-of-string to
plain string if the cardinality exceeds some threshold.
ingest from AnnData to TileDB-SOMA, we decategoricalize if the cardinality
exceeds some threshold.
All well and good -- except for one more complication which is append mode.
Namely, if the new column has high enough cardinality that we would
downgrade to plain string, but the existing storage has
categorical-of-string, we must write the new data as categorical-of-string.
Likewise, if the new column has low enough cardinality that we would keep it
as categorical-of-string, but the existing storage has plain string, we must
write the new data as plain strings.
downgrade to non-categorical, but the existing storage has categorical, we
must write the new data as categorical. Likewise, if the new column has low
enough cardinality that we would keep it as categorical, but the existing
storage has non-categorical, we must write the new data as non-categorical.
"""

# Retain only the new rows.
Expand All @@ -1133,34 +1132,23 @@ def _extract_new_values_for_append_aux(
# anything in that case. Regardless, we can't assume that the old
# and new schema have the same column names.

# Helper functions for
def is_str_type(typ: pa.DataType) -> bool:
return cast(bool, typ == pa.string() or typ == pa.large_string())

def is_str_col(field: pa.Field) -> bool:
return is_str_type(field.type)

def is_str_cat_col(field: pa.Field) -> bool:
if not pa.types.is_dictionary(field.type):
return False
return is_str_type(field.type.value_type)
def is_cat(field: pa.Field) -> bool:
return cast(bool, pa.types.is_dictionary(field.type))

# Make a quick check of the old and new schemas to see if any columns need
# changing between plain string and categorical-of-string. We're about to
# changing between non-categorical and categorical. We're about to
# duplicate the new data -- and we must, since pyarrow.Table is immutable --
# so let's only do that if we need to.
any_to_change = False
for name in new_schema.names:
if name not in old_schema.names:
continue
if is_str_col(old_schema.field(name)) and is_str_cat_col(
new_schema.field(name)
):
old_field = old_schema.field(name)
new_field = new_schema.field(name)
if not is_cat(old_field) and is_cat(new_field):
any_to_change = True
break
if is_str_cat_col(old_schema.field(name)) and is_str_col(
new_schema.field(name)
):
if is_cat(old_field) and not is_cat(new_field):
any_to_change = True
break

Expand All @@ -1170,24 +1158,31 @@ def is_str_cat_col(field: pa.Field) -> bool:
if name not in old_schema.names:
continue
column = arrow_table.column(name)
old_info = old_schema.field(name)
new_info = new_schema.field(name)
if is_str_col(old_info) and is_str_cat_col(new_info):
# Convert from categorical-of-string to plain string.
column = column.to_pylist()
elif is_str_cat_col(old_info) and is_str_col(new_info):
# Convert from plain string to categorical-of-string. Note:
old_field = old_schema.field(name)
new_field = new_schema.field(name)

# Note from https://enpiar.com/arrow-site/docs/python/data.html#tables
# we have the assertion that pa.Table columns are necessarily of type
# pa.ChunkedArray.
assert isinstance(column, pa.ChunkedArray)

if not is_cat(old_field) and is_cat(new_field):
# Convert from categorical to non-categorical. Note that if
# this is a pa.ChunkedArray, there is no .dictionary_decode()
# for it, but each chunk does have a .dictionary_decode().
column = pa.chunked_array(
[chunk.dictionary_decode() for chunk in column.chunks]
)

elif is_cat(old_field) and not is_cat(new_field):
# Convert from non-categorical to categorical. Note:
# libtiledbsoma already merges the enum mappings, e.g if the
# storage has red, yellow, & green, but our new data has some
# yellow, green, and orange.
column = pa.array(
column.to_pylist(),
pa.dictionary(
index_type=old_info.type.index_type,
value_type=old_info.type.value_type,
ordered=old_info.type.ordered,
),
column = pa.chunked_array(
[chunk.dictionary_encode() for chunk in column.chunks]
)

fields_dict[name] = column
arrow_table = pa.Table.from_pydict(fields_dict)

Expand Down
79 changes: 63 additions & 16 deletions apis/python/tests/test_basic_anndata_io.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import random
import tempfile
from copy import deepcopy
from pathlib import Path
Expand Down Expand Up @@ -1359,22 +1360,34 @@ def test_nan_append(conftest_pbmc_small, dtype, nans, new_obs_ids):
def test_decat_append(tmp_path):

# Prepare the AnnData inputs
nobs_under = tiledbsoma.io.conversions.STRING_DECAT_THRESHOLD - 2
nobs_over = tiledbsoma.io.conversions.STRING_DECAT_THRESHOLD + 2
nobs_under = tiledbsoma.io.conversions.COLUMN_DECAT_THRESHOLD - 2
nobs_over = tiledbsoma.io.conversions.COLUMN_DECAT_THRESHOLD + 2
nvar = 100

obs_ids_under = [f"under_{e:08}" for e in range(nobs_under)]
obs_ids_over = [f"over_{e:08}" for e in range(nobs_over)]
var_ids = [f"gene_{e:08}" for e in range(nvar)]

enum_values_under = [f"enum_u_{e:06}" for e in range(nobs_under)]
enum_values_over = [f"enum_o_{e:06}" for e in range(nobs_over)]
string_enum_values_under = [f"enum_u_{e:06}" for e in range(nobs_under)]
string_enum_values_over = [f"enum_o_{e:06}" for e in range(nobs_over)]
float_enum_values_under = [1e6 + e for e in range(nobs_under)]
float_enum_values_over = [2e6 + e for e in range(nobs_over)]
bool_enum_values_under = random.choices([True, False], k=nobs_under)
bool_enum_values_over = random.choices([True, False], k=nobs_over)

obs_under = pd.DataFrame(
data={
"obs_id": np.asarray(obs_ids_under),
"is_primary_data": np.asarray([True] * nobs_under),
"myenum": pd.Series(np.asarray(enum_values_under), dtype="category"),
"string_enum": pd.Series(
np.asarray(string_enum_values_under), dtype="category"
),
"float_enum": pd.Series(
np.asarray(float_enum_values_under), dtype="category"
),
"bool_enum": pd.Series(
np.asarray(bool_enum_values_under), dtype="category"
),
}
)
obs_under.set_index("obs_id", inplace=True)
Expand All @@ -1383,7 +1396,13 @@ def test_decat_append(tmp_path):
data={
"obs_id": np.asarray(obs_ids_over),
"is_primary_data": np.asarray([True] * nobs_over),
"myenum": pd.Series(np.asarray(enum_values_over), dtype="category"),
"string_enum": pd.Series(
np.asarray(string_enum_values_over), dtype="category"
),
"float_enum": pd.Series(
np.asarray(float_enum_values_over), dtype="category"
),
"bool_enum": pd.Series(np.asarray(bool_enum_values_over), dtype="category"),
}
)
obs_over.set_index("obs_id", inplace=True)
Expand Down Expand Up @@ -1416,16 +1435,24 @@ def test_decat_append(tmp_path):
# Check that the low-cardinality categorical-of-string in the AnnData has
# been ingested to TileDB-SOMA enum-of-string.
with tiledbsoma.Experiment.open(path_under) as exp_under:
assert pa.types.is_dictionary(exp_under.obs.schema.field("myenum").type)
assert pa.types.is_dictionary(exp_under.obs.schema.field("string_enum").type)
assert pa.types.is_dictionary(exp_under.obs.schema.field("float_enum").type)
assert pa.types.is_dictionary(exp_under.obs.schema.field("bool_enum").type)
obs_table = exp_under.obs.read().concat()
assert obs_table.column("myenum").to_pylist() == enum_values_under
assert obs_table.column("string_enum").to_pylist() == string_enum_values_under
assert obs_table.column("float_enum").to_pylist() == float_enum_values_under
assert obs_table.column("bool_enum").to_pylist() == bool_enum_values_under

# Check that the high-cardinality categorical-of-string in the AnnData has
# been ingested to TileDB-SOMA plain string.
with tiledbsoma.Experiment.open(path_over) as exp_over:
assert not pa.types.is_dictionary(exp_over.obs.schema.field("myenum").type)
assert not pa.types.is_dictionary(exp_over.obs.schema.field("string_enum").type)
assert not pa.types.is_dictionary(exp_over.obs.schema.field("float_enum").type)
assert pa.types.is_dictionary(exp_over.obs.schema.field("bool_enum").type)
obs_table = exp_over.obs.read().concat()
assert obs_table.column("myenum").to_pylist() == enum_values_over
assert obs_table.column("string_enum").to_pylist() == string_enum_values_over
assert obs_table.column("float_enum").to_pylist() == float_enum_values_over
assert obs_table.column("bool_enum").to_pylist() == bool_enum_values_over

# Append over-the-threshold AnnData to under-the-threshold TileDB-SOMA
# storage, and vice versa.
Expand Down Expand Up @@ -1465,17 +1492,37 @@ def test_decat_append(tmp_path):

# Check that the appends happened successfully
with tiledbsoma.Experiment.open(path_under) as exp_under:
assert pa.types.is_dictionary(exp_under.obs.schema.field("myenum").type)
assert pa.types.is_dictionary(exp_under.obs.schema.field("string_enum").type)
assert pa.types.is_dictionary(exp_under.obs.schema.field("float_enum").type)
assert pa.types.is_dictionary(exp_under.obs.schema.field("bool_enum").type)
obs_table = exp_under.obs.read().concat()
assert (
obs_table.column("myenum").to_pylist()
== enum_values_under + enum_values_over
obs_table.column("string_enum").to_pylist()
== string_enum_values_under + string_enum_values_over
)
assert (
obs_table.column("float_enum").to_pylist()
== float_enum_values_under + float_enum_values_over
)
assert (
obs_table.column("bool_enum").to_pylist()
== bool_enum_values_under + bool_enum_values_over
)

with tiledbsoma.Experiment.open(path_over) as exp_over:
assert not pa.types.is_dictionary(exp_over.obs.schema.field("myenum").type)
assert not pa.types.is_dictionary(exp_over.obs.schema.field("string_enum").type)
assert not pa.types.is_dictionary(exp_over.obs.schema.field("float_enum").type)
assert pa.types.is_dictionary(exp_over.obs.schema.field("bool_enum").type)
obs_table = exp_over.obs.read().concat()
assert (
obs_table.column("myenum").to_pylist()
== enum_values_over + enum_values_under
obs_table.column("string_enum").to_pylist()
== string_enum_values_over + string_enum_values_under
)
assert (
obs_table.column("float_enum").to_pylist()
== float_enum_values_over + float_enum_values_under
)
assert (
obs_table.column("bool_enum").to_pylist()
== bool_enum_values_over + bool_enum_values_under
)

0 comments on commit 2a3ed32

Please sign in to comment.