Skip to content

Commit

Permalink
[python] Support Enumerations On Nullable Attributes and Query Condit…
Browse files Browse the repository at this point in the history
…ions
  • Loading branch information
nguyenv committed Aug 24, 2023
1 parent 2fbc777 commit a84e76c
Show file tree
Hide file tree
Showing 13 changed files with 100 additions and 74 deletions.
2 changes: 1 addition & 1 deletion apis/python/devtools/outgestor
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def main():
"--var-id-name",
help="Which var column name to use as index for outgested andata",
type=str,
default="obs_id",
default="var_id",
)
parser.add_argument(
"paths",
Expand Down
2 changes: 0 additions & 2 deletions apis/python/src/tiledbsoma/_arrow_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,6 @@ def df_to_arrow(df: pd.DataFrame) -> pa.Table:
null_fields = set()
# Not for name, col in df.items() since we need df[k] on the left-hand sides
for k in df:
if df[k].dtype == "category":
df[k] = df[k].astype(df[k].cat.categories.dtype)
if df[k].isnull().any():
if df[k].isnull().all():
df[k] = pa.nulls(df.shape[0], pa.infer_type(df[k]))
Expand Down
11 changes: 7 additions & 4 deletions apis/python/src/tiledbsoma/_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
"""
Implementation of a SOMA DataFrame
"""
from typing import Any, Optional, Sequence, Tuple, Type, Union, cast
from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union, cast

import numpy as np
import pandas as pd
import pyarrow as pa
import somacore
import tiledb
Expand Down Expand Up @@ -133,7 +134,9 @@ def create(
platform_config: Optional[options.PlatformConfig] = None,
context: Optional[SOMATileDBContext] = None,
tiledb_timestamp: Optional[OpenTimestamp] = None,
enumerations: Optional[dict[str, Sequence[Any]]] = None,
enumerations: Optional[
Dict[str, Union[Sequence[Any], np.ndarray[Any, Any]]]
] = None,
ordered_enumerations: Optional[Sequence[str]] = None,
column_to_enumerations: Optional[dict[str, str]] = None,
) -> "DataFrame":
Expand Down Expand Up @@ -400,8 +403,8 @@ def write(
_util.check_type("values", values, (pa.Table,))

del platform_config # unused
dim_cols_map = {}
attr_cols_map = {}
dim_cols_map: Dict[str, pd.DataFrame] = {}
attr_cols_map: Dict[str, pd.DataFrame] = {}
dim_names_set = self.index_column_names
n = None

Expand Down
19 changes: 12 additions & 7 deletions apis/python/src/tiledbsoma/_query_condition.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@ def __attrs_post_init__(self):
"(Is this an empty expression?)"
)

def init_query_condition(self, schema: tiledb.ArraySchema, query_attrs: List[str]):
qctree = QueryConditionTree(schema, query_attrs)
def init_query_condition(self, uri: str, query_attrs: List[str]):
qctree = QueryConditionTree(tiledb.open(uri), query_attrs)
self.c_obj = qctree.visit(self.tree.body)

if not isinstance(self.c_obj, clib.PyQueryCondition):
Expand All @@ -143,7 +143,7 @@ def init_query_condition(self, schema: tiledb.ArraySchema, query_attrs: List[str

@attrs.define
class QueryConditionTree(ast.NodeVisitor):
schema: tiledb.ArraySchema
array: tiledb.Array
query_attrs: List[str]

def visit_BitOr(self, node):
Expand Down Expand Up @@ -237,8 +237,13 @@ def aux_visit_Compare(

att = self.get_att_from_node(att)
val = self.get_val_from_node(val)

dt = self.schema.attr(att).dtype

enum_label = self.array.attr(att).enum_label
if enum_label is not None:
dt = self.array.enum(enum_label).dtype
else:
dt = self.array.attr(att).dtype

dtype = "string" if dt.kind in "SUa" else dt.name
val = self.cast_val_to_dtype(val, dtype)

Expand Down Expand Up @@ -318,8 +323,8 @@ def get_att_from_node(self, node: QueryConditionNodeElem) -> Any:
f"Incorrect type for attribute name: {ast.dump(node)}"
)

if not self.schema.has_attr(att):
if self.schema.domain.has_dim(att):
if not self.array.schema.has_attr(att):
if self.array.schema.domain.has_dim(att):
raise tiledb.TileDBError(
f"`{att}` is a dimension. QueryConditions currently only "
"work on attributes."
Expand Down
4 changes: 2 additions & 2 deletions apis/python/src/tiledbsoma/_tiledb_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@ def _soma_reader(
# Leave empty arguments out of kwargs to allow C++ constructor defaults to apply, as
# they're not all wrapped in std::optional<>.
kwargs: Dict[str, object] = {}
if schema:
kwargs["schema"] = schema
# if schema:
# kwargs["schema"] = schema
if column_names:
kwargs["column_names"] = column_names
if query_condition:
Expand Down
56 changes: 24 additions & 32 deletions apis/python/src/tiledbsoma/io/conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,49 +44,41 @@ def _to_tiledb_supported_dtype(dtype: _DT) -> _DT:

def to_tiledb_supported_array_type(name: str, x: _MT) -> _MT:
"""Converts datatypes unrepresentable by TileDB into datatypes it can represent.
E.g., categorical strings -> string.
See also `https://docs.scipy.org/doc/numpy-1.10.1/reference/arrays.dtypes.html ,https://docs.scipy.org/doc/numpy-1.10.1/reference/arrays.dtypes.html>`_.
Preferentially converts to the underlying primitive type, as TileDB does not support
most complex types. NOTE: this does not support ``datetime64`` conversion.
Categoricals are a special case. If the underlying categorical type is a primitive,
convert to that. If the array contains NA/NaN (i.e. not in the category, code == -1),
raise error unless it is a float or string.
E.g., float16 -> float32
"""
if isinstance(x, (np.ndarray, sp.spmatrix)) or not is_categorical_dtype(x):
# mypy issues a spurious error here, but only when
# _to_tiledb_supported_dtype is decorated with @typeguard_ignore???
target_dtype = _to_tiledb_supported_dtype(x.dtype) # type: ignore[arg-type]
return x if target_dtype == x.dtype else x.astype(target_dtype)

categories = x.cat.categories
cat_dtype = categories.dtype
if cat_dtype.kind in ("f", "u", "i"):
if x.hasnans and cat_dtype.kind == "i":
raise ValueError(
f"Categorical column {name!r} contains NaN -- unable to convert to TileDB array."
)
# More mysterious spurious mypy errors.
target_dtype = _to_tiledb_supported_dtype(cat_dtype) # type: ignore[arg-type]
else:
# Into the weirdness. See if Pandas can help with edge cases.
inferred = infer_dtype(categories)
if x.hasnans and inferred in ("boolean", "bytes"):
raise ValueError(
"Categorical array contains NaN -- unable to convert to TileDB array."
)
target_dtype = np.dtype( # type: ignore[assignment]
_str_to_type.get(inferred, object)
)

return x.astype(target_dtype)
# categories = x.cat.categories
# cat_dtype = categories.dtype
# if cat_dtype.kind in ("f", "u", "i"):
# if x.hasnans and cat_dtype.kind == "i":
# raise ValueError(
# f"Categorical column {name!r} contains NaN -- unable to convert to TileDB array."
# )
# # More mysterious spurious mypy errors.
# target_dtype = _to_tiledb_supported_dtype(cat_dtype) # type: ignore[arg-type]
# else:
# # Into the weirdness. See if Pandas can help with edge cases.
# inferred = infer_dtype(categories)
# if x.hasnans and inferred in ("boolean", "bytes"):
# raise ValueError(
# "Categorical array contains NaN -- unable to convert to TileDB array."
# )
# target_dtype = np.dtype( # type: ignore[assignment]
# _str_to_type.get(inferred, object)
# )

# return x.astype(target_dtype)
return x


def csr_from_tiledb_df(df: pd.DataFrame, num_rows: int, num_cols: int) -> sp.csr_matrix:
"""Given a tiledb dataframe, return a ``scipy.sparse.csr_matrx``."""
return sp.csr_matrix(
(df["soma_data"], (df["soma_dim_0"], df["soma_dim_1"])),
shape=(num_rows, num_cols),
)
)
8 changes: 5 additions & 3 deletions apis/python/src/tiledbsoma/io/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import time
from typing import (
Any,
Dict,
List,
Mapping,
Optional,
Expand Down Expand Up @@ -739,7 +740,7 @@ def _write_dataframe_impl(
try:
soma_df = _factory.open(df_uri, "w", soma_type=DataFrame, context=context)
except DoesNotExistError:
enums = {}
enums: Dict[str, Union[Sequence[Any], np.ndarray[Any, Any]]] = {}
col_to_enums = {}
for att in arrow_table.schema:
if pa.types.is_dictionary(att.type):
Expand All @@ -749,7 +750,7 @@ def _write_dataframe_impl(
else:
enums[att.name] = cat
col_to_enums[att.name] = att.name

soma_df = DataFrame.create(
df_uri,
schema=arrow_table.schema,
Expand Down Expand Up @@ -1024,6 +1025,7 @@ def _update_dataframe(
for key in common_keys:
old_type = old_sig[key]
new_type = new_sig[key]

if old_type != new_type:
msgs.append(f"{key} type {old_type} != {new_type}")
if msgs:
Expand Down Expand Up @@ -1948,4 +1950,4 @@ def to_anndata(

logging.log_io(None, _util.format_elapsed(s, "FINISH Experiment.to_anndata"))

return anndata
return anndata
12 changes: 8 additions & 4 deletions apis/python/src/tiledbsoma/io/registration/signatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,13 @@ def _string_dict_from_arrow_schema(schema: pa.Schema) -> Dict[str, str]:
Converts an Arrow schema to a string/string dict, which is easier on the eyes,
easier to convert from/to JSON for distributed logging, and easier to do del-key on.
"""

retval = {name: _stringify_type(schema.field(name).type) for name in schema.names}

retval = {}
for name in schema.names:
arrow_type = schema.field(name).type
if pa.types.is_dictionary(arrow_type):
arrow_type = arrow_type.index_type
retval[name] = _stringify_type(arrow_type)

# The soma_joinid field is specific to SOMA data but does not exist in AnnData/H5AD. When we
# pre-check an AnnData/H5AD input to see if it's appendable to an existing SOMA experiment, we
# must not punish the AnnData/H5AD input for it not having a soma_joinid column in its obs and
Expand Down Expand Up @@ -241,7 +245,7 @@ def from_soma_experiment(
varm_dtypes[varm_layer_name] = str(
varm.schema.field("soma_data").type
)

return cls(
obs_schema=obs_schema,
var_schema=var_schema,
Expand Down
8 changes: 2 additions & 6 deletions apis/python/src/tiledbsoma/pytiledbsoma.cc
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,6 @@ PYBIND11_MODULE(pytiledbsoma, m) {
std::string_view name,
std::optional<std::vector<std::string>> column_names_in,
py::object py_query_condition,
py::object py_schema,
std::string_view batch_size,
ResultOrder result_order,
std::map<std::string, std::string> platform_config,
Expand All @@ -222,7 +221,7 @@ PYBIND11_MODULE(pytiledbsoma, m) {
// Column names will be updated with columns present
// in the query condition
auto new_column_names =
init_pyqc(py_schema, column_names)
init_pyqc(uri, column_names)
.cast<std::vector<std::string>>();

// Update the column_names list if it was not empty,
Expand Down Expand Up @@ -267,7 +266,6 @@ PYBIND11_MODULE(pytiledbsoma, m) {
"name"_a = "unnamed",
"column_names"_a = py::none(),
"query_condition"_a = py::none(),
"schema"_a = py::none(),
"batch_size"_a = "auto",
"result_order"_a = ResultOrder::automatic,
"platform_config"_a = py::dict(),
Expand All @@ -278,7 +276,6 @@ PYBIND11_MODULE(pytiledbsoma, m) {
[](SOMAArray& reader,
std::optional<std::vector<std::string>> column_names_in,
py::object py_query_condition,
py::object py_schema,
std::string_view batch_size,
ResultOrder result_order) {
// Handle optional args
Expand All @@ -298,7 +295,7 @@ PYBIND11_MODULE(pytiledbsoma, m) {
// Column names will be updated with columns present in
// the query condition
auto new_column_names =
init_pyqc(py_schema, column_names)
init_pyqc(reader.uri(), column_names)
.cast<std::vector<std::string>>();

// Update the column_names list if it was not empty,
Expand Down Expand Up @@ -331,7 +328,6 @@ PYBIND11_MODULE(pytiledbsoma, m) {
py::kw_only(),
"column_names"_a = py::none(),
"query_condition"_a = py::none(),
"schema"_a = py::none(),
"batch_size"_a = "auto",
"result_order"_a = ResultOrder::automatic)

Expand Down
Binary file added apis/python/testdata/categorical_int_nan.h5ad
Binary file not shown.
32 changes: 32 additions & 0 deletions apis/python/tests/test_basic_anndata_io.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import math
import pathlib
import tempfile
from pathlib import Path
Expand Down Expand Up @@ -38,6 +39,24 @@ def h5ad_file_uns_string_array(request):
return input_path


@pytest.fixture
def h5ad_file_categorical_int_nan(request):
# This has obs["categ_int_nan"] as a categorical int but with math.nan as a
# "not-in-the-category" indicator. Such H5AD files do arise in the wild.
#
# Reference:
# import anndata as ad
# import pandas as pd
# import math
# adata = adata.read_h5ad("whatever.h5ad")
# s = pd.Series(list(range(80)), dtype="category")
# s[0] = math.nan
# adata.obs["categ_int_nan"] = s
# adata.write_h5ad("categorical_int_nan.h5ad")
input_path = HERE.parent / "testdata/categorical_int_nan.h5ad"
return input_path


@pytest.fixture
def adata(h5ad_file):
return anndata.read_h5ad(h5ad_file)
Expand Down Expand Up @@ -476,3 +495,16 @@ def test_null_obs(adata, tmp_path: Path):
# of the Pandas data frame
for k in adata.obs:
assert obs.attr(k).isnullable == adata.obs[k].isnull().any()


# There exist in the wild AnnData files with categorical-int columns where the "not in the category"
# is indicated by the presence of floating-point math.NaN in cells. Here we test that we can ingest
# this.
def test_obs_with_categorical_int_nan_enumeration(
tmp_path, h5ad_file_categorical_int_nan
):
output_path = tmp_path.as_uri()

tiledbsoma.io.from_h5ad(
output_path, h5ad_file_categorical_int_nan, measurement_name="RNA"
)
2 changes: 1 addition & 1 deletion apis/python/tests/test_type_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
(pa.binary(), pa.large_binary()),
(pa.large_string(),) * 2,
(pa.large_binary(),) * 2,
(pa.dictionary(pa.int32(), pa.string()), pa.int32())
(pa.dictionary(pa.int32(), pa.string()), pa.int32()),
]


Expand Down
Loading

0 comments on commit a84e76c

Please sign in to comment.