Skip to content

Commit

Permalink
Merge branch 'viviannguyen/sc-30316/enumerated-data-types-aka-categor…
Browse files Browse the repository at this point in the history
…icals-aka' into kerl/categ-int-nan-python-test
  • Loading branch information
nguyenv authored Aug 24, 2023
2 parents 22368cf + a84e76c commit 2a8061d
Show file tree
Hide file tree
Showing 9 changed files with 55 additions and 64 deletions.
2 changes: 0 additions & 2 deletions apis/python/src/tiledbsoma/_arrow_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,6 @@ def df_to_arrow(df: pd.DataFrame) -> pa.Table:
null_fields = set()
# Not for name, col in df.items() since we need df[k] on the left-hand sides
for k in df:
if df[k].dtype == "category":
df[k] = df[k].astype(df[k].cat.categories.dtype)
if df[k].isnull().any():
if df[k].isnull().all():
df[k] = pa.nulls(df.shape[0], pa.infer_type(df[k]))
Expand Down
19 changes: 12 additions & 7 deletions apis/python/src/tiledbsoma/_query_condition.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@ def __attrs_post_init__(self):
"(Is this an empty expression?)"
)

def init_query_condition(self, schema: tiledb.ArraySchema, query_attrs: List[str]):
qctree = QueryConditionTree(schema, query_attrs)
def init_query_condition(self, uri: str, query_attrs: List[str]):
qctree = QueryConditionTree(tiledb.open(uri), query_attrs)
self.c_obj = qctree.visit(self.tree.body)

if not isinstance(self.c_obj, clib.PyQueryCondition):
Expand All @@ -143,7 +143,7 @@ def init_query_condition(self, schema: tiledb.ArraySchema, query_attrs: List[str

@attrs.define
class QueryConditionTree(ast.NodeVisitor):
schema: tiledb.ArraySchema
array: tiledb.Array
query_attrs: List[str]

def visit_BitOr(self, node):
Expand Down Expand Up @@ -237,8 +237,13 @@ def aux_visit_Compare(

att = self.get_att_from_node(att)
val = self.get_val_from_node(val)

dt = self.schema.attr(att).dtype

enum_label = self.array.attr(att).enum_label
if enum_label is not None:
dt = self.array.enum(enum_label).dtype
else:
dt = self.array.attr(att).dtype

dtype = "string" if dt.kind in "SUa" else dt.name
val = self.cast_val_to_dtype(val, dtype)

Expand Down Expand Up @@ -318,8 +323,8 @@ def get_att_from_node(self, node: QueryConditionNodeElem) -> Any:
f"Incorrect type for attribute name: {ast.dump(node)}"
)

if not self.schema.has_attr(att):
if self.schema.domain.has_dim(att):
if not self.array.schema.has_attr(att):
if self.array.schema.domain.has_dim(att):
raise tiledb.TileDBError(
f"`{att}` is a dimension. QueryConditions currently only "
"work on attributes."
Expand Down
4 changes: 2 additions & 2 deletions apis/python/src/tiledbsoma/_tiledb_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@ def _soma_reader(
# Leave empty arguments out of kwargs to allow C++ constructor defaults to apply, as
# they're not all wrapped in std::optional<>.
kwargs: Dict[str, object] = {}
if schema:
kwargs["schema"] = schema
# if schema:
# kwargs["schema"] = schema
if column_names:
kwargs["column_names"] = column_names
if query_condition:
Expand Down
54 changes: 23 additions & 31 deletions apis/python/src/tiledbsoma/io/conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,44 +44,36 @@ def _to_tiledb_supported_dtype(dtype: _DT) -> _DT:

def to_tiledb_supported_array_type(name: str, x: _MT) -> _MT:
"""Converts datatypes unrepresentable by TileDB into datatypes it can represent.
E.g., categorical strings -> string.
See also `https://docs.scipy.org/doc/numpy-1.10.1/reference/arrays.dtypes.html ,https://docs.scipy.org/doc/numpy-1.10.1/reference/arrays.dtypes.html>`_.
Preferentially converts to the underlying primitive type, as TileDB does not support
most complex types. NOTE: this does not support ``datetime64`` conversion.
Categoricals are a special case. If the underlying categorical type is a primitive,
convert to that. If the array contains NA/NaN (i.e. not in the category, code == -1),
raise error unless it is a float or string.
E.g., float16 -> float32
"""
if isinstance(x, (np.ndarray, sp.spmatrix)) or not is_categorical_dtype(x):
# mypy issues a spurious error here, but only when
# _to_tiledb_supported_dtype is decorated with @typeguard_ignore???
target_dtype = _to_tiledb_supported_dtype(x.dtype) # type: ignore[arg-type]
return x if target_dtype == x.dtype else x.astype(target_dtype)

categories = x.cat.categories
cat_dtype = categories.dtype
if cat_dtype.kind in ("f", "u", "i"):
if x.hasnans and cat_dtype.kind == "i":
raise ValueError(
f"Categorical column {name!r} contains NaN -- unable to convert to TileDB array."
)
# More mysterious spurious mypy errors.
target_dtype = _to_tiledb_supported_dtype(cat_dtype) # type: ignore[arg-type]
else:
# Into the weirdness. See if Pandas can help with edge cases.
inferred = infer_dtype(categories)
if x.hasnans and inferred in ("boolean", "bytes"):
raise ValueError(
"Categorical array contains NaN -- unable to convert to TileDB array."
)
target_dtype = np.dtype( # type: ignore[assignment]
_str_to_type.get(inferred, object)
)

return x.astype(target_dtype)
# categories = x.cat.categories
# cat_dtype = categories.dtype
# if cat_dtype.kind in ("f", "u", "i"):
# if x.hasnans and cat_dtype.kind == "i":
# raise ValueError(
# f"Categorical column {name!r} contains NaN -- unable to convert to TileDB array."
# )
# # More mysterious spurious mypy errors.
# target_dtype = _to_tiledb_supported_dtype(cat_dtype) # type: ignore[arg-type]
# else:
# # Into the weirdness. See if Pandas can help with edge cases.
# inferred = infer_dtype(categories)
# if x.hasnans and inferred in ("boolean", "bytes"):
# raise ValueError(
# "Categorical array contains NaN -- unable to convert to TileDB array."
# )
# target_dtype = np.dtype( # type: ignore[assignment]
# _str_to_type.get(inferred, object)
# )

# return x.astype(target_dtype)
return x


def csr_from_tiledb_df(df: pd.DataFrame, num_rows: int, num_cols: int) -> sp.csr_matrix:
Expand Down
1 change: 1 addition & 0 deletions apis/python/src/tiledbsoma/io/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1025,6 +1025,7 @@ def _update_dataframe(
for key in common_keys:
old_type = old_sig[key]
new_type = new_sig[key]

if old_type != new_type:
msgs.append(f"{key} type {old_type} != {new_type}")
if msgs:
Expand Down
12 changes: 8 additions & 4 deletions apis/python/src/tiledbsoma/io/registration/signatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,13 @@ def _string_dict_from_arrow_schema(schema: pa.Schema) -> Dict[str, str]:
Converts an Arrow schema to a string/string dict, which is easier on the eyes,
easier to convert from/to JSON for distributed logging, and easier to do del-key on.
"""

retval = {name: _stringify_type(schema.field(name).type) for name in schema.names}

retval = {}
for name in schema.names:
arrow_type = schema.field(name).type
if pa.types.is_dictionary(arrow_type):
arrow_type = arrow_type.index_type
retval[name] = _stringify_type(arrow_type)

# The soma_joinid field is specific to SOMA data but does not exist in AnnData/H5AD. When we
# pre-check an AnnData/H5AD input to see if it's appendable to an existing SOMA experiment, we
# must not punish the AnnData/H5AD input for it not having a soma_joinid column in its obs and
Expand Down Expand Up @@ -241,7 +245,7 @@ def from_soma_experiment(
varm_dtypes[varm_layer_name] = str(
varm.schema.field("soma_data").type
)

return cls(
obs_schema=obs_schema,
var_schema=var_schema,
Expand Down
8 changes: 2 additions & 6 deletions apis/python/src/tiledbsoma/pytiledbsoma.cc
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,6 @@ PYBIND11_MODULE(pytiledbsoma, m) {
std::string_view name,
std::optional<std::vector<std::string>> column_names_in,
py::object py_query_condition,
py::object py_schema,
std::string_view batch_size,
ResultOrder result_order,
std::map<std::string, std::string> platform_config,
Expand All @@ -222,7 +221,7 @@ PYBIND11_MODULE(pytiledbsoma, m) {
// Column names will be updated with columns present
// in the query condition
auto new_column_names =
init_pyqc(py_schema, column_names)
init_pyqc(uri, column_names)
.cast<std::vector<std::string>>();

// Update the column_names list if it was not empty,
Expand Down Expand Up @@ -267,7 +266,6 @@ PYBIND11_MODULE(pytiledbsoma, m) {
"name"_a = "unnamed",
"column_names"_a = py::none(),
"query_condition"_a = py::none(),
"schema"_a = py::none(),
"batch_size"_a = "auto",
"result_order"_a = ResultOrder::automatic,
"platform_config"_a = py::dict(),
Expand All @@ -278,7 +276,6 @@ PYBIND11_MODULE(pytiledbsoma, m) {
[](SOMAArray& reader,
std::optional<std::vector<std::string>> column_names_in,
py::object py_query_condition,
py::object py_schema,
std::string_view batch_size,
ResultOrder result_order) {
// Handle optional args
Expand All @@ -298,7 +295,7 @@ PYBIND11_MODULE(pytiledbsoma, m) {
// Column names will be updated with columns present in
// the query condition
auto new_column_names =
init_pyqc(py_schema, column_names)
init_pyqc(reader.uri(), column_names)
.cast<std::vector<std::string>>();

// Update the column_names list if it was not empty,
Expand Down Expand Up @@ -331,7 +328,6 @@ PYBIND11_MODULE(pytiledbsoma, m) {
py::kw_only(),
"column_names"_a = py::none(),
"query_condition"_a = py::none(),
"schema"_a = py::none(),
"batch_size"_a = "auto",
"result_order"_a = ResultOrder::automatic)

Expand Down
1 change: 1 addition & 0 deletions apis/python/tests/test_basic_anndata_io.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import math
import pathlib
import tempfile
from pathlib import Path
Expand Down
18 changes: 6 additions & 12 deletions libtiledbsoma/test/test_query_condition.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,7 @@ def pandas_query(uri, condition):

def soma_query(uri, condition):
qc = QueryCondition(condition)
schema = tiledb.open(uri).schema

sr = clib.SOMAArray(uri, query_condition=qc, schema=schema)
sr = clib.SOMAArray(uri, query_condition=qc)
sr.submit()
arrow_table = sr.read_next()
assert sr.results_complete()
Expand Down Expand Up @@ -110,10 +108,9 @@ def test_query_condition_select_columns():
condition = "percent_mito > 0.02"

qc = QueryCondition(condition)
schema = tiledb.open(uri).schema

sr = clib.SOMAArray(
uri, query_condition=qc, schema=schema, column_names=["n_genes"]
uri, query_condition=qc, column_names=["n_genes"]
)
sr.submit()
arrow_table = sr.read_next()
Expand All @@ -128,9 +125,8 @@ def test_query_condition_all_columns():
condition = "percent_mito > 0.02"

qc = QueryCondition(condition)
schema = tiledb.open(uri).schema

sr = clib.SOMAArray(uri, query_condition=qc, schema=schema)
sr = clib.SOMAArray(uri, query_condition=qc)
sr.submit()
arrow_table = sr.read_next()

Expand All @@ -144,9 +140,8 @@ def test_query_condition_reset():
condition = "percent_mito > 0.02"

qc = QueryCondition(condition)
schema = tiledb.open(uri).schema

sr = clib.SOMAArray(uri, query_condition=qc, schema=schema)
sr = clib.SOMAArray(uri, query_condition=qc)
sr.submit()
arrow_table = sr.read_next()

Expand All @@ -158,7 +153,7 @@ def test_query_condition_reset():
# ---------------------------------------------------------------
condition = "percent_mito < 0.02"
qc = QueryCondition(condition)
sr.reset(column_names=["percent_mito"], query_condition=qc, schema=schema)
sr.reset(column_names=["percent_mito"], query_condition=qc)

sr.submit()
arrow_table = sr.read_next()
Expand Down Expand Up @@ -221,14 +216,13 @@ def test_parsing_error_conditions(malformed_condition):
def test_eval_error_conditions(malformed_condition):
"""Conditions which should not evaluate (but WILL parse)"""
uri = os.path.join(SOMA_URI, "obs")
schema = tiledb.open(uri).schema

# TODO: these raise the wrong error - it should be SOMAError. Change the test
# when https://github.com/single-cell-data/TileDB-SOMA/issues/783 is fixed
#
with pytest.raises(RuntimeError):
qc = QueryCondition(malformed_condition)
sr = clib.SOMAArray(uri, query_condition=qc, schema=schema)
sr = clib.SOMAArray(uri, query_condition=qc)
sr.submit()
sr.read_next()

Expand Down

0 comments on commit 2a8061d

Please sign in to comment.