Skip to content

Commit

Permalink
Refactor QueryCondition
Browse files Browse the repository at this point in the history
* Use Pyarrow Schema instead of TileDB ArraySchema
* Remove TileDB-Py dependency
* No longer requires attr-to-enum mapping passed for dictionaries as
  this can be checked in Pyarrow Schema now
  • Loading branch information
nguyenv committed Oct 18, 2023
1 parent 06aca87 commit c0d02b0
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 169 deletions.
44 changes: 18 additions & 26 deletions apis/python/src/tiledbsoma/_query_condition.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import attrs
import numpy as np
import pyarrow as pa
# import tiledb

from . import pytiledbsoma as clib
from ._exception import SOMAError
Expand Down Expand Up @@ -132,12 +131,9 @@ def __attrs_post_init__(self):
def init_query_condition(
self,
schema: pa.Schema,
enum_to_dtype: dict,
query_attrs: Optional[List[str]],
):
print(schema)

qctree = QueryConditionTree(schema, enum_to_dtype, query_attrs)
qctree = QueryConditionTree(schema, query_attrs)
self.c_obj = qctree.visit(self.tree.body)

if not isinstance(self.c_obj, clib.PyQueryCondition):
Expand All @@ -152,7 +148,6 @@ def init_query_condition(
@attrs.define
class QueryConditionTree(ast.NodeVisitor):
schema: pa.Schema
enum_to_dtype: dict
query_attrs: List[str]

def visit_BitOr(self, node):
Expand Down Expand Up @@ -228,30 +223,24 @@ def visit_Compare(self, node: ast.Compare) -> clib.PyQueryCondition:

variable = node.left.id
values = [self.get_val_from_node(val) for val in self.visit(rhs)]

# if self.schema.has_attr(variable):
# enum_label = self.schema.attr(variable).enum_label
# if enum_label is not None:
# dt = self.enum_to_dtype[enum_label]
# else:
# dt = self.schema.attr(variable).dtype
# else:
# dt = self.schema.attr_or_dim_dtype(variable)

dt = self.schema.field(variable).type
if pa.types.is_dictionary(dt):
dt = dt.value_type

if pa.types.is_string(dt) or pa.types.is_large_string(dt) or pa.types.is_binary(dt) or pa.types.is_large_binary(dt):
dtype = "string"
else:
dtype = np.dtype(dt.to_pandas_dtype()).name

# sdf.read(column_names=["foo"], value_filter='bar == 999') should
# result in bar being added to the column names. See also
# https://github.com/single-cell-data/TileDB-SOMA/issues/755
att = self.get_att_from_node(node.left)
if att not in self.query_attrs:
self.query_attrs.append(att)

if pa.types.is_string(dt) or pa.types.is_large_string(dt) or pa.types.is_binary(dt) or pa.types.is_large_binary(dt):
dtype = "string"
else:
dtype = dt


# dtype = "string" if dt.kind in "SUa" else dt.name
op = clib.TILEDB_IN if isinstance(operator, ast.In) else clib.TILEDB_NOT_IN
result = self.create_pyqc(dtype)(node.left.id, values, op)

Expand All @@ -267,12 +256,15 @@ def aux_visit_Compare(

att = self.get_att_from_node(att)
val = self.get_val_from_node(val)
enum_label = self.schema.attr(att).enum_label
if enum_label is not None:
dt = self.enum_to_dtype[enum_label]

dt = self.schema.field(att).type
if pa.types.is_dictionary(dt):
dt = dt.value_type

if pa.types.is_string(dt) or pa.types.is_large_string(dt) or pa.types.is_binary(dt) or pa.types.is_large_binary(dt):
dtype = "string"
else:
dt = self.schema.attr(att).dtype
dtype = "string" if dt.kind in "SUa" else dt.name
dtype = np.dtype(dt.to_pandas_dtype()).name
val = self.cast_val_to_dtype(val, dtype)

pyqc = clib.PyQueryCondition()
Expand Down
175 changes: 46 additions & 129 deletions apis/python/src/tiledbsoma/soma_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,54 +86,13 @@ bool get_enum_is_ordered(SOMAArray& sr, std::string attr_name){
return attr_to_enmrs.at(attr_name).ordered();
}

/**
* @brief pybind11 bindings
*
*/
void load_soma_array(py::module &m) {
m.doc() = "SOMA acceleration library";

m.def("version", []() { return tiledbsoma::version::as_string(); });

m.def(
"config_logging",
[](const std::string& level, const std::string& logfile) {
LOG_CONFIG(level, logfile);
},
"level"_a,
"logfile"_a = "");

m.def("info", &LOG_INFO, "message"_a = "");
m.def("debug", &LOG_DEBUG, "message"_a = "");

m.def(
"tiledbsoma_stats_enable",
[]() { tiledbsoma::stats::enable(); },
"Enable TileDB internal statistics. Lifecycle: experimental.");
m.def(
"tiledbsoma_stats_disable",
[]() { tiledbsoma::stats::disable(); },
"Disable TileDB internal statistics. Lifecycle: experimental.");
m.def(
"tiledbsoma_stats_reset",
[]() { tiledbsoma::stats::reset(); },
"Reset all TileDB internal statistics to 0. Lifecycle: experimental.");
m.def(
"tiledbsoma_stats_dump",
[]() {
py::print(tiledbsoma::version::as_string());
std::string stats = tiledbsoma::stats::dump();
py::print(stats);
},
"Print TileDB internal statistics. Lifecycle: experimental.");

py::class_<SOMAArray>(m, "SOMAArray")
.def(
py::init(
[](std::string_view uri,
std::string_view name,
std::optional<std::vector<std::string>> column_names_in,
py::object py_query_condition,
std::string_view batch_size,
ResultOrder result_order,
std::map<std::string, std::string> platform_config,
Expand All @@ -144,41 +103,7 @@ void load_soma_array(py::module &m) {
column_names = *column_names_in;
}

// Handle query condition based on
// TileDB-Py::PyQuery::set_attr_cond()
QueryCondition* qc = nullptr;
if (!py_query_condition.is(py::none())) {
py::object init_pyqc = py_query_condition.attr(
"init_query_condition");

try {
// Column names will be updated with columns present
// in the query condition
auto new_column_names =
init_pyqc(uri, column_names, platform_config, timestamp)
.cast<std::vector<std::string>>();

// Update the column_names list if it was not empty,
// otherwise continue selecting all columns with an
// empty column_names list
if (!column_names.empty()) {
column_names = new_column_names;
}
} catch (const std::exception& e) {
throw TileDBSOMAError(e.what());
}

qc = py_query_condition.attr("c_obj")
.cast<PyQueryCondition>()
.ptr()
.get();
}

// Release python GIL after we're done accessing python
// objects
py::gil_scoped_release release;

auto reader = SOMAArray::open(
return SOMAArray::open(
OpenMode::read,
uri,
name,
Expand All @@ -187,29 +112,65 @@ void load_soma_array(py::module &m) {
batch_size,
result_order,
timestamp);

// Set query condition if present
if (qc) {
reader->set_condition(*qc);
}

return reader;
}),
"uri"_a,
py::kw_only(),
"name"_a = "unnamed",
"column_names"_a = py::none(),
"query_condition"_a = py::none(),
"batch_size"_a = "auto",
"result_order"_a = ResultOrder::automatic,
"platform_config"_a = py::dict(),
"timestamp"_a = py::none())

.def(
"set_condition",
[](SOMAArray& reader,
py::object py_query_condition,
py::object py_schema){
auto column_names = reader.column_names();
// Handle query condition based on
// TileDB-Py::PyQuery::set_attr_cond()
QueryCondition* qc = nullptr;
if (!py_query_condition.is(py::none())) {
py::object init_pyqc = py_query_condition.attr(
"init_query_condition");
try {
// Column names will be updated with columns present
// in the query condition
auto new_column_names =
init_pyqc(py_schema, column_names)
.cast<std::vector<std::string>>();
// Update the column_names list if it was not empty,
// otherwise continue selecting all columns with an
// empty column_names list
if (!column_names.empty()) {
column_names = new_column_names;
}
} catch (const std::exception& e) {
throw TileDBSOMAError(e.what());
}
qc = py_query_condition.attr("c_obj")
.cast<PyQueryCondition>()
.ptr()
.get();
}
reader.reset(column_names);

// Release python GIL after we're done accessing python
// objects
py::gil_scoped_release release;
// Set query condition if present
if (qc) {
reader.set_condition(*qc);
}
},
"py_query_condition"_a,
"py_schema"_a)

.def(
"reset",
[](SOMAArray& reader,
std::optional<std::vector<std::string>> column_names_in,
py::object py_query_condition,
std::string_view batch_size,
ResultOrder result_order) {
// Handle optional args
Expand All @@ -218,55 +179,11 @@ void load_soma_array(py::module &m) {
column_names = *column_names_in;
}

// Handle query condition based on
// TileDB-Py::PyQuery::set_attr_cond()
QueryCondition* qc = nullptr;
if (!py_query_condition.is(py::none())) {
py::object init_pyqc = py_query_condition.attr(
"init_query_condition");

try {
// Convert TileDB::Config to std::unordered map for pybind11 passing
std::unordered_map<std::string, std::string> cfg;
for (const auto& it : reader.ctx()->config()) {
cfg[it.first] = it.second;
}
// Column names will be updated with columns present in
// the query condition
auto new_column_names =
init_pyqc(reader.uri(), column_names, cfg, reader.timestamp())
.cast<std::vector<std::string>>();

// Update the column_names list if it was not empty,
// otherwise continue selecting all columns with an
// empty column_names list
if (!column_names.empty()) {
column_names = new_column_names;
}
} catch (const std::exception& e) {
throw TileDBSOMAError(e.what());
}

qc = py_query_condition.attr("c_obj")
.cast<PyQueryCondition>()
.ptr()
.get();
}

// Release python GIL after we're done accessing python objects
py::gil_scoped_release release;

// Reset state of the existing SOMAArray object
reader.reset(column_names, batch_size, result_order);

// Set query condition if present
if (qc) {
reader.set_condition(*qc);
}
},
py::kw_only(),
"column_names"_a = py::none(),
"query_condition"_a = py::none(),
"batch_size"_a = "auto",
"result_order"_a = ResultOrder::automatic)

Expand Down
10 changes: 2 additions & 8 deletions apis/python/src/tiledbsoma/soma_dataframe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,7 @@ void load_soma_dataframe(py::module &m) {
.def("set_condition",
[](SOMADataFrame& reader,
py::object py_query_condition,
py::object pa_schema){
auto attr_to_enum = reader.get_attr_to_enum_mapping();
std::map<std::string, py::dtype> enum_to_dtype;
for(auto const& [attr, enmr] : attr_to_enum){
enum_to_dtype[attr] = tdb_to_np_dtype(
enmr.type(), enmr.cell_val_num());
}
py::object pa_schema){
auto column_names = reader.column_names();
// Handle query condition based on
// TileDB-Py::PyQuery::set_attr_cond()
Expand All @@ -80,7 +74,7 @@ void load_soma_dataframe(py::module &m) {
// Column names will be updated with columns present
// in the query condition
auto new_column_names =
init_pyqc(pa_schema, enum_to_dtype, column_names)
init_pyqc(pa_schema, column_names)
.cast<std::vector<std::string>>();
// Update the column_names list if it was not empty,
// otherwise continue selecting all columns with an
Expand Down
Loading

0 comments on commit c0d02b0

Please sign in to comment.