diff --git a/HISTORY.md b/HISTORY.md index 908b2392ab..9fb7ad091a 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,3 +1,18 @@ +# Release 0.23.0 + +* TileDB-Py 0.23.0 includes TileDB Embedded [2.17.0](https://github.com/TileDB-Inc/TileDB/releases/tag/2.17.0) + +## Improvements + +* Support for "enumerated datatypes" (aka categoricals or factors). [#1790](https://github.com/TileDB-Inc/TileDB-Py/pull/1790) +* Introduce `Array.read_subarray` and `Array.write_subarray` APIs. [#1824](https://github.com/TileDB-Inc/TileDB-Py/pull/1824) +* Avoid importing Pandas until we actually use it. [#1825](https://github.com/TileDB-Inc/TileDB-Py/pull/1825) +* Make VFS accept path-like objects to refer to files. [#1818](https://github.com/TileDB-Inc/TileDB-Py/pull/1818) + +## Bug Fies + +* Use object equality check in buffer conversion, fixes state serialization bug in distributed use-case. [#1822](https://github.com/TileDB-Inc/TileDB-Py/pull/1822) + # Release 0.22.3 * TileDB-Py 0.22.3 includes TileDB Embedded [2.16.3](https://github.com/TileDB-Inc/TileDB/releases/tag/2.16.3) diff --git a/misc/azure-release.yml b/misc/azure-release.yml index 662cd86b4b..36af00d2bb 100644 --- a/misc/azure-release.yml +++ b/misc/azure-release.yml @@ -6,9 +6,10 @@ stages: LIBTILEDB_VERSION: dev LIBTILEDB_SHA: dev ${{ else }}: - TILEDBPY_VERSION: 0.22.3 - LIBTILEDB_VERSION: 2.16.3 - LIBTILEDB_SHA: 194b5ae2941d7b6631fba367a7afdd79350332e7 + TILEDBPY_VERSION: 0.23.0 + # NOTE: *must* update both LIBTILEDB_VERSION and LIBTILEDB_SHA + LIBTILEDB_VERSION: 2.17.0 + LIBTILEDB_SHA: 93c173dbe46278c76db49b8ae26a4d5d2384ecb0 LIBTILEDB_REPO: https://github.com/TileDB-Inc/TileDB TILEDB_SRC: "$(Build.Repository.Localpath)/tiledb_src" TILEDB_BUILD: "$(Build.Repository.Localpath)/tiledb_build" diff --git a/setup.py b/setup.py index 649fb083e7..e805eab602 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ # - this is for builds-from-source # - release builds are controlled by `misc/azure-release.yml` # - this should be set to the current core release, not `dev` -TILEDB_VERSION = "2.16.3" +TILEDB_VERSION = "2.17.0" # allow overriding w/ environment variable TILEDB_VERSION = os.environ.get("TILEDB_VERSION") or TILEDB_VERSION diff --git a/tiledb/__init__.py b/tiledb/__init__.py index 0648e15f8f..d53de407da 100644 --- a/tiledb/__init__.py +++ b/tiledb/__init__.py @@ -45,6 +45,7 @@ from .dimension_label import DimLabel from .dimension_label_schema import DimLabelSchema from .domain import Domain +from .enumeration import Enumeration from .filestore import Filestore from .filter import ( BitShuffleFilter, diff --git a/tiledb/array_schema.py b/tiledb/array_schema.py index 05ffb7c205..28c2bfae05 100644 --- a/tiledb/array_schema.py +++ b/tiledb/array_schema.py @@ -51,9 +51,15 @@ def __init__( allows_duplicates: bool = False, sparse: bool = False, dim_labels={}, + enums=None, ctx: Ctx = None, ): super().__init__(ctx, lt.ArrayType.SPARSE if sparse else lt.ArrayType.DENSE) + + if enums is not None: + for enum_name in enums: + self._add_enumeration(self._ctx, enum_name) + if attrs is not None: for att in attrs: if not isinstance(att, Attr): diff --git a/tiledb/attribute.py b/tiledb/attribute.py index 46be0a369a..a3f770a99c 100644 --- a/tiledb/attribute.py +++ b/tiledb/attribute.py @@ -24,6 +24,7 @@ def __init__( var: bool = None, nullable: bool = False, filters: Union[FilterList, Sequence[Filter]] = None, + enum_label: str = None, ctx: Optional[Ctx] = None, ): """Class representing a TileDB array attribute. @@ -88,6 +89,9 @@ def __init__( if nullable is not None: self._nullable = nullable + if enum_label is not None: + self._set_enumeration_name(self._ctx, enum_label) + def __eq__(self, other): if not isinstance(other, Attr): return False @@ -202,6 +206,10 @@ def isascii(self) -> bool: """ return self._tiledb_dtype == lt.DataType.STRING_ASCII + @property + def enum_label(self): + return self._get_enumeration_name(self._ctx) + def __repr__(self): filters_str = "" if self.filters: @@ -217,11 +225,16 @@ def __repr__(self): else: attr_dtype = self.dtype + if self.enum_label is None: + enum_label = None + else: + enum_label = f"'{self.enum_label!s}'" + # filters_str must be last with no spaces return ( f"""Attr(name={repr(self.name)}, dtype='{attr_dtype!s}', """ - f"""var={self.isvar!s}, nullable={self.isnullable!s}""" - f"""{filters_str})""" + f"""var={self.isvar!s}, nullable={self.isnullable!s}, """ + f"""enum_label={enum_label}{filters_str})""" ) def _repr_html_(self): diff --git a/tiledb/cc/array.cc b/tiledb/cc/array.cc index bbd6200b4e..852c6c8f95 100644 --- a/tiledb/cc/array.cc +++ b/tiledb/cc/array.cc @@ -34,6 +34,7 @@ void init_array(py::module &m) { .def("uri", &Array::uri) .def("schema", &Array::schema) //.def("ptr", [](Array& arr){ return py::capsule(arr.ptr()); } ) + .def("open", (void (Array::*)(tiledb_query_type_t)) & Array::open) // open with encryption key .def("open", (void (Array::*)(tiledb_query_type_t, tiledb_encryption_type_t, diff --git a/tiledb/cc/attribute.cc b/tiledb/cc/attribute.cc index 3865f35b4e..e1992c5339 100644 --- a/tiledb/cc/attribute.cc +++ b/tiledb/cc/attribute.cc @@ -1,4 +1,5 @@ #include +#include #include #include @@ -39,6 +40,16 @@ py::array get_fill_value(Attribute &attr) { return py::array(value_type, value_num, value); } +void set_enumeration_name(Attribute &attr, const Context &ctx, + const std::string &enumeration_name) { + AttributeExperimental::set_enumeration_name(ctx, attr, enumeration_name); +} + +std::optional get_enumeration_name(Attribute &attr, + const Context &ctx) { + return AttributeExperimental::get_enumeration_name(ctx, attr); +} + void init_attribute(py::module &m) { py::class_(m, "Attribute") .def(py::init()) @@ -73,8 +84,11 @@ void init_attribute(py::module &m) { .def_property("_fill", get_fill_value, set_fill_value) + .def("_get_enumeration_name", get_enumeration_name) + + .def("_set_enumeration_name", set_enumeration_name) + .def("_dump", [](Attribute &attr) { attr.dump(); }); - ; } } // namespace libtiledbcpp \ No newline at end of file diff --git a/tiledb/cc/enumeration.cc b/tiledb/cc/enumeration.cc new file mode 100644 index 0000000000..1762a98780 --- /dev/null +++ b/tiledb/cc/enumeration.cc @@ -0,0 +1,77 @@ +#include +#include + +#include +#include +#include +#include + +#include "common.h" + +namespace libtiledbcpp { + +using namespace tiledb; +using namespace tiledbpy::common; +namespace py = pybind11; + +void init_enumeration(py::module &m) { + py::class_(m, "Enumeration") + .def(py::init()) + + .def(py::init([](const Context &ctx, const std::string &name, + std::vector &values, bool ordered, + tiledb_datatype_t type) { + return Enumeration::create(ctx, name, values, ordered, type); + })) + + .def(py::init([](const Context &ctx, const std::string &name, + bool ordered, py::array data, py::array offsets) { + tiledb_datatype_t data_type; + try { + data_type = np_to_tdb_dtype(data.dtype()); + } catch (const TileDBPyError &e) { + throw py::type_error(e.what()); + } + + py::buffer_info data_buffer = data.request(); + if (data_buffer.ndim != 1) + throw py::type_error("Only 1D Numpy arrays can be stored as " + "enumeration values"); + + py::size_t cell_val_num = + offsets.size() == 0 ? get_ncells(data.dtype()) : TILEDB_VAR_NUM; + + return Enumeration::create( + ctx, name, data_type, cell_val_num, ordered, data.data(), + data.nbytes(), offsets.size() == 0 ? nullptr : offsets.data(), + offsets.nbytes()); + })) + + .def(py::init(), py::keep_alive<1, 2>()) + + .def("__capsule__", + [](Enumeration &enmr) { + return py::capsule(enmr.ptr().get(), "enmr", nullptr); + }) + + .def_property_readonly("name", &Enumeration::name) + + .def_property_readonly("type", &Enumeration::type) + + .def_property_readonly("cell_val_num", &Enumeration::cell_val_num) + + .def_property_readonly("ordered", &Enumeration::ordered) + + .def("values", + [](Enumeration &enmr) { + auto data = enmr.as_vector(); + auto dtype = tdb_to_np_dtype(enmr.type(), enmr.cell_val_num()); + return py::array(dtype, data.size() / dtype.itemsize(), + data.data()); + }) + + .def("str_values", + [](Enumeration &enmr) { return enmr.as_vector(); }); +} + +} // namespace libtiledbcpp \ No newline at end of file diff --git a/tiledb/cc/schema.cc b/tiledb/cc/schema.cc index bdcce33eaa..56382dd86e 100644 --- a/tiledb/cc/schema.cc +++ b/tiledb/cc/schema.cc @@ -278,16 +278,23 @@ void init_schema(py::module &m) { .def("_has_attribute", &ArraySchema::has_attribute) #if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 15 - .def("_has_dim_label", [](const ArraySchema &schema, const Context &ctx, - const std::string &name) { - return ArraySchemaExperimental::has_dimension_label(ctx, schema, name); - }); + .def("_has_dim_label", + [](const ArraySchema &schema, const Context &ctx, + const std::string &name) { + return ArraySchemaExperimental::has_dimension_label(ctx, schema, + name); + }) #else .def("_has_dim_label", [](const ArraySchema &, const Context &, const std::string &) { return false; - }); + }) #endif + + .def("_add_enumeration", [](const ArraySchema &schema, const Context &ctx, + const Enumeration &enmr) { + ArraySchemaExperimental::add_enumeration(ctx, schema, enmr); + }); } } // namespace libtiledbcpp diff --git a/tiledb/cc/tiledbcpp.cc b/tiledb/cc/tiledbcpp.cc index 374b59b93e..a308f04142 100644 --- a/tiledb/cc/tiledbcpp.cc +++ b/tiledb/cc/tiledbcpp.cc @@ -19,6 +19,7 @@ void init_attribute(py::module &); void init_context(py::module &); void init_config(py::module &); void init_enums(py::module &); +void init_enumeration(py::module &); void init_dimension_label(py::module &m); void init_domain(py::module &m); void init_file_handle(py::module &); @@ -40,6 +41,7 @@ PYBIND11_MODULE(cc, m) { init_dimension_label(m); init_domain(m); init_enums(m); + init_enumeration(m); init_file_handle(m); init_filestore(m); init_filter(m); diff --git a/tiledb/core.cc b/tiledb/core.cc index 5047dbe1b1..c6b8625e7a 100644 --- a/tiledb/core.cc +++ b/tiledb/core.cc @@ -316,7 +316,7 @@ class PyQuery { // label buffer list std::unordered_map label_input_buffer_data_; - py::object pyschema_; + std::string uri_; public: tiledb_ctx_t *c_ctx_; @@ -349,7 +349,7 @@ class PyQuery { domain_ = std::shared_ptr(new Domain(array_schema_->domain())); - pyschema_ = array.attr("schema"); + uri_ = array.attr("uri").cast(); bool issparse = array_->schema().array_type() == TILEDB_SPARSE; @@ -450,7 +450,7 @@ class PyQuery { py::object init_pyqc = cond.attr("init_query_condition"); try { - init_pyqc(pyschema_, attrs_); + init_pyqc(uri_, attrs_); } catch (tiledb::TileDBError &e) { TPY_ERROR_LOC(e.what()); } catch (py::error_already_set &e) { diff --git a/tiledb/dataframe_.py b/tiledb/dataframe_.py index e6a5065881..23188cb8c7 100644 --- a/tiledb/dataframe_.py +++ b/tiledb/dataframe_.py @@ -77,7 +77,6 @@ def parse_tiledb_kwargs(kwargs): @dataclass(frozen=True) class ColumnInfo: - dtype: np.dtype repr: Optional[str] = None nullable: bool = False diff --git a/tiledb/enumeration.py b/tiledb/enumeration.py new file mode 100644 index 0000000000..d71343d7cf --- /dev/null +++ b/tiledb/enumeration.py @@ -0,0 +1,128 @@ +import io +from typing import Any, Optional, Sequence + +import numpy as np +from numpy.typing import NDArray + +import tiledb.cc as lt + +from .ctx import Ctx, CtxMixin +from .datatypes import DataType + + +class Enumeration(CtxMixin, lt.Enumeration): + """ + Represents a TileDB Enumeration. + """ + + def __init__( + self, name: str, ordered: bool, values: Sequence[Any], ctx: Optional[Ctx] = None + ): + """Class representing the TileDB Enumeration. + + :param name: The name of the to-be created Enumeration + :type name: str + :param ordered: Whether or not to consider this enumeration ordered + :type ordered: bool + :param values: A Numpy array of values for this enumeration + :type values: np.array + :param ctx: A TileDB context + :type ctx: tiledb.Ctx + """ + values = np.array(values) + if np.dtype(values.dtype).kind in "US": + dtype = ( + lt.DataType.STRING_UTF8 + if values.dtype.kind == "U" + else lt.DataType.STRING_ASCII + ) + super().__init__(ctx, name, values, ordered, dtype) + else: + super().__init__(ctx, name, ordered, values, np.array([])) + + @property + def name(self) -> str: + """The enumeration label string. + + :rtype: str + """ + return super().name + + @property + def dtype(self) -> np.dtype: + """Numpy dtype representation of the enumeration type. + + :rtype: numpy.dtype + """ + return DataType.from_tiledb(super().type).np_dtype + + @property + def cell_val_num(self) -> int: + """The enumeration's cell value number. + + :rtype: int + """ + return super().cell_val_num + + @property + def ordered(self) -> bool: + """True if the enumeration is ordered. + + :rtype: bool + """ + return super().ordered + + def values(self) -> NDArray: + """The values of the enumeration. + + :rtype: NDArray + """ + if self.dtype.kind == "U": + return np.array(super().str_values()) + elif self.dtype.kind == "S": + return np.array(super().str_values(), dtype=np.bytes_) + else: + return super().values() + + def __eq__(self, other): + if not isinstance(other, Enumeration): + return False + + return any( + [ + self.name == other.name, + self.dtype == other.dtype, + self.dtype == other.dtype, + self.dtype == other.dtype, + self.values() == other.values(), + ] + ) + + def __repr__(self): + return f"Enumeration(name='{self.name}', cell_val_num={self.cell_val_num}, ordered={self.ordered}, values={list(self.values())})" + + def _repr_html_(self): + output = io.StringIO() + + output.write("") + output.write("") + output.write("") + output.write("") + output.write("") + output.write("") + output.write(f"{self._repr_html_row_only_()}") + output.write("
NameData TypeOrdered
") + + return output.getvalue() + + def _repr_html_row_only_(self): + output = io.StringIO() + + output.write("") + output.write(f"{self.name}") + output.write(f"{self.dtype}") + output.write(f"{self.cell_val_num}") + output.write(f"{self.ordered}") + output.write("") + + return output.getvalue() diff --git a/tiledb/group.py b/tiledb/group.py index 06ab048248..4503fd75c5 100644 --- a/tiledb/group.py +++ b/tiledb/group.py @@ -218,7 +218,6 @@ def _iter(self, keys_only: bool = True, dump: bool = False): f"- Type: {val_dtype}\n" ) else: - yield key, val def __iter__(self): diff --git a/tiledb/libtiledb.pxd b/tiledb/libtiledb.pxd index 04694e7de4..c2ed474b48 100644 --- a/tiledb/libtiledb.pxd +++ b/tiledb/libtiledb.pxd @@ -133,6 +133,8 @@ cdef extern from "tiledb/tiledb.h": pass ctypedef struct tiledb_config_iter_t: pass + ctypedef struct tiledb_enumeration_t: + pass ctypedef struct tiledb_error_t: pass ctypedef struct tiledb_array_t: @@ -947,6 +949,12 @@ cdef extern from "tiledb/tiledb.h": void* end, int32_t* is_empty) + int32_t tiledb_array_get_enumeration( + tiledb_ctx_t* ctx, + const tiledb_array_t* array, + const char* name, + tiledb_enumeration_t** enumeration) + int tiledb_array_vacuum( tiledb_ctx_t* ctx, const char* array_uri, diff --git a/tiledb/libtiledb.pyx b/tiledb/libtiledb.pyx index b30832261f..f0e0b181ad 100644 --- a/tiledb/libtiledb.pyx +++ b/tiledb/libtiledb.pyx @@ -14,6 +14,7 @@ from json import loads as json_loads from ._generated_version import version_tuple as tiledbpy_version from .array_schema import ArraySchema +from .enumeration import Enumeration from .cc import TileDBError from .ctx import Config, Ctx, default_ctx from .vfs import VFS @@ -1226,6 +1227,24 @@ cdef class Array(object): :raises TypeError: invalid key type""" return self.schema.domain.dim(dim_id) + def enum(self, name): + """ + Return the Enumeration from the attribute name. + + :param name: attribute name + :type key: str + :rtype: `Enumeration` + """ + cdef tiledb_ctx_t* ctx_ptr = safe_ctx_ptr(self.ctx) + cdef tiledb_array_t* array_ptr = self.ptr + cdef bytes bname = unicode_path(name) + cdef const char* name_ptr = PyBytes_AS_STRING(bname) + cdef tiledb_enumeration_t* enum_ptr = NULL + rc = tiledb_array_get_enumeration(ctx_ptr, array_ptr, name_ptr, &enum_ptr) + if rc != TILEDB_OK: + _raise_ctx_err(ctx_ptr, rc) + return Enumeration.from_capsule(self.ctx, PyCapsule_New(enum_ptr, "enum", NULL)) + def delete_fragments(self, timestamp_start, timestamp_end): """ Delete a range of fragments from timestamp_start to timestamp_end. @@ -1908,6 +1927,12 @@ cdef class DenseArrayImpl(Array): return result[self.view_attr] else: result = self.subarray(selection) + for i in range(self.schema.nattr): + attr = self.schema.attr(i) + enum_label = attr.enum_label + if enum_label is not None: + values = self.enum(enum_label).values() + result[attr.name] = np.array([values[idx] for idx in result[attr.name]]) return result def __repr__(self): @@ -2888,7 +2913,14 @@ cdef class SparseArrayImpl(Array): >>> # A[5.0:579.9] """ - return self.subarray(selection) + result = self.subarray(selection) + for i in range(self.schema.nattr): + attr = self.schema.attr(i) + enum_label = attr.enum_label + if enum_label is not None: + values = self.enum(enum_label).values() + result[attr.name] = np.array([values[idx] for idx in result[attr.name]]) + return result def query(self, attrs=None, cond=None, attr_cond=None, dims=None, index_col=True, coords=None, order='U', use_arrow=None, diff --git a/tiledb/multirange_indexing.py b/tiledb/multirange_indexing.py index 0292df0746..3b20864e10 100644 --- a/tiledb/multirange_indexing.py +++ b/tiledb/multirange_indexing.py @@ -345,7 +345,7 @@ def _run_query(self) -> Dict[str, np.ndarray]: return self._empty_results self.pyquery.submit() - result_dict = _get_pyquery_results(self.pyquery, self.array.schema) + result_dict = _get_pyquery_results(self.pyquery, self.array) if self.result_shape is not None: for name, arr in result_dict.items(): # TODO check/test layout @@ -412,6 +412,17 @@ def _run_query(self) -> Union["pandas.DataFrame", "pyarrow.Table"]: tdb_attr = self.array.attr(pa_attr.name) + if tdb_attr.enum_label is not None: + enmr = self.array.enum(tdb_attr.enum_label) + col = pyarrow.DictionaryArray.from_arrays( + indices=table[pa_attr.name].combine_chunks(), + dictionary=enmr.values(), + ) + idx = pa_schema.get_field_index(pa_attr.name) + table = table.set_column(idx, pa_attr.name, col) + pa_schema = table.schema + continue + if np.issubdtype(tdb_attr.dtype, bool): # this is a workaround to cast TILEDB_BOOL types from uint8 # representation in Arrow to Boolean @@ -467,7 +478,7 @@ def _run_query(self) -> Union["pandas.DataFrame", "pyarrow.Table"]: df = table.to_pandas() else: - df = pandas.DataFrame(_get_pyquery_results(self.pyquery, self.array.schema)) + df = pandas.DataFrame(_get_pyquery_results(self.pyquery, self.array)) with timing("pandas_index_update_time"): return _update_df_from_meta(df, self.array.meta, self.query.index_col) @@ -621,9 +632,8 @@ def _iter_dim_names( return (dom.dim(i).name for i in range(dom.ndim)) -def _get_pyquery_results( - pyquery: PyQuery, schema: ArraySchema -) -> Dict[str, np.ndarray]: +def _get_pyquery_results(pyquery: PyQuery, array: Array) -> Dict[str, np.ndarray]: + schema = array.schema result_dict = OrderedDict() for name, item in pyquery.results().items(): if len(item[1]) > 0: @@ -635,6 +645,13 @@ def _get_pyquery_results( if not schema.has_dim_label(name) else schema.dim_label(name).dtype ) + + if schema.has_attr(name): + enum_label = schema.attr(name).enum_label + if enum_label is not None: + values = array.enum(enum_label).values() + arr = np.array([values[idx] for idx in arr]) + result_dict[name if name != "__attr" else ""] = arr return result_dict diff --git a/tiledb/query_condition.cc b/tiledb/query_condition.cc index 65f2b2b79e..2ae68e04d3 100644 --- a/tiledb/query_condition.cc +++ b/tiledb/query_condition.cc @@ -60,6 +60,11 @@ class PyQueryCondition { py::capsule __capsule__() { return py::capsule(&qc_, "qc", nullptr); } + void set_use_enumeration(bool use_enumeration) { + QueryConditionExperimental::set_use_enumeration(ctx_, *qc_, + use_enumeration); + } + PyQueryCondition combine(PyQueryCondition rhs, tiledb_query_condition_combination_op_t combination_op) const { @@ -150,9 +155,9 @@ void init_query_condition(py::module &m) { tiledb_query_condition_op_t)>( &PyQueryCondition::init)) - .def("combine", &PyQueryCondition::combine) + .def("__capsule__", &PyQueryCondition::__capsule__) - .def("__capsule__", &PyQueryCondition::__capsule__); + .def("combine", &PyQueryCondition::combine); py::enum_(m, "tiledb_query_condition_op_t", py::arithmetic()) diff --git a/tiledb/query_condition.py b/tiledb/query_condition.py index cbcc059415..5f8422dbd2 100644 --- a/tiledb/query_condition.py +++ b/tiledb/query_condition.py @@ -8,7 +8,7 @@ from .cc import TileDBError from .ctx import Ctx, default_ctx -from .libtiledb import ArraySchema +from .libtiledb import Array """ A high level wrapper around the Pybind11 query_condition.cc implementation for @@ -130,8 +130,8 @@ def __post_init__(self): "(Is this an empty expression?)" ) - def init_query_condition(self, schema: ArraySchema, query_attrs: List[str]): - qctree = QueryConditionTree(self.ctx, schema, query_attrs) + def init_query_condition(self, uri: str, query_attrs: List[str]): + qctree = QueryConditionTree(self.ctx, Array.load_typed(uri), query_attrs) self.c_obj = qctree.visit(self.tree.body) if not isinstance(self.c_obj, qc.PyQueryCondition): @@ -144,7 +144,7 @@ def init_query_condition(self, schema: ArraySchema, query_attrs: List[str]): @dataclass class QueryConditionTree(ast.NodeVisitor): ctx: Ctx - schema: ArraySchema + array: Array query_attrs: List[str] def visit_BitOr(self, node): @@ -237,7 +237,15 @@ def aux_visit_Compare( variable = self.get_variable_from_node(variable) value = self.get_value_from_node(value) - dt = self.schema.attr_or_dim_dtype(variable) + if self.array.schema.has_attr(variable): + enum_label = self.array.attr(variable).enum_label + if enum_label is not None: + dt = self.array.enum(enum_label).dtype + else: + dt = self.array.attr(variable).dtype + else: + dt = self.array.schema.attr_or_dim_dtype(variable) + dtype = "string" if dt.kind in "SUa" else dt.name value = self.cast_value_to_dtype(value, dtype) @@ -310,17 +318,17 @@ def get_variable_from_node(self, node: QueryConditionNodeElem) -> Any: f"Incorrect type for variable name: {ast.dump(variable_node)}" ) - if self.schema.domain.has_dim(variable) and not self.schema.sparse: + if self.array.schema.domain.has_dim(variable) and not self.array.schema.sparse: raise TileDBError( "Cannot apply query condition to dimensions on dense arrays. " f"{variable} is a dimension." ) if isinstance(node, ast.Call): - if node.func.id == "attr" and not self.schema.has_attr(variable): + if node.func.id == "attr" and not self.array.schema.has_attr(variable): raise TileDBError(f"{node.func.id} is not an attribute.") - if node.func.id == "dim" and not self.schema.domain.has_dim(variable): + if node.func.id == "dim" and not self.array.schema.domain.has_dim(variable): raise TileDBError(f"{node.func.id} is not a dimension.") return variable diff --git a/tiledb/schema_evolution.cc b/tiledb/schema_evolution.cc index 5eb1a1e949..2d9ba38f73 100644 --- a/tiledb/schema_evolution.cc +++ b/tiledb/schema_evolution.cc @@ -66,6 +66,26 @@ void init_schema_evolution(py::module &m) { if (rc != TILEDB_OK) { TPY_ERROR_LOC(get_last_ctx_err_str(inst.ctx_, rc)); } + }) + .def("add_enumeration", + [](ArraySchemaEvolution &inst, py::object enum_py) { + tiledb_enumeration_t *enum_c = + (py::capsule)enum_py.attr("__capsule__")(); + if (enum_c == nullptr) + TPY_ERROR_LOC("Invalid Enumeration!"); + int rc = tiledb_array_schema_evolution_add_enumeration( + inst.ctx_, inst.evol_, enum_c); + if (rc != TILEDB_OK) { + TPY_ERROR_LOC(get_last_ctx_err_str(inst.ctx_, rc)); + } + }) + .def("drop_enumeration", + [](ArraySchemaEvolution &inst, const std::string &enumeration_name) { + int rc = tiledb_array_schema_evolution_drop_enumeration( + inst.ctx_, inst.evol_, enumeration_name.c_str()); + if (rc != TILEDB_OK) { + TPY_ERROR_LOC(get_last_ctx_err_str(inst.ctx_, rc)); + } }); } diff --git a/tiledb/tests/cc/test_cc.py b/tiledb/tests/cc/test_cc.py index 20ef3b88ee..4bd6e0813c 100644 --- a/tiledb/tests/cc/test_cc.py +++ b/tiledb/tests/cc/test_cc.py @@ -120,13 +120,19 @@ def test_array(): arr.close() #### - arrw = lt.Array(ctx, uri, lt.QueryType.WRITE) + arr = lt.Array(ctx, uri, lt.QueryType.WRITE) + arr.set_open_timestamp_start(1) + arr.set_open_timestamp_end(1) + arr.close() + arr.open(lt.QueryType.WRITE) data = b"abcdef" - arrw.put_metadata("key", lt.DataType.STRING_ASCII, data) - arrw.close() + arr.put_metadata("key", lt.DataType.STRING_ASCII, data) + arr.close() - arr = lt.Array(ctx, uri, lt.QueryType.READ) + arr.set_open_timestamp_start(1) + arr.set_open_timestamp_end(1) + arr.open(lt.QueryType.READ) assert arr.metadata_num() == 1 assert arr.has_metadata("key") mv = arr.get_metadata("key") @@ -139,11 +145,15 @@ def test_array(): arr.get_metadata_from_index(1) arr.close() - arrw = lt.Array(ctx, uri, lt.QueryType.WRITE) - arrw.delete_metadata("key") - arrw.close() + arr.open(lt.QueryType.WRITE) + arr.set_open_timestamp_start(2) + arr.set_open_timestamp_end(2) + arr.delete_metadata("key") + arr.close() - arr = lt.Array(ctx, uri, lt.QueryType.READ) + arr.set_open_timestamp_start(3) + arr.set_open_timestamp_end(3) + arr.open(lt.QueryType.READ) with pytest.raises(KeyError): arr.get_metadata("key") assert not arr.has_metadata("key")[0] diff --git a/tiledb/tests/datatypes.py b/tiledb/tests/datatypes.py index f34863ed13..a8c077f1b9 100644 --- a/tiledb/tests/datatypes.py +++ b/tiledb/tests/datatypes.py @@ -10,7 +10,6 @@ @pd.api.extensions.register_extension_dtype class RaggedDtype(pd.api.extensions.ExtensionDtype): - type = np.ndarray na_value = None diff --git a/tiledb/tests/test_domain_index.py b/tiledb/tests/test_domain_index.py index 15a1f86edf..daa0f47133 100644 --- a/tiledb/tests/test_domain_index.py +++ b/tiledb/tests/test_domain_index.py @@ -1,4 +1,4 @@ -#%% +# %% import numpy as np @@ -61,7 +61,6 @@ def test_fp_domain_indexing(self): A[X, Y, Z] = data with tiledb.SparseArray(array_path) as A: - # check direct slicing assert_array_equal(A.domain_index[X[0], Y[0], Z[0]]["data"], data[0]) diff --git a/tiledb/tests/test_enumeration.py b/tiledb/tests/test_enumeration.py new file mode 100644 index 0000000000..5b7486e026 --- /dev/null +++ b/tiledb/tests/test_enumeration.py @@ -0,0 +1,128 @@ +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +import tiledb + +from .common import DiskTestCase, has_pandas + + +class EnumerationTest(DiskTestCase): + @pytest.mark.parametrize( + "name,data", + ( + ("int", np.array([0])), + ("float", np.array([1.0, 2.2, 5.8234, 94.23])), + ("str", np.array(["abc", "defghi", "jk"])), + ("utf8", np.array(["abc", "defghi", "jk"], dtype=np.str_)), + ("ascii", np.array([b"abc", b"defghi", b"jk"], dtype=np.bytes_)), + ), + ) + @pytest.mark.parametrize("ordered", [True, False]) + def test_enumeration_basic(self, name, ordered, data): + enmr = tiledb.Enumeration(name, ordered, data) + + assert enmr.name == name + assert enmr.ordered == ordered + assert_array_equal(enmr.values(), data) + if name in ("str", "utf8", "ascii"): + assert enmr.cell_val_num == tiledb.cc.TILEDB_VAR_NUM() + assert enmr.dtype.kind == data.dtype.kind + else: + assert enmr.cell_val_num == 1 + assert enmr.dtype.kind == data.dtype.kind + + def test_attribute_enumeration(self): + attr = tiledb.Attr() + attr.enum = "enum" + assert attr.enum == "enum" + + def test_array_schema_enumeration(self): + uri = self.path("test_array_schema_enumeration") + dom = tiledb.Domain(tiledb.Dim(domain=(1, 8), tile=1)) + enum1 = tiledb.Enumeration("enmr1", False, np.arange(3) * 10) + enum2 = tiledb.Enumeration("enmr2", False, ["a", "bb", "ccc"]) + attr1 = tiledb.Attr("attr1", dtype=np.int32, enum_label="enmr1") + attr2 = tiledb.Attr("attr2", dtype=np.int32, enum_label="enmr2") + attr3 = tiledb.Attr("attr3", dtype=np.int32) + schema = tiledb.ArraySchema( + domain=dom, attrs=(attr1, attr2, attr3), enums=(enum1, enum2) + ) + tiledb.Array.create(uri, schema) + + data1 = np.random.randint(0, 3, 8) + data2 = np.random.randint(0, 3, 8) + data3 = np.random.randint(0, 3, 8) + + with tiledb.open(uri, "w") as A: + A[:] = {"attr1": data1, "attr2": data2, "attr3": data3} + + with tiledb.open(uri, "r") as A: + assert A.enum("enmr1") == enum1 + assert attr1.enum_label == "enmr1" + assert A.attr("attr1").enum_label == "enmr1" + + assert A.enum("enmr2") == enum2 + assert attr2.enum_label == "enmr2" + assert A.attr("attr2").enum_label == "enmr2" + + with self.assertRaises(tiledb.TileDBError) as excinfo: + assert A.enum("enmr3") == [] + assert " No enumeration named 'enmr3'" in str(excinfo.value) + assert attr3.enum_label is None + assert A.attr("attr3").enum_label is None + + if has_pandas(): + assert_array_equal(A.df[:]["attr1"].cat.codes, data1) + assert_array_equal(A.df[:]["attr2"].cat.codes, data2) + + assert_array_equal(A.df[:]["attr1"], A.multi_index[:]["attr1"]) + assert_array_equal(A.df[:]["attr2"], A.multi_index[:]["attr2"]) + + assert_array_equal(A.df[:]["attr1"], A[:]["attr1"]) + assert_array_equal(A.df[:]["attr2"], A[:]["attr2"]) + + def test_array_schema_enumeration_nullable(self): + uri = self.path("test_array_schema_enumeration") + dom = tiledb.Domain(tiledb.Dim(domain=(1, 8), tile=1)) + enum1 = tiledb.Enumeration("enmr1", False, np.arange(3) * 10) + enum2 = tiledb.Enumeration("enmr2", False, ["a", "bb", "ccc"]) + attr1 = tiledb.Attr("attr1", dtype=np.int32, enum_label="enmr1") + attr2 = tiledb.Attr("attr2", dtype=np.int32, enum_label="enmr2") + attr3 = tiledb.Attr("attr3", dtype=np.int32) + schema = tiledb.ArraySchema( + domain=dom, attrs=(attr1, attr2, attr3), enums=(enum1, enum2) + ) + tiledb.Array.create(uri, schema) + + data1 = np.random.randint(0, 3, 8) + data2 = np.random.randint(0, 3, 8) + data3 = np.random.randint(0, 3, 8) + + with tiledb.open(uri, "w") as A: + A[:] = {"attr1": data1, "attr2": data2, "attr3": data3} + + with tiledb.open(uri, "r") as A: + assert A.enum("enmr1") == enum1 + assert attr1.enum_label == "enmr1" + assert A.attr("attr1").enum_label == "enmr1" + + assert A.enum("enmr2") == enum2 + assert attr2.enum_label == "enmr2" + assert A.attr("attr2").enum_label == "enmr2" + + with self.assertRaises(tiledb.TileDBError) as excinfo: + assert A.enum("enmr3") == [] + assert " No enumeration named 'enmr3'" in str(excinfo.value) + assert attr3.enum_label is None + assert A.attr("attr3").enum_label is None + + if has_pandas(): + assert_array_equal(A.df[:]["attr1"].cat.codes, data1) + assert_array_equal(A.df[:]["attr2"].cat.codes, data2) + + assert_array_equal(A.df[:]["attr1"], A.multi_index[:]["attr1"]) + assert_array_equal(A.df[:]["attr2"], A.multi_index[:]["attr2"]) + + assert_array_equal(A.df[:]["attr1"], A[:]["attr1"]) + assert_array_equal(A.df[:]["attr2"], A[:]["attr2"]) diff --git a/tiledb/tests/test_fixes.py b/tiledb/tests/test_fixes.py index 1314c8fd6c..aad8823d3b 100644 --- a/tiledb/tests/test_fixes.py +++ b/tiledb/tests/test_fixes.py @@ -253,7 +253,6 @@ class SOMA919Test(DiskTestCase): """ def run_test(self): - import tempfile import numpy as np diff --git a/tiledb/tests/test_fragments.py b/tiledb/tests/test_fragments.py index 5ffa71941e..a3c3c25035 100644 --- a/tiledb/tests/test_fragments.py +++ b/tiledb/tests/test_fragments.py @@ -546,6 +546,8 @@ def write_fragments(target_path, dshape, num_frags, ts_start=1): with tiledb.open(target_path, "w", timestamp=i) as A: A[[1, 2, 3]] = np.random.rand(dshape[1]) + tiledb.VFS() + src_dshape = (1, 3) src_num_frags = 10 src_path = self.path("test_copy_fragments_to_existing_array_src") @@ -560,7 +562,7 @@ def write_fragments(target_path, dshape, num_frags, ts_start=1): ts = tuple((t, t) for t in range(1, 21)) - frags = tiledb.FragmentInfoList(dst_path) + frags = tiledb.array_fragments(dst_path) assert len(frags) == 10 assert frags.timestamp_range == ts[10:] diff --git a/tiledb/tests/test_libtiledb.py b/tiledb/tests/test_libtiledb.py index 2ccdde3ad7..6698e8baa6 100644 --- a/tiledb/tests/test_libtiledb.py +++ b/tiledb/tests/test_libtiledb.py @@ -2532,7 +2532,6 @@ def test_dense_datetime_vector(self): # Slice open spans with tiledb.DenseArray(uri, "r", attr="a1") as T: - # Convert datetime interval to integer offset/length into original array read_offset = int( (np.datetime64("2010-01-01") - start) / np.timedelta64(1, "D") diff --git a/tiledb/tests/test_multi_index-hp.py b/tiledb/tests/test_multi_index-hp.py index c7cbb41457..6f27801587 100644 --- a/tiledb/tests/test_multi_index-hp.py +++ b/tiledb/tests/test_multi_index-hp.py @@ -109,7 +109,6 @@ def test_multi_index_two_way_query(self, order, ranges, sparse_array_1d): @given(index_obj) def test_multi_index_inputs(self, sparse_array_1d, ind): - # TODO # currently we don't have a comparison target/mockup to check # as there is no direct numpy equivalent for this indexing mode diff --git a/tiledb/tests/test_multi_index.py b/tiledb/tests/test_multi_index.py index bf3f54b228..8c68f04db5 100644 --- a/tiledb/tests/test_multi_index.py +++ b/tiledb/tests/test_multi_index.py @@ -475,7 +475,6 @@ def test_multirange_2d_sparse_domain_utypes(self): A[coords] = coords with tiledb.open(path) as A: - res = A.multi_index[slice(coords[0], coords[-1])] assert_array_equal(res[attr_name], coords) assert_array_equal(res["__dim_0"].astype(dtype), coords) diff --git a/tiledb/tests/test_query_condition.py b/tiledb/tests/test_query_condition.py index e3d500c317..26a196b30e 100644 --- a/tiledb/tests/test_query_condition.py +++ b/tiledb/tests/test_query_condition.py @@ -815,6 +815,36 @@ def test_boolean_dense(self): assert all(self.filter_dense(result["a"], mask)) assert all(self.filter_dense(result["b"], mask)) + def test_qc_enumeration(self): + uri = self.path("test_qc_enumeration") + dom = tiledb.Domain(tiledb.Dim(domain=(1, 8), tile=1)) + enum1 = tiledb.Enumeration("enmr1", True, [0, 1, 2]) + enum2 = tiledb.Enumeration("enmr2", False, ["a", "bb", "ccc"]) + attr1 = tiledb.Attr("attr1", dtype=np.int32, enum_label="enmr1") + attr2 = tiledb.Attr("attr2", dtype=np.int32, enum_label="enmr2") + schema = tiledb.ArraySchema( + domain=dom, attrs=(attr1, attr2), enums=(enum1, enum2) + ) + tiledb.Array.create(uri, schema) + + data1 = np.random.randint(0, 3, 8) + data2 = np.random.randint(0, 3, 8) + + with tiledb.open(uri, "w") as A: + A[:] = {"attr1": data1, "attr2": data2} + + with tiledb.open(uri, "r") as A: + mask = A.attr("attr1").fill + result = A.query(cond="attr1 < 2", attrs=["attr1"])[:] + assert all(self.filter_dense(result["attr1"], mask) < 2) + + mask = A.attr("attr2").fill + result = A.query(cond="attr2 == 'bb'", attrs=["attr2"])[:] + assert all( + self.filter_dense(result["attr2"], mask) + == list(enum2.values()).index("bb") + ) + class QueryDeleteTest(DiskTestCase): def test_basic_sparse(self): diff --git a/tiledb/tests/test_repr.py b/tiledb/tests/test_repr.py index 0a6e068f33..1132c9a6ae 100644 --- a/tiledb/tests/test_repr.py +++ b/tiledb/tests/test_repr.py @@ -18,7 +18,7 @@ def test_attr_repr(self): attr = tiledb.Attr(name="itsanattr", dtype=np.float64) self.assertTrue( re.match( - r"Attr\(name=[u]?'itsanattr', dtype='float64', var=False, nullable=False\)", + r"Attr\(name=[u]?'itsanattr', dtype='float64', var=False, nullable=False, enum_label=None\)", repr(attr), ) )