Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wrap Enumerated Datatype #1790

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
# Release 0.23.0

* TileDB-Py 0.23.0 includes TileDB Embedded [2.17.0](https://github.com/TileDB-Inc/TileDB/releases/tag/2.17.0)

## Improvements

* Support for "enumerated datatypes" (aka categoricals or factors). [#1790](https://github.com/TileDB-Inc/TileDB-Py/pull/1790)
* Introduce `Array.read_subarray` and `Array.write_subarray` APIs. [#1824](https://github.com/TileDB-Inc/TileDB-Py/pull/1824)
* Avoid importing Pandas until we actually use it. [#1825](https://github.com/TileDB-Inc/TileDB-Py/pull/1825)
* Make VFS accept path-like objects to refer to files. [#1818](https://github.com/TileDB-Inc/TileDB-Py/pull/1818)

## Bug Fies

* Use object equality check in buffer conversion, fixes state serialization bug in distributed use-case. [#1822](https://github.com/TileDB-Inc/TileDB-Py/pull/1822)

# Release 0.22.3

* TileDB-Py 0.22.3 includes TileDB Embedded [2.16.3](https://github.com/TileDB-Inc/TileDB/releases/tag/2.16.3)
Expand Down
7 changes: 4 additions & 3 deletions misc/azure-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ stages:
LIBTILEDB_VERSION: dev
LIBTILEDB_SHA: dev
${{ else }}:
TILEDBPY_VERSION: 0.22.3
LIBTILEDB_VERSION: 2.16.3
LIBTILEDB_SHA: 194b5ae2941d7b6631fba367a7afdd79350332e7
TILEDBPY_VERSION: 0.23.0
# NOTE: *must* update both LIBTILEDB_VERSION and LIBTILEDB_SHA
LIBTILEDB_VERSION: 2.17.0
LIBTILEDB_SHA: 93c173dbe46278c76db49b8ae26a4d5d2384ecb0
LIBTILEDB_REPO: https://github.com/TileDB-Inc/TileDB
TILEDB_SRC: "$(Build.Repository.Localpath)/tiledb_src"
TILEDB_BUILD: "$(Build.Repository.Localpath)/tiledb_build"
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# - this is for builds-from-source
# - release builds are controlled by `misc/azure-release.yml`
# - this should be set to the current core release, not `dev`
TILEDB_VERSION = "2.16.3"
TILEDB_VERSION = "2.17.0"

# allow overriding w/ environment variable
TILEDB_VERSION = os.environ.get("TILEDB_VERSION") or TILEDB_VERSION
Expand Down
1 change: 1 addition & 0 deletions tiledb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from .dimension_label import DimLabel
from .dimension_label_schema import DimLabelSchema
from .domain import Domain
from .enumeration import Enumeration
from .filestore import Filestore
from .filter import (
BitShuffleFilter,
Expand Down
6 changes: 6 additions & 0 deletions tiledb/array_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,15 @@ def __init__(
allows_duplicates: bool = False,
sparse: bool = False,
dim_labels={},
enums=None,
ctx: Ctx = None,
):
super().__init__(ctx, lt.ArrayType.SPARSE if sparse else lt.ArrayType.DENSE)

if enums is not None:
for enum_name in enums:
self._add_enumeration(self._ctx, enum_name)

if attrs is not None:
for att in attrs:
if not isinstance(att, Attr):
Expand Down
17 changes: 15 additions & 2 deletions tiledb/attribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def __init__(
var: bool = None,
nullable: bool = False,
filters: Union[FilterList, Sequence[Filter]] = None,
enum_label: str = None,
ctx: Optional[Ctx] = None,
):
"""Class representing a TileDB array attribute.
Expand Down Expand Up @@ -88,6 +89,9 @@ def __init__(
if nullable is not None:
self._nullable = nullable

if enum_label is not None:
self._set_enumeration_name(self._ctx, enum_label)

def __eq__(self, other):
if not isinstance(other, Attr):
return False
Expand Down Expand Up @@ -202,6 +206,10 @@ def isascii(self) -> bool:
"""
return self._tiledb_dtype == lt.DataType.STRING_ASCII

@property
def enum_label(self):
return self._get_enumeration_name(self._ctx)

def __repr__(self):
filters_str = ""
if self.filters:
Expand All @@ -217,11 +225,16 @@ def __repr__(self):
else:
attr_dtype = self.dtype

if self.enum_label is None:
enum_label = None
else:
enum_label = f"'{self.enum_label!s}'"

# filters_str must be last with no spaces
return (
f"""Attr(name={repr(self.name)}, dtype='{attr_dtype!s}', """
f"""var={self.isvar!s}, nullable={self.isnullable!s}"""
f"""{filters_str})"""
f"""var={self.isvar!s}, nullable={self.isnullable!s}, """
f"""enum_label={enum_label}{filters_str})"""
)

def _repr_html_(self):
Expand Down
1 change: 1 addition & 0 deletions tiledb/cc/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ void init_array(py::module &m) {
.def("uri", &Array::uri)
.def("schema", &Array::schema)
//.def("ptr", [](Array& arr){ return py::capsule(arr.ptr()); } )
.def("open", (void (Array::*)(tiledb_query_type_t)) & Array::open)
// open with encryption key
.def("open",
(void (Array::*)(tiledb_query_type_t, tiledb_encryption_type_t,
Expand Down
16 changes: 15 additions & 1 deletion tiledb/cc/attribute.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <tiledb/tiledb>
#include <tiledb/tiledb_experimental>

#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
Expand Down Expand Up @@ -39,6 +40,16 @@ py::array get_fill_value(Attribute &attr) {
return py::array(value_type, value_num, value);
}

void set_enumeration_name(Attribute &attr, const Context &ctx,
const std::string &enumeration_name) {
AttributeExperimental::set_enumeration_name(ctx, attr, enumeration_name);
}

std::optional<std::string> get_enumeration_name(Attribute &attr,
const Context &ctx) {
return AttributeExperimental::get_enumeration_name(ctx, attr);
}

void init_attribute(py::module &m) {
py::class_<tiledb::Attribute>(m, "Attribute")
.def(py::init<Attribute>())
Expand Down Expand Up @@ -73,8 +84,11 @@ void init_attribute(py::module &m) {

.def_property("_fill", get_fill_value, set_fill_value)

.def("_get_enumeration_name", get_enumeration_name)

.def("_set_enumeration_name", set_enumeration_name)

.def("_dump", [](Attribute &attr) { attr.dump(); });
;
}

} // namespace libtiledbcpp
77 changes: 77 additions & 0 deletions tiledb/cc/enumeration.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#include <tiledb/tiledb>
#include <tiledb/tiledb_experimental>

#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/pytypes.h>
#include <pybind11/stl.h>

#include "common.h"

namespace libtiledbcpp {

using namespace tiledb;
using namespace tiledbpy::common;
namespace py = pybind11;

void init_enumeration(py::module &m) {
py::class_<Enumeration>(m, "Enumeration")
.def(py::init<Enumeration>())

.def(py::init([](const Context &ctx, const std::string &name,
std::vector<std::string> &values, bool ordered,
tiledb_datatype_t type) {
return Enumeration::create(ctx, name, values, ordered, type);
}))

.def(py::init([](const Context &ctx, const std::string &name,
bool ordered, py::array data, py::array offsets) {
tiledb_datatype_t data_type;
try {
data_type = np_to_tdb_dtype(data.dtype());
} catch (const TileDBPyError &e) {
throw py::type_error(e.what());
}

py::buffer_info data_buffer = data.request();
if (data_buffer.ndim != 1)
throw py::type_error("Only 1D Numpy arrays can be stored as "
"enumeration values");

py::size_t cell_val_num =
offsets.size() == 0 ? get_ncells(data.dtype()) : TILEDB_VAR_NUM;

return Enumeration::create(
ctx, name, data_type, cell_val_num, ordered, data.data(),
data.nbytes(), offsets.size() == 0 ? nullptr : offsets.data(),
offsets.nbytes());
}))

.def(py::init<const Context &, py::capsule>(), py::keep_alive<1, 2>())

.def("__capsule__",
[](Enumeration &enmr) {
return py::capsule(enmr.ptr().get(), "enmr", nullptr);
})

.def_property_readonly("name", &Enumeration::name)

.def_property_readonly("type", &Enumeration::type)

.def_property_readonly("cell_val_num", &Enumeration::cell_val_num)

.def_property_readonly("ordered", &Enumeration::ordered)

.def("values",
[](Enumeration &enmr) {
auto data = enmr.as_vector<std::byte>();
auto dtype = tdb_to_np_dtype(enmr.type(), enmr.cell_val_num());
return py::array(dtype, data.size() / dtype.itemsize(),
data.data());
})

.def("str_values",
[](Enumeration &enmr) { return enmr.as_vector<std::string>(); });
}

} // namespace libtiledbcpp
17 changes: 12 additions & 5 deletions tiledb/cc/schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -278,16 +278,23 @@ void init_schema(py::module &m) {
.def("_has_attribute", &ArraySchema::has_attribute)

#if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 15
.def("_has_dim_label", [](const ArraySchema &schema, const Context &ctx,
const std::string &name) {
return ArraySchemaExperimental::has_dimension_label(ctx, schema, name);
});
.def("_has_dim_label",
[](const ArraySchema &schema, const Context &ctx,
const std::string &name) {
return ArraySchemaExperimental::has_dimension_label(ctx, schema,
name);
})
#else
.def("_has_dim_label", [](const ArraySchema &, const Context &,
const std::string &) {
return false;
});
})
#endif

.def("_add_enumeration", [](const ArraySchema &schema, const Context &ctx,
const Enumeration &enmr) {
ArraySchemaExperimental::add_enumeration(ctx, schema, enmr);
});
}

} // namespace libtiledbcpp
2 changes: 2 additions & 0 deletions tiledb/cc/tiledbcpp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ void init_attribute(py::module &);
void init_context(py::module &);
void init_config(py::module &);
void init_enums(py::module &);
void init_enumeration(py::module &);
void init_dimension_label(py::module &m);
void init_domain(py::module &m);
void init_file_handle(py::module &);
Expand All @@ -40,6 +41,7 @@ PYBIND11_MODULE(cc, m) {
init_dimension_label(m);
init_domain(m);
init_enums(m);
init_enumeration(m);
init_file_handle(m);
init_filestore(m);
init_filter(m);
Expand Down
6 changes: 3 additions & 3 deletions tiledb/core.cc
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ class PyQuery {
// label buffer list
std::unordered_map<string, uint64_t> label_input_buffer_data_;

py::object pyschema_;
std::string uri_;

public:
tiledb_ctx_t *c_ctx_;
Expand Down Expand Up @@ -349,7 +349,7 @@ class PyQuery {
domain_ =
std::shared_ptr<tiledb::Domain>(new Domain(array_schema_->domain()));

pyschema_ = array.attr("schema");
uri_ = array.attr("uri").cast<std::string>();

bool issparse = array_->schema().array_type() == TILEDB_SPARSE;

Expand Down Expand Up @@ -450,7 +450,7 @@ class PyQuery {
py::object init_pyqc = cond.attr("init_query_condition");

try {
init_pyqc(pyschema_, attrs_);
init_pyqc(uri_, attrs_);
} catch (tiledb::TileDBError &e) {
TPY_ERROR_LOC(e.what());
} catch (py::error_already_set &e) {
Expand Down
1 change: 0 additions & 1 deletion tiledb/dataframe_.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ def parse_tiledb_kwargs(kwargs):

@dataclass(frozen=True)
class ColumnInfo:

dtype: np.dtype
repr: Optional[str] = None
nullable: bool = False
Expand Down
Loading