Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improving Usability of ASCII Strings #1304

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# In Progress

## API Changes
* `TILEDB_STRING_ASCII` displaying as `str` instead of `bytes` [#1304](https://github.com/TileDB-Inc/TileDB-Py/pull/1304)

## Misc Updates
* Wheels will no longer be supported for macOS 10.15 Catalina; the minimum supported macOS version is now 11 Big Sur [#1300](https://github.com/TileDB-Inc/TileDB-Py/pull/1300)
* Wheels will no longer supported for Python 3.6 [#1300](https://github.com/TileDB-Inc/TileDB-Py/pull/1300)
Expand Down
2 changes: 1 addition & 1 deletion tiledb/core.cc
Original file line number Diff line number Diff line change
Expand Up @@ -246,10 +246,10 @@ py::dtype tiledb_dtype(tiledb_datatype_t type, uint32_t cell_val_num) {
std::string base_str;
switch (type) {
case TILEDB_CHAR:
case TILEDB_STRING_ASCII:
base_str = "|S";
break;
case TILEDB_STRING_UTF8:
case TILEDB_STRING_ASCII:
base_str = "|U";
break;
default:
Expand Down
5 changes: 2 additions & 3 deletions tiledb/dataframe_.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,9 +273,8 @@ def dim_for_column(name, values, dtype, tile, full_domain=False, dim_filters=Non
return tiledb.Dim(
name=name,
domain=(dim_min, dim_max),
# libtiledb only supports TILEDB_ASCII dimensions, so we must use
# nb.bytes_ which will force encoding on write
dtype=np.bytes_ if dtype == np.str_ else dtype,
# TileDB only supports TILEDB_STRING_ASCII dimensions for strings
dtype="ascii" if dtype in (np.bytes_, np.str_) else dtype,
tile=tile,
filters=dim_filters,
)
Expand Down
4 changes: 2 additions & 2 deletions tiledb/libmetadata.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,10 @@ cdef object unpack_metadata_val(
):
assert value_num != 0, "internal error: unexpected value_num==0"

if value_type == TILEDB_STRING_UTF8:
if value_type == TILEDB_STRING_UTF8 or value_type == TILEDB_STRING_ASCII:
return value_ptr[:value_num].decode('UTF-8') if value_ptr != NULL else ''

if value_type == TILEDB_CHAR or value_type == TILEDB_STRING_ASCII:
if value_type == TILEDB_CHAR:
return value_ptr[:value_num] if value_ptr != NULL else b''

if value_ptr == NULL:
Expand Down
60 changes: 32 additions & 28 deletions tiledb/libtiledb.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ _tiledb_dtype_to_numpy_typeid_convert ={
TILEDB_INT16: np.NPY_INT16,
TILEDB_UINT16: np.NPY_UINT16,
TILEDB_CHAR: np.NPY_STRING,
TILEDB_STRING_ASCII: np.NPY_STRING,
TILEDB_STRING_ASCII: np.NPY_UNICODE,
TILEDB_STRING_UTF8: np.NPY_UNICODE,
}
IF LIBTILEDB_VERSION_MAJOR >= 2:
Expand All @@ -133,7 +133,7 @@ _tiledb_dtype_to_numpy_dtype_convert = {
TILEDB_INT16: np.int16,
TILEDB_UINT16: np.uint16,
TILEDB_CHAR: np.dtype('S1'),
TILEDB_STRING_ASCII: np.dtype('S'),
TILEDB_STRING_ASCII: np.dtype('U'),
TILEDB_STRING_UTF8: np.dtype('U1'),
}
IF LIBTILEDB_VERSION_MAJOR >= 2:
Expand Down Expand Up @@ -1654,14 +1654,11 @@ cdef class Attr(object):
:rtype: numpy.dtype

"""
cdef tiledb_datatype_t typ
check_error(self.ctx,
tiledb_attribute_get_type(self.ctx.ptr, self.ptr, &typ))
cdef uint32_t ncells = 0
check_error(self.ctx,
tiledb_attribute_get_cell_val_num(self.ctx.ptr, self.ptr, &ncells))

return np.dtype(_numpy_dtype(typ, ncells))
return np.dtype(_numpy_dtype(self._get_type(), ncells))

@property
def name(self):
Expand Down Expand Up @@ -1824,12 +1821,11 @@ cdef class Attr(object):
filters_str += repr(f) + ", "
filters_str += "])"

attr_dtype = "ascii" if self.isascii else self.dtype
dtype = "ascii" if self.isascii else self.dtype

# filters_str must be last with no spaces
return (f"""Attr(name={repr(self.name)}, dtype='{attr_dtype!s}', """
f"""var={self.isvar!s}, nullable={self.isnullable!s}"""
f"""{filters_str})""")
return (f"Attr(name={self.name!r}, dtype='{dtype}', var={self.isvar}, "
f"nullable={self.isnullable}{filters_str})")

def _repr_html_(self):
output = io.StringIO()
Expand All @@ -1852,7 +1848,7 @@ cdef class Attr(object):

output.write("<tr>")
output.write(f"<td>{self.name}</td>")
output.write(f"<td>{'ascii' if self.isascii else self.dtype}</td>")
output.write(f"<td>{self.isascii}</td>")
output.write(f"<td>{self.isvar}</td>")
output.write(f"<td>{self.isnullable}</td>")
output.write(f"<td>{self.filters._repr_html_()}</td>")
Expand Down Expand Up @@ -1903,8 +1899,12 @@ cdef class Dim(object):
if not ctx:
ctx = default_ctx()

is_string = (
isinstance(dtype, str) and dtype == "ascii"
) or np.dtype(dtype) in (np.str_, np.bytes_)

if var is not None:
if var and np.dtype(dtype) not in (np.str_, np.bytes_):
if var and not is_string:
raise TypeError("'var=True' specified for non-str/bytes dtype")

if domain is not None and len(domain) != 2:
Expand All @@ -1919,12 +1919,14 @@ cdef class Dim(object):
cdef void* tile_size_ptr = NULL
cdef np.dtype domain_dtype

if ((isinstance(dtype, str) and dtype == "ascii") or
dtype == np.dtype('S')):
if is_string:
# Handle var-len domain type
# (currently only TILEDB_STRING_ASCII)
# The dimension's domain is implicitly formed as
# coordinates are written.
if dtype != "ascii":
warnings.warn("Use 'ascii' for string dimensions.")
dtype = np.dtype("|U0")
dim_datatype = TILEDB_STRING_ASCII
else:
if domain is None or len(domain) != 2:
Expand Down Expand Up @@ -1985,17 +1987,19 @@ cdef class Dim(object):
self.ptr = dim_ptr

def __repr__(self):
filters_str = ""
filters = ""
if self.filters:
filters_str = ", filters=FilterList(["
filters = ", filters=FilterList(["
for f in self.filters:
filters_str += repr(f) + ", "
filters_str += "])"
filters += repr(f) + ", "
filters += "])"

dtype = "ascii" if self._get_type() == TILEDB_STRING_ASCII else self.dtype

# for consistency, print `var=True` for string-like types
varlen = "" if not self.dtype in (np.str_, np.bytes_) else ", var=True"
return "Dim(name={0!r}, domain={1!s}, tile={2!r}, dtype='{3!s}'{4}{5})" \
.format(self.name, self.domain, self.tile, self.dtype, varlen, filters_str)
varlen = "" if dtype != "ascii" else ", var=True"
return f"Dim(name={self.name!r}, domain={self.domain}, tile={self.tile!r}, dtype='{dtype}'{varlen}{filters})"


def _repr_html_(self) -> str:
output = io.StringIO()
Expand All @@ -2022,7 +2026,7 @@ cdef class Dim(object):
output.write(f"<td>{self.domain}</td>")
output.write(f"<td>{self.tile}</td>")
output.write(f"<td>{self.dtype}</td>")
output.write(f"<td>{self.dtype in (np.str_, np.bytes_)}</td>")
output.write(f"<td>{self.dtype == 'ascii'}</td>")
output.write(f"<td>{self.filters._repr_html_()}</td>")
output.write("</tr>")

Expand Down Expand Up @@ -2222,7 +2226,7 @@ cdef class Dim(object):
:rtype: tuple(numpy scalar, numpy scalar)

"""
if self.dtype == np.dtype('S'):
if self.dtype == np.dtype('U'):
return None, None
cdef const void* domain_ptr = NULL
check_error(self.ctx,
Expand Down Expand Up @@ -3864,9 +3868,8 @@ cdef class Array(object):
results.append((None, None))
continue

buf_dtype = 'S'
start_buf = np.empty(start_size, 'S' + str(start_size))
end_buf = np.empty(end_size, 'S' + str(end_size))
start_buf = np.empty(start_size, f"S{start_size}")
end_buf = np.empty(end_size, f"S{end_size}")
start_buf_ptr = np.PyArray_DATA(start_buf)
end_buf_ptr = np.PyArray_DATA(end_buf)
else:
Expand All @@ -3884,7 +3887,8 @@ cdef class Array(object):
return None

if start_size > 0 and end_size > 0:
results.append((start_buf.item(0), end_buf.item(0)))
results.append((start_buf.item(0).decode("UTF-8"),
end_buf.item(0).decode("UTF-8")))
else:
results.append((None, None))
else:
Expand Down Expand Up @@ -4918,7 +4922,7 @@ def index_domain_coords(dom: Domain, idx: tuple, check_ndim: bool):
# ensure strings contain only ASCII characters
domain_coords.append(np.array(sel, dtype=np.bytes_, ndmin=1))
except Exception as exc:
raise TileDBError(f'Dim\' strings may only contain ASCII characters')
raise TileDBError('Dimension strings may only contain ASCII characters')
else:
domain_coords.append(np.array(sel, dtype=dim.dtype, ndmin=1))

Expand Down
2 changes: 1 addition & 1 deletion tiledb/tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def test_dictionary_encoding(self):
schema = tiledb.ArraySchema(domain=dom, attrs=[attr], sparse=True)
tiledb.Array.create(path, schema)

data = [b"x" * i for i in np.random.randint(1, 10, size=10)]
data = ["x" * i for i in np.random.randint(1, 10, size=10)]

with tiledb.open(path, "w") as A:
A[np.arange(10)] = data
Expand Down
14 changes: 7 additions & 7 deletions tiledb/tests/test_fragments.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def test_array_fragments_var(self):

uri = self.path("test_array_fragments_var")
dom = tiledb.Domain(
tiledb.Dim(name="dim", domain=(None, None), tile=None, dtype=np.bytes_)
tiledb.Dim(name="dim", domain=(None, None), tile=None, dtype="ascii")
)
schema = tiledb.ArraySchema(
domain=dom,
Expand Down Expand Up @@ -285,22 +285,22 @@ def test_nonempty_domain_date(self):
def test_nonempty_domain_strings(self):
uri = self.path("test_nonempty_domain_strings")
dom = tiledb.Domain(
tiledb.Dim(name="x", domain=(None, None), dtype=np.bytes_),
tiledb.Dim(name="y", domain=(None, None), dtype=np.bytes_),
tiledb.Dim(name="x", domain=(None, None), dtype="ascii"),
tiledb.Dim(name="y", domain=(None, None), dtype="ascii"),
)
att = tiledb.Attr()
schema = tiledb.ArraySchema(sparse=True, domain=dom, attrs=(att,))

tiledb.SparseArray.create(uri, schema)

with tiledb.SparseArray(uri, mode="w") as T:
x_dims = [b"a", b"b", b"c", b"d"]
y_dims = [b"e", b"f", b"g", b"h"]
x_dims = ["a", "b", "c", "d"]
y_dims = ["e", "f", "g", "h"]
T[x_dims, y_dims] = np.array([1, 2, 3, 4])

with tiledb.SparseArray(uri, mode="w") as T:
x_dims = [b"a", b"b"]
y_dims = [b"e", b"f"]
x_dims = ["a", "b"]
y_dims = ["e", "f"]
T[x_dims, y_dims] = np.array([1, 2])

fragment_info = PyFragmentInfo(uri, schema, False, tiledb.default_ctx())
Expand Down
Loading