diff --git a/HISTORY.md b/HISTORY.md
index d1db608d81..3821256924 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,8 @@
# In Progress
+## API Changes
+* `TILEDB_STRING_ASCII` displaying as `str` instead of `bytes` [#1304](https://github.com/TileDB-Inc/TileDB-Py/pull/1304)
+
## Misc Updates
* Wheels will no longer be supported for macOS 10.15 Catalina; the minimum supported macOS version is now 11 Big Sur [#1300](https://github.com/TileDB-Inc/TileDB-Py/pull/1300)
* Wheels will no longer supported for Python 3.6 [#1300](https://github.com/TileDB-Inc/TileDB-Py/pull/1300)
diff --git a/tiledb/core.cc b/tiledb/core.cc
index 75cb656973..ec13f26e70 100644
--- a/tiledb/core.cc
+++ b/tiledb/core.cc
@@ -246,10 +246,10 @@ py::dtype tiledb_dtype(tiledb_datatype_t type, uint32_t cell_val_num) {
std::string base_str;
switch (type) {
case TILEDB_CHAR:
- case TILEDB_STRING_ASCII:
base_str = "|S";
break;
case TILEDB_STRING_UTF8:
+ case TILEDB_STRING_ASCII:
base_str = "|U";
break;
default:
diff --git a/tiledb/dataframe_.py b/tiledb/dataframe_.py
index 034299200c..5794a541ec 100644
--- a/tiledb/dataframe_.py
+++ b/tiledb/dataframe_.py
@@ -273,9 +273,8 @@ def dim_for_column(name, values, dtype, tile, full_domain=False, dim_filters=Non
return tiledb.Dim(
name=name,
domain=(dim_min, dim_max),
- # libtiledb only supports TILEDB_ASCII dimensions, so we must use
- # nb.bytes_ which will force encoding on write
- dtype=np.bytes_ if dtype == np.str_ else dtype,
+ # TileDB only supports TILEDB_STRING_ASCII dimensions for strings
+ dtype="ascii" if dtype in (np.bytes_, np.str_) else dtype,
tile=tile,
filters=dim_filters,
)
diff --git a/tiledb/libmetadata.pyx b/tiledb/libmetadata.pyx
index fcbe62ab38..d644cd6440 100644
--- a/tiledb/libmetadata.pyx
+++ b/tiledb/libmetadata.pyx
@@ -82,10 +82,10 @@ cdef object unpack_metadata_val(
):
assert value_num != 0, "internal error: unexpected value_num==0"
- if value_type == TILEDB_STRING_UTF8:
+ if value_type == TILEDB_STRING_UTF8 or value_type == TILEDB_STRING_ASCII:
return value_ptr[:value_num].decode('UTF-8') if value_ptr != NULL else ''
- if value_type == TILEDB_CHAR or value_type == TILEDB_STRING_ASCII:
+ if value_type == TILEDB_CHAR:
return value_ptr[:value_num] if value_ptr != NULL else b''
if value_ptr == NULL:
diff --git a/tiledb/libtiledb.pyx b/tiledb/libtiledb.pyx
index 34b4cdfbc4..8af9445269 100644
--- a/tiledb/libtiledb.pyx
+++ b/tiledb/libtiledb.pyx
@@ -110,7 +110,7 @@ _tiledb_dtype_to_numpy_typeid_convert ={
TILEDB_INT16: np.NPY_INT16,
TILEDB_UINT16: np.NPY_UINT16,
TILEDB_CHAR: np.NPY_STRING,
- TILEDB_STRING_ASCII: np.NPY_STRING,
+ TILEDB_STRING_ASCII: np.NPY_UNICODE,
TILEDB_STRING_UTF8: np.NPY_UNICODE,
}
IF LIBTILEDB_VERSION_MAJOR >= 2:
@@ -133,7 +133,7 @@ _tiledb_dtype_to_numpy_dtype_convert = {
TILEDB_INT16: np.int16,
TILEDB_UINT16: np.uint16,
TILEDB_CHAR: np.dtype('S1'),
- TILEDB_STRING_ASCII: np.dtype('S'),
+ TILEDB_STRING_ASCII: np.dtype('U'),
TILEDB_STRING_UTF8: np.dtype('U1'),
}
IF LIBTILEDB_VERSION_MAJOR >= 2:
@@ -1654,14 +1654,11 @@ cdef class Attr(object):
:rtype: numpy.dtype
"""
- cdef tiledb_datatype_t typ
- check_error(self.ctx,
- tiledb_attribute_get_type(self.ctx.ptr, self.ptr, &typ))
cdef uint32_t ncells = 0
check_error(self.ctx,
tiledb_attribute_get_cell_val_num(self.ctx.ptr, self.ptr, &ncells))
- return np.dtype(_numpy_dtype(typ, ncells))
+ return np.dtype(_numpy_dtype(self._get_type(), ncells))
@property
def name(self):
@@ -1824,12 +1821,11 @@ cdef class Attr(object):
filters_str += repr(f) + ", "
filters_str += "])"
- attr_dtype = "ascii" if self.isascii else self.dtype
+ dtype = "ascii" if self.isascii else self.dtype
# filters_str must be last with no spaces
- return (f"""Attr(name={repr(self.name)}, dtype='{attr_dtype!s}', """
- f"""var={self.isvar!s}, nullable={self.isnullable!s}"""
- f"""{filters_str})""")
+ return (f"Attr(name={self.name!r}, dtype='{dtype}', var={self.isvar}, "
+ f"nullable={self.isnullable}{filters_str})")
def _repr_html_(self):
output = io.StringIO()
@@ -1852,7 +1848,7 @@ cdef class Attr(object):
output.write("
")
output.write(f"{self.name} | ")
- output.write(f"{'ascii' if self.isascii else self.dtype} | ")
+ output.write(f"{self.isascii} | ")
output.write(f"{self.isvar} | ")
output.write(f"{self.isnullable} | ")
output.write(f"{self.filters._repr_html_()} | ")
@@ -1903,8 +1899,12 @@ cdef class Dim(object):
if not ctx:
ctx = default_ctx()
+ is_string = (
+ isinstance(dtype, str) and dtype == "ascii"
+ ) or np.dtype(dtype) in (np.str_, np.bytes_)
+
if var is not None:
- if var and np.dtype(dtype) not in (np.str_, np.bytes_):
+ if var and not is_string:
raise TypeError("'var=True' specified for non-str/bytes dtype")
if domain is not None and len(domain) != 2:
@@ -1919,12 +1919,14 @@ cdef class Dim(object):
cdef void* tile_size_ptr = NULL
cdef np.dtype domain_dtype
- if ((isinstance(dtype, str) and dtype == "ascii") or
- dtype == np.dtype('S')):
+ if is_string:
# Handle var-len domain type
# (currently only TILEDB_STRING_ASCII)
# The dimension's domain is implicitly formed as
# coordinates are written.
+ if dtype != "ascii":
+ warnings.warn("Use 'ascii' for string dimensions.")
+ dtype = np.dtype("|U0")
dim_datatype = TILEDB_STRING_ASCII
else:
if domain is None or len(domain) != 2:
@@ -1985,17 +1987,19 @@ cdef class Dim(object):
self.ptr = dim_ptr
def __repr__(self):
- filters_str = ""
+ filters = ""
if self.filters:
- filters_str = ", filters=FilterList(["
+ filters = ", filters=FilterList(["
for f in self.filters:
- filters_str += repr(f) + ", "
- filters_str += "])"
+ filters += repr(f) + ", "
+ filters += "])"
+
+ dtype = "ascii" if self._get_type() == TILEDB_STRING_ASCII else self.dtype
# for consistency, print `var=True` for string-like types
- varlen = "" if not self.dtype in (np.str_, np.bytes_) else ", var=True"
- return "Dim(name={0!r}, domain={1!s}, tile={2!r}, dtype='{3!s}'{4}{5})" \
- .format(self.name, self.domain, self.tile, self.dtype, varlen, filters_str)
+ varlen = "" if dtype != "ascii" else ", var=True"
+ return f"Dim(name={self.name!r}, domain={self.domain}, tile={self.tile!r}, dtype='{dtype}'{varlen}{filters})"
+
def _repr_html_(self) -> str:
output = io.StringIO()
@@ -2022,7 +2026,7 @@ cdef class Dim(object):
output.write(f"{self.domain} | ")
output.write(f"{self.tile} | ")
output.write(f"{self.dtype} | ")
- output.write(f"{self.dtype in (np.str_, np.bytes_)} | ")
+ output.write(f"{self.dtype == 'ascii'} | ")
output.write(f"{self.filters._repr_html_()} | ")
output.write("
")
@@ -2222,7 +2226,7 @@ cdef class Dim(object):
:rtype: tuple(numpy scalar, numpy scalar)
"""
- if self.dtype == np.dtype('S'):
+ if self.dtype == np.dtype('U'):
return None, None
cdef const void* domain_ptr = NULL
check_error(self.ctx,
@@ -3864,9 +3868,8 @@ cdef class Array(object):
results.append((None, None))
continue
- buf_dtype = 'S'
- start_buf = np.empty(start_size, 'S' + str(start_size))
- end_buf = np.empty(end_size, 'S' + str(end_size))
+ start_buf = np.empty(start_size, f"S{start_size}")
+ end_buf = np.empty(end_size, f"S{end_size}")
start_buf_ptr = np.PyArray_DATA(start_buf)
end_buf_ptr = np.PyArray_DATA(end_buf)
else:
@@ -3884,7 +3887,8 @@ cdef class Array(object):
return None
if start_size > 0 and end_size > 0:
- results.append((start_buf.item(0), end_buf.item(0)))
+ results.append((start_buf.item(0).decode("UTF-8"),
+ end_buf.item(0).decode("UTF-8")))
else:
results.append((None, None))
else:
@@ -4918,7 +4922,7 @@ def index_domain_coords(dom: Domain, idx: tuple, check_ndim: bool):
# ensure strings contain only ASCII characters
domain_coords.append(np.array(sel, dtype=np.bytes_, ndmin=1))
except Exception as exc:
- raise TileDBError(f'Dim\' strings may only contain ASCII characters')
+ raise TileDBError('Dimension strings may only contain ASCII characters')
else:
domain_coords.append(np.array(sel, dtype=dim.dtype, ndmin=1))
diff --git a/tiledb/tests/test_filters.py b/tiledb/tests/test_filters.py
index 18371ba08c..7d2ccdf882 100644
--- a/tiledb/tests/test_filters.py
+++ b/tiledb/tests/test_filters.py
@@ -97,7 +97,7 @@ def test_dictionary_encoding(self):
schema = tiledb.ArraySchema(domain=dom, attrs=[attr], sparse=True)
tiledb.Array.create(path, schema)
- data = [b"x" * i for i in np.random.randint(1, 10, size=10)]
+ data = ["x" * i for i in np.random.randint(1, 10, size=10)]
with tiledb.open(path, "w") as A:
A[np.arange(10)] = data
diff --git a/tiledb/tests/test_fragments.py b/tiledb/tests/test_fragments.py
index 469a8afe90..4c58f87980 100644
--- a/tiledb/tests/test_fragments.py
+++ b/tiledb/tests/test_fragments.py
@@ -75,7 +75,7 @@ def test_array_fragments_var(self):
uri = self.path("test_array_fragments_var")
dom = tiledb.Domain(
- tiledb.Dim(name="dim", domain=(None, None), tile=None, dtype=np.bytes_)
+ tiledb.Dim(name="dim", domain=(None, None), tile=None, dtype="ascii")
)
schema = tiledb.ArraySchema(
domain=dom,
@@ -285,8 +285,8 @@ def test_nonempty_domain_date(self):
def test_nonempty_domain_strings(self):
uri = self.path("test_nonempty_domain_strings")
dom = tiledb.Domain(
- tiledb.Dim(name="x", domain=(None, None), dtype=np.bytes_),
- tiledb.Dim(name="y", domain=(None, None), dtype=np.bytes_),
+ tiledb.Dim(name="x", domain=(None, None), dtype="ascii"),
+ tiledb.Dim(name="y", domain=(None, None), dtype="ascii"),
)
att = tiledb.Attr()
schema = tiledb.ArraySchema(sparse=True, domain=dom, attrs=(att,))
@@ -294,13 +294,13 @@ def test_nonempty_domain_strings(self):
tiledb.SparseArray.create(uri, schema)
with tiledb.SparseArray(uri, mode="w") as T:
- x_dims = [b"a", b"b", b"c", b"d"]
- y_dims = [b"e", b"f", b"g", b"h"]
+ x_dims = ["a", "b", "c", "d"]
+ y_dims = ["e", "f", "g", "h"]
T[x_dims, y_dims] = np.array([1, 2, 3, 4])
with tiledb.SparseArray(uri, mode="w") as T:
- x_dims = [b"a", b"b"]
- y_dims = [b"e", b"f"]
+ x_dims = ["a", "b"]
+ y_dims = ["e", "f"]
T[x_dims, y_dims] = np.array([1, 2])
fragment_info = PyFragmentInfo(uri, schema, False, tiledb.default_ctx())
diff --git a/tiledb/tests/test_libtiledb.py b/tiledb/tests/test_libtiledb.py
index ee82339283..253fc50c29 100644
--- a/tiledb/tests/test_libtiledb.py
+++ b/tiledb/tests/test_libtiledb.py
@@ -306,13 +306,34 @@ def test_datetime_dimension(self):
)
def test_shape(self):
- dim = tiledb.Dim(name="", dtype="|S0", var=True)
+ dim = tiledb.Dim(name="", dtype="ascii", var=True)
with self.assertRaisesRegex(
TypeError,
"shape only valid for integer and datetime dimension domains",
):
dim.shape
+ def test_string_dtypes(self):
+ expected_repr = (
+ "Dim(name='__dim_0', domain=(None, None), "
+ "tile=None, dtype='ascii', var=True)"
+ )
+ expected_warn = "Use 'ascii' for string dimensions."
+
+ dim = tiledb.Dim(dtype="ascii")
+ assert dim.dtype == np.dtype("U")
+ assert repr(dim) == expected_repr
+
+ with pytest.warns(UserWarning, match=expected_warn):
+ dim = tiledb.Dim(dtype=np.bytes_)
+ assert dim.dtype == np.dtype("U")
+ assert repr(dim) == expected_repr
+
+ with pytest.warns(UserWarning, match=expected_warn):
+ dim = tiledb.Dim(dtype=np.str_)
+ assert dim.dtype == np.dtype("U")
+ assert repr(dim) == expected_repr
+
class DomainTest(DiskTestCase):
def test_domain(self, capfd):
@@ -368,7 +389,7 @@ def test_ascii_domain(self, capfd):
path = self.path("test_ascii_domain")
dim = tiledb.Dim(name="d", dtype="ascii")
- assert dim.dtype == np.bytes_
+ assert dim.dtype == np.str_
dom = tiledb.Domain(dim)
dom.dump()
@@ -545,9 +566,9 @@ def test_ascii_attribute(self, sparse, capfd):
assert A.schema.nattr == 1
A.schema.dump()
assert_captured(capfd, "Type: STRING_ASCII")
- assert A.schema.attr("A").dtype == np.bytes_
+ assert A.schema.attr("A").dtype == np.str_
assert A.schema.attr("A").isascii
- assert_array_equal(A[:]["A"], np.asarray(ascii_data, dtype=np.bytes_))
+ assert_array_equal(A[:]["A"], np.asarray(ascii_data, dtype=np.str_))
class ArraySchemaTest(DiskTestCase):
@@ -815,7 +836,7 @@ def test_mixed_string_schema(self):
dims = [
tiledb.Dim(name="dpos", domain=(-100.0, 100.0), tile=10, dtype=np.float64),
- tiledb.Dim(name="str_index", tile=None, dtype=np.bytes_),
+ tiledb.Dim(name="str_index", tile=None, dtype="ascii"),
]
dom = tiledb.Domain(*dims)
attrs = [tiledb.Attr(name="val", dtype=np.float64)]
@@ -827,12 +848,12 @@ def test_mixed_string_schema(self):
self.assertTrue(schema.domain.dim("str_index").isvar)
self.assertFalse(schema.domain.dim("dpos").isvar)
self.assertEqual(schema.domain.dim("dpos").dtype, np.double)
- self.assertEqual(schema.domain.dim("str_index").dtype, np.bytes_)
+ self.assertEqual(schema.domain.dim("str_index").dtype, np.str_)
self.assertFalse(schema.domain.homogeneous)
tiledb.Array.create(path, schema)
with tiledb.open(path, "r") as arr:
- assert_array_equal(arr[:]["str_index"], np.array([], dtype="|S1"))
+ assert_array_equal(arr[:]["str_index"], np.array([], dtype="|U1"))
class ArrayTest(DiskTestCase):
@@ -1035,7 +1056,7 @@ def test_nonempty_domain_empty_string(self):
A[""] = None
with tiledb.open(uri, "r") as A:
- assert_array_equal(A.nonempty_domain(), ((b"", b""),))
+ assert_array_equal(A.nonempty_domain(), (("", ""),))
def test_create_array_overwrite(self):
uri = self.path("test_create_array_overwrite")
@@ -2777,12 +2798,12 @@ def test_sparse_mixed_domain_uint_float64(self, sparse_cell_order):
sparse_cell_order != "hilbert",
)
a_nonempty = A.nonempty_domain()
- self.assertEqual(a_nonempty[0], (0, 49))
- self.assertEqual(a_nonempty[1], (-100.0, 100.0))
+ assert a_nonempty[0] == (0, 49)
+ assert a_nonempty[1] == (-100.0, 100.0)
def test_sparse_string_domain(self, sparse_cell_order):
path = self.path("sparse_string_domain")
- dom = tiledb.Domain(tiledb.Dim(name="d", domain=(None, None), dtype=np.bytes_))
+ dom = tiledb.Domain(tiledb.Dim(name="d", domain=(None, None), dtype="ascii"))
att = tiledb.Attr(name="a", dtype=np.int64)
schema = tiledb.ArraySchema(
domain=dom,
@@ -2794,7 +2815,7 @@ def test_sparse_string_domain(self, sparse_cell_order):
tiledb.SparseArray.create(path, schema)
data = [1, 2, 3, 4]
- coords = [b"aa", b"bbb", b"c", b"dddd"]
+ coords = ["aa", "bb", "c", "dddd"]
with tiledb.open(path, "w") as A:
A[coords] = data
@@ -2803,18 +2824,18 @@ def test_sparse_string_domain(self, sparse_cell_order):
ned = A.nonempty_domain()[0]
res = A[ned[0] : ned[1]]
assert_array_equal(res["a"], data)
- self.assertEqual(set(res["d"]), set(coords))
- self.assertEqual(A.nonempty_domain(), ((b"aa", b"dddd"),))
+ assert set(res["d"]) == set(coords)
+ assert A.nonempty_domain() == (("aa", "dddd"),)
def test_sparse_string_domain2(self, sparse_cell_order):
path = self.path("sparse_string_domain2")
with self.assertRaises(ValueError):
dims = [
tiledb.Dim(
- name="str", domain=(None, None, None), tile=None, dtype=np.bytes_
+ name="str", domain=(None, None, None), tile=None, dtype="ascii"
)
]
- dims = [tiledb.Dim(name="str", domain=(None, None), tile=None, dtype=np.bytes_)]
+ dims = [tiledb.Dim(name="str", domain=(None, None), tile=None, dtype="ascii")]
dom = tiledb.Domain(*dims)
attrs = [tiledb.Attr(name="val", dtype=np.float64)]
@@ -2824,7 +2845,7 @@ def test_sparse_string_domain2(self, sparse_cell_order):
tiledb.SparseArray.create(path, schema)
data = np.random.rand(10)
- coords = [rand_ascii_bytes(random.randint(5, 50)) for _ in range(10)]
+ coords = [rand_ascii(random.randint(5, 50)) for _ in range(10)]
with tiledb.open(path, "w") as A:
A[coords] = data
@@ -2832,7 +2853,7 @@ def test_sparse_string_domain2(self, sparse_cell_order):
with tiledb.open(path) as A:
ned = A.nonempty_domain()[0]
res = A[ned[0] : ned[1]]
- self.assertTrue(set(res["str"]) == set(coords))
+ assert set(res["str"]) == set(coords)
# must check data ordered by coords
assert_array_equal(res["val"], data[np.argsort(coords, kind="stable")])
@@ -2840,7 +2861,7 @@ def test_sparse_mixed_domain(self, sparse_cell_order):
uri = self.path("sparse_mixed_domain")
dims = [
tiledb.Dim(name="p", domain=(-100.0, 100.0), tile=10, dtype=np.float64),
- tiledb.Dim(name="str", domain=(None, None), tile=None, dtype=np.bytes_),
+ tiledb.Dim(name="str", domain=(None, None), tile=None, dtype="ascii"),
]
dom = tiledb.Domain(*dims)
attrs = [tiledb.Attr(name="val", dtype=np.float64)]
@@ -2852,7 +2873,7 @@ def test_sparse_mixed_domain(self, sparse_cell_order):
nrows = 5
idx_f64 = np.random.rand(nrows)
- idx_str = [rand_ascii(5).encode("utf-8") for _ in range(nrows)]
+ idx_str = [rand_ascii(5) for _ in range(nrows)]
data = np.random.rand(nrows)
with tiledb.SparseArray(uri, "w") as A:
@@ -2868,7 +2889,7 @@ def test_sparse_mixed_domain(self, sparse_cell_order):
def test_sparse_get_unique_dim_values(self, sparse_cell_order):
uri = self.path("get_non_empty_coords")
- dim1 = tiledb.Dim(name="dim1", domain=(None, None), tile=None, dtype=np.bytes_)
+ dim1 = tiledb.Dim(name="dim1", domain=(None, None), tile=None, dtype="ascii")
dim2 = tiledb.Dim(name="dim2", domain=(0, 1), tile=1, dtype=np.float64)
attr = tiledb.Attr(name="attr", dtype=np.float32)
dom = tiledb.Domain(dim1, dim2)
@@ -2886,12 +2907,10 @@ def test_sparse_get_unique_dim_values(self, sparse_cell_order):
with tiledb.open(uri, "r") as A:
self.assertEqual(
A.unique_dim_values(),
- OrderedDict(
- [("dim1", (b"a1", b"a2", b"a3")), ("dim2", (0.0, 0.25, 0.5))]
- ),
+ OrderedDict([("dim1", ("a1", "a2", "a3")), ("dim2", (0.0, 0.25, 0.5))]),
)
- self.assertEqual(A.unique_dim_values("dim1"), (b"a1", b"a2", b"a3"))
+ self.assertEqual(A.unique_dim_values("dim1"), ("a1", "a2", "a3"))
self.assertEqual(A.unique_dim_values("dim2"), (0, 0.25, 0.5))
with self.assertRaises(ValueError):
@@ -2919,7 +2938,7 @@ def test_sparse_write_for_zero_attrs(self):
def test_sparse_write_nullable_default(self):
uri = self.path("test_sparse_write_nullable_default")
- dim1 = tiledb.Dim(name="d1", dtype="|S0", var=True)
+ dim1 = tiledb.Dim(name="d1", dtype="ascii", var=True)
att = tiledb.Attr(name="a1", dtype="