diff --git a/.github/workflows/python-ci-single.yml b/.github/workflows/python-ci-single.yml index 4803dfe260..a67076231b 100644 --- a/.github/workflows/python-ci-single.yml +++ b/.github/workflows/python-ci-single.yml @@ -122,13 +122,6 @@ jobs: - name: Run libtiledbsoma unit tests run: ctest --output-on-failure --test-dir build/libtiledbsoma -C Release --verbose - - name: Run pytests for C++ - shell: bash - # Setting PYTHONPATH ensures the tests load the in-tree source code unde apis/python/src - # instead of copy we `pip install`ed to site-packages above. That's needed for the code - # coverage analysis to work. - run: PYTHONPATH=$(pwd)/apis/python/src python -m pytest --cov=apis/python/src --cov-report=xml libtiledbsoma/test -v --durations=20 - - name: Run pytests for Python shell: bash # Setting PYTHONPATH ensures the tests load the in-tree source code unde apis/python/src diff --git a/Makefile b/Makefile index cdcebb3835..f033dce8d8 100644 --- a/Makefile +++ b/Makefile @@ -32,7 +32,7 @@ update: .PHONY: test test: data ctest --test-dir build/libtiledbsoma -C Release --verbose --rerun-failed --output-on-failure - pytest apis/python/tests libtiledbsoma/test + pytest apis/python/tests .PHONY: data data: diff --git a/apis/python/setup.py b/apis/python/setup.py index 78f475870e..5640a9c820 100644 --- a/apis/python/setup.py +++ b/apis/python/setup.py @@ -186,9 +186,6 @@ def run(self): "dist_links/libtiledbsoma/external/include", "../../build/externals/install/include", str(libtiledbsoma_dir / "include"), - str( - "./src/tiledbsoma" - ), # since pytiledbsoma.cc does #include of query_condition.cc str(libtiledbsoma_dir.parent / "build/externals/install/include"), str(tiledb_dir / "include"), ] @@ -258,7 +255,14 @@ def run(self): ext_modules=[ Pybind11Extension( "tiledbsoma.pytiledbsoma", - ["src/tiledbsoma/pytiledbsoma.cc"], + [ + "src/tiledbsoma/common.cc", + "src/tiledbsoma/query_condition.cc", + "src/tiledbsoma/soma_array.cc", + "src/tiledbsoma/soma_object.cc", + "src/tiledbsoma/soma_dataframe.cc", + "src/tiledbsoma/pytiledbsoma.cc", + ], include_dirs=INC_DIRS, library_dirs=LIB_DIRS, libraries=["tiledbsoma"] + (["tiledb"] if os.name == "nt" else []), diff --git a/apis/python/src/tiledbsoma/_collection.py b/apis/python/src/tiledbsoma/_collection.py index 131435d973..09934d5811 100644 --- a/apis/python/src/tiledbsoma/_collection.py +++ b/apis/python/src/tiledbsoma/_collection.py @@ -75,6 +75,7 @@ class CollectionBase( # type: ignore[misc] # __eq__ false positive __slots__ = ("_contents", "_mutated_keys") _wrapper_type = _tdb_handles.GroupWrapper + _reader_wrapper_type = _tdb_handles.GroupWrapper # TODO: Implement additional creation of members on collection subclasses. @classmethod @@ -426,13 +427,20 @@ def __getitem__(self, key: str) -> CollectionElementType: if entry.soma is None: from . import _factory # Delayed binding to resolve circular import. - entry.soma = _factory._open_internal( - entry.entry.wrapper_type.open, - entry.entry.uri, - self.mode, - self.context, - self.tiledb_timestamp_ms, - ) + uri = entry.entry.uri + mode = self.mode + context = self.context + timestamp = self.tiledb_timestamp_ms + + try: + wrapper = _tdb_handles._open_with_clib_wrapper( + uri, mode, context, timestamp + ) + entry.soma = _factory.reify_handle(wrapper) + except SOMAError: + entry.soma = _factory._open_internal( + entry.entry.wrapper_type.open, uri, mode, context, timestamp + ) # Since we just opened this object, we own it and should close it. self._close_stack.enter_context(entry.soma) return cast(CollectionElementType, entry.soma) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 90ce28cdb3..fd3700ecb5 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -21,6 +21,7 @@ from ._constants import SOMA_JOINID from ._query_condition import QueryCondition from ._read_iters import TableReadIter +from ._tdb_handles import DataFrameWrapper from ._tiledb_array import TileDBArray from ._types import NPFloating, NPInteger, OpenTimestamp, Slice, is_slice_of from .options import SOMATileDBContext @@ -121,6 +122,8 @@ class DataFrame(TileDBArray, somacore.DataFrame): it must be ``None``. """ + _reader_wrapper_type = DataFrameWrapper + @classmethod def create( cls, @@ -261,18 +264,8 @@ def count(self) -> int: Experimental. """ self._check_open_read() - return cast(int, self._soma_reader().nnz()) - - def enumeration(self, name: str) -> Tuple[Any, ...]: - """Doc place holder. - - Returns: - Tuple[Any, ...]: _description_ - """ - return tuple(self._soma_reader().get_enum(name)) - - def column_to_enumeration(self, name: str) -> str: - return str(self._soma_reader().get_enum_label_on_attr(name)) + # if is it in read open mode, then it is a DataFrameWrapper + return cast(DataFrameWrapper, self._handle).count def __len__(self) -> int: """Returns the number of rows in the dataframe. Same as ``df.count``.""" @@ -341,25 +334,28 @@ def read( Lifecycle: Experimental. """ - del batch_size, platform_config # Currently unused. + del batch_size # Currently unused. _util.check_unpartitioned(partitions) self._check_open_read() - schema = self._handle.schema - query_condition = None - if value_filter is not None: - query_condition = QueryCondition(value_filter) - - sr = self._soma_reader( - schema=schema, # query_condition needs this - column_names=column_names, - query_condition=query_condition, - result_order=result_order, + ts = None + if self._handle._handle.timestamp is not None: + ts = (0, self._handle._handle.timestamp) + + sr = clib.SOMADataFrame.open( + uri=self._handle._handle.uri, + mode=clib.OpenMode.read, + platform_config=platform_config or {}, + column_names=column_names or [], + result_order=_util.to_clib_result_order(result_order), + timestamp=ts, ) + if value_filter is not None: + sr.set_condition(QueryCondition(value_filter), self._handle.schema) + self._set_reader_coords(sr, coords) - # TODO: platform_config # TODO: batch_size return TableReadIter(sr) @@ -415,7 +411,7 @@ def write( if not pa.types.is_dictionary(col_info.type): raise ValueError( "Expected dictionary type for enumerated attribute " - f"{name} but saw {col_info.type}" + f"{name} but saw {col.type}" ) enmr = self._handle.enum(attr.name) @@ -521,20 +517,23 @@ def _set_reader_coord( if self._set_reader_coord_by_numeric_slice(sr, dim_idx, dim, coord): return True + domain = self.domain[dim_idx] + # Note: slice(None, None) matches the is_slice_of part, unless we also check the dim-type # part. - if (is_slice_of(coord, str) or is_slice_of(coord, bytes)) and ( - dim.dtype == "str" or dim.dtype == "bytes" - ): + if ( + is_slice_of(coord, str) or is_slice_of(coord, bytes) + ) and _util.pa_types_is_string_or_bytes(dim.type): _util.validate_slice(coord) # Figure out which one. - dim_type: Union[Type[str], Type[bytes]] = type(dim.domain[0]) + dim_type: Union[Type[str], Type[bytes]] = type(domain[0]) # A ``None`` or empty start is always equivalent to empty str/bytes. start = coord.start or dim_type() if coord.stop is None: # There's no way to specify "to infinity" for strings. # We have to get the nonempty domain and use that as the end. - _, stop = self._handle.reader.nonempty_domain()[dim_idx] + ned = self._handle.non_empty_domain() + _, stop = ned[dim_idx] else: stop = coord.stop sr.set_dim_ranges_string_or_bytes(dim.name, [(start, stop)]) @@ -542,16 +541,14 @@ def _set_reader_coord( # Note: slice(None, None) matches the is_slice_of part, unless we also check the dim-type # part. - if is_slice_of(coord, np.datetime64) and dim.dtype.name.startswith( - "datetime64" - ): + if is_slice_of(coord, np.datetime64) and pa.types.is_timestamp(dim.type): _util.validate_slice(coord) # These timestamp types are stored in Arrow as well as TileDB as 64-bit integers (with # distinguishing metadata of course). For purposes of the query logic they're just # int64. - istart = coord.start or dim.domain[0] + istart = coord.start or domain[0] istart = int(istart.astype("int64")) - istop = coord.stop or dim.domain[1] + istop = coord.stop or domain[1] istop = int(istop.astype("int64")) sr.set_dim_ranges_int64(dim.name, [(istart, istop)]) return True @@ -574,41 +571,19 @@ def _set_reader_coord_by_py_seq_or_np_array( f"only 1D numpy arrays may be used to index; got {coord.ndim}" ) - # See libtiledbsoma.cc for more context on why we need the - # explicit type-check here. - - if dim.dtype == np.int64: - sr.set_dim_points_int64(dim.name, coord) - elif dim.dtype == np.int32: - sr.set_dim_points_int32(dim.name, coord) - elif dim.dtype == np.int16: - sr.set_dim_points_int16(dim.name, coord) - elif dim.dtype == np.int8: - sr.set_dim_points_int8(dim.name, coord) - - elif dim.dtype == np.uint64: - sr.set_dim_points_uint64(dim.name, coord) - elif dim.dtype == np.uint32: - sr.set_dim_points_uint32(dim.name, coord) - elif dim.dtype == np.uint16: - sr.set_dim_points_uint16(dim.name, coord) - elif dim.dtype == np.uint8: - sr.set_dim_points_uint8(dim.name, coord) - - elif dim.dtype == np.float64: - sr.set_dim_points_float64(dim.name, coord) - elif dim.dtype == np.float32: - sr.set_dim_points_float32(dim.name, coord) - - elif dim.dtype == "str" or dim.dtype == "bytes": - sr.set_dim_points_string_or_bytes(dim.name, coord) + try: + set_dim_points = getattr(sr, f"set_dim_points_{dim.type}") + except AttributeError: + # We have to handle this type specially below + pass + else: + set_dim_points(dim.name, coord) + return True - elif ( - dim.dtype == "datetime64[s]" - or dim.dtype == "datetime64[ms]" - or dim.dtype == "datetime64[us]" - or dim.dtype == "datetime64[ns]" - ): + if _util.pa_types_is_string_or_bytes(dim.type): + sr.set_dim_points_string_or_bytes(dim.name, coord) + return True + elif pa.types.is_timestamp(dim.type): if not isinstance(coord, (tuple, list, np.ndarray)): raise ValueError( f"unhandled coord type {type(coord)} for index column named {dim.name}" @@ -618,64 +593,31 @@ def _set_reader_coord_by_py_seq_or_np_array( for e in coord ] sr.set_dim_points_int64(dim.name, icoord) + return True # TODO: bool - else: - raise ValueError( - f"unhandled type {dim.dtype} for index column named {dim.name}" - ) - - return True + raise ValueError( + f"unhandled type {dim.dtype} for index column named {dim.name}" + ) def _set_reader_coord_by_numeric_slice( - self, sr: clib.SOMAArray, dim_idx: int, dim: tiledb.Dim, coord: Slice[Any] + self, sr: clib.SOMAArray, dim_idx: int, dim: pa.Field, coord: Slice[Any] ) -> bool: try: - lo_hi = _util.slice_to_numeric_range(coord, dim.domain) + lo_hi = _util.slice_to_numeric_range(coord, self.domain[dim_idx]) except _util.NonNumericDimensionError: return False # We only handle numeric dimensions here. if not lo_hi: return True - elif dim.dtype == np.int64: - sr.set_dim_ranges_int64(dim.name, [lo_hi]) - return True - elif dim.dtype == np.int32: - sr.set_dim_ranges_int32(dim.name, [lo_hi]) - return True - elif dim.dtype == np.int16: - sr.set_dim_ranges_int16(dim.name, [lo_hi]) - return True - elif dim.dtype == np.int8: - sr.set_dim_ranges_int8(dim.name, [lo_hi]) - return True - - elif dim.dtype == np.uint64: - sr.set_dim_ranges_uint64(dim.name, [lo_hi]) - return True - elif dim.dtype == np.uint32: - sr.set_dim_ranges_uint32(dim.name, [lo_hi]) - return True - elif dim.dtype == np.uint16: - sr.set_dim_ranges_uint16(dim.name, [lo_hi]) - return True - elif dim.dtype == np.uint8: - sr.set_dim_ranges_uint8(dim.name, [lo_hi]) - return True - - elif dim.dtype == np.float64: - sr.set_dim_ranges_float64(dim.name, [lo_hi]) - return True - elif dim.dtype == np.float32: - sr.set_dim_ranges_float32(dim.name, [lo_hi]) + try: + set_dim_range = getattr(sr, f"set_dim_ranges_{dim.type}") + set_dim_range(dim.name, [lo_hi]) return True - - # TODO: - # elif dim.dtype == np.bool_: - - return False + except AttributeError: + return False def _canonicalize_schema( diff --git a/apis/python/src/tiledbsoma/_dense_nd_array.py b/apis/python/src/tiledbsoma/_dense_nd_array.py index 82030254ea..4617bd93b0 100644 --- a/apis/python/src/tiledbsoma/_dense_nd_array.py +++ b/apis/python/src/tiledbsoma/_dense_nd_array.py @@ -17,6 +17,7 @@ from . import _util from ._common_nd_array import NDArray from ._exception import SOMAError +from ._tdb_handles import ArrayWrapper from ._util import dense_indices_to_shape from .options._tiledb_create_options import TileDBCreateOptions @@ -71,6 +72,8 @@ class DenseNDArray(NDArray, somacore.DenseNDArray): __slots__ = () + _reader_wrapper_type = ArrayWrapper + def read( self, coords: options.DenseNDCoords = (), diff --git a/apis/python/src/tiledbsoma/_factory.py b/apis/python/src/tiledbsoma/_factory.py index b0eb385bbf..2bcf5b084f 100644 --- a/apis/python/src/tiledbsoma/_factory.py +++ b/apis/python/src/tiledbsoma/_factory.py @@ -142,18 +142,18 @@ def _open_internal( """Lower-level open function for internal use only.""" handle = opener(uri, mode, context, timestamp) try: - return _reify_handle(handle) + return reify_handle(handle) except Exception: handle.close() raise @typeguard_ignore -def _reify_handle(hdl: _Wrapper) -> "_tiledb_object.TileDBObject[_Wrapper]": +def reify_handle(hdl: _Wrapper) -> "_tiledb_object.TileDBObject[_Wrapper]": """Picks out the appropriate SOMA class for a handle and wraps it.""" typename = _read_soma_type(hdl) cls = _type_name_to_cls(typename) - if cls._wrapper_type != type(hdl): + if type(hdl) not in (cls._wrapper_type, cls._reader_wrapper_type): raise SOMAError( f"cannot open {hdl.uri!r}: a {type(hdl._handle)}" f" cannot be converted to a {typename}" diff --git a/apis/python/src/tiledbsoma/_query_condition.py b/apis/python/src/tiledbsoma/_query_condition.py index 1717aa810f..6d1d62984f 100644 --- a/apis/python/src/tiledbsoma/_query_condition.py +++ b/apis/python/src/tiledbsoma/_query_condition.py @@ -11,10 +11,11 @@ import attrs import numpy as np -import tiledb +import pyarrow as pa from . import pytiledbsoma as clib from ._exception import SOMAError +from ._util import pa_types_is_string_or_bytes # In Python 3.7, a boolean literal like `True` is of type `ast.NameConstant`. # Above that, it's of type `ast.Constant`. @@ -130,15 +131,17 @@ def __attrs_post_init__(self): def init_query_condition( self, - schema: tiledb.ArraySchema, - enum_to_dtype: dict, + schema: pa.Schema, query_attrs: Optional[List[str]], ): - qctree = QueryConditionTree(schema, enum_to_dtype, query_attrs) - self.c_obj = qctree.visit(self.tree.body) + try: + qctree = QueryConditionTree(schema, query_attrs) + self.c_obj = qctree.visit(self.tree.body) + except Exception as pex: + raise SOMAError(pex) if not isinstance(self.c_obj, clib.PyQueryCondition): - raise tiledb.TileDBError( + raise SOMAError( "Malformed query condition statement. A query condition must " "be made up of one or more Boolean expressions." ) @@ -148,8 +151,7 @@ def init_query_condition( @attrs.define class QueryConditionTree(ast.NodeVisitor): - schema: tiledb.ArraySchema - enum_to_dtype: dict + schema: pa.Schema query_attrs: List[str] def visit_BitOr(self, node): @@ -219,25 +221,25 @@ def visit_Compare(self, node: ast.Compare) -> clib.PyQueryCondition: elif isinstance(operator, (ast.In, ast.NotIn)): rhs = node.comparators[0] if not isinstance(rhs, ast.List): - raise tiledb.TileDBError( + raise SOMAError( "`in` operator syntax must be written as `attr in ['l', 'i', 's', 't']`" ) variable = node.left.id values = [self.get_val_from_node(val) for val in self.visit(rhs)] if len(values) == 0: - raise tiledb.TileDBError( + raise SOMAError( "At least one value must be provided to the set membership" ) - if self.schema.has_attr(variable): - enum_label = self.schema.attr(variable).enum_label - if enum_label is not None: - dt = self.enum_to_dtype[enum_label] - else: - dt = self.schema.attr(variable).dtype + dt = self.schema.field(variable).type + if pa.types.is_dictionary(dt): + dt = dt.value_type + + if pa_types_is_string_or_bytes(dt): + dtype = "string" else: - dt = self.schema.attr_or_dim_dtype(variable) + dtype = np.dtype(dt.to_pandas_dtype()).name # sdf.read(column_names=["foo"], value_filter='bar == 999') should # result in bar being added to the column names. See also @@ -246,7 +248,6 @@ def visit_Compare(self, node: ast.Compare) -> clib.PyQueryCondition: if att not in self.query_attrs: self.query_attrs.append(att) - dtype = "string" if dt.kind in "SUa" else dt.name op = clib.TILEDB_IN if isinstance(operator, ast.In) else clib.TILEDB_NOT_IN result = self.create_pyqc(dtype)(node.left.id, values, op) @@ -262,12 +263,15 @@ def aux_visit_Compare( att = self.get_att_from_node(att) val = self.get_val_from_node(val) - enum_label = self.schema.attr(att).enum_label - if enum_label is not None: - dt = self.enum_to_dtype[enum_label] + + dt = self.schema.field(att).type + if pa.types.is_dictionary(dt): + dt = dt.value_type + + if pa_types_is_string_or_bytes(dt): + dtype = "string" else: - dt = self.schema.attr(att).dtype - dtype = "string" if dt.kind in "SUa" else dt.name + dtype = np.dtype(dt.to_pandas_dtype()).name val = self.cast_val_to_dtype(val, dtype) pyqc = clib.PyQueryCondition() @@ -278,7 +282,7 @@ def aux_visit_Compare( def is_att_node(self, att: QueryConditionNodeElem) -> bool: if isinstance(att, ast.Call): if not isinstance(att.func, ast.Name): - raise tiledb.TileDBError(f"Unrecognized expression {att.func}.") + raise SOMAError(f"Unrecognized expression {att.func}.") if att.func.id != "attr": return False @@ -323,9 +327,7 @@ def get_att_from_node(self, node: QueryConditionNodeElem) -> Any: if isinstance(att_node, ast.Call): if not isinstance(att_node.func, ast.Name): - raise tiledb.TileDBError( - f"Unrecognized expression {att_node.func}." - ) + raise SOMAError(f"Unrecognized expression {att_node.func}.") att_node = att_node.args[0] if isinstance(att_node, ast.Name): @@ -338,21 +340,14 @@ def get_att_from_node(self, node: QueryConditionNodeElem) -> Any: # deprecated in 3.8 att = str(att_node.s) else: - raise tiledb.TileDBError( + raise SOMAError( f"Incorrect type for attribute name: {ast.dump(att_node)}" ) else: - raise tiledb.TileDBError( - f"Incorrect type for attribute name: {ast.dump(node)}" - ) + raise SOMAError(f"Incorrect type for attribute name: {ast.dump(node)}") - if not self.schema.has_attr(att): - if self.schema.domain.has_dim(att): - raise tiledb.TileDBError( - f"`{att}` is a dimension. QueryConditions currently only " - "work on attributes." - ) - raise tiledb.TileDBError(f"Attribute `{att}` not found in schema.") + if not att not in self.schema: + raise SOMAError(f"`{att}` not found in schema.") # sdf.read(column_names=["foo"], value_filter='bar == 999') should # result in bar being added to the column names. See also @@ -367,14 +362,12 @@ def get_val_from_node(self, node: QueryConditionNodeElem) -> Any: if isinstance(node, ast.Call): if not isinstance(node.func, ast.Name): - raise tiledb.TileDBError(f"Unrecognized expression {node.func}.") + raise SOMAError(f"Unrecognized expression {node.func}.") if node.func.id == "val": val_node = node.args[0] else: - raise tiledb.TileDBError( - f"Incorrect type for cast value: {node.func.id}" - ) + raise SOMAError(f"Incorrect type for cast value: {node.func.id}") if isinstance(val_node, ast.Constant) or isinstance(val_node, ast.NameConstant): val = val_node.value @@ -385,7 +378,7 @@ def get_val_from_node(self, node: QueryConditionNodeElem) -> Any: # deprecated in 3.8 val = val_node.s else: - raise tiledb.TileDBError( + raise SOMAError( f"Incorrect type for comparison value: {ast.dump(val_node)}" ) @@ -399,7 +392,7 @@ def cast_val_to_dtype( # this prevents numeric strings ("1", '123.32') from getting # casted to numeric types if isinstance(val, str): - raise tiledb.TileDBError(f"Cannot cast `{val}` to {dtype}.") + raise SOMAError(f"Cannot cast `{val}` to {dtype}.") if np.issubdtype(dtype, np.datetime64): cast = getattr(np, "int64") # silence DeprecationWarning: `np.bool` @@ -409,7 +402,7 @@ def cast_val_to_dtype( cast = getattr(np, dtype) val = cast(val) except ValueError: - raise tiledb.TileDBError(f"Cannot cast `{val}` to {dtype}.") + raise SOMAError(f"Cannot cast `{val}` to {dtype}.") return val @@ -420,7 +413,7 @@ def init_pyqc(self, pyqc: clib.PyQueryCondition, dtype: str) -> Callable: init_fn_name = f"init_{dtype}" if not hasattr(pyqc, init_fn_name): - raise tiledb.TileDBError(f"PyQueryCondition.{init_fn_name}() not found.") + raise SOMAError(f"PyQueryCondition.{init_fn_name}() not found.") return getattr(pyqc, init_fn_name) @@ -436,14 +429,13 @@ def create_pyqc(self, dtype: str) -> Callable: try: return getattr(clib.PyQueryCondition, create_fn_name) except AttributeError as ae: - raise tiledb.TileDBError( - f"PyQueryCondition.{create_fn_name}() not found." - ) from ae + raise SOMAError(f"PyQueryCondition.{create_fn_name}() not found.") from ae def visit_BinOp(self, node: ast.BinOp) -> clib.PyQueryCondition: - op = self.visit(node.op) - if op is None: - raise tiledb.TileDBError( + try: + op = self.visit(node.op) + except KeyError: + raise SOMAError( f"Unsupported binary operator: {ast.dump(node.op)}. Only & is currently supported." ) @@ -458,9 +450,7 @@ def visit_BoolOp(self, node: ast.BoolOp) -> clib.PyQueryCondition: try: op = self.visit(node.op) except KeyError: - raise tiledb.TileDBError( - f"Unsupported Boolean operator: {ast.dump(node.op)}." - ) + raise SOMAError(f"Unsupported Boolean operator: {ast.dump(node.op)}.") result = self.visit(node.values[0]) for value in node.values[1:]: @@ -470,13 +460,13 @@ def visit_BoolOp(self, node: ast.BoolOp) -> clib.PyQueryCondition: def visit_Call(self, node: ast.Call) -> ast.Call: if not isinstance(node.func, ast.Name): - raise tiledb.TileDBError(f"Unrecognized expression {node.func}.") + raise SOMAError(f"Unrecognized expression {node.func}.") if node.func.id not in ["attr", "val"]: - raise tiledb.TileDBError("Valid casts are attr() or val().") + raise SOMAError("Valid casts are attr() or val().") if len(node.args) != 1: - raise tiledb.TileDBError( + raise SOMAError( f"Exactly one argument must be provided to {node.func.id}()." ) @@ -497,7 +487,7 @@ def visit_UnaryOp(self, node: ast.UnaryOp, sign: int = 1): elif isinstance(node.op, ast.USub): sign *= -1 else: - raise tiledb.TileDBError(f"Unsupported UnaryOp type. Saw {ast.dump(node)}.") + raise SOMAError(f"Unsupported UnaryOp type. Saw {ast.dump(node)}.") if isinstance(node.operand, ast.UnaryOp): return self.visit_UnaryOp(node.operand, sign) @@ -509,7 +499,7 @@ def visit_UnaryOp(self, node: ast.UnaryOp, sign: int = 1): elif isinstance(node.operand, ast.Num): node.operand.n *= sign else: - raise tiledb.TileDBError( + raise SOMAError( f"Unexpected node type following UnaryOp. Saw {ast.dump(node)}." ) diff --git a/apis/python/src/tiledbsoma/_sparse_nd_array.py b/apis/python/src/tiledbsoma/_sparse_nd_array.py index 880e5be720..f98a368b85 100644 --- a/apis/python/src/tiledbsoma/_sparse_nd_array.py +++ b/apis/python/src/tiledbsoma/_sparse_nd_array.py @@ -38,6 +38,7 @@ SparseCOOTensorReadIter, TableReadIter, ) +from ._tdb_handles import ArrayWrapper from ._types import NTuple from .options._tiledb_create_options import TileDBCreateOptions @@ -94,6 +95,8 @@ class SparseNDArray(NDArray, somacore.SparseNDArray): __slots__ = () + _reader_wrapper_type = ArrayWrapper + # Inherited from somacore # * ndim accessor # * is_sparse: Final = True @@ -264,15 +267,15 @@ def write( ) def _set_reader_coord( - self, sr: clib.SOMAArray, dim_idx: int, dim: tiledb.Dim, coord: object + self, sr: clib.SOMAArray, dim_idx: int, dim: pa.Field, coord: object ) -> bool: if super()._set_reader_coord(sr, dim_idx, dim, coord): return True if isinstance(coord, Sequence): - if dim.dtype == np.int64: + if pa.types.is_int64(dim.type): sr.set_dim_points_int64(dim.name, coord) return True - elif dim.dtype == "str" or dim.dtype == "bytes": + elif _util.pa_types_is_string_or_bytes(dim.type): sr.set_dim_points_string_or_bytes(dim.name, coord) return True else: @@ -283,10 +286,10 @@ def _set_reader_coord( raise ValueError( f"only 1D numpy arrays may be used to index; got {coord.ndim}" ) - if dim.dtype == np.int64: + if pa.types.is_int64(dim.type): sr.set_dim_points_int64(dim.name, coord) return True - elif dim.dtype == "str" or dim.dtype == "bytes": + elif _util.pa_types_is_string_or_bytes(dim.type): sr.set_dim_points_string_or_bytes(dim.name, coord) return True @@ -345,7 +348,7 @@ def used_shape(self) -> Tuple[Tuple[int, int], ...]: # In the unlikely event that a previous data update succeeded but the # subsequent metadata update did not, take the union of the core non-empty domain # (which is done as part of the data update) and the metadata bounding box. - ned = self.non_empty_domain() + ned = self.non_empty_domain() or () for i, nedslot in enumerate(ned): ned_lower, ned_upper = nedslot bbox_lower, bbox_upper = retval[i] diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index f05a333b40..c28743240b 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -12,9 +12,11 @@ import enum from typing import ( Any, + Callable, Dict, Generic, Iterator, + List, Mapping, MutableMapping, Optional, @@ -25,15 +27,19 @@ ) import attrs +import numpy as np +import pyarrow as pa import tiledb +from numpy.typing import DTypeLike from somacore import options from typing_extensions import Literal, Self +from . import pytiledbsoma as clib from ._exception import DoesNotExistError, SOMAError, is_does_not_exist_error from ._types import OpenTimestamp from .options._soma_tiledb_context import SOMATileDBContext -RawHandle = Union[tiledb.Array, tiledb.Group] +RawHandle = Union[tiledb.Array, tiledb.Group, clib.SOMADataFrame] _RawHdl_co = TypeVar("_RawHdl_co", bound=RawHandle, covariant=True) """A raw TileDB object. Covariant because Handles are immutable enough.""" @@ -48,13 +54,35 @@ def open( obj_type = tiledb.object_type(uri, ctx=context.tiledb_ctx) if not obj_type: raise DoesNotExistError(f"{uri!r} does not exist") - if obj_type == "array": - return ArrayWrapper.open(uri, mode, context, timestamp) - if obj_type == "group": - return GroupWrapper.open(uri, mode, context, timestamp) + + try: + return _open_with_clib_wrapper(uri, mode, context, timestamp) + except SOMAError: + # This object still uses tiledb-py and must be handled below + if obj_type == "array": + return ArrayWrapper.open(uri, mode, context, timestamp) + if obj_type == "group": + return GroupWrapper.open(uri, mode, context, timestamp) + + # Invalid object raise SOMAError(f"{uri!r} has unknown storage type {obj_type!r}") +def _open_with_clib_wrapper( + uri: str, + mode: options.OpenMode, + context: SOMATileDBContext, + timestamp: Optional[OpenTimestamp] = None, +) -> "DataFrameWrapper": + open_mode = clib.OpenMode.read if mode == "r" else clib.OpenMode.write + config = {k: str(v) for k, v in context.tiledb_config.items()} + timestamp_ms = context._open_timestamp_ms(timestamp) + obj = clib.SOMAObject.open(uri, open_mode, config, (0, timestamp_ms)) + if obj.type == "SOMADataFrame": + return DataFrameWrapper._from_soma_object(obj, context) + raise SOMAError(f"clib.SOMAObject {obj.type!r} not yet supported") + + @attrs.define(eq=False, hash=False, slots=False) class Wrapper(Generic[_RawHdl_co], metaclass=abc.ABCMeta): """Wrapper for TileDB handles to manage lifecycle and metadata. @@ -95,6 +123,26 @@ def open( raise return handle + @classmethod + def _from_soma_object( + cls, soma_object: clib.SOMAObject, context: SOMATileDBContext + ) -> Self: + uri = soma_object.uri + mode = soma_object.mode + timestamp = soma_object.timestamp + try: + handle = cls(uri, mode, context, timestamp, soma_object) + if handle.mode == "w": + with cls._opener(uri, mode, context, timestamp) as auxiliary_reader: + handle._do_initial_reads(auxiliary_reader) + else: + handle._do_initial_reads(soma_object) + except tiledb.TileDBError as tdbe: + if is_does_not_exist_error(tdbe): + raise DoesNotExistError(f"{handle.uri!r} does not exist") from tdbe + raise + return handle + @classmethod @abc.abstractmethod def _opener( @@ -194,16 +242,30 @@ def _opener( def schema(self) -> tiledb.ArraySchema: return self._handle.schema - def non_empty_domain(self) -> Tuple[Tuple[int, int], ...]: - """ - Retrieves the non-empty domain for each dimension, namely the smallest - and largest indices in each dimension for which the array/dataframe has - data occupied. This is nominally the same as the domain used at - creation time, but if for example only a portion of the available domain - has actually had data written, this function will return a tighter - range. - """ - return self._handle.nonempty_domain() # type: ignore + def non_empty_domain(self) -> Tuple[Tuple[object, object], ...]: + try: + return self._handle.nonempty_domain() or () + except tiledb.TileDBError as e: + raise SOMAError(e) + + @property + def domain(self) -> Tuple[Tuple[object, object], ...]: + dom = self._handle.schema.domain + return tuple(dom.dim(i).domain for i in range(dom.ndim)) + + @property + def ndim(self) -> int: + return int(self._handle.schema.domain.ndim) + + @property + def attr_names(self) -> Tuple[str, ...]: + schema = self._handle.schema + return tuple(schema.attr(i).name for i in range(schema.nattr)) + + @property + def dim_names(self) -> Tuple[str, ...]: + schema = self._handle.schema + return tuple(schema.domain.dim(i).name for i in range(schema.domain.ndim)) def enum(self, label: str) -> tiledb.Enumeration: return self._handle.enum(label) @@ -247,6 +309,106 @@ def _do_initial_reads(self, reader: tiledb.Group) -> None: } +class DataFrameWrapper(Wrapper[clib.SOMADataFrame]): + """Wrapper around a Pybind11 SOMADataFrame handle.""" + + @classmethod + def _opener( + cls, + uri: str, + mode: options.OpenMode, + context: SOMATileDBContext, + timestamp: int, + ) -> clib.SOMADataFrame: + open_mode = clib.OpenMode.read if mode == "r" else clib.OpenMode.write + config = {k: str(v) for k, v in context.tiledb_config.items()} + column_names: List[str] = [] + result_order = clib.ResultOrder.automatic + return clib.SOMADataFrame.open( + uri, + open_mode, + config, + column_names, + result_order, + (0, timestamp), + ) + + # Covariant types should normally not be in parameters, but this is for + # internal use only so it's OK. + def _do_initial_reads(self, reader: _RawHdl_co) -> None: # type: ignore[misc] + """Final setup step before returning the Handle. + + This is passed a raw TileDB object opened in read mode, since writers + will need to retrieve data from the backing store on setup. + """ + # non–attrs-managed field + self.metadata = MetadataWrapper(self, dict(reader.meta)) + + @property + def schema(self) -> pa.Schema: + return self._handle.schema + + @property + def meta(self) -> "MetadataWrapper": + return MetadataWrapper(self, dict(self._handle.meta)) + + @property + def ndim(self) -> int: + return len(self._handle.index_column_names) + + @property + def count(self) -> int: + return int(self._handle.count) + + def _cast_domain( + self, domain: Callable[[str, DTypeLike], Tuple[object, object]] + ) -> Tuple[Tuple[object, object], ...]: + result = [] + for name in self._handle.index_column_names: + dtype = self._handle.schema.field(name).type + if pa.types.is_timestamp(dtype): + np_dtype = np.dtype(dtype.to_pandas_dtype()) + dom = domain(name, np_dtype) + result.append( + ( + np_dtype.type(dom[0], dtype.unit), + np_dtype.type(dom[1], dtype.unit), + ) + ) + else: + if pa.types.is_large_string(dtype) or pa.types.is_string(dtype): + dtype = np.dtype("U") + elif pa.types.is_large_binary(dtype) or pa.types.is_binary(dtype): + dtype = np.dtype("S") + else: + dtype = np.dtype(dtype.to_pandas_dtype()) + result.append(domain(name, dtype)) + return tuple(result) + + @property + def domain(self) -> Tuple[Tuple[object, object], ...]: + return self._cast_domain(self._handle.domain) + + def non_empty_domain(self) -> Tuple[Tuple[object, object], ...]: + return self._cast_domain(self._handle.non_empty_domain) or () + + @property + def attr_names(self) -> Tuple[str, ...]: + return tuple( + f.name for f in self.schema if f.name not in self._handle.index_column_names + ) + + @property + def dim_names(self) -> Tuple[str, ...]: + return tuple(self._handle.index_column_names) + + def enum(self, label: str) -> tiledb.Enumeration: + # The DataFrame handle may either be ArrayWrapper or DataFrameWrapper. + # enum is only used in the DataFrame write path and is implemented by + # ArrayWrapper. If enum is called in the read path, it is an error. + raise NotImplementedError + + class _DictMod(enum.Enum): """State machine to keep track of modifications to a dictionary. diff --git a/apis/python/src/tiledbsoma/_tiledb_array.py b/apis/python/src/tiledbsoma/_tiledb_array.py index 15c589b05a..a937ab00a3 100644 --- a/apis/python/src/tiledbsoma/_tiledb_array.py +++ b/apis/python/src/tiledbsoma/_tiledb_array.py @@ -65,9 +65,13 @@ def schema(self) -> pa.Schema: Lifecycle: Experimental. """ - return tiledb_schema_to_arrow(self._tiledb_array_schema(), self.uri, self._ctx) + if isinstance(self._tiledb_array_schema(), tiledb.ArraySchema): + return tiledb_schema_to_arrow( + self._tiledb_array_schema(), self.uri, self._ctx + ) + return self._tiledb_array_schema() - def non_empty_domain(self) -> Tuple[Tuple[int, int], ...]: + def non_empty_domain(self) -> Tuple[Tuple[Any, Any], ...]: """ Retrieves the non-empty domain for each dimension, namely the smallest and largest indices in each dimension for which the array/dataframe has @@ -88,19 +92,16 @@ def _tiledb_array_keys(self) -> Tuple[str, ...]: def _tiledb_dim_names(self) -> Tuple[str, ...]: """Reads the dimension names from the schema: for example, ['obs_id', 'var_id'].""" - schema = self._handle.schema - return tuple(schema.domain.dim(i).name for i in range(schema.domain.ndim)) + return self._handle.dim_names def _tiledb_attr_names(self) -> Tuple[str, ...]: """Reads the attribute names from the schema: for example, the list of column names in a dataframe. """ - schema = self._handle.schema - return tuple(schema.attr(i).name for i in range(schema.nattr)) + return self._handle.attr_names def _tiledb_domain(self) -> Tuple[Tuple[Any, Any], ...]: - schema = self._handle.schema - return tuple(schema.domain.dim(i).domain for i in range(0, schema.domain.ndim)) + return self._handle.domain def _soma_reader( self, @@ -147,14 +148,14 @@ def _set_reader_coords(self, sr: clib.SOMAArray, coords: Sequence[object]) -> No f"coords type {type(coords)} must be a regular sequence," " not str or bytes" ) - schema = self._handle.schema - if len(coords) > schema.domain.ndim: + + if len(coords) > self._handle.ndim: raise ValueError( f"coords ({len(coords)} elements) must be shorter than ndim" - f" ({schema.domain.ndim})" + f" ({self._handle.ndim})" ) for i, coord in enumerate(coords): - dim = self._handle.schema.domain.dim(i) + dim = self.schema.field(i) if not self._set_reader_coord(sr, i, dim, coord): raise TypeError( f"coord type {type(coord)} for dimension {dim.name}" @@ -162,7 +163,7 @@ def _set_reader_coords(self, sr: clib.SOMAArray, coords: Sequence[object]) -> No ) def _set_reader_coord( - self, sr: clib.SOMAArray, dim_idx: int, dim: tiledb.Dim, coord: object + self, sr: clib.SOMAArray, dim_idx: int, dim: pa.Field, coord: object ) -> bool: """Parses a single coordinate entry. @@ -173,7 +174,6 @@ def _set_reader_coord( Returns: True if successful, False if unrecognized. """ - del dim_idx # Unused. if coord is None: return True # No constraint; select all in this dimension @@ -183,7 +183,8 @@ def _set_reader_coord( if isinstance(coord, slice): _util.validate_slice(coord) try: - lo_hi = _util.slice_to_numeric_range(coord, dim.domain) + dom = self._handle.domain[dim_idx] + lo_hi = _util.slice_to_numeric_range(coord, dom) except _util.NonNumericDimensionError: return False # We only handle numeric dimensions here. if lo_hi: diff --git a/apis/python/src/tiledbsoma/_tiledb_object.py b/apis/python/src/tiledbsoma/_tiledb_object.py index 06de6a8c53..ccc72453ae 100644 --- a/apis/python/src/tiledbsoma/_tiledb_object.py +++ b/apis/python/src/tiledbsoma/_tiledb_object.py @@ -5,7 +5,7 @@ import datetime from contextlib import ExitStack -from typing import Any, Generic, MutableMapping, Optional, Type, TypeVar +from typing import Any, Generic, MutableMapping, Optional, Type, TypeVar, Union import somacore import tiledb @@ -81,7 +81,12 @@ def open( """ del platform_config # unused context = _validate_soma_tiledb_context(context) - handle = cls._wrapper_type.open(uri, mode, context, tiledb_timestamp) + try: + handle = _tdb_handles._open_with_clib_wrapper( + uri, mode, context, tiledb_timestamp + ) + except SOMAError: + handle = cls._wrapper_type.open(uri, mode, context, tiledb_timestamp) return cls( handle, _dont_call_this_use_create_or_open_instead="tiledbsoma-internal-code", @@ -89,7 +94,8 @@ def open( def __init__( self, - handle: _WrapperType_co, + # TODO DataFrameWrapper should be _WrapperType_co + handle: Union[_WrapperType_co, _tdb_handles.DataFrameWrapper], *, _dont_call_this_use_create_or_open_instead: str = "unset", ): @@ -121,6 +127,9 @@ def __init__( self._close_stack.enter_context(self._handle) _wrapper_type: Type[_WrapperType_co] + _reader_wrapper_type: Union[ + Type[_WrapperType_co], Type[_tdb_handles.DataFrameWrapper] + ] """Class variable of the Wrapper class used to open this object type.""" @property diff --git a/apis/python/src/tiledbsoma/_util.py b/apis/python/src/tiledbsoma/_util.py index b29e022d85..d2ea2db55e 100644 --- a/apis/python/src/tiledbsoma/_util.py +++ b/apis/python/src/tiledbsoma/_util.py @@ -10,9 +10,11 @@ from itertools import zip_longest from typing import Any, Optional, Tuple, Type, TypeVar +import pyarrow as pa import somacore from somacore import options +from . import pytiledbsoma as clib from ._types import OpenTimestamp, Slice, is_slice_of @@ -260,3 +262,25 @@ def ms_to_datetime(millis: int) -> datetime.datetime: secs, millis = divmod(millis, 1000) dt = datetime.datetime.fromtimestamp(secs, tz=datetime.timezone.utc) return dt.replace(microsecond=millis * 1000) + + +def to_clib_result_order(result_order: options.ResultOrderStr) -> clib.ResultOrder: + result_order = options.ResultOrder(result_order) + to_clib_result_order = { + options.ResultOrder.AUTO: clib.ResultOrder.automatic, + options.ResultOrder.ROW_MAJOR: clib.ResultOrder.rowmajor, + options.ResultOrder.COLUMN_MAJOR: clib.ResultOrder.colmajor, + } + try: + return to_clib_result_order[result_order] + except KeyError as ke: + raise ValueError(f"Invalid result_order: {result_order}") from ke + + +def pa_types_is_string_or_bytes(dtype: pa.DataType) -> bool: + return bool( + pa.types.is_large_string(dtype) + or pa.types.is_large_binary(dtype) + or pa.types.is_string(dtype) + or pa.types.is_binary(dtype) + ) diff --git a/apis/python/src/tiledbsoma/common.cc b/apis/python/src/tiledbsoma/common.cc new file mode 100644 index 0000000000..e2d0b94e99 --- /dev/null +++ b/apis/python/src/tiledbsoma/common.cc @@ -0,0 +1,181 @@ +#include "common.h" + +namespace tiledbsoma { + +std::unordered_map _tdb_to_np_name_dtype = { + {TILEDB_INT32, "int32"}, + {TILEDB_INT64, "int64"}, + {TILEDB_FLOAT32, "float32"}, + {TILEDB_FLOAT64, "float64"}, + {TILEDB_INT8, "int8"}, + {TILEDB_UINT8, "uint8"}, + {TILEDB_INT16, "int16"}, + {TILEDB_UINT16, "uint16"}, + {TILEDB_UINT32, "uint32"}, + {TILEDB_UINT64, "uint64"}, + {TILEDB_STRING_ASCII, "S"}, + {TILEDB_STRING_UTF8, "U1"}, + {TILEDB_CHAR, "S1"}, + {TILEDB_DATETIME_YEAR, "M8[Y]"}, + {TILEDB_DATETIME_MONTH, "M8[M]"}, + {TILEDB_DATETIME_WEEK, "M8[W]"}, + {TILEDB_DATETIME_DAY, "M8[D]"}, + {TILEDB_DATETIME_HR, "M8[h]"}, + {TILEDB_DATETIME_MIN, "M8[m]"}, + {TILEDB_DATETIME_SEC, "M8[s]"}, + {TILEDB_DATETIME_MS, "M8[ms]"}, + {TILEDB_DATETIME_US, "M8[us]"}, + {TILEDB_DATETIME_NS, "M8[ns]"}, + {TILEDB_DATETIME_PS, "M8[ps]"}, + {TILEDB_DATETIME_FS, "M8[fs]"}, + {TILEDB_DATETIME_AS, "M8[as]"}, + {TILEDB_TIME_HR, "m8[h]"}, + {TILEDB_TIME_MIN, "m8[m]"}, + {TILEDB_TIME_SEC, "m8[s]"}, + {TILEDB_TIME_MS, "m8[ms]"}, + {TILEDB_TIME_US, "m8[us]"}, + {TILEDB_TIME_NS, "m8[ns]"}, + {TILEDB_TIME_PS, "m8[ps]"}, + {TILEDB_TIME_FS, "m8[fs]"}, + {TILEDB_TIME_AS, "m8[as]"}, + {TILEDB_BLOB, "byte"}, + {TILEDB_BOOL, "bool"}, +}; + +std::unordered_map _np_name_to_tdb_dtype = { + {"int32", TILEDB_INT32}, + {"int64", TILEDB_INT64}, + {"float32", TILEDB_FLOAT32}, + {"float64", TILEDB_FLOAT64}, + {"int8", TILEDB_INT8}, + {"uint8", TILEDB_UINT8}, + {"int16", TILEDB_INT16}, + {"uint16", TILEDB_UINT16}, + {"uint32", TILEDB_UINT32}, + {"uint64", TILEDB_UINT64}, + {"datetime64[Y]", TILEDB_DATETIME_YEAR}, + {"datetime64[M]", TILEDB_DATETIME_MONTH}, + {"datetime64[W]", TILEDB_DATETIME_WEEK}, + {"datetime64[D]", TILEDB_DATETIME_DAY}, + {"datetime64[h]", TILEDB_DATETIME_HR}, + {"datetime64[m]", TILEDB_DATETIME_MIN}, + {"datetime64[s]", TILEDB_DATETIME_SEC}, + {"datetime64[ms]", TILEDB_DATETIME_MS}, + {"datetime64[us]", TILEDB_DATETIME_US}, + {"datetime64[ns]", TILEDB_DATETIME_NS}, + {"datetime64[ps]", TILEDB_DATETIME_PS}, + {"datetime64[fs]", TILEDB_DATETIME_FS}, + {"datetime64[as]", TILEDB_DATETIME_AS}, + /* duration types map to timedelta */ + {"timedelta64[h]", TILEDB_TIME_HR}, + {"timedelta64[m]", TILEDB_TIME_MIN}, + {"timedelta64[s]", TILEDB_TIME_SEC}, + {"timedelta64[ms]", TILEDB_TIME_MS}, + {"timedelta64[us]", TILEDB_TIME_US}, + {"timedelta64[ns]", TILEDB_TIME_NS}, + {"timedelta64[ps]", TILEDB_TIME_PS}, + {"timedelta64[fs]", TILEDB_TIME_FS}, + {"timedelta64[as]", TILEDB_TIME_AS}, + {"bool", TILEDB_BOOL}, +}; + +py::dtype tdb_to_np_dtype(tiledb_datatype_t type, uint32_t cell_val_num) { + if (type == TILEDB_CHAR || type == TILEDB_STRING_UTF8 || + type == TILEDB_STRING_ASCII) { + std::string base_str = (type == TILEDB_STRING_UTF8) ? "|U" : "|S"; + if (cell_val_num < TILEDB_VAR_NUM) + base_str += std::to_string(cell_val_num); + return py::dtype(base_str); + } + + if (cell_val_num == 1) { + if (type == TILEDB_STRING_UTF16 || type == TILEDB_STRING_UTF32) + TPY_ERROR_LOC("Unimplemented UTF16 or UTF32 string conversion!"); + if (type == TILEDB_STRING_UCS2 || type == TILEDB_STRING_UCS4) + TPY_ERROR_LOC("Unimplemented UCS2 or UCS4 string conversion!"); + + if (_tdb_to_np_name_dtype.count(type) == 1) + return py::dtype(_tdb_to_np_name_dtype[type]); + } + + if (cell_val_num == 2) { + if (type == TILEDB_FLOAT32) + return py::dtype("complex64"); + if (type == TILEDB_FLOAT64) + return py::dtype("complex128"); + } + + if (cell_val_num == TILEDB_VAR_NUM) + return tdb_to_np_dtype(type, 1); + + if (cell_val_num > 1) { + py::dtype base_dtype = tdb_to_np_dtype(type, 1); + py::tuple rec_elem = py::make_tuple("", base_dtype); + py::list rec_list; + for (size_t i = 0; i < cell_val_num; i++) + rec_list.append(rec_elem); + // note: we call the 'dtype' constructor b/c py::dtype does not accept + // list + auto np = py::module::import("numpy"); + auto np_dtype = np.attr("dtype"); + return np_dtype(rec_list); + } + + TPY_ERROR_LOC("tiledb datatype not understood ('" + + tiledb::impl::type_to_str(type) + + "', cell_val_num: " + std::to_string(cell_val_num) + ")"); +} + +tiledb_datatype_t np_to_tdb_dtype(py::dtype type) { + auto name = py::str(py::getattr(type, "name")); + if (_np_name_to_tdb_dtype.count(name) == 1) + return _np_name_to_tdb_dtype[name]; + + auto kind = py::str(py::getattr(type, "kind")); + if (kind == py::str("S")) + return TILEDB_STRING_ASCII; + if (kind == py::str("U")) + return TILEDB_STRING_UTF8; + + TPY_ERROR_LOC("could not handle numpy dtype"); +} + +/** + * @brief Convert ArrayBuffers to Arrow table. + * + * @param cbs ArrayBuffers + * @return py::object + */ +py::object _buffer_to_table(std::shared_ptr buffers) { + auto pa = py::module::import("pyarrow"); + auto pa_table_from_arrays = pa.attr("Table").attr("from_arrays"); + auto pa_array_import = pa.attr("Array").attr("_import_from_c"); + auto pa_schema_import = pa.attr("Schema").attr("_import_from_c"); + + py::list array_list; + py::list names; + + for (auto& name : buffers->names()) { + auto column = buffers->at(name); + auto [pa_array, pa_schema] = ArrowAdapter::to_arrow(column); + auto array = pa_array_import(py::capsule(pa_array.get()), + py::capsule(pa_schema.get())); + array_list.append(array); + names.append(name); + } + + return pa_table_from_arrays(array_list, names); +} + +std::optional to_table( + std::optional> buffers){ + // If more data was read, convert it to an arrow table and return + if (buffers.has_value()) { + return _buffer_to_table(*buffers); + } + + // No data was read, the query is complete, return nullopt + return std::nullopt; +} + +} \ No newline at end of file diff --git a/apis/python/src/tiledbsoma/common.h b/apis/python/src/tiledbsoma/common.h new file mode 100644 index 0000000000..42173acba4 --- /dev/null +++ b/apis/python/src/tiledbsoma/common.h @@ -0,0 +1,139 @@ +#include + +#include +#include +#include +#include +#include + +#include // C++ +#include + +using namespace std; +using namespace tiledb; +namespace py = pybind11; + +#define TPY_ERROR_LOC(m) throw TileDBSOMAPyError(m); + +class TileDBSOMAPyError : std::runtime_error { +public: + explicit TileDBSOMAPyError(const char *m) : std::runtime_error(m) {} + explicit TileDBSOMAPyError(std::string m) : std::runtime_error(m.c_str()) {} + +public: + virtual const char *what() const noexcept override { + return std::runtime_error::what(); + } +}; + +namespace tiledbsoma { + +py::dtype tdb_to_np_dtype(tiledb_datatype_t type, uint32_t cell_val_num); +tiledb_datatype_t np_to_tdb_dtype(py::dtype type); +std::optional to_table( + std::optional> buffers); + +class PyQueryCondition { + +private: + Context ctx_; + shared_ptr qc_; + +public: + PyQueryCondition(){ + try { + // create one global context for all query conditions + static Context context = Context(); + ctx_ = context; + qc_ = shared_ptr(new QueryCondition(ctx_)); + } catch (TileDBError &e) { + TPY_ERROR_LOC(e.what()); + } + } + + PyQueryCondition(py::object ctx) { + (void)ctx; + try { + // create one global context for all query conditions + static Context context = Context(); + ctx_ = context; + qc_ = shared_ptr(new QueryCondition(ctx_)); + } catch (TileDBError &e) { + TPY_ERROR_LOC(e.what()); + } + } + + void init(const string &attribute_name, const string &condition_value, + tiledb_query_condition_op_t op) { + try { + qc_->init(attribute_name, condition_value, op); + } catch (TileDBError &e) { + TPY_ERROR_LOC(e.what()); + } + } + + template + void init(const string &attribute_name, T condition_value, + tiledb_query_condition_op_t op) { + try { + qc_->init(attribute_name, &condition_value, sizeof(condition_value), op); + } catch (TileDBError &e) { + TPY_ERROR_LOC(e.what()); + } + } + + shared_ptr ptr() { return qc_; } + + py::capsule __capsule__() { return py::capsule(&qc_, "qc"); } + + template + static PyQueryCondition + create(const std::string &field_name, + const std::vector &values, tiledb_query_condition_op_t op) { + auto pyqc = PyQueryCondition(); + + const Context ctx = std::as_const(pyqc.ctx_); + + auto set_membership_qc = + QueryConditionExperimental::create(ctx, field_name, values, op); + + pyqc.qc_ = std::make_shared(std::move(set_membership_qc)); + + return pyqc; + } + + PyQueryCondition + combine(PyQueryCondition qc, + tiledb_query_condition_combination_op_t combination_op) const { + + auto pyqc = PyQueryCondition(nullptr, ctx_.ptr().get()); + + tiledb_query_condition_t *combined_qc = nullptr; + ctx_.handle_error( + tiledb_query_condition_alloc(ctx_.ptr().get(), &combined_qc)); + + ctx_.handle_error(tiledb_query_condition_combine( + ctx_.ptr().get(), qc_->ptr().get(), qc.qc_->ptr().get(), + combination_op, &combined_qc)); + + pyqc.qc_ = std::shared_ptr( + new QueryCondition(pyqc.ctx_, combined_qc)); + + return pyqc; + } + +private: + PyQueryCondition(shared_ptr qc, tiledb_ctx_t *c_ctx) + : qc_(qc) { + ctx_ = Context(c_ctx, false); + } + + void set_ctx(py::object ctx) { + tiledb_ctx_t *c_ctx; + if ((c_ctx = (py::capsule)ctx.attr("__capsule__")()) == nullptr) + TPY_ERROR_LOC("Invalid context pointer!") + + ctx_ = Context(c_ctx, false); + } +}; +} diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py index b1abd5d298..67d5651fe3 100644 --- a/apis/python/src/tiledbsoma/io/ingest.py +++ b/apis/python/src/tiledbsoma/io/ingest.py @@ -1777,7 +1777,7 @@ def _write_matrix_to_denseNDArray( def _read_nonempty_domain(arr: TileDBArray) -> Any: try: - return arr._handle.reader.nonempty_domain() + return arr._handle.non_empty_domain() except SOMAError: # This means that we're open in write-only mode. # Reopen the array in read mode. @@ -1785,7 +1785,7 @@ def _read_nonempty_domain(arr: TileDBArray) -> Any: cls = type(arr) with cls.open(arr.uri, "r", platform_config=None, context=arr.context) as readarr: - return readarr._handle.reader.nonempty_domain() + return readarr._handle.non_empty_domain() def _find_sparse_chunk_size( @@ -2251,7 +2251,7 @@ def _coo_to_table( def _chunk_is_contained_in( chunk_bounds: Sequence[Tuple[int, int]], - storage_nonempty_domain: Optional[Sequence[Tuple[Optional[int], Optional[int]]]], + storage_nonempty_domain: Sequence[Tuple[Optional[int], Optional[int]]], ) -> bool: """ Determines if a dim range is included within the array's non-empty domain. Ranges are inclusive @@ -2269,7 +2269,7 @@ def _chunk_is_contained_in( user that they declare they are retrying the exact same input file -- and we do our best to fulfill their ask by checking the dimension being strided on. """ - if storage_nonempty_domain is None: + if len(storage_nonempty_domain) == 0: return False if len(chunk_bounds) != len(storage_nonempty_domain): @@ -2288,6 +2288,9 @@ def _chunk_is_contained_in_axis( stride_axis: int, ) -> bool: """Helper function for ``_chunk_is_contained_in``.""" + if len(storage_nonempty_domain) == 0: + return False + storage_lo, storage_hi = storage_nonempty_domain[stride_axis] if storage_lo is None or storage_hi is None: # E.g. an array has had its schema created but no data written yet diff --git a/apis/python/src/tiledbsoma/pytiledbsoma.cc b/apis/python/src/tiledbsoma/pytiledbsoma.cc index 9d25d018aa..eee25aad4e 100644 --- a/apis/python/src/tiledbsoma/pytiledbsoma.cc +++ b/apis/python/src/tiledbsoma/pytiledbsoma.cc @@ -1,231 +1,49 @@ -/** - * @file pytiledbsoma.cc - * - * @section LICENSE - * - * The MIT License - * - * @copyright Copyright (c) 2022 TileDB, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - * @section DESCRIPTION - * - * This file defines the a pybind11 api into SOMA C++ library. - */ +#include +#include #include #include #include #include -#include -#include -#include +#include "common.h" -#include "query_condition.cc" - -#define DENUM(x) .value(#x, TILEDB_##x) - -using namespace tiledbsoma; +namespace libtiledbsomacpp { namespace py = pybind11; using namespace py::literals; +using namespace tiledbsoma; -namespace tiledbsoma { - -std::unordered_map _tdb_to_np_name_dtype = { - {TILEDB_INT32, "int32"}, - {TILEDB_INT64, "int64"}, - {TILEDB_FLOAT32, "float32"}, - {TILEDB_FLOAT64, "float64"}, - {TILEDB_INT8, "int8"}, - {TILEDB_UINT8, "uint8"}, - {TILEDB_INT16, "int16"}, - {TILEDB_UINT16, "uint16"}, - {TILEDB_UINT32, "uint32"}, - {TILEDB_UINT64, "uint64"}, - {TILEDB_STRING_ASCII, "S"}, - {TILEDB_STRING_UTF8, "U1"}, - {TILEDB_CHAR, "S1"}, - {TILEDB_DATETIME_YEAR, "M8[Y]"}, - {TILEDB_DATETIME_MONTH, "M8[M]"}, - {TILEDB_DATETIME_WEEK, "M8[W]"}, - {TILEDB_DATETIME_DAY, "M8[D]"}, - {TILEDB_DATETIME_HR, "M8[h]"}, - {TILEDB_DATETIME_MIN, "M8[m]"}, - {TILEDB_DATETIME_SEC, "M8[s]"}, - {TILEDB_DATETIME_MS, "M8[ms]"}, - {TILEDB_DATETIME_US, "M8[us]"}, - {TILEDB_DATETIME_NS, "M8[ns]"}, - {TILEDB_DATETIME_PS, "M8[ps]"}, - {TILEDB_DATETIME_FS, "M8[fs]"}, - {TILEDB_DATETIME_AS, "M8[as]"}, - {TILEDB_TIME_HR, "m8[h]"}, - {TILEDB_TIME_MIN, "m8[m]"}, - {TILEDB_TIME_SEC, "m8[s]"}, - {TILEDB_TIME_MS, "m8[ms]"}, - {TILEDB_TIME_US, "m8[us]"}, - {TILEDB_TIME_NS, "m8[ns]"}, - {TILEDB_TIME_PS, "m8[ps]"}, - {TILEDB_TIME_FS, "m8[fs]"}, - {TILEDB_TIME_AS, "m8[as]"}, - {TILEDB_BLOB, "byte"}, - {TILEDB_BOOL, "bool"}, -}; - -py::dtype tdb_to_np_dtype(tiledb_datatype_t type, uint32_t cell_val_num) { - if (type == TILEDB_CHAR || type == TILEDB_STRING_UTF8 || - type == TILEDB_STRING_ASCII) { - std::string base_str = (type == TILEDB_STRING_UTF8) ? "|U" : "|S"; - if (cell_val_num < TILEDB_VAR_NUM) - base_str += std::to_string(cell_val_num); - return py::dtype(base_str); - } - - if (cell_val_num == 1) { - if (type == TILEDB_STRING_UTF16 || type == TILEDB_STRING_UTF32) - TileDBSOMAError("Unimplemented UTF16 or UTF32 string conversion!"); - if (type == TILEDB_STRING_UCS2 || type == TILEDB_STRING_UCS4) - TileDBSOMAError("Unimplemented UCS2 or UCS4 string conversion!"); - - if (_tdb_to_np_name_dtype.count(type) == 1) - return py::dtype(_tdb_to_np_name_dtype[type]); - } - - if (cell_val_num == 2) { - if (type == TILEDB_FLOAT32) - return py::dtype("complex64"); - if (type == TILEDB_FLOAT64) - return py::dtype("complex128"); - } - - if (cell_val_num == TILEDB_VAR_NUM) - return tdb_to_np_dtype(type, 1); - - if (cell_val_num > 1) { - py::dtype base_dtype = tdb_to_np_dtype(type, 1); - py::tuple rec_elem = py::make_tuple("", base_dtype); - py::list rec_list; - for (size_t i = 0; i < cell_val_num; i++) - rec_list.append(rec_elem); - // note: we call the 'dtype' constructor b/c py::dtype does not accept - // list - auto np = py::module::import("numpy"); - auto np_dtype = np.attr("dtype"); - return np_dtype(rec_list); - } - - TileDBSOMAError("tiledb datatype not understood ('" + - tiledb::impl::type_to_str(type) + - "', cell_val_num: " + std::to_string(cell_val_num) + ")"); -} - -py::tuple get_enum(SOMAArray& sr, std::string attr_name){ - auto attr_to_enmrs = sr.get_attr_to_enum_mapping(); - if(attr_to_enmrs.count(attr_name) == 0) - throw TileDBSOMAError("Given attribute does not have enumeration"); - - Enumeration enmr(attr_to_enmrs.at(attr_name)); - - switch (enmr.type()) { - case TILEDB_UINT8: - return py::tuple(py::cast(enmr.as_vector())); - case TILEDB_INT8: - return py::tuple(py::cast(enmr.as_vector())); - case TILEDB_UINT16: - return py::tuple(py::cast(enmr.as_vector())); - case TILEDB_INT16: - return py::tuple(py::cast(enmr.as_vector())); - case TILEDB_UINT32: - return py::tuple(py::cast(enmr.as_vector())); - case TILEDB_INT32: - return py::tuple(py::cast(enmr.as_vector())); - case TILEDB_UINT64: - return py::tuple(py::cast(enmr.as_vector())); - case TILEDB_INT64: - return py::tuple(py::cast(enmr.as_vector())); - case TILEDB_FLOAT32: - return py::tuple(py::cast(enmr.as_vector())); - case TILEDB_FLOAT64: - return py::tuple(py::cast(enmr.as_vector())); - case TILEDB_STRING_ASCII: - case TILEDB_STRING_UTF8: - case TILEDB_CHAR: - return py::tuple(py::cast(enmr.as_vector())); - case TILEDB_BOOL: - return py::tuple(py::cast(enmr.as_vector())); - default: - throw TileDBSOMAError("Unsupported enumeration type."); - } -} - -bool get_enum_is_ordered(SOMAArray& sr, std::string attr_name){ - auto attr_to_enmrs = sr.get_attr_to_enum_mapping(); - if(attr_to_enmrs.count(attr_name) == 0) - throw TileDBSOMAError("Given attribute does not have enumeration"); - return attr_to_enmrs.at(attr_name).ordered(); -} - -/** - * @brief Convert ArrayBuffers to Arrow table. - * - * @param cbs ArrayBuffers - * @return py::object - */ -py::object _buffer_to_table(std::shared_ptr buffers) { - auto pa = py::module::import("pyarrow"); - auto pa_table_from_arrays = pa.attr("Table").attr("from_arrays"); - auto pa_array_import = pa.attr("Array").attr("_import_from_c"); - auto pa_schema_import = pa.attr("Schema").attr("_import_from_c"); - - py::list array_list; - py::list names; - - for (auto& name : buffers->names()) { - auto column = buffers->at(name); - auto [pa_array, pa_schema] = ArrowAdapter::to_arrow(column); - auto array = pa_array_import(py::capsule(pa_array.get()), - py::capsule(pa_schema.get())); - array_list.append(array); - names.append(name); - } - - return pa_table_from_arrays(array_list, names); -} - -std::optional to_table( - std::optional> buffers){ - // If more data was read, convert it to an arrow table and return - if (buffers.has_value()) { - return _buffer_to_table(*buffers); - } +template +using overload_cast_ = pybind11::detail::overload_cast_impl; - // No data was read, the query is complete, return nullopt - return std::nullopt; -} +void load_soma_array(py::module &); +void load_soma_object(py::module &); +void load_soma_dataframe(py::module &); +void load_query_condition(py::module &); -/** - * @brief pybind11 bindings - * - */ PYBIND11_MODULE(pytiledbsoma, m) { + py::register_exception(m, "SOMAError"); + + /* We need to make sure C++ TileDBSOMAError is translated to a correctly-typed + * Python error + */ + py::register_exception_translator([](std::exception_ptr p) { + auto tiledb_soma_error = + (py::object)py::module::import("tiledbsoma").attr("SOMAError"); + + try { + if (p) + std::rethrow_exception(p); + } catch (const TileDBSOMAError &e) { + PyErr_SetString(tiledb_soma_error.ptr(), e.what()); + } catch (const TileDBSOMAPyError &e) { + PyErr_SetString(tiledb_soma_error.ptr(), e.what()); + } catch (py::builtin_exception &e) { + throw; + }; + }); + py::enum_(m, "OpenMode") .value("read", OpenMode::read) .value("write", OpenMode::write); @@ -235,8 +53,6 @@ PYBIND11_MODULE(pytiledbsoma, m) { .value("rowmajor", ResultOrder::rowmajor) .value("colmajor", ResultOrder::colmajor); - tiledbpy::load_query_condition(m); - m.doc() = "SOMA acceleration library"; m.def("version", []() { return tiledbsoma::version::as_string(); }); @@ -273,427 +89,6 @@ PYBIND11_MODULE(pytiledbsoma, m) { }, "Print TileDB internal statistics. Lifecycle: experimental."); - py::class_(m, "SOMAArray") - .def( - py::init( - [](std::string_view uri, - std::string_view name, - std::optional> column_names_in, - std::string_view batch_size, - ResultOrder result_order, - std::map platform_config, - std::optional> timestamp) { - // Handle optional args - std::vector column_names; - if (column_names_in) { - column_names = *column_names_in; - } - - return SOMAArray::open( - OpenMode::read, - uri, - name, - platform_config, - column_names, - batch_size, - result_order, - timestamp); - }), - "uri"_a, - py::kw_only(), - "name"_a = "unnamed", - "column_names"_a = py::none(), - "batch_size"_a = "auto", - "result_order"_a = ResultOrder::automatic, - "platform_config"_a = py::dict(), - "timestamp"_a = py::none()) - - .def( - "set_condition", - [](SOMAArray& reader, - py::object py_query_condition, - py::object py_schema){ - auto attr_to_enum = reader.get_attr_to_enum_mapping(); - std::map enum_to_dtype; - for(auto const& [attr, enmr] : attr_to_enum){ - enum_to_dtype[attr] = tdb_to_np_dtype( - enmr.type(), enmr.cell_val_num()); - } - auto column_names = reader.column_names(); - // Handle query condition based on - // TileDB-Py::PyQuery::set_attr_cond() - QueryCondition* qc = nullptr; - if (!py_query_condition.is(py::none())) { - py::object init_pyqc = py_query_condition.attr( - "init_query_condition"); - try { - // Column names will be updated with columns present - // in the query condition - auto new_column_names = - init_pyqc(py_schema, enum_to_dtype, column_names) - .cast>(); - // Update the column_names list if it was not empty, - // otherwise continue selecting all columns with an - // empty column_names list - if (!column_names.empty()) { - column_names = new_column_names; - } - } catch (const std::exception& e) { - throw TileDBSOMAError(e.what()); - } - qc = py_query_condition.attr("c_obj") - .cast() - .ptr() - .get(); - } - reader.reset(column_names); - - // Release python GIL after we're done accessing python - // objects - py::gil_scoped_release release; - // Set query condition if present - if (qc) { - reader.set_condition(*qc); - } - }, - "py_query_condition"_a, - "py_schema"_a) - - .def( - "reset", - [](SOMAArray& reader, - std::optional> column_names_in, - std::string_view batch_size, - ResultOrder result_order) { - // Handle optional args - std::vector column_names; - if (column_names_in) { - column_names = *column_names_in; - } - - // Reset state of the existing SOMAArray object - reader.reset(column_names, batch_size, result_order); - }, - py::kw_only(), - "column_names"_a = py::none(), - "batch_size"_a = "auto", - "result_order"_a = ResultOrder::automatic) - - // After this are short functions expected to be invoked when the coords - // are Python list/tuple, or NumPy arrays. Arrow arrays are in this - // long if-else-if function. - .def( - "set_dim_points_arrow", - [](SOMAArray& reader, - const std::string& dim, - py::object py_arrow_array, - int partition_index, - int partition_count) { - // Create a list of array chunks - py::list array_chunks; - if (py::hasattr(py_arrow_array, "chunks")) { - array_chunks = py_arrow_array.attr("chunks") - .cast(); - } else { - array_chunks.append(py_arrow_array); - } - - for (const pybind11::handle array : array_chunks) { - ArrowSchema arrow_schema; - ArrowArray arrow_array; - uintptr_t arrow_schema_ptr = (uintptr_t)(&arrow_schema); - uintptr_t arrow_array_ptr = (uintptr_t)(&arrow_array); - - // Call array._export_to_c to get arrow array and schema - // - // If ever a NumPy array gets in here, there will be an - // exception like "AttributeError: 'numpy.ndarray' object - // has no attribute '_export_to_c'". - array.attr("_export_to_c")( - arrow_array_ptr, arrow_schema_ptr); - - auto coords = array.attr("tolist")(); - - if (!strcmp(arrow_schema.format, "l")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if (!strcmp(arrow_schema.format, "i")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if (!strcmp(arrow_schema.format, "s")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if (!strcmp(arrow_schema.format, "c")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if (!strcmp(arrow_schema.format, "L")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if (!strcmp(arrow_schema.format, "I")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if (!strcmp(arrow_schema.format, "S")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if (!strcmp(arrow_schema.format, "C")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if (!strcmp(arrow_schema.format, "f")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if (!strcmp(arrow_schema.format, "g")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if ( - !strcmp(arrow_schema.format, "u") || - !strcmp(arrow_schema.format, "z")) { - reader.set_dim_points( - dim, coords.cast>()); - } else if ( - !strcmp(arrow_schema.format, "tss:") || - !strcmp(arrow_schema.format, "tsm:") || - !strcmp(arrow_schema.format, "tsu:") || - !strcmp(arrow_schema.format, "tsn:")) { - // convert the Arrow Array to int64 - auto pa = py::module::import("pyarrow"); - coords = array.attr("cast")(pa.attr("int64")()).attr("tolist")(); - reader.set_dim_points( - dim, coords.cast>()); - } else if ( - !strcmp(arrow_schema.format, "U") || - !strcmp(arrow_schema.format, "Z")) { - reader.set_dim_points( - dim, coords.cast>()); - } else { - throw TileDBSOMAError( - "[pytiledbsoma] set_dim_points: type=" + std::string(arrow_schema.format) + " not " - "supported"); - } - - // Release arrow schema - arrow_schema.release(&arrow_schema); - } - }, - "dim"_a, - "py_arrow_array"_a, - "partition_index"_a = 0, - "partition_count"_a = 1) - - // The following short functions are expected to be invoked when the - // coords are Python list/tuple, or NumPy arrays. Arrow arrays are in - // the long if-else-if function above. - // - // Binding overloaded methods to templated member functions requires - // more effort, see: - // https://pybind11.readthedocs.io/en/stable/classes.html#overloaded-methods - - // In an initial version of this file we had `set_dim_ranges` relying - // solely on type-overloading. This worked since we supported only int - // and string indices. In a subsequent version we are now supporting - // various NumPy/PyArrow types including float32, float64, int8, uint16, - // etc. It is an unfortunate fact that pybind11 does _not_ successfully - // disambiguate between float32 and float64, or between int8 and int64, - // etc. given that we ask it to disambiguate using not just types but - // std::vector of types or std::vector of std::pair of types. - // Experiments have shown that when both float32 and float64 are - // implemented with overloaded names to be differentiated solely by - // type, pybind11 uses the _first found_. Therefore it is necessary for - // us to no longer use common overloaded names. - - .def( - "set_dim_points_string_or_bytes", - static_cast&)>( - &SOMAArray::set_dim_points)) - - .def( - "set_dim_points_float64", - static_cast&)>( - &SOMAArray::set_dim_points)) - - .def( - "set_dim_points_float32", - static_cast&)>( - &SOMAArray::set_dim_points)) - - .def( - "set_dim_points_int64", - static_cast&)>( - &SOMAArray::set_dim_points)) - - .def( - "set_dim_points_int32", - static_cast&)>( - &SOMAArray::set_dim_points)) - - .def( - "set_dim_points_int16", - static_cast&)>( - &SOMAArray::set_dim_points)) - - .def( - "set_dim_points_int8", - static_cast&)>( - &SOMAArray::set_dim_points)) - - .def( - "set_dim_points_uint64", - static_cast&)>( - &SOMAArray::set_dim_points)) - - .def( - "set_dim_points_uint32", - static_cast&)>( - &SOMAArray::set_dim_points)) - - .def( - "set_dim_points_uint16", - static_cast&)>( - &SOMAArray::set_dim_points)) - - .def( - "set_dim_points_uint8", - static_cast&)>( - &SOMAArray::set_dim_points)) - - // In an initial version of this file we had `set_dim_ranges` relying - // solely on type-overloading. This worked since we supported only int - // and string indices. In a subsequent version we are now supporting - // various NumPy/PyArrow types including float32, float64, int8, uint16, - // etc. It is an unfortunate fact that pybind11 does _not_ successfully - // disambiguate between float32 and float64, or between int8 and int64, - // etc. given that we ask it to disambiguate using not just types but - // std::vector of types or std::vector of std::pair of types. - // Experiments have shown that when both float32 and float64 are - // implemented with overloaded names to be differentiated solely by - // type, pybind11 uses the _first found_. Therefore it is necessary for - // us to no longer use common overloaded names. - - .def( - "set_dim_ranges_string_or_bytes", - static_cast>&)>( - &SOMAArray::set_dim_ranges)) - - .def( - "set_dim_ranges_int64", - static_cast>&)>( - &SOMAArray::set_dim_ranges)) - - .def( - "set_dim_ranges_int32", - static_cast>&)>( - &SOMAArray::set_dim_ranges)) - - .def( - "set_dim_ranges_int16", - static_cast>&)>( - &SOMAArray::set_dim_ranges)) - - .def( - "set_dim_ranges_int8", - static_cast>&)>( - &SOMAArray::set_dim_ranges)) - - .def( - "set_dim_ranges_uint64", - static_cast>&)>( - &SOMAArray::set_dim_ranges)) - - .def( - "set_dim_ranges_uint32", - static_cast>&)>( - &SOMAArray::set_dim_ranges)) - - .def( - "set_dim_ranges_uint16", - static_cast>&)>( - &SOMAArray::set_dim_ranges)) - - .def( - "set_dim_ranges_uint8", - static_cast>&)>( - &SOMAArray::set_dim_ranges)) - - .def( - "set_dim_ranges_float64", - static_cast>&)>( - &SOMAArray::set_dim_ranges)) - - .def( - "set_dim_ranges_float32", - static_cast>&)>( - &SOMAArray::set_dim_ranges)) - - .def("results_complete", &SOMAArray::results_complete) - - .def( - "read_next", - [](SOMAArray& reader) -> std::optional { - // Release python GIL before reading data - py::gil_scoped_release release; - - // Try to read more data - auto buffers = reader.read_next(); - - // If more data was read, convert it to an arrow table and - // return - if (buffers.has_value()) { - // Acquire python GIL before accessing python objects - py::gil_scoped_acquire acquire; - return to_table(*buffers); - } - - // No data was read, the query is complete, return nullopt - return std::nullopt; - }) - - .def("nnz", &SOMAArray::nnz, py::call_guard()) - - .def_property_readonly("shape", &SOMAArray::shape) - - .def_property_readonly("uri", &SOMAArray::uri) - - .def_property_readonly("column_names", &SOMAArray::column_names) - - .def_property_readonly("result_order", &SOMAArray::result_order) - - .def("get_enum", get_enum) - - .def("get_enum_is_ordered", get_enum_is_ordered) - - .def("get_enum_label_on_attr", &SOMAArray::get_enum_label_on_attr); // Efficient C++ re-indexing (aka hashing unique key values to an index // between 0 and number of keys - 1) based on khash py::class_(m, "IntIndexer") @@ -702,8 +97,8 @@ PYBIND11_MODULE(pytiledbsoma, m) { .def( "map_locations", [](IntIndexer& indexer, - py::array_t keys, - int num_threads) { + py::array_t keys, + int num_threads) { auto buffer = keys.request(); int64_t* data = static_cast(buffer.ptr); size_t length = buffer.shape[0]; @@ -712,8 +107,8 @@ PYBIND11_MODULE(pytiledbsoma, m) { .def( "map_locations", [](IntIndexer& indexer, - std::vector keys, - int num_threads) { + std::vector keys, + int num_threads) { indexer.map_locations(keys.data(), keys.size(), num_threads); }) // Perform lookup for a large input array of keys and return the looked @@ -740,8 +135,8 @@ PYBIND11_MODULE(pytiledbsoma, m) { .def( "get_indexer", [](IntIndexer& indexer, - py::array_t lookups, - py::array_t& results) { + py::array_t lookups, + py::array_t& results) { auto input_buffer = lookups.request(); int64_t* input_ptr = static_cast(input_buffer.ptr); size_t size = input_buffer.shape[0]; @@ -752,5 +147,11 @@ PYBIND11_MODULE(pytiledbsoma, m) { size_t results_size = input_buffer.shape[0]; indexer.lookup(input_ptr, input_ptr, size); }); + + load_soma_array(m); + load_soma_object(m); + load_soma_dataframe(m); + load_query_condition(m); } -} // namespace tiledbsoma + +}; diff --git a/apis/python/src/tiledbsoma/query_condition.cc b/apis/python/src/tiledbsoma/query_condition.cc index c8f1134c6a..2cf021c435 100644 --- a/apis/python/src/tiledbsoma/query_condition.cc +++ b/apis/python/src/tiledbsoma/query_condition.cc @@ -30,137 +30,20 @@ * This file implements the TileDB-Py query condition. */ -// clang-format off -#include -#include - -#include - -// #define TILEDB_DEPRECATED -// #define TILEDB_DEPRECATED_EXPORT - -// #include "util.h" -#include // C++ -#include +#include "common.h" #define TPY_ERROR_LOC(m) throw tiledbsoma::TileDBSOMAError(m); #if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 2 - #if !defined(NDEBUG) -//#include "debug.cc" #endif - -namespace tiledbpy { +namespace libtiledbsomacpp { using namespace std; using namespace tiledb; namespace py = pybind11; -using namespace pybind11::literals; - -class PyQueryCondition { - -private: - Context ctx_; - shared_ptr qc_; - -public: - PyQueryCondition(){ - try { - // create one global context for all query conditions - static Context context = Context(); - ctx_ = context; - qc_ = shared_ptr(new QueryCondition(ctx_)); - } catch (TileDBError &e) { - TPY_ERROR_LOC(e.what()); - } - } - - PyQueryCondition(py::object ctx) { - (void)ctx; - try { - // create one global context for all query conditions - static Context context = Context(); - ctx_ = context; - qc_ = shared_ptr(new QueryCondition(ctx_)); - } catch (TileDBError &e) { - TPY_ERROR_LOC(e.what()); - } - } - - void init(const string &attribute_name, const string &condition_value, - tiledb_query_condition_op_t op) { - try { - qc_->init(attribute_name, condition_value, op); - } catch (TileDBError &e) { - TPY_ERROR_LOC(e.what()); - } - } - - template - void init(const string &attribute_name, T condition_value, - tiledb_query_condition_op_t op) { - try { - qc_->init(attribute_name, &condition_value, sizeof(condition_value), op); - } catch (TileDBError &e) { - TPY_ERROR_LOC(e.what()); - } - } - - shared_ptr ptr() { return qc_; } - - py::capsule __capsule__() { return py::capsule(&qc_, "qc"); } - - template - static PyQueryCondition - create(const std::string &field_name, - const std::vector &values, tiledb_query_condition_op_t op) { - auto pyqc = PyQueryCondition(); - - const Context ctx = std::as_const(pyqc.ctx_); - - auto set_membership_qc = - QueryConditionExperimental::create(ctx, field_name, values, op); - - pyqc.qc_ = std::make_shared(std::move(set_membership_qc)); - - return pyqc; - } - - PyQueryCondition - combine(PyQueryCondition qc, - tiledb_query_condition_combination_op_t combination_op) const { - - auto pyqc = PyQueryCondition(nullptr, ctx_.ptr().get()); - - tiledb_query_condition_t *combined_qc = nullptr; - ctx_.handle_error( - tiledb_query_condition_alloc(ctx_.ptr().get(), &combined_qc)); - - ctx_.handle_error(tiledb_query_condition_combine( - ctx_.ptr().get(), qc_->ptr().get(), qc.qc_->ptr().get(), - combination_op, &combined_qc)); - - pyqc.qc_ = std::shared_ptr( - new QueryCondition(pyqc.ctx_, combined_qc)); - - return pyqc; - } - -private: - PyQueryCondition(shared_ptr qc, tiledb_ctx_t *c_ctx) - : qc_(qc) { - ctx_ = Context(c_ctx, false); - } - - void set_ctx(py::object ctx) { - tiledb_ctx_t *c_ctx; - if ((c_ctx = (py::capsule)ctx.attr("__capsule__")()) == nullptr) - TPY_ERROR_LOC("Invalid context pointer!") - - ctx_ = Context(c_ctx, false); - } -}; // namespace tiledbpy +using namespace py::literals; +using namespace tiledbsoma; void load_query_condition(py::module &m) { py::class_(m, "PyQueryCondition", py::module_local()) @@ -299,7 +182,6 @@ void load_query_condition(py::module &m) { .value("TILEDB_AND", TILEDB_AND) .value("TILEDB_OR", TILEDB_OR) .export_values(); -} -}; // namespace tiledbpy +}} #endif diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc new file mode 100644 index 0000000000..2434c0b4ae --- /dev/null +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -0,0 +1,506 @@ +/** + * @file soma_array.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2022 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines the SOMAArray bindings. + */ + +#include "common.h" + +#define DENUM(x) .value(#x, TILEDB_##x) +namespace libtiledbsomacpp { + +namespace py = pybind11; +using namespace py::literals; +using namespace tiledbsoma; + +py::tuple get_enum(SOMAArray& sr, std::string attr_name){ + auto attr_to_enmrs = sr.get_attr_to_enum_mapping(); + if(attr_to_enmrs.count(attr_name) == 0) + TPY_ERROR_LOC("Given attribute does not have enumeration"); + + Enumeration enmr(attr_to_enmrs.at(attr_name)); + + switch (enmr.type()) { + case TILEDB_UINT8: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_INT8: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_UINT16: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_INT16: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_UINT32: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_INT32: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_UINT64: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_INT64: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_FLOAT32: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_FLOAT64: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + case TILEDB_CHAR: + return py::tuple(py::cast(enmr.as_vector())); + case TILEDB_BOOL: + return py::tuple(py::cast(enmr.as_vector())); + default: + TPY_ERROR_LOC("Unsupported enumeration type."); + } +} + +bool get_enum_is_ordered(SOMAArray& sr, std::string attr_name){ + auto attr_to_enmrs = sr.get_attr_to_enum_mapping(); + if(attr_to_enmrs.count(attr_name) == 0) + TPY_ERROR_LOC("Given attribute does not have enumeration"); + return attr_to_enmrs.at(attr_name).ordered(); +} + +void load_soma_array(py::module &m) { + py::class_(m, "SOMAArray") + .def( + py::init( + [](std::string_view uri, + std::string_view name, + std::optional> column_names_in, + std::string_view batch_size, + ResultOrder result_order, + std::map platform_config, + std::optional> timestamp) { + // Handle optional args + std::vector column_names; + if (column_names_in) { + column_names = *column_names_in; + } + + return SOMAArray::open( + OpenMode::read, + uri, + name, + platform_config, + column_names, + batch_size, + result_order, + timestamp); + }), + "uri"_a, + py::kw_only(), + "name"_a = "unnamed", + "column_names"_a = py::none(), + "batch_size"_a = "auto", + "result_order"_a = ResultOrder::automatic, + "platform_config"_a = py::dict(), + "timestamp"_a = py::none()) + + .def( + "set_condition", + [](SOMAArray& reader, + py::object py_query_condition, + py::object py_schema){ + auto column_names = reader.column_names(); + // Handle query condition based on + // TileDB-Py::PyQuery::set_attr_cond() + QueryCondition* qc = nullptr; + if (!py_query_condition.is(py::none())) { + py::object init_pyqc = py_query_condition.attr( + "init_query_condition"); + try { + // Column names will be updated with columns present + // in the query condition + auto new_column_names = + init_pyqc(py_schema, column_names) + .cast>(); + // Update the column_names list if it was not empty, + // otherwise continue selecting all columns with an + // empty column_names list + if (!column_names.empty()) { + column_names = new_column_names; + } + } catch (const std::exception& e) { + TPY_ERROR_LOC(e.what()); + } + qc = py_query_condition.attr("c_obj") + .cast() + .ptr() + .get(); + } + reader.reset(column_names); + + // Release python GIL after we're done accessing python + // objects + py::gil_scoped_release release; + // Set query condition if present + if (qc) { + reader.set_condition(*qc); + } + }, + "py_query_condition"_a, + "py_schema"_a) + + .def( + "reset", + [](SOMAArray& reader, + std::optional> column_names_in, + std::string_view batch_size, + ResultOrder result_order) { + // Handle optional args + std::vector column_names; + if (column_names_in) { + column_names = *column_names_in; + } + + // Reset state of the existing SOMAArray object + reader.reset(column_names, batch_size, result_order); + }, + py::kw_only(), + "column_names"_a = py::none(), + "batch_size"_a = "auto", + "result_order"_a = ResultOrder::automatic) + + // After this are short functions expected to be invoked when the coords + // are Python list/tuple, or NumPy arrays. Arrow arrays are in this + // long if-else-if function. + .def( + "set_dim_points_arrow", + [](SOMAArray& reader, + const std::string& dim, + py::object py_arrow_array, + int partition_index, + int partition_count) { + // Create a list of array chunks + py::list array_chunks; + if (py::hasattr(py_arrow_array, "chunks")) { + array_chunks = py_arrow_array.attr("chunks") + .cast(); + } else { + array_chunks.append(py_arrow_array); + } + + for (const pybind11::handle array : array_chunks) { + ArrowSchema arrow_schema; + ArrowArray arrow_array; + uintptr_t arrow_schema_ptr = (uintptr_t)(&arrow_schema); + uintptr_t arrow_array_ptr = (uintptr_t)(&arrow_array); + + // Call array._export_to_c to get arrow array and schema + // + // If ever a NumPy array gets in here, there will be an + // exception like "AttributeError: 'numpy.ndarray' object + // has no attribute '_export_to_c'". + array.attr("_export_to_c")( + arrow_array_ptr, arrow_schema_ptr); + + auto coords = array.attr("tolist")(); + + if (!strcmp(arrow_schema.format, "l")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "i")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "s")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "c")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "L")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "I")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "S")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "C")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "f")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "g")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if ( + !strcmp(arrow_schema.format, "u") || + !strcmp(arrow_schema.format, "z")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if ( + !strcmp(arrow_schema.format, "tss:") || + !strcmp(arrow_schema.format, "tsm:") || + !strcmp(arrow_schema.format, "tsu:") || + !strcmp(arrow_schema.format, "tsn:")) { + // convert the Arrow Array to int64 + auto pa = py::module::import("pyarrow"); + coords = array.attr("cast")(pa.attr("int64")()).attr("tolist")(); + reader.set_dim_points( + dim, coords.cast>()); + } else if ( + !strcmp(arrow_schema.format, "U") || + !strcmp(arrow_schema.format, "Z")) { + reader.set_dim_points( + dim, coords.cast>()); + } else { + TPY_ERROR_LOC( + "[pytiledbsoma] set_dim_points: type={} not " + "supported" + + std::string(arrow_schema.format)); + } + + // Release arrow schema + arrow_schema.release(&arrow_schema); + } + }, + "dim"_a, + "py_arrow_array"_a, + "partition_index"_a = 0, + "partition_count"_a = 1) + + // The following short functions are expected to be invoked when the + // coords are Python list/tuple, or NumPy arrays. Arrow arrays are in + // the long if-else-if function above. + // + // Binding overloaded methods to templated member functions requires + // more effort, see: + // https://pybind11.readthedocs.io/en/stable/classes.html#overloaded-methods + + // In an initial version of this file we had `set_dim_ranges` relying + // solely on type-overloading. This worked since we supported only int + // and string indices. In a subsequent version we are now supporting + // various NumPy/PyArrow types including float32, float64, int8, uint16, + // etc. It is an unfortunate fact that pybind11 does _not_ successfully + // disambiguate between float32 and float64, or between int8 and int64, + // etc. given that we ask it to disambiguate using not just types but + // std::vector of types or std::vector of std::pair of types. + // Experiments have shown that when both float32 and float64 are + // implemented with overloaded names to be differentiated solely by + // type, pybind11 uses the _first found_. Therefore it is necessary for + // us to no longer use common overloaded names. + + .def( + "set_dim_points_string_or_bytes", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_float64", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_float32", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_int64", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_int32", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_int16", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_int8", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_uint64", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_uint32", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_uint16", + static_cast&)>( + &SOMAArray::set_dim_points)) + + .def( + "set_dim_points_uint8", + static_cast&)>( + &SOMAArray::set_dim_points)) + + // In an initial version of this file we had `set_dim_ranges` relying + // solely on type-overloading. This worked since we supported only int + // and string indices. In a subsequent version we are now supporting + // various NumPy/PyArrow types including float32, float64, int8, uint16, + // etc. It is an unfortunate fact that pybind11 does _not_ successfully + // disambiguate between float32 and float64, or between int8 and int64, + // etc. given that we ask it to disambiguate using not just types but + // std::vector of types or std::vector of std::pair of types. + // Experiments have shown that when both float32 and float64 are + // implemented with overloaded names to be differentiated solely by + // type, pybind11 uses the _first found_. Therefore it is necessary for + // us to no longer use common overloaded names. + + .def( + "set_dim_ranges_string_or_bytes", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_int64", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_int32", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_int16", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_int8", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_uint64", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_uint32", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_uint16", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_uint8", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_float64", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def( + "set_dim_ranges_float32", + static_cast>&)>( + &SOMAArray::set_dim_ranges)) + + .def("results_complete", &SOMAArray::results_complete) + + .def( + "read_next", + [](SOMAArray& reader) -> std::optional { + // Release python GIL before reading data + py::gil_scoped_release release; + + // Try to read more data + auto buffers = reader.read_next(); + + // If more data was read, convert it to an arrow table and + // return + if (buffers.has_value()) { + // Acquire python GIL before accessing python objects + py::gil_scoped_acquire acquire; + return to_table(*buffers); + } + + // No data was read, the query is complete, return nullopt + return std::nullopt; + }) + + .def("nnz", &SOMAArray::nnz, py::call_guard()) + + .def_property_readonly("shape", &SOMAArray::shape) + + .def_property_readonly("uri", &SOMAArray::uri) + + .def_property_readonly("column_names", &SOMAArray::column_names) + + .def_property_readonly("result_order", &SOMAArray::result_order) + + .def("get_enum", get_enum) + + .def("get_enum_is_ordered", get_enum_is_ordered) + + .def("get_enum_label_on_attr", &SOMAArray::get_enum_label_on_attr); +} +} // namespace tiledbsoma \ No newline at end of file diff --git a/apis/python/src/tiledbsoma/soma_dataframe.cc b/apis/python/src/tiledbsoma/soma_dataframe.cc new file mode 100644 index 0000000000..18717fc9ed --- /dev/null +++ b/apis/python/src/tiledbsoma/soma_dataframe.cc @@ -0,0 +1,474 @@ +/** + * @file soma_dataframe.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2023 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines the SOMADataFrame bindings. + */ + +#include +#include +#include +#include +#include + +#include + +#include "common.h" + +namespace libtiledbsomacpp { + +namespace py = pybind11; +using namespace py::literals; +using namespace tiledbsoma; + +void load_soma_dataframe(py::module &m) { + py::class_(m, "SOMADataFrame") + + .def_static( + "open", + py::overload_cast< + std::string_view, + OpenMode, + std::map, + std::vector, + ResultOrder, + std::optional>>(&SOMADataFrame::open), + "uri"_a, + "mode"_a, + py::kw_only(), + "platform_config"_a = py::dict(), + "column_names"_a = py::none(), + "result_order"_a = ResultOrder::automatic, + "timestamp"_a = py::none()) + + .def_static("exists", &SOMADataFrame::exists) + .def("reopen", py::overload_cast>>(&SOMADataFrame::open)) + .def("close", &SOMADataFrame::close) + .def_property_readonly("closed", [](SOMADataFrame& soma_df) -> bool { + return not soma_df.is_open(); + }) + .def("reset", &SOMADataFrame::reset) + .def("set_condition", + [](SOMADataFrame& reader, + py::object py_query_condition, + py::object pa_schema){ + auto column_names = reader.column_names(); + // Handle query condition based on + // TileDB-Py::PyQuery::set_attr_cond() + QueryCondition* qc = nullptr; + if (!py_query_condition.is(py::none())) { + py::object init_pyqc = py_query_condition.attr( + "init_query_condition"); + try { + // Column names will be updated with columns present + // in the query condition + auto new_column_names = + init_pyqc(pa_schema, column_names) + .cast>(); + // Update the column_names list if it was not empty, + // otherwise continue selecting all columns with an + // empty column_names list + if (!column_names.empty()) { + column_names = new_column_names; + } + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + qc = py_query_condition.attr("c_obj") + .cast() + .ptr() + .get(); + reader.reset(column_names); + + // Release python GIL after we're done accessing python + // objects + py::gil_scoped_release release; + // Set query condition if present + if (qc) { + reader.set_condition(*qc); + } + } + }, + "py_query_condition"_a, + "py_schema"_a) + .def_property_readonly("type", &SOMADataFrame::type) + .def_property_readonly("uri", &SOMADataFrame::uri) + .def_property_readonly("mode", [](SOMADataFrame& soma_df){ + return soma_df.mode() == OpenMode::read ? "r" : "w"; + }) + .def_property_readonly("schema", [](SOMADataFrame& soma_df) -> py::object { + auto pa = py::module::import("pyarrow"); + auto pa_schema_import = pa.attr("Schema").attr("_import_from_c"); + return pa_schema_import(py::capsule(soma_df.schema().get())); + }) + .def_property_readonly("timestamp", [](SOMADataFrame& soma_df) -> py::object { + if(!soma_df.timestamp().has_value()) + return py::none(); + return py::cast(soma_df.timestamp()->second); + }) + .def_property_readonly("index_column_names", &SOMADataFrame::index_column_names) + .def("non_empty_domain", [](SOMADataFrame& soma_df, std::string name, py::dtype dtype){ + switch (np_to_tdb_dtype(dtype)) { + case TILEDB_UINT64: + return py::cast(soma_df.non_empty_domain(name)); + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_INT64: + return py::cast(soma_df.non_empty_domain(name)); + case TILEDB_UINT32: + return py::cast(soma_df.non_empty_domain(name)); + case TILEDB_INT32: + return py::cast(soma_df.non_empty_domain(name)); + case TILEDB_UINT16: + return py::cast(soma_df.non_empty_domain(name)); + case TILEDB_INT16: + return py::cast(soma_df.non_empty_domain(name)); + case TILEDB_UINT8: + return py::cast(soma_df.non_empty_domain(name)); + case TILEDB_INT8: + return py::cast(soma_df.non_empty_domain(name)); + case TILEDB_FLOAT64: + return py::cast(soma_df.non_empty_domain(name)); + case TILEDB_FLOAT32: + return py::cast(soma_df.non_empty_domain(name)); + case TILEDB_STRING_UTF8: + case TILEDB_STRING_ASCII: + return py::cast(soma_df.non_empty_domain_var(name)); + default: + throw TileDBSOMAError("Unsupported dtype for nonempty domain."); + } + }) + .def("domain", [](SOMADataFrame& soma_df, std::string name, py::dtype dtype) { + switch (np_to_tdb_dtype(dtype)) { + case TILEDB_UINT64: + return py::cast(soma_df.domain(name)); + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_INT64: + return py::cast(soma_df.domain(name)); + case TILEDB_UINT32: + return py::cast(soma_df.domain(name)); + case TILEDB_INT32: + return py::cast(soma_df.domain(name)); + case TILEDB_UINT16: + return py::cast(soma_df.domain(name)); + case TILEDB_INT16: + return py::cast(soma_df.domain(name)); + case TILEDB_UINT8: + return py::cast(soma_df.domain(name)); + case TILEDB_INT8: + return py::cast(soma_df.domain(name)); + case TILEDB_FLOAT64: + return py::cast(soma_df.domain(name)); + case TILEDB_FLOAT32: + return py::cast(soma_df.domain(name)); + case TILEDB_STRING_UTF8: + case TILEDB_STRING_ASCII: { + std::pair str_domain; + return py::cast(std::make_pair("", "")); + } + default: + throw TileDBSOMAError("Unsupported dtype for Dimension's domain"); + } + }) + .def_property_readonly("count", &SOMADataFrame::count) + .def("read_next", [](SOMADataFrame& dataframe){ + // Release GIL when reading data + py::gil_scoped_release release; + auto buffers = dataframe.read_next(); + py::gil_scoped_acquire acquire; + + return to_table(buffers); + }) + .def("set_metadata", &SOMADataFrame::set_metadata) + .def("delete_metadata", &SOMADataFrame::delete_metadata) + .def("get_metadata", + py::overload_cast(&SOMADataFrame::get_metadata)) + .def_property_readonly("meta", [](SOMADataFrame&soma_dataframe) -> py::dict { + py::dict results; + + for (auto const& [key, val] : soma_dataframe.get_metadata()){ + tiledb_datatype_t tdb_type = std::get(val); + uint32_t value_num = std::get(val); + const void *value = std::get(val); + + if(tdb_type == TILEDB_STRING_UTF8){ + results[py::str(key)] = py::str(std::string((const char*)value, value_num)); + }else if(tdb_type == TILEDB_STRING_ASCII){ + results[py::str(key)] = py::bytes(std::string((const char*)value, value_num)); + }else{ + py::dtype value_type = tdb_to_np_dtype(tdb_type, 1); + results[py::str(key)] = py::array(value_type, value_num, value); + } + } + return results; + }) + .def("has_metadata", &SOMADataFrame::has_metadata) + .def("metadata_num", &SOMADataFrame::metadata_num) + .def( + "set_dim_points_arrow", + [](SOMADataFrame& reader, + const std::string& dim, + py::object py_arrow_array, + int partition_index, + int partition_count) { + // Create a list of array chunks + py::list array_chunks; + if (py::hasattr(py_arrow_array, "chunks")) { + array_chunks = py_arrow_array.attr("chunks") + .cast(); + } else { + array_chunks.append(py_arrow_array); + } + + for (const pybind11::handle array : array_chunks) { + ArrowSchema arrow_schema; + ArrowArray arrow_array; + uintptr_t arrow_schema_ptr = (uintptr_t)(&arrow_schema); + uintptr_t arrow_array_ptr = (uintptr_t)(&arrow_array); + + // Call array._export_to_c to get arrow array and schema + // + // If ever a NumPy array gets in here, there will be an + // exception like "AttributeError: 'numpy.ndarray' object + // has no attribute '_export_to_c'". + array.attr("_export_to_c")( + arrow_array_ptr, arrow_schema_ptr); + + auto coords = array.attr("tolist")(); + + if (!strcmp(arrow_schema.format, "l")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "i")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "s")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "c")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "L")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "I")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "S")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "C")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "f")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if (!strcmp(arrow_schema.format, "g")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if ( + !strcmp(arrow_schema.format, "u") || + !strcmp(arrow_schema.format, "z")) { + reader.set_dim_points( + dim, coords.cast>()); + } else if ( + !strcmp(arrow_schema.format, "tss:") || + !strcmp(arrow_schema.format, "tsm:") || + !strcmp(arrow_schema.format, "tsu:") || + !strcmp(arrow_schema.format, "tsn:")) { + // convert the Arrow Array to int64 + auto pa = py::module::import("pyarrow"); + coords = array.attr("cast")(pa.attr("int64")()).attr("tolist")(); + reader.set_dim_points( + dim, coords.cast>()); + } else if ( + !strcmp(arrow_schema.format, "U") || + !strcmp(arrow_schema.format, "Z")) { + reader.set_dim_points( + dim, coords.cast>()); + } else { + throw TileDBSOMAError( + "[pytiledbsoma] set_dim_points: type={} not " + "supported" + + std::string(arrow_schema.format)); + } + + // Release arrow schema + arrow_schema.release(&arrow_schema); + } + }, + "dim"_a, + "py_arrow_array"_a, + "partition_index"_a = 0, + "partition_count"_a = 1) + .def( + "set_dim_points_string_or_bytes", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_double", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_float", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_int64", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_int32", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_int16", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_int8", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_uint64", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_uint32", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_uint16", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_points_uint8", + static_cast&)>( + &SOMADataFrame::set_dim_points)) + .def( + "set_dim_ranges_string_or_bytes", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + .def( + "set_dim_ranges_int64", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + .def( + "set_dim_ranges_int32", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + .def( + "set_dim_ranges_int16", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + .def( + "set_dim_ranges_int8", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + .def( + "set_dim_ranges_uint64", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + .def( + "set_dim_ranges_uint32", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + .def( + "set_dim_ranges_uint16", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + .def( + "set_dim_ranges_uint8", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + .def( + "set_dim_ranges_double", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)) + .def( + "set_dim_ranges_float", + static_cast>&)>( + &SOMADataFrame::set_dim_ranges)); + } +} \ No newline at end of file diff --git a/apis/python/src/tiledbsoma/soma_object.cc b/apis/python/src/tiledbsoma/soma_object.cc new file mode 100644 index 0000000000..6192961817 --- /dev/null +++ b/apis/python/src/tiledbsoma/soma_object.cc @@ -0,0 +1,70 @@ +/** + * @file soma_object.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2023 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines the SOMAObject bindings. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include "common.h" + +namespace libtiledbsomacpp { + +namespace py = pybind11; +using namespace py::literals; +using namespace tiledbsoma; + +void load_soma_object(py::module &m) { + py::class_(m, "SOMAObject") + + .def_static("open", [](std::string uri, + OpenMode mode, + std::map config, + std::optional> timestamp) -> py::object { + if(mode == OpenMode::write) + TPY_ERROR_LOC("SOMAObjects for write mode not handled in Python API yet."); + + try{ + auto obj = SOMAObject::open(uri, mode, config, timestamp); + if (obj->type() == "SOMADataFrame") + return py::cast(dynamic_cast(*obj)); + } + catch(...){ + TPY_ERROR_LOC("SOMAObject not handled in Python API yet."); + } + }); +} +} diff --git a/apis/python/tests/test_collection.py b/apis/python/tests/test_collection.py index 8564a30dbd..39a6918ff8 100644 --- a/apis/python/tests/test_collection.py +++ b/apis/python/tests/test_collection.py @@ -78,10 +78,10 @@ def test_collection_basic(tmp_path): assert len(readback_collection) == 2 with readback_collection["sdf"] as sdf: - assert len(sdf._handle.reader.df[:]) == 5 + assert len(sdf.read().concat()) == 5 with readback_collection["snda"] as snda: - assert len(snda._handle.reader.df[:]) == 3 + assert len(snda.read().tables().concat()) == 3 @pytest.fixture( diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index 7fc9592504..dbc8ef945b 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -131,10 +131,7 @@ def test_dataframe_with_enumeration(tmp_path): ] ) enums = {"enmr1": ("a", "bb", "ccc"), "enmr2": ("cat", "dog")} - with soma.DataFrame.create( - tmp_path.as_posix(), - schema=schema, - ) as sdf: + with soma.DataFrame.create(tmp_path.as_posix(), schema=schema) as sdf: data = {} data["soma_joinid"] = [0, 1, 2, 3, 4] data["foo"] = ["a", "bb", "ccc", "bb", "a"] @@ -1359,3 +1356,7 @@ def test_enum_extend_past_numerical_limit(tmp_path): with pytest.raises(ValueError): with soma.open(uri, mode="w") as A: A.write(tbl) + + +def test_write_str_empty_ned(tmp_path): + tmp_path.as_posix() diff --git a/libtiledbsoma/test/test_indexer.py b/apis/python/tests/test_indexer.py similarity index 94% rename from libtiledbsoma/test/test_indexer.py rename to apis/python/tests/test_indexer.py index 3bbd9ec520..c2acb53fd7 100644 --- a/libtiledbsoma/test/test_indexer.py +++ b/apis/python/tests/test_indexer.py @@ -1,14 +1,10 @@ import numpy as np import pandas as pd -import tiledb from tiledbsoma.options import SOMATileDBContext from tiledbsoma.options._soma_tiledb_context import _validate_soma_tiledb_context -from tiledbsoma.pytiledbsoma import config_logging from tiledbsoma.utils import build_index -config_logging("debug") - def indexer_test(keys: np.array, lookups: np.array, fail: bool): if fail: @@ -19,7 +15,7 @@ def indexer_test(keys: np.array, lookups: np.array, fail: bool): def indexer_test_fail(keys: np.array, lookups: np.array): try: - context = _validate_soma_tiledb_context(SOMATileDBContext(tiledb.default_ctx())) + context = _validate_soma_tiledb_context(SOMATileDBContext()) index = build_index(keys, context) index.get_indexer(lookups) raise AssertionError("should have failed") @@ -35,7 +31,7 @@ def indexer_test_fail(keys: np.array, lookups: np.array): def indexer_test_pass(keys: np.array, lookups: np.array): - context = _validate_soma_tiledb_context(SOMATileDBContext(tiledb.default_ctx())) + context = _validate_soma_tiledb_context(SOMATileDBContext()) indexer = build_index(keys, context) results = indexer.get_indexer(lookups) panda_indexer = pd.Index(keys) diff --git a/apis/python/tests/test_platform_config.py b/apis/python/tests/test_platform_config.py index 282504380e..3849e33b53 100644 --- a/apis/python/tests/test_platform_config.py +++ b/apis/python/tests/test_platform_config.py @@ -70,9 +70,20 @@ def test_platform_config(adata): assert list(x_arr.dim("soma_dim_1").filters) == [ tiledb.ZstdFilter(level=-1) ] - var_df = exp.ms["RNA"].var - var_arr = var_df._handle.reader - assert var_arr.dim("soma_joinid").filters == [tiledb.ZstdFilter(level=1)] + # TODO as we remove usage of TileDB-Py in favor of ArrowSchema, we + # need a new method to get which filters have applied to the column + # rather than grabbing it from the ArraySchema. One consideration + # would be to store TileDB information in JSON format as a field in + # the ArraySchema metadata very similar to how Pandas stores information + # within pa.Schema.pandas_metadata. This could hold not only which + # filters have been applied to the column, but other info that cannot + # be "directly" stored in the ArrowSchema such as whether the column + # is a TileDB attribute or dimension, whether this represent a dense + # or sparse array, etc. This may be as easy as simply copying the + # platform_config by calling pa.Schema.with_metadata(platform_config). + # var_df = exp.ms["RNA"].var + # var_arr = var_df._handle.reader + # assert var_arr.dim("soma_joinid").filters == [tiledb.ZstdFilter(level=1)] def test__from_platform_config__admits_ignored_config_structure(): diff --git a/libtiledbsoma/test/test_query_condition.py b/apis/python/tests/test_query_condition.py similarity index 88% rename from libtiledbsoma/test/test_query_condition.py rename to apis/python/tests/test_query_condition.py index 218816fbcd..3fad4f47fa 100644 --- a/libtiledbsoma/test/test_query_condition.py +++ b/apis/python/tests/test_query_condition.py @@ -6,13 +6,14 @@ import tiledb import tiledbsoma.pytiledbsoma as clib +from tiledbsoma._arrow_types import tiledb_schema_to_arrow from tiledbsoma._exception import SOMAError from tiledbsoma._query_condition import QueryCondition VERBOSE = False TEST_DIR = os.path.dirname(__file__) -SOMA_URI = f"{TEST_DIR}/../../test/soco/pbmc3k_processed" +SOMA_URI = f"{TEST_DIR}/../../../test/soco/pbmc3k_processed" if VERBOSE: clib.config_logging("debug") @@ -29,7 +30,7 @@ def pandas_query(uri, condition): def soma_query(uri, condition): qc = QueryCondition(condition) sr = clib.SOMAArray(uri) - schema = tiledb.open(uri).schema + schema = tiledb_schema_to_arrow(tiledb.open(uri).schema, uri, tiledb.default_ctx()) sr.set_condition(qc, schema) arrow_table = sr.read_next() assert sr.results_complete() @@ -107,10 +108,9 @@ def test_query_condition_select_columns(): uri = os.path.join(SOMA_URI, "obs") condition = "percent_mito > 0.02" - qc = QueryCondition(condition) - schema = tiledb.open(uri).schema - sr = clib.SOMAArray(uri, column_names=["n_genes"]) + qc = QueryCondition(condition) + schema = tiledb_schema_to_arrow(tiledb.open(uri).schema, uri, tiledb.default_ctx()) sr.set_condition(qc, schema) arrow_table = sr.read_next() @@ -124,7 +124,7 @@ def test_query_condition_all_columns(): condition = "percent_mito > 0.02" qc = QueryCondition(condition) - schema = tiledb.open(uri).schema + schema = tiledb_schema_to_arrow(tiledb.open(uri).schema, uri, tiledb.default_ctx()) sr = clib.SOMAArray(uri) sr.set_condition(qc, schema) @@ -140,7 +140,7 @@ def test_query_condition_reset(): condition = "percent_mito > 0.02" qc = QueryCondition(condition) - schema = tiledb.open(uri).schema + schema = tiledb_schema_to_arrow(tiledb.open(uri).schema, uri, tiledb.default_ctx()) sr = clib.SOMAArray(uri) sr.set_condition(qc, schema) @@ -218,22 +218,17 @@ def test_parsing_error_conditions(malformed_condition): def test_eval_error_conditions(malformed_condition): """Conditions which should not evaluate (but WILL parse)""" uri = os.path.join(SOMA_URI, "obs") + schema = tiledb_schema_to_arrow(tiledb.open(uri).schema, uri, tiledb.default_ctx()) + qc = QueryCondition(malformed_condition) - # TODO: these raise the wrong error - it should be SOMAError. Change the test - # when https://github.com/single-cell-data/TileDB-SOMA/issues/783 is fixed - - with pytest.raises(RuntimeError): - qc = QueryCondition(malformed_condition) - schema = tiledb.open(uri).schema + with pytest.raises(SOMAError): sr = clib.SOMAArray(uri) sr.set_condition(qc, schema) - sr.read_next() - with pytest.raises(tiledb.TileDBError): - qc = QueryCondition(malformed_condition) - schema = tiledb.open(uri).schema + with pytest.raises(SOMAError): # test function directly for codecov - qc.init_query_condition(schema, {}, []) + qc.init_query_condition(schema, []) + qc.init_query_condition(schema, ["bad_query_attr"]) if __name__ == "__main__": diff --git a/libtiledbsoma/test/test_simple.py b/apis/python/tests/test_simple.py similarity index 100% rename from libtiledbsoma/test/test_simple.py rename to apis/python/tests/test_simple.py diff --git a/libtiledbsoma/test/test_soma_array.py b/apis/python/tests/test_soma_array.py similarity index 99% rename from libtiledbsoma/test/test_soma_array.py rename to apis/python/tests/test_soma_array.py index f6b9ef4519..e090cada8e 100644 --- a/libtiledbsoma/test/test_soma_array.py +++ b/apis/python/tests/test_soma_array.py @@ -9,7 +9,7 @@ VERBOSE = False TEST_DIR = os.path.dirname(__file__) -SOMA_URI = f"{TEST_DIR}/../../test/soco/pbmc3k_processed" +SOMA_URI = f"{TEST_DIR}/../../../test/soco/pbmc3k_processed" if VERBOSE: clib.config_logging("debug") diff --git a/apis/python/tests/test_unicode.py b/apis/python/tests/test_unicode.py index 4b59fd4646..68d252525d 100644 --- a/apis/python/tests/test_unicode.py +++ b/apis/python/tests/test_unicode.py @@ -57,7 +57,9 @@ def test_dataframe_unicode_columns(sample_dataframe_path, sample_arrow_table): sdf.write(sample_arrow_table) with soma.DataFrame.open(sample_dataframe_path) as sdf: - assert sample_arrow_table.schema == sdf.schema + # TODO when coverting from Pandas to Arrow, the schema has information + # stored in the pandas_metadata + # assert sample_arrow_table.schema == sdf.schema assert sdf.read().concat().equals(sample_arrow_table) diff --git a/apis/r/src/rinterface.cpp b/apis/r/src/rinterface.cpp index c279719a20..4a312bb4c3 100644 --- a/apis/r/src/rinterface.cpp +++ b/apis/r/src/rinterface.cpp @@ -87,7 +87,7 @@ Rcpp::List soma_array_reader(const std::string& uri, tdb_result_order); std::unordered_map> name2dim; - std::shared_ptr schema = sr->schema(); + std::shared_ptr schema = sr->tiledb_schema(); tiledb::Domain domain = schema->domain(); std::vector dims = domain.dimensions(); for (auto& dim: dims) { diff --git a/apis/r/src/riterator.cpp b/apis/r/src/riterator.cpp index f7da66555f..1cd170aacc 100644 --- a/apis/r/src/riterator.cpp +++ b/apis/r/src/riterator.cpp @@ -118,7 +118,7 @@ Rcpp::List sr_setup(const std::string& uri, tdb_result_order, std::make_pair(ts_start, ts_end)); std::unordered_map> name2dim; - std::shared_ptr schema = ptr->schema(); + std::shared_ptr schema = ptr->tiledb_schema(); tiledb::Domain domain = schema->domain(); std::vector dims = domain.dimensions(); for (auto& dim: dims) { diff --git a/libtiledbsoma/src/CMakeLists.txt b/libtiledbsoma/src/CMakeLists.txt index ea62affc5f..24b74fd040 100644 --- a/libtiledbsoma/src/CMakeLists.txt +++ b/libtiledbsoma/src/CMakeLists.txt @@ -54,6 +54,7 @@ add_library(TILEDB_SOMA_OBJECTS OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/soma/managed_query.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_array.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_group.cc + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_object.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_collection.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_experiment.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_measurement.cc diff --git a/libtiledbsoma/src/external/khash/khash.h b/libtiledbsoma/src/external/khash/khash.h index f75f3474c1..9142c5df5a 100644 --- a/libtiledbsoma/src/external/khash/khash.h +++ b/libtiledbsoma/src/external/khash/khash.h @@ -29,35 +29,35 @@ #include "khash.h" KHASH_MAP_INIT_INT(32, char) int main() { - int ret, is_missing; - khiter_t k; - khash_t(32) *h = kh_init(32); - k = kh_put(32, h, 5, &ret); - kh_value(h, k) = 10; - k = kh_get(32, h, 10); - is_missing = (k == kh_end(h)); - k = kh_get(32, h, 5); - kh_del(32, h, k); - for (k = kh_begin(h); k != kh_end(h); ++k) - if (kh_exist(h, k)) kh_value(h, k) = 1; - kh_destroy(32, h); - return 0; + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; } */ /* 2013-05-02 (0.2.8): - * Use quadratic probing. When the capacity is power of 2, stepping function - i*(i+1)/2 guarantees to traverse each bucket. It is better than double - hashing on cache performance and is more robust than linear probing. + * Use quadratic probing. When the capacity is power of 2, stepping + function i*(i+1)/2 guarantees to traverse each bucket. It is better than + double hashing on cache performance and is more robust than linear probing. - In theory, double hashing should be more robust than quadratic probing. - However, my implementation is probably not for large hash tables, because - the second hash function is closely tied to the first hash function, - which reduce the effectiveness of double hashing. + In theory, double hashing should be more robust than quadratic + probing. However, my implementation is probably not for large hash tables, + because the second hash function is closely tied to the first hash function, + which reduce the effectiveness of double hashing. - Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php + Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php 2011-12-29 (0.2.7): @@ -65,18 +65,18 @@ int main() { 2011-09-16 (0.2.6): - * The capacity is a power of 2. This seems to dramatically improve the - speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: - - http://code.google.com/p/ulib/ - - http://nothings.org/computer/judy/ + - http://code.google.com/p/ulib/ + - http://nothings.org/computer/judy/ - * Allow to optionally use linear probing which usually has better - performance for random input. Double hashing is still the default as it - is more robust to certain non-random input. + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as + it is more robust to certain non-random input. - * Added Wang's integer hash function (not used by default). This hash - function is more robust to certain non-random input. + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. 2011-02-14 (0.2.5): @@ -88,32 +88,31 @@ int main() { 2008-09-19 (0.2.3): - * Corrected the example - * Improved interfaces + * Corrected the example + * Improved interfaces 2008-09-11 (0.2.2): - * Improved speed a little in kh_put() + * Improved speed a little in kh_put() 2008-09-10 (0.2.1): - * Added kh_clear() - * Fixed a compiling error + * Added kh_clear() + * Fixed a compiling error 2008-09-02 (0.2.0): - * Changed to token concatenation which increases flexibility. + * Changed to token concatenation which increases flexibility. 2008-08-31 (0.1.2): - * Fixed a bug in kh_get(), which has not been tested previously. + * Fixed a bug in kh_get(), which has not been tested previously. 2008-08-31 (0.1.1): - * Added destructor + * Added destructor */ - #ifndef __AC_KHASH_H #define __AC_KHASH_H @@ -125,9 +124,9 @@ int main() { #define AC_VERSION_KHASH_H "0.2.8" +#include #include #include -#include /* compiler specific configuration */ @@ -152,8 +151,9 @@ typedef unsigned long long khint64_t; #endif /* kh_inline */ #ifndef klib_unused -#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) -#define klib_unused __attribute__ ((__unused__)) +#if (defined __clang__ && __clang_major__ >= 3) || \ + (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__((__unused__)) #else #define klib_unused #endif @@ -162,28 +162,38 @@ typedef unsigned long long khint64_t; typedef khint32_t khint_t; typedef khint_t khiter_t; -#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) -#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) -#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) -#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) -#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) -#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) -#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) +#define __ac_isempty(flag, i) ((flag[i >> 4] >> ((i & 0xfU) << 1)) & 2) +#define __ac_isdel(flag, i) ((flag[i >> 4] >> ((i & 0xfU) << 1)) & 1) +#define __ac_iseither(flag, i) ((flag[i >> 4] >> ((i & 0xfU) << 1)) & 3) +#define __ac_set_isdel_false(flag, i) \ + (flag[i >> 4] &= ~(1ul << ((i & 0xfU) << 1))) +#define __ac_set_isempty_false(flag, i) \ + (flag[i >> 4] &= ~(2ul << ((i & 0xfU) << 1))) +#define __ac_set_isboth_false(flag, i) \ + (flag[i >> 4] &= ~(3ul << ((i & 0xfU) << 1))) +#define __ac_set_isdel_true(flag, i) (flag[i >> 4] |= 1ul << ((i & 0xfU) << 1)) -#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) +#define __ac_fsize(m) ((m) < 16 ? 1 : (m) >> 4) #ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#define kroundup32(x) \ + (--(x), \ + (x) |= (x) >> 1, \ + (x) |= (x) >> 2, \ + (x) |= (x) >> 4, \ + (x) |= (x) >> 8, \ + (x) |= (x) >> 16, \ + ++(x)) #endif #ifndef kcalloc -#define kcalloc(N,Z) calloc(N,Z) +#define kcalloc(N, Z) calloc(N, Z) #endif #ifndef kmalloc #define kmalloc(Z) malloc(Z) #endif #ifndef krealloc -#define krealloc(P,Z) realloc(P,Z) +#define krealloc(P, Z) realloc(P, Z) #endif #ifndef kfree #define kfree(P) free(P) @@ -191,179 +201,240 @@ typedef khint_t khiter_t; static const double __ac_HASH_UPPER = 0.77; -#define __KHASH_TYPE(name, khkey_t, khval_t) \ - typedef struct kh_##name##_s { \ - khint_t n_buckets, size, n_occupied, upper_bound; \ - khint32_t *flags; \ - khkey_t *keys; \ - khval_t *vals; \ - } kh_##name##_t; - -#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ - extern kh_##name##_t *kh_init_##name(void); \ - extern void kh_destroy_##name(kh_##name##_t *h); \ - extern void kh_clear_##name(kh_##name##_t *h); \ - extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ - extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ - extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ - extern void kh_del_##name(kh_##name##_t *h, khint_t x); - -#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - SCOPE kh_##name##_t *kh_init_##name(void) { \ - return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ - } \ - SCOPE void kh_destroy_##name(kh_##name##_t *h) \ - { \ - if (h) { \ - kfree((void *)h->keys); kfree(h->flags); \ - kfree((void *)h->vals); \ - kfree(h); \ - } \ - } \ - SCOPE void kh_clear_##name(kh_##name##_t *h) \ - { \ - if (h && h->flags) { \ - memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ - h->size = h->n_occupied = 0; \ - } \ - } \ - SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ - { \ - if (h->n_buckets) { \ - khint_t k, i, last, mask, step = 0; \ - mask = h->n_buckets - 1; \ - k = __hash_func(key); i = k & mask; \ - last = i; \ - while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - i = (i + (++step)) & mask; \ - if (i == last) return h->n_buckets; \ - } \ - return __ac_iseither(h->flags, i)? h->n_buckets : i; \ - } else return 0; \ - } \ - SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ - { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ - khint32_t *new_flags = 0; \ - khint_t j = 1; \ - { \ - kroundup32(new_n_buckets); \ - if (new_n_buckets < 4) new_n_buckets = 4; \ - if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ - else { /* hash table size to be changed (shrink or expand); rehash */ \ - new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ - if (!new_flags) return -1; \ - memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ - if (h->n_buckets < new_n_buckets) { /* expand */ \ - khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (!new_keys) { kfree(new_flags); return -1; } \ - h->keys = new_keys; \ - if (kh_is_map) { \ - khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ - if (!new_vals) { kfree(new_flags); return -1; } \ - h->vals = new_vals; \ - } \ - } /* otherwise shrink */ \ - } \ - } \ - if (j) { /* rehashing is needed */ \ - for (j = 0; j != h->n_buckets; ++j) { \ - if (__ac_iseither(h->flags, j) == 0) { \ - khkey_t key = h->keys[j]; \ - khval_t val; \ - khint_t new_mask; \ - new_mask = new_n_buckets - 1; \ - if (kh_is_map) val = h->vals[j]; \ - __ac_set_isdel_true(h->flags, j); \ - while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ - khint_t k, i, step = 0; \ - k = __hash_func(key); \ - i = k & new_mask; \ - while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ - __ac_set_isempty_false(new_flags, i); \ - if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ - { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ - if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ - __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ - } else { /* write the element and jump out of the loop */ \ - h->keys[i] = key; \ - if (kh_is_map) h->vals[i] = val; \ - break; \ - } \ - } \ - } \ - } \ - if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ - h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ - } \ - kfree(h->flags); /* free the working space */ \ - h->flags = new_flags; \ - h->n_buckets = new_n_buckets; \ - h->n_occupied = h->size; \ - h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ - } \ - return 0; \ - } \ - SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ - { \ - khint_t x; \ - if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ - if (h->n_buckets > (h->size<<1)) { \ - if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ - *ret = -1; return h->n_buckets; \ - } \ - } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ - *ret = -1; return h->n_buckets; \ - } \ - } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ - { \ - khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ - x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ - if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ - else { \ - last = i; \ - while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - if (__ac_isdel(h->flags, i)) site = i; \ - i = (i + (++step)) & mask; \ - if (i == last) { x = site; break; } \ - } \ - if (x == h->n_buckets) { \ - if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ - else x = i; \ - } \ - } \ - } \ - if (__ac_isempty(h->flags, x)) { /* not present at all */ \ - h->keys[x] = key; \ - __ac_set_isboth_false(h->flags, x); \ - ++h->size; ++h->n_occupied; \ - *ret = 1; \ - } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ - h->keys[x] = key; \ - __ac_set_isboth_false(h->flags, x); \ - ++h->size; \ - *ret = 2; \ - } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ - return x; \ - } \ - SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ - { \ - if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ - __ac_set_isdel_true(h->flags, x); \ - --h->size; \ - } \ - } - -#define KHASH_DECLARE(name, khkey_t, khval_t) \ - __KHASH_TYPE(name, khkey_t, khval_t) \ - __KHASH_PROTOTYPES(name, khkey_t, khval_t) - -#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - __KHASH_TYPE(name, khkey_t, khval_t) \ - __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) - -#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) +#define __KHASH_TYPE(name, khkey_t, khval_t) \ + typedef struct kh_##name##_s { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t* flags; \ + khkey_t* keys; \ + khval_t* vals; \ + } kh_##name##_t; + +#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ + extern kh_##name##_t* kh_init_##name(void); \ + extern void kh_destroy_##name(kh_##name##_t* h); \ + extern void kh_clear_##name(kh_##name##_t* h); \ + extern khint_t kh_get_##name(const kh_##name##_t* h, khkey_t key); \ + extern int kh_resize_##name(kh_##name##_t* h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t* h, khkey_t key, int* ret); \ + extern void kh_del_##name(kh_##name##_t* h, khint_t x); + +#define __KHASH_IMPL( \ + name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + SCOPE kh_##name##_t* kh_init_##name(void) { \ + return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ + } \ + SCOPE void kh_destroy_##name(kh_##name##_t* h) { \ + if (h) { \ + kfree((void*)h->keys); \ + kfree(h->flags); \ + kfree((void*)h->vals); \ + kfree(h); \ + } \ + } \ + SCOPE void kh_clear_##name(kh_##name##_t* h) { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + SCOPE khint_t kh_get_##name(const kh_##name##_t* h, khkey_t key) { \ + if (h->n_buckets) { \ + khint_t k, i, last, mask, step = 0; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); \ + i = k & mask; \ + last = i; \ + while (!__ac_isempty(h->flags, i) && \ + (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + i = (i + (++step)) & mask; \ + if (i == last) \ + return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i) ? h->n_buckets : i; \ + } else \ + return 0; \ + } \ + SCOPE int kh_resize_##name( \ + kh_##name##_t* h, \ + khint_t new_n_buckets) { /* This function uses 0.25*n_buckets bytes of \ + working space instead of \ + [sizeof(key_t+val_t)+.25]*n_buckets. */ \ + khint32_t* new_flags = 0; \ + khint_t j = 1; \ + { \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) \ + new_n_buckets = 4; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) \ + j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khint32_t*)kmalloc( \ + __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (!new_flags) \ + return -1; \ + memset( \ + new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + khkey_t* new_keys = (khkey_t*)krealloc( \ + (void*)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (!new_keys) { \ + kfree(new_flags); \ + return -1; \ + } \ + h->keys = new_keys; \ + if (kh_is_map) { \ + khval_t* new_vals = (khval_t*)krealloc( \ + (void*)h->vals, new_n_buckets * sizeof(khval_t)); \ + if (!new_vals) { \ + kfree(new_flags); \ + return -1; \ + } \ + h->vals = new_vals; \ + } \ + } /* otherwise shrink */ \ + } \ + } \ + if (j) { /* rehashing is needed */ \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + khint_t new_mask; \ + new_mask = new_n_buckets - 1; \ + if (kh_is_map) \ + val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khint_t k, i, step = 0; \ + k = __hash_func(key); \ + i = k & new_mask; \ + while (!__ac_isempty(new_flags, i)) \ + i = (i + (++step)) & new_mask; \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && \ + __ac_iseither(h->flags, i) == \ + 0) { /* kick out the existing element */ \ + { \ + khkey_t tmp = h->keys[i]; \ + h->keys[i] = key; \ + key = tmp; \ + } \ + if (kh_is_map) { \ + khval_t tmp = h->vals[i]; \ + h->vals[i] = val; \ + val = tmp; \ + } \ + __ac_set_isdel_true( \ + h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + if (kh_is_map) \ + h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t*)krealloc( \ + (void*)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t*)krealloc( \ + (void*)h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + kfree(h->flags); /* free the working space */ \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + return 0; \ + } \ + SCOPE khint_t kh_put_##name(kh_##name##_t* h, khkey_t key, int* ret) { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size << 1)) { \ + if (kh_resize_##name(h, h->n_buckets - 1) < \ + 0) { /* clear "deleted" elements */ \ + *ret = -1; \ + return h->n_buckets; \ + } \ + } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the \ + hash table */ \ + *ret = -1; \ + return h->n_buckets; \ + } \ + } /* TODO: to implement automatically shrinking; resize() already support \ + shrinking */ \ + { \ + khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ + x = site = h->n_buckets; \ + k = __hash_func(key); \ + i = k & mask; \ + if (__ac_isempty(h->flags, i)) \ + x = i; /* for speed up */ \ + else { \ + last = i; \ + while (!__ac_isempty(h->flags, i) && \ + (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) \ + site = i; \ + i = (i + (++step)) & mask; \ + if (i == last) { \ + x = site; \ + break; \ + } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) \ + x = site; \ + else \ + x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else \ + *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ + return x; \ + } \ + SCOPE void kh_del_##name(kh_##name##_t* h, khint_t x) { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_PROTOTYPES(name, khkey_t, khval_t) + +#define KHASH_INIT2( \ + name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_IMPL( \ + name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +#define KHASH_INIT( \ + name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2( \ + name, \ + static kh_inline klib_unused, \ + khkey_t, \ + khval_t, \ + kh_is_map, \ + __hash_func, \ + __hash_equal) /* --- BEGIN OF HASH FUNCTIONS --- */ @@ -382,7 +453,7 @@ static const double __ac_HASH_UPPER = 0.77; @param key The integer [khint64_t] @return The hash value [khint_t] */ -#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) +#define kh_int64_hash_func(key) (khint32_t)((key) >> 33 ^ (key) ^ (key) << 11) /*! @function @abstract 64-bit integer comparison function */ @@ -392,11 +463,12 @@ static const double __ac_HASH_UPPER = 0.77; @param s Pointer to a null terminated string @return The hash value */ -static kh_inline khint_t __ac_X31_hash_string(const char *s) -{ - khint_t h = (khint_t)*s; - if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; - return h; +static kh_inline khint_t __ac_X31_hash_string(const char* s) { + khint_t h = (khint_t)*s; + if (h) + for (++s; *s; ++s) + h = (h << 5) - h + (khint_t)*s; + return h; } /*! @function @abstract Another interface to const char* hash function @@ -409,15 +481,14 @@ static kh_inline khint_t __ac_X31_hash_string(const char *s) */ #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) -static kh_inline khint_t __ac_Wang_hash(khint_t key) -{ - key += ~(key << 15); - key ^= (key >> 10); - key += (key << 3); - key ^= (key >> 6); - key += ~(key << 11); - key ^= (key >> 16); - return key; +static kh_inline khint_t __ac_Wang_hash(khint_t key) { + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; } #define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key) @@ -468,7 +539,7 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) @param r Extra return code: -1 if the operation failed; 0 if the key is present in the hash table; 1 if the bucket is empty (never used); 2 if the element in - the bucket has been deleted [int*] + the bucket has been deleted [int*] @return Iterator to the inserted element [khint_t] */ #define kh_put(name, h, k, r) kh_put_##name(h, k, r) @@ -478,7 +549,8 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] - @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t] + @return Iterator to the found element, or kh_end(h) if the element is + absent [khint_t] */ #define kh_get(name, h, k) kh_get_##name(h, k) @@ -555,13 +627,17 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) @param vvar Variable to which value will be assigned @param code Block of code to execute */ -#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ - for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ - if (!kh_exist(h,__i)) continue; \ - (kvar) = kh_key(h,__i); \ - (vvar) = kh_val(h,__i); \ - code; \ - } } +#define kh_foreach(h, kvar, vvar, code) \ + { \ + khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h, __i)) \ + continue; \ + (kvar) = kh_key(h, __i); \ + (vvar) = kh_val(h, __i); \ + code; \ + } \ + } /*! @function @abstract Iterate over the values in the hash table @@ -569,12 +645,16 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) @param vvar Variable to which value will be assigned @param code Block of code to execute */ -#define kh_foreach_value(h, vvar, code) { khint_t __i; \ - for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ - if (!kh_exist(h,__i)) continue; \ - (vvar) = kh_val(h,__i); \ - code; \ - } } +#define kh_foreach_value(h, vvar, code) \ + { \ + khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h, __i)) \ + continue; \ + (vvar) = kh_val(h, __i); \ + code; \ + } \ + } /* More convenient interfaces */ @@ -582,46 +662,47 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) @abstract Instantiate a hash set containing integer keys @param name Name of the hash table [symbol] */ -#define KHASH_SET_INIT_INT(name) \ - KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ -#define KHASH_MAP_INIT_INT(name, khval_t) \ - KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash set containing 64-bit integer keys @param name Name of the hash table [symbol] */ -#define KHASH_SET_INIT_INT64(name) \ - KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ -#define KHASH_MAP_INIT_INT64(name, khval_t) \ - KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT( \ + name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) -typedef const char *kh_cstr_t; +typedef const char* kh_cstr_t; /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] */ -#define KHASH_SET_INIT_STR(name) \ - KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ -#define KHASH_MAP_INIT_STR(name, khval_t) \ - KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) #endif /* __AC_KHASH_H */ diff --git a/libtiledbsoma/src/external/khash/khashl.h b/libtiledbsoma/src/external/khash/khashl.h index 93ce31354c..dcbffa704b 100644 --- a/libtiledbsoma/src/external/khash/khashl.h +++ b/libtiledbsoma/src/external/khash/khashl.h @@ -28,9 +28,9 @@ #define AC_VERSION_KHASHL_H "0.1" +#include #include #include -#include /************************************ * Compiler specific configurations * @@ -57,8 +57,9 @@ typedef int64_t khint64_t; #endif /* kh_inline */ #ifndef klib_unused -#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) -#define klib_unused __attribute__ ((__unused__)) +#if (defined __clang__ && __clang_major__ >= 3) || \ + (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__((__unused__)) #else #define klib_unused #endif @@ -73,13 +74,13 @@ typedef khint32_t khint_t; ******************/ #ifndef kcalloc -#define kcalloc(N,Z) calloc(N,Z) +#define kcalloc(N, Z) calloc(N, Z) #endif #ifndef kmalloc #define kmalloc(Z) malloc(Z) #endif #ifndef krealloc -#define krealloc(P,Z) realloc(P,Z) +#define krealloc(P, Z) realloc(P, Z) #endif #ifndef kfree #define kfree(P) free(P) @@ -89,216 +90,364 @@ typedef khint32_t khint_t; * Simple private functions * ****************************/ -#define __kh_used(flag, i) (flag[i>>5] >> (i&0x1fU) & 1U) -#define __kh_set_used(flag, i) (flag[i>>5] |= 1U<<(i&0x1fU)) -#define __kh_set_unused(flag, i) (flag[i>>5] &= ~(1U<<(i&0x1fU))) +#define __kh_used(flag, i) (flag[i >> 5] >> (i & 0x1fU) & 1U) +#define __kh_set_used(flag, i) (flag[i >> 5] |= 1U << (i & 0x1fU)) +#define __kh_set_unused(flag, i) (flag[i >> 5] &= ~(1U << (i & 0x1fU))) -#define __kh_fsize(m) ((m) < 32? 1 : (m)>>5) +#define __kh_fsize(m) ((m) < 32 ? 1 : (m) >> 5) -static kh_inline khint_t __kh_h2b(khint_t hash, khint_t bits) { return hash * 2654435769U >> (32 - bits); } +static kh_inline khint_t __kh_h2b(khint_t hash, khint_t bits) { + return hash * 2654435769U >> (32 - bits); +} /******************* * Hash table base * *******************/ #define __KHASHL_TYPE(HType, khkey_t) \ - typedef struct HType { \ - khint_t bits, count; \ - khint32_t *used; \ - khkey_t *keys; \ - } HType; - -#define __KHASHL_PROTOTYPES(HType, prefix, khkey_t) \ - extern HType *prefix##_init(void); \ - extern void prefix##_destroy(HType *h); \ - extern void prefix##_clear(HType *h); \ - extern khint_t prefix##_getp(const HType *h, const khkey_t *key); \ - extern int prefix##_resize(HType *h, khint_t new_n_buckets); \ - extern khint_t prefix##_putp(HType *h, const khkey_t *key, int *absent); \ - extern void prefix##_del(HType *h, khint_t k); - -#define __KHASHL_IMPL_BASIC(SCOPE, HType, prefix) \ - SCOPE HType *prefix##_init(void) { \ - return (HType*)kcalloc(1, sizeof(HType)); \ - } \ - SCOPE void prefix##_destroy(HType *h) { \ - if (!h) return; \ - kfree((void *)h->keys); kfree(h->used); \ - kfree(h); \ - } \ - SCOPE void prefix##_clear(HType *h) { \ - if (h && h->used) { \ - uint32_t n_buckets = 1U << h->bits; \ - memset(h->used, 0, __kh_fsize(n_buckets) * sizeof(khint32_t)); \ - h->count = 0; \ - } \ - } + typedef struct HType { \ + khint_t bits, count; \ + khint32_t* used; \ + khkey_t* keys; \ + } HType; + +#define __KHASHL_PROTOTYPES(HType, prefix, khkey_t) \ + extern HType* prefix##_init(void); \ + extern void prefix##_destroy(HType* h); \ + extern void prefix##_clear(HType* h); \ + extern khint_t prefix##_getp(const HType* h, const khkey_t* key); \ + extern int prefix##_resize(HType* h, khint_t new_n_buckets); \ + extern khint_t prefix##_putp(HType* h, const khkey_t* key, int* absent); \ + extern void prefix##_del(HType* h, khint_t k); + +#define __KHASHL_IMPL_BASIC(SCOPE, HType, prefix) \ + SCOPE HType* prefix##_init(void) { \ + return (HType*)kcalloc(1, sizeof(HType)); \ + } \ + SCOPE void prefix##_destroy(HType* h) { \ + if (!h) \ + return; \ + kfree((void*)h->keys); \ + kfree(h->used); \ + kfree(h); \ + } \ + SCOPE void prefix##_clear(HType* h) { \ + if (h && h->used) { \ + uint32_t n_buckets = 1U << h->bits; \ + memset(h->used, 0, __kh_fsize(n_buckets) * sizeof(khint32_t)); \ + h->count = 0; \ + } \ + } #define __KHASHL_IMPL_GET(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ - SCOPE khint_t prefix##_getp(const HType *h, const khkey_t *key) { \ - khint_t i, last, n_buckets, mask; \ - if (h->keys == 0) return 0; \ - n_buckets = 1U << h->bits; \ - mask = n_buckets - 1U; \ - i = last = __kh_h2b(__hash_fn(*key), h->bits); \ - while (__kh_used(h->used, i) && !__hash_eq(h->keys[i], *key)) { \ - i = (i + 1U) & mask; \ - if (i == last) return n_buckets; \ - } \ - return !__kh_used(h->used, i)? n_buckets : i; \ - } \ - SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { return prefix##_getp(h, &key); } - -#define __KHASHL_IMPL_RESIZE(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ - SCOPE int prefix##_resize(HType *h, khint_t new_n_buckets) { \ - khint32_t *new_used = 0; \ - khint_t j = 0, x = new_n_buckets, n_buckets, new_bits, new_mask; \ - while ((x >>= 1) != 0) ++j; \ - if (new_n_buckets & (new_n_buckets - 1)) ++j; \ - new_bits = j > 2? j : 2; \ - new_n_buckets = 1U << new_bits; \ - if (h->count > (new_n_buckets>>1) + (new_n_buckets>>2)) return 0; /* requested size is too small */ \ - new_used = (khint32_t*)kmalloc(__kh_fsize(new_n_buckets) * sizeof(khint32_t)); \ - memset(new_used, 0, __kh_fsize(new_n_buckets) * sizeof(khint32_t)); \ - if (!new_used) return -1; /* not enough memory */ \ - n_buckets = h->keys? 1U<bits : 0U; \ - if (n_buckets < new_n_buckets) { /* expand */ \ - khkey_t *new_keys = (khkey_t*)krealloc((void*)h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (!new_keys) { kfree(new_used); return -1; } \ - h->keys = new_keys; \ - } /* otherwise shrink */ \ - new_mask = new_n_buckets - 1; \ - for (j = 0; j != n_buckets; ++j) { \ - khkey_t key; \ - if (!__kh_used(h->used, j)) continue; \ - key = h->keys[j]; \ - __kh_set_unused(h->used, j); \ - while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ - khint_t i; \ - i = __kh_h2b(__hash_fn(key), new_bits); \ - while (__kh_used(new_used, i)) i = (i + 1) & new_mask; \ - __kh_set_used(new_used, i); \ - if (i < n_buckets && __kh_used(h->used, i)) { /* kick out the existing element */ \ - { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ - __kh_set_unused(h->used, i); /* mark it as deleted in the old hash table */ \ - } else { /* write the element and jump out of the loop */ \ - h->keys[i] = key; \ - break; \ - } \ - } \ - } \ - if (n_buckets > new_n_buckets) /* shrink the hash table */ \ - h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ - kfree(h->used); /* free the working space */ \ - h->used = new_used, h->bits = new_bits; \ - return 0; \ - } + SCOPE khint_t prefix##_getp(const HType* h, const khkey_t* key) { \ + khint_t i, last, n_buckets, mask; \ + if (h->keys == 0) \ + return 0; \ + n_buckets = 1U << h->bits; \ + mask = n_buckets - 1U; \ + i = last = __kh_h2b(__hash_fn(*key), h->bits); \ + while (__kh_used(h->used, i) && !__hash_eq(h->keys[i], *key)) { \ + i = (i + 1U) & mask; \ + if (i == last) \ + return n_buckets; \ + } \ + return !__kh_used(h->used, i) ? n_buckets : i; \ + } \ + SCOPE khint_t prefix##_get(const HType* h, khkey_t key) { \ + return prefix##_getp(h, &key); \ + } + +#define __KHASHL_IMPL_RESIZE( \ + SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + SCOPE int prefix##_resize(HType* h, khint_t new_n_buckets) { \ + khint32_t* new_used = 0; \ + khint_t j = 0, x = new_n_buckets, n_buckets, new_bits, new_mask; \ + while ((x >>= 1) != 0) \ + ++j; \ + if (new_n_buckets & (new_n_buckets - 1)) \ + ++j; \ + new_bits = j > 2 ? j : 2; \ + new_n_buckets = 1U << new_bits; \ + if (h->count > (new_n_buckets >> 1) + (new_n_buckets >> 2)) \ + return 0; /* requested size is too small */ \ + new_used = \ + (khint32_t*)kmalloc(__kh_fsize(new_n_buckets) * sizeof(khint32_t)); \ + memset(new_used, 0, __kh_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (!new_used) \ + return -1; /* not enough memory */ \ + n_buckets = h->keys ? 1U << h->bits : 0U; \ + if (n_buckets < new_n_buckets) { /* expand */ \ + khkey_t* new_keys = \ + (khkey_t*)krealloc((void*)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (!new_keys) { \ + kfree(new_used); \ + return -1; \ + } \ + h->keys = new_keys; \ + } /* otherwise shrink */ \ + new_mask = new_n_buckets - 1; \ + for (j = 0; j != n_buckets; ++j) { \ + khkey_t key; \ + if (!__kh_used(h->used, j)) \ + continue; \ + key = h->keys[j]; \ + __kh_set_unused(h->used, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khint_t i; \ + i = __kh_h2b(__hash_fn(key), new_bits); \ + while (__kh_used(new_used, i)) \ + i = (i + 1) & new_mask; \ + __kh_set_used(new_used, i); \ + if (i < n_buckets && \ + __kh_used(h->used, i)) { /* kick out the existing element */ \ + { \ + khkey_t tmp = h->keys[i]; \ + h->keys[i] = key; \ + key = tmp; \ + } \ + __kh_set_unused( \ + h->used, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + break; \ + } \ + } \ + } \ + if (n_buckets > new_n_buckets) /* shrink the hash table */ \ + h->keys = \ + (khkey_t*)krealloc((void*)h->keys, new_n_buckets * sizeof(khkey_t)); \ + kfree(h->used); /* free the working space */ \ + h->used = new_used, h->bits = new_bits; \ + return 0; \ + } #define __KHASHL_IMPL_PUT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ - SCOPE khint_t prefix##_putp(HType *h, const khkey_t *key, int *absent) { \ - khint_t n_buckets, i, last, mask; \ - n_buckets = h->keys? 1U<bits : 0U; \ - *absent = -1; \ - if (h->count >= (n_buckets>>1) + (n_buckets>>2)) { /* rehashing */ \ - if (prefix##_resize(h, n_buckets + 1U) < 0) \ - return n_buckets; \ - n_buckets = 1U<bits; \ - } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ - mask = n_buckets - 1; \ - i = last = __kh_h2b(__hash_fn(*key), h->bits); \ - while (__kh_used(h->used, i) && !__hash_eq(h->keys[i], *key)) { \ - i = (i + 1U) & mask; \ - if (i == last) break; \ - } \ - if (!__kh_used(h->used, i)) { /* not present at all */ \ - h->keys[i] = *key; \ - __kh_set_used(h->used, i); \ - ++h->count; \ - *absent = 1; \ - } else *absent = 0; /* Don't touch h->keys[i] if present */ \ - return i; \ - } \ - SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { return prefix##_putp(h, &key, absent); } - -#define __KHASHL_IMPL_DEL(SCOPE, HType, prefix, khkey_t, __hash_fn) \ - SCOPE int prefix##_del(HType *h, khint_t i) { \ - khint_t j = i, k, mask, n_buckets; \ - if (h->keys == 0) return 0; \ - n_buckets = 1U<bits; \ - mask = n_buckets - 1U; \ - while (1) { \ - j = (j + 1U) & mask; \ - if (j == i || !__kh_used(h->used, j)) break; /* j==i only when the table is completely full */ \ - k = __kh_h2b(__hash_fn(h->keys[j]), h->bits); \ - if ((j > i && (k <= i || k > j)) || (j < i && (k <= i && k > j))) \ - h->keys[i] = h->keys[j], i = j; \ - } \ - __kh_set_unused(h->used, i); \ - --h->count; \ - return 1; \ - } + SCOPE khint_t prefix##_putp(HType* h, const khkey_t* key, int* absent) { \ + khint_t n_buckets, i, last, mask; \ + n_buckets = h->keys ? 1U << h->bits : 0U; \ + *absent = -1; \ + if (h->count >= (n_buckets >> 1) + (n_buckets >> 2)) { /* rehashing */ \ + if (prefix##_resize(h, n_buckets + 1U) < 0) \ + return n_buckets; \ + n_buckets = 1U << h->bits; \ + } /* TODO: to implement automatically shrinking; resize() already support \ + shrinking */ \ + mask = n_buckets - 1; \ + i = last = __kh_h2b(__hash_fn(*key), h->bits); \ + while (__kh_used(h->used, i) && !__hash_eq(h->keys[i], *key)) { \ + i = (i + 1U) & mask; \ + if (i == last) \ + break; \ + } \ + if (!__kh_used(h->used, i)) { /* not present at all */ \ + h->keys[i] = *key; \ + __kh_set_used(h->used, i); \ + ++h->count; \ + *absent = 1; \ + } else \ + *absent = 0; /* Don't touch h->keys[i] if present */ \ + return i; \ + } \ + SCOPE khint_t prefix##_put(HType* h, khkey_t key, int* absent) { \ + return prefix##_putp(h, &key, absent); \ + } + +#define __KHASHL_IMPL_DEL(SCOPE, HType, prefix, khkey_t, __hash_fn) \ + SCOPE int prefix##_del(HType* h, khint_t i) { \ + khint_t j = i, k, mask, n_buckets; \ + if (h->keys == 0) \ + return 0; \ + n_buckets = 1U << h->bits; \ + mask = n_buckets - 1U; \ + while (1) { \ + j = (j + 1U) & mask; \ + if (j == i || !__kh_used(h->used, j)) \ + break; /* j==i only when the table is completely full */ \ + k = __kh_h2b(__hash_fn(h->keys[j]), h->bits); \ + if ((j > i && (k <= i || k > j)) || (j < i && (k <= i && k > j))) \ + h->keys[i] = h->keys[j], i = j; \ + } \ + __kh_set_unused(h->used, i); \ + --h->count; \ + return 1; \ + } #define KHASHL_DECLARE(HType, prefix, khkey_t) \ - __KHASHL_TYPE(HType, khkey_t) \ - __KHASHL_PROTOTYPES(HType, prefix, khkey_t) + __KHASHL_TYPE(HType, khkey_t) \ + __KHASHL_PROTOTYPES(HType, prefix, khkey_t) -#define KHASHL_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ - __KHASHL_TYPE(HType, khkey_t) \ - __KHASHL_IMPL_BASIC(SCOPE, HType, prefix) \ - __KHASHL_IMPL_GET(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ - __KHASHL_IMPL_RESIZE(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ - __KHASHL_IMPL_PUT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ - __KHASHL_IMPL_DEL(SCOPE, HType, prefix, khkey_t, __hash_fn) +#define KHASHL_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + __KHASHL_TYPE(HType, khkey_t) \ + __KHASHL_IMPL_BASIC(SCOPE, HType, prefix) \ + __KHASHL_IMPL_GET(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + __KHASHL_IMPL_RESIZE(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + __KHASHL_IMPL_PUT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + __KHASHL_IMPL_DEL(SCOPE, HType, prefix, khkey_t, __hash_fn) /***************************** * More convenient interface * *****************************/ -#define __kh_packed __attribute__ ((__packed__)) +#define __kh_packed __attribute__((__packed__)) #define __kh_cached_hash(x) ((x).hash) #define KHASHL_SET_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ - typedef struct { khkey_t key; } __kh_packed HType##_s_bucket_t; \ - static kh_inline khint_t prefix##_s_hash(HType##_s_bucket_t x) { return __hash_fn(x.key); } \ - static kh_inline int prefix##_s_eq(HType##_s_bucket_t x, HType##_s_bucket_t y) { return __hash_eq(x.key, y.key); } \ - KHASHL_INIT(KH_LOCAL, HType, prefix##_s, HType##_s_bucket_t, prefix##_s_hash, prefix##_s_eq) \ - SCOPE HType *prefix##_init(void) { return prefix##_s_init(); } \ - SCOPE void prefix##_destroy(HType *h) { prefix##_s_destroy(h); } \ - SCOPE void prefix##_resize(HType *h, khint_t new_n_buckets) { prefix##_s_resize(h, new_n_buckets); } \ - SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_s_bucket_t t; t.key = key; return prefix##_s_getp(h, &t); } \ - SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_s_del(h, k); } \ - SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_s_bucket_t t; t.key = key; return prefix##_s_putp(h, &t, absent); } - -#define KHASHL_MAP_INIT(SCOPE, HType, prefix, khkey_t, kh_val_t, __hash_fn, __hash_eq) \ - typedef struct { khkey_t key; kh_val_t val; } __kh_packed HType##_m_bucket_t; \ - static kh_inline khint_t prefix##_m_hash(HType##_m_bucket_t x) { return __hash_fn(x.key); } \ - static kh_inline int prefix##_m_eq(HType##_m_bucket_t x, HType##_m_bucket_t y) { return __hash_eq(x.key, y.key); } \ - KHASHL_INIT(KH_LOCAL, HType, prefix##_m, HType##_m_bucket_t, prefix##_m_hash, prefix##_m_eq) \ - SCOPE HType *prefix##_init(void) { return prefix##_m_init(); } \ - SCOPE void prefix##_destroy(HType *h) { prefix##_m_destroy(h); } \ - SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_m_bucket_t t; t.key = key; return prefix##_m_getp(h, &t); } \ - SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_m_del(h, k); } \ - SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_m_bucket_t t; t.key = key; return prefix##_m_putp(h, &t, absent); } + typedef struct { \ + khkey_t key; \ + } __kh_packed HType##_s_bucket_t; \ + static kh_inline khint_t prefix##_s_hash(HType##_s_bucket_t x) { \ + return __hash_fn(x.key); \ + } \ + static kh_inline int prefix##_s_eq( \ + HType##_s_bucket_t x, HType##_s_bucket_t y) { \ + return __hash_eq(x.key, y.key); \ + } \ + KHASHL_INIT( \ + KH_LOCAL, \ + HType, \ + prefix##_s, \ + HType##_s_bucket_t, \ + prefix##_s_hash, \ + prefix##_s_eq) \ + SCOPE HType* prefix##_init(void) { \ + return prefix##_s_init(); \ + } \ + SCOPE void prefix##_destroy(HType* h) { \ + prefix##_s_destroy(h); \ + } \ + SCOPE void prefix##_resize(HType* h, khint_t new_n_buckets) { \ + prefix##_s_resize(h, new_n_buckets); \ + } \ + SCOPE khint_t prefix##_get(const HType* h, khkey_t key) { \ + HType##_s_bucket_t t; \ + t.key = key; \ + return prefix##_s_getp(h, &t); \ + } \ + SCOPE int prefix##_del(HType* h, khint_t k) { \ + return prefix##_s_del(h, k); \ + } \ + SCOPE khint_t prefix##_put(HType* h, khkey_t key, int* absent) { \ + HType##_s_bucket_t t; \ + t.key = key; \ + return prefix##_s_putp(h, &t, absent); \ + } + +#define KHASHL_MAP_INIT( \ + SCOPE, HType, prefix, khkey_t, kh_val_t, __hash_fn, __hash_eq) \ + typedef struct { \ + khkey_t key; \ + kh_val_t val; \ + } __kh_packed HType##_m_bucket_t; \ + static kh_inline khint_t prefix##_m_hash(HType##_m_bucket_t x) { \ + return __hash_fn(x.key); \ + } \ + static kh_inline int prefix##_m_eq( \ + HType##_m_bucket_t x, HType##_m_bucket_t y) { \ + return __hash_eq(x.key, y.key); \ + } \ + KHASHL_INIT( \ + KH_LOCAL, \ + HType, \ + prefix##_m, \ + HType##_m_bucket_t, \ + prefix##_m_hash, \ + prefix##_m_eq) \ + SCOPE HType* prefix##_init(void) { \ + return prefix##_m_init(); \ + } \ + SCOPE void prefix##_destroy(HType* h) { \ + prefix##_m_destroy(h); \ + } \ + SCOPE khint_t prefix##_get(const HType* h, khkey_t key) { \ + HType##_m_bucket_t t; \ + t.key = key; \ + return prefix##_m_getp(h, &t); \ + } \ + SCOPE int prefix##_del(HType* h, khint_t k) { \ + return prefix##_m_del(h, k); \ + } \ + SCOPE khint_t prefix##_put(HType* h, khkey_t key, int* absent) { \ + HType##_m_bucket_t t; \ + t.key = key; \ + return prefix##_m_putp(h, &t, absent); \ + } #define KHASHL_CSET_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ - typedef struct { khkey_t key; khint_t hash; } __kh_packed HType##_cs_bucket_t; \ - static kh_inline int prefix##_cs_eq(HType##_cs_bucket_t x, HType##_cs_bucket_t y) { return x.hash == y.hash && __hash_eq(x.key, y.key); } \ - KHASHL_INIT(KH_LOCAL, HType, prefix##_cs, HType##_cs_bucket_t, __kh_cached_hash, prefix##_cs_eq) \ - SCOPE HType *prefix##_init(void) { return prefix##_cs_init(); } \ - SCOPE void prefix##_destroy(HType *h) { prefix##_cs_destroy(h); } \ - SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_cs_bucket_t t; t.key = key; t.hash = __hash_fn(key); return prefix##_cs_getp(h, &t); } \ - SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_cs_del(h, k); } \ - SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_cs_bucket_t t; t.key = key, t.hash = __hash_fn(key); return prefix##_cs_putp(h, &t, absent); } - -#define KHASHL_CMAP_INIT(SCOPE, HType, prefix, khkey_t, kh_val_t, __hash_fn, __hash_eq) \ - typedef struct { khkey_t key; kh_val_t val; khint_t hash; } __kh_packed HType##_cm_bucket_t; \ - static kh_inline int prefix##_cm_eq(HType##_cm_bucket_t x, HType##_cm_bucket_t y) { return x.hash == y.hash && __hash_eq(x.key, y.key); } \ - KHASHL_INIT(KH_LOCAL, HType, prefix##_cm, HType##_cm_bucket_t, __kh_cached_hash, prefix##_cm_eq) \ - SCOPE HType *prefix##_init(void) { return prefix##_cm_init(); } \ - SCOPE void prefix##_destroy(HType *h) { prefix##_cm_destroy(h); } \ - SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_cm_bucket_t t; t.key = key; t.hash = __hash_fn(key); return prefix##_cm_getp(h, &t); } \ - SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_cm_del(h, k); } \ - SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_cm_bucket_t t; t.key = key, t.hash = __hash_fn(key); return prefix##_cm_putp(h, &t, absent); } + typedef struct { \ + khkey_t key; \ + khint_t hash; \ + } __kh_packed HType##_cs_bucket_t; \ + static kh_inline int prefix##_cs_eq( \ + HType##_cs_bucket_t x, HType##_cs_bucket_t y) { \ + return x.hash == y.hash && __hash_eq(x.key, y.key); \ + } \ + KHASHL_INIT( \ + KH_LOCAL, \ + HType, \ + prefix##_cs, \ + HType##_cs_bucket_t, \ + __kh_cached_hash, \ + prefix##_cs_eq) \ + SCOPE HType* prefix##_init(void) { \ + return prefix##_cs_init(); \ + } \ + SCOPE void prefix##_destroy(HType* h) { \ + prefix##_cs_destroy(h); \ + } \ + SCOPE khint_t prefix##_get(const HType* h, khkey_t key) { \ + HType##_cs_bucket_t t; \ + t.key = key; \ + t.hash = __hash_fn(key); \ + return prefix##_cs_getp(h, &t); \ + } \ + SCOPE int prefix##_del(HType* h, khint_t k) { \ + return prefix##_cs_del(h, k); \ + } \ + SCOPE khint_t prefix##_put(HType* h, khkey_t key, int* absent) { \ + HType##_cs_bucket_t t; \ + t.key = key, t.hash = __hash_fn(key); \ + return prefix##_cs_putp(h, &t, absent); \ + } + +#define KHASHL_CMAP_INIT( \ + SCOPE, HType, prefix, khkey_t, kh_val_t, __hash_fn, __hash_eq) \ + typedef struct { \ + khkey_t key; \ + kh_val_t val; \ + khint_t hash; \ + } __kh_packed HType##_cm_bucket_t; \ + static kh_inline int prefix##_cm_eq( \ + HType##_cm_bucket_t x, HType##_cm_bucket_t y) { \ + return x.hash == y.hash && __hash_eq(x.key, y.key); \ + } \ + KHASHL_INIT( \ + KH_LOCAL, \ + HType, \ + prefix##_cm, \ + HType##_cm_bucket_t, \ + __kh_cached_hash, \ + prefix##_cm_eq) \ + SCOPE HType* prefix##_init(void) { \ + return prefix##_cm_init(); \ + } \ + SCOPE void prefix##_destroy(HType* h) { \ + prefix##_cm_destroy(h); \ + } \ + SCOPE khint_t prefix##_get(const HType* h, khkey_t key) { \ + HType##_cm_bucket_t t; \ + t.key = key; \ + t.hash = __hash_fn(key); \ + return prefix##_cm_getp(h, &t); \ + } \ + SCOPE int prefix##_del(HType* h, khint_t k) { \ + return prefix##_cm_del(h, k); \ + } \ + SCOPE khint_t prefix##_put(HType* h, khkey_t key, int* absent) { \ + HType##_cm_bucket_t t; \ + t.key = key, t.hash = __hash_fn(key); \ + return prefix##_cm_putp(h, &t, absent); \ + } /************************** * Public macro functions * @@ -306,7 +455,7 @@ static kh_inline khint_t __kh_h2b(khint_t hash, khint_t bits) { return hash * 26 #define kh_bucket(h, x) ((h)->keys[x]) #define kh_size(h) ((h)->count) -#define kh_capacity(h) ((h)->keys? 1U<<(h)->bits : 0U) +#define kh_capacity(h) ((h)->keys ? 1U << (h)->bits : 0U) #define kh_end(h) kh_capacity(h) #define kh_key(h, x) ((h)->keys[x].key) @@ -322,30 +471,32 @@ static kh_inline khint_t __kh_h2b(khint_t hash, khint_t bits) { return hash * 26 #define kh_hash_dummy(x) ((khint_t)(x)) static kh_inline khint_t kh_hash_uint32(khint_t key) { - key += ~(key << 15); - key ^= (key >> 10); - key += (key << 3); - key ^= (key >> 6); - key += ~(key << 11); - key ^= (key >> 16); - return key; + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; } static kh_inline khint_t kh_hash_uint64(khint64_t key) { - key = ~key + (key << 21); - key = key ^ key >> 24; - key = (key + (key << 3)) + (key << 8); - key = key ^ key >> 14; - key = (key + (key << 2)) + (key << 4); - key = key ^ key >> 28; - key = key + (key << 31); - return (khint_t)key; + key = ~key + (key << 21); + key = key ^ key >> 24; + key = (key + (key << 3)) + (key << 8); + key = key ^ key >> 14; + key = (key + (key << 2)) + (key << 4); + key = key ^ key >> 28; + key = key + (key << 31); + return (khint_t)key; } -static kh_inline khint_t kh_hash_str(const char *s) { - khint_t h = (khint_t)*s; - if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; - return h; +static kh_inline khint_t kh_hash_str(const char* s) { + khint_t h = (khint_t)*s; + if (h) + for (++s; *s; ++s) + h = (h << 5) - h + (khint_t)*s; + return h; } #endif /* __AC_KHASHL_H */ diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index e3e8005b6a..c35315632f 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -468,12 +468,12 @@ std::vector SOMAArray::shape() { } uint64_t SOMAArray::ndim() const { - return this->schema().get()->domain().ndim(); + return tiledb_schema()->domain().ndim(); } std::vector SOMAArray::dimension_names() const { std::vector result; - auto dimensions = this->schema().get()->domain().dimensions(); + auto dimensions = tiledb_schema()->domain().dimensions(); for (const auto& dim : dimensions) { result.push_back(dim.name()); } diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 0845cd694b..2814c2c0b8 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -39,6 +39,7 @@ #include #include +#include "../utils/arrow_adapter.h" #include "enums.h" #include "logger_public.h" #include "managed_query.h" @@ -185,6 +186,19 @@ class SOMAArray { */ std::shared_ptr ctx(); + std::optional soma_object_type() { + auto soma_object_type = this->get_metadata("soma_object_type"); + + if (!soma_object_type.has_value()) + return std::nullopt; + + const char* dtype = (const char*)std::get( + *soma_object_type); + uint32_t sz = std::get(*soma_object_type); + + return std::string(dtype, sz); + } + /** * Open the SOMAArray object. * @@ -209,6 +223,11 @@ class SOMAArray { return arr_->is_open(); } + OpenMode mode() const { + return mq_->query_type() == TILEDB_READ ? OpenMode::read : + OpenMode::write; + } + /** * @brief Reset the state of this SOMAArray object to prepare for a * new query, while holding the array open. @@ -472,14 +491,24 @@ class SOMAArray { uint64_t nnz(); /** - * @brief Get the schema of the array. + * @brief Get the TileDB ArraySchema. This should eventually + * be removed in lieu of arrow_schema below. * * @return std::shared_ptr Schema */ - std::shared_ptr schema() const { + std::shared_ptr tiledb_schema() const { return mq_->schema(); } + /** + * @brief Get the Arrow schema of the array. + * + * @return std::unique_ptr Schema + */ + std::unique_ptr arrow_schema() const { + return ArrowAdapter::arrow_schema_from_tiledb_array(ctx_, arr_); + } + /** * @brief Get the capacity of each dimension. * @@ -495,6 +524,36 @@ class SOMAArray { */ uint64_t ndim() const; + /** + * Retrieves the non-empty domain from the array. This is the union of the + * non-empty domains of the array fragments. + */ + template + std::pair non_empty_domain(const std::string& name) { + return arr_->non_empty_domain(name); + }; + + /** + * Retrieves the non-empty domain from the array on the given dimension. + * This is the union of the non-empty domains of the array fragments. + * Applicable only to var-sized dimensions. + */ + std::pair non_empty_domain_var( + const std::string& name) { + return arr_->non_empty_domain_var(name); + }; + + /** + * Returns the domain of the given dimension. + * + * @tparam T Domain datatype + * @return Pair of [lower, upper] inclusive bounds. + */ + template + std::pair domain(const std::string& name) const { + return arr_->schema().domain().dimension(name).domain(); + } + /** * @brief Get the name of each dimensions. * diff --git a/libtiledbsoma/src/soma/soma_dataframe.cc b/libtiledbsoma/src/soma/soma_dataframe.cc index 00edd05b1a..20ef2bd4f1 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_dataframe.cc @@ -105,19 +105,37 @@ SOMADataFrame::SOMADataFrame( "auto", // batch_size, result_order, timestamp); - array_->reset(); } void SOMADataFrame::open( OpenMode mode, std::optional> timestamp) { array_->open(mode, timestamp); - array_->reset(); } void SOMADataFrame::close() { array_->close(); } +bool SOMADataFrame::exists(std::string_view uri) { + try { + auto soma_dataframe = SOMADataFrame::open(uri, OpenMode::read); + auto soma_object_type = soma_dataframe->get_metadata( + "soma_object_type"); + + if (!soma_object_type.has_value()) + return false; + + const char* dtype = (const char*)std::get( + *soma_object_type); + + uint32_t sz = std::get(*soma_object_type); + + return std::string(dtype, sz) == "SOMADataFrame"; + } catch (std::exception& e) { + return false; + } +} + bool SOMADataFrame::is_open() const { return array_->is_open(); } @@ -130,8 +148,12 @@ std::shared_ptr SOMADataFrame::ctx() { return array_->ctx(); } -std::shared_ptr SOMADataFrame::schema() const { - return array_->schema(); +std::unique_ptr SOMADataFrame::schema() const { + return array_->arrow_schema(); +} + +std::shared_ptr SOMADataFrame::tiledb_schema() const { + return array_->tiledb_schema(); } const std::vector SOMADataFrame::index_column_names() const { @@ -139,7 +161,7 @@ const std::vector SOMADataFrame::index_column_names() const { } int64_t SOMADataFrame::count() const { - return array_->ndim(); + return array_->nnz(); } std::optional> SOMADataFrame::read_next() { diff --git a/libtiledbsoma/src/soma/soma_dataframe.h b/libtiledbsoma/src/soma/soma_dataframe.h index c96a829495..9fb0871b14 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.h +++ b/libtiledbsoma/src/soma/soma_dataframe.h @@ -33,7 +33,6 @@ #ifndef SOMA_DATAFRAME #define SOMA_DATAFRAME -#include #include "enums.h" #include "soma_array.h" #include "soma_object.h" @@ -146,8 +145,11 @@ class SOMADataFrame : public SOMAObject { ResultOrder result_order, std::optional> timestamp = std::nullopt); + SOMADataFrame(std::shared_ptr array) + : array_(array){}; + SOMADataFrame() = delete; - SOMADataFrame(const SOMADataFrame&) = delete; + SOMADataFrame(const SOMADataFrame&) = default; SOMADataFrame(SOMADataFrame&&) = default; ~SOMADataFrame() = default; @@ -166,6 +168,18 @@ class SOMADataFrame : public SOMAObject { */ void close(); + void reset( + std::vector column_names = {}, + std::string_view batch_size = "auto", + ResultOrder result_order = ResultOrder::automatic) { + array_->reset(column_names, batch_size, result_order); + } + + /** + * @brief Check if the SOMADataFrame exists at the URI. + */ + static bool exists(std::string_view uri); + /** * Check if the SOMADataFrame is open. * @@ -173,6 +187,10 @@ class SOMADataFrame : public SOMAObject { */ bool is_open() const; + OpenMode mode() const { + return array_->mode(); + } + /** * Return the constant "SOMADataFrame". * @@ -196,12 +214,26 @@ class SOMADataFrame : public SOMAObject { */ std::shared_ptr ctx(); + /** + * Return optional timestamp pair SOMADataFrame was opened with. + */ + std::optional> timestamp() { + return array_->timestamp(); + } + + /** + * Return the data schema, in the form of a ArrowSchema. + * + * @return std::unique_ptr + */ + std::unique_ptr schema() const; + /** * Return the data schema, in the form of a TileDB ArraySchema. * * @return std::shared_ptr */ - std::shared_ptr schema() const; + std::shared_ptr tiledb_schema() const; /** * Return the index (dimension) column names. @@ -211,18 +243,126 @@ class SOMADataFrame : public SOMAObject { const std::vector index_column_names() const; /** - * Return the number of rows in the SOMADataFrame. + * Return the number of rows. * * @return int64_t */ int64_t count() const; + /** + * Retrieves the non-empty domain of the column index. + * + * @return int64_t + */ + template + std::pair non_empty_domain(const std::string& column_index_name) { + return array_->non_empty_domain(column_index_name); + }; + + /** + * Retrieves the non-empty domain of the column index. + * Applicable only to var-sized dimensions. + */ + std::pair non_empty_domain_var( + const std::string& column_index_name) { + return array_->non_empty_domain_var(column_index_name); + }; + + /** + * Returns the domain of the given column index. + * + * @tparam T Domain datatype + * @return Pair of [lower, upper] inclusive bounds. + */ + template + std::pair domain(const std::string& column_index_name) const { + return array_->domain(column_index_name); + } + /** * @brief Read the next chunk of results from the query. If all results have * already been read, std::nullopt is returned. */ std::optional> read_next(); + /** + * @brief Set the dimension slice using one point + * + * @note Partitioning is not supported + * + * @tparam T + * @param dim + * @param point + */ + template + void set_dim_point(const std::string& dim, const T& point) { + array_->set_dim_point(dim, point); + } + + /** + * @brief Set the dimension slice using multiple points, with support + * for partitioning. + * + * @tparam T + * @param dim + * @param points + */ + template + void set_dim_points( + const std::string& dim, + const tcb::span points, + int partition_index, + int partition_count) { + array_->set_dim_points(dim, points, partition_index, partition_count); + } + + /** + * @brief Set the dimension slice using multiple points + * + * @note Partitioning is not supported + * + * @tparam T + * @param dim + * @param points + */ + template + void set_dim_points(const std::string& dim, const std::vector& points) { + array_->set_dim_points(dim, points); + } + + /** + * @brief Set the dimension slice using multiple ranges + * + * @note Partitioning is not supported + * + * @tparam T + * @param dim + * @param ranges + */ + template + void set_dim_ranges( + const std::string& dim, const std::vector>& ranges) { + array_->set_dim_ranges(dim, ranges); + } + + /** + * @brief Set a query condition. + * + * @param qc Query condition + */ + void set_condition(QueryCondition& qc) { + array_->set_condition(qc); + } + + /** + * @brief Returns the column names set by the query. + * + * @return std::vector + */ + std::vector column_names() { + return array_->column_names(); + } + /** * @brief Write data to the dataframe. * @param buffers The ArrayBuffers to write diff --git a/libtiledbsoma/src/soma/soma_dense_ndarray.cc b/libtiledbsoma/src/soma/soma_dense_ndarray.cc index 5f66ba8abb..93d33f0095 100644 --- a/libtiledbsoma/src/soma/soma_dense_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_dense_ndarray.cc @@ -131,8 +131,12 @@ std::shared_ptr SOMADenseNDArray::ctx() { return array_->ctx(); } -std::shared_ptr SOMADenseNDArray::schema() const { - return array_->schema(); +std::unique_ptr SOMADenseNDArray::schema() const { + return array_->arrow_schema(); +} + +std::shared_ptr SOMADenseNDArray::tiledb_schema() const { + return array_->tiledb_schema(); } std::vector SOMADenseNDArray::shape() const { diff --git a/libtiledbsoma/src/soma/soma_dense_ndarray.h b/libtiledbsoma/src/soma/soma_dense_ndarray.h index e4c771a064..8efd776966 100644 --- a/libtiledbsoma/src/soma/soma_dense_ndarray.h +++ b/libtiledbsoma/src/soma/soma_dense_ndarray.h @@ -33,7 +33,6 @@ #ifndef SOMA_DENSE_NDARRAY #define SOMA_DENSE_NDARRAY -#include #include "enums.h" #include "soma_array.h" #include "soma_object.h" @@ -200,11 +199,18 @@ class SOMADenseNDArray : public SOMAObject { const std::string uri() const; /** - * Return data schema, in the form of a TileDB ArraySchema. + * Return the data schema, in the form of an ArrowSchema. + * + * @return std::unique_ptr + */ + std::unique_ptr schema() const; + + /** + * Return the data schema, in the form of a TileDB ArraySchema. * * @return std::shared_ptr */ - std::shared_ptr schema() const; + std::shared_ptr tiledb_schema() const; /** * @brief Get the capacity of each dimension. diff --git a/libtiledbsoma/src/soma/soma_object.cc b/libtiledbsoma/src/soma/soma_object.cc new file mode 100644 index 0000000000..655adbf30e --- /dev/null +++ b/libtiledbsoma/src/soma/soma_object.cc @@ -0,0 +1,42 @@ +#include +#include +#include + +#include "soma_array.h" +#include "soma_dataframe.h" + +namespace tiledbsoma { + +using namespace tiledb; + +std::unique_ptr SOMAObject::open( + std::string uri, + OpenMode mode, + std::map platform_config, + std::optional> timestamp) { + auto ctx = std::make_shared(Config(platform_config)); + return SOMAObject::open(uri, mode, ctx, timestamp); +} + +std::unique_ptr SOMAObject::open( + std::string uri, + OpenMode mode, + std::shared_ptr ctx, + std::optional> timestamp) { + auto obj = tiledb::Object::object(*ctx, uri); + + if (obj.type() == tiledb::Object::Type::Array) { + auto array_ = SOMAArray::open( + mode, ctx, uri, "", {}, "auto", ResultOrder::automatic, timestamp); + + if (array_->soma_object_type() == "SOMADataFrame") + return std::make_unique(std::move(array_)); + else + throw TileDBSOMAError( + "Invalid SOMAObject passed to SOMAObject::open"); + } + + throw TileDBSOMAError("Invalid TileDB object passed to SOMAObject::open"); +} + +} // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_object.h b/libtiledbsoma/src/soma/soma_object.h index 47beeb6047..43a209885e 100644 --- a/libtiledbsoma/src/soma/soma_object.h +++ b/libtiledbsoma/src/soma/soma_object.h @@ -41,6 +41,8 @@ namespace tiledbsoma { +class SOMADataFrame; + using namespace tiledb; class SOMAObject { public: @@ -49,6 +51,18 @@ class SOMAObject { //=================================================================== virtual ~SOMAObject() = default; + static std::unique_ptr open( + std::string uri, + OpenMode mode, + std::map platform_config = {}, + std::optional> timestamp = std::nullopt); + + static std::unique_ptr open( + std::string uri, + OpenMode mode, + std::shared_ptr ctx, + std::optional> timestamp = std::nullopt); + /** * @brief Return a constant string describing the type of the object. */ diff --git a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc index 29f4b16afe..308fb0ddde 100644 --- a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc @@ -131,8 +131,12 @@ std::shared_ptr SOMASparseNDArray::ctx() { return array_->ctx(); } -std::shared_ptr SOMASparseNDArray::schema() const { - return array_->schema(); +std::unique_ptr SOMASparseNDArray::schema() const { + return array_->arrow_schema(); +} + +std::shared_ptr SOMASparseNDArray::tiledb_schema() const { + return array_->tiledb_schema(); } std::vector SOMASparseNDArray::shape() const { diff --git a/libtiledbsoma/src/soma/soma_sparse_ndarray.h b/libtiledbsoma/src/soma/soma_sparse_ndarray.h index 6aaf2eb10b..749418b597 100644 --- a/libtiledbsoma/src/soma/soma_sparse_ndarray.h +++ b/libtiledbsoma/src/soma/soma_sparse_ndarray.h @@ -33,7 +33,6 @@ #ifndef SOMA_SPARSE_NDARRAY #define SOMA_SPARSE_NDARRAY -#include #include "enums.h" #include "soma_array.h" #include "soma_object.h" @@ -200,11 +199,18 @@ class SOMASparseNDArray : public SOMAObject { const std::string uri() const; /** - * Return data schema, in the form of a TileDB ArraySchema. + * Return the data schema, in the form of an ArrowSchema. + * + * @return std::unique_ptr + */ + std::unique_ptr schema() const; + + /** + * Return the data schema, in the form of a TileDB ArraySchema. * * @return std::shared_ptr */ - std::shared_ptr schema() const; + std::shared_ptr tiledb_schema() const; /** * @brief Get the capacity of each dimension. diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 93058428b8..4739bd0ed2 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -101,6 +101,68 @@ void ArrowAdapter::release_array(struct ArrowArray* array) { array->release = nullptr; } +std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( + std::shared_ptr ctx, std::shared_ptr tiledb_array) { + auto tiledb_schema = tiledb_array->schema(); + auto ndim = tiledb_schema.domain().ndim(); + auto nattr = tiledb_schema.attribute_num(); + + std::unique_ptr arrow_schema = std::make_unique(); + arrow_schema->format = "+s"; + arrow_schema->n_children = ndim + nattr; + arrow_schema->release = &ArrowAdapter::release_schema; + arrow_schema->children = new ArrowSchema*[arrow_schema->n_children]; + + ArrowSchema* child = nullptr; + + for (uint32_t i = 0; i < ndim; ++i) { + auto dim = tiledb_schema.domain().dimension(i); + child = arrow_schema->children[i] = new ArrowSchema; + child->format = ArrowAdapter::to_arrow_format(dim.type()).data(); + child->name = strdup(dim.name().c_str()); + child->metadata = nullptr; + child->flags = 0; + child->n_children = 0; + child->dictionary = nullptr; + child->children = nullptr; + child->release = &ArrowAdapter::release_schema; + } + + for (uint32_t i = 0; i < nattr; ++i) { + auto attr = tiledb_schema.attribute(i); + child = arrow_schema->children[ndim + i] = new ArrowSchema; + child->format = ArrowAdapter::to_arrow_format(attr.type()).data(); + child->name = strdup(attr.name().c_str()); + child->metadata = nullptr; + child->flags = attr.nullable() ? ARROW_FLAG_NULLABLE : 0; + child->n_children = 0; + child->children = nullptr; + child->dictionary = nullptr; + + auto enmr_name = AttributeExperimental::get_enumeration_name( + *ctx, attr); + if (enmr_name.has_value()) { + auto enmr = ArrayExperimental::get_enumeration( + *ctx, *tiledb_array, attr.name()); + auto dict = new ArrowSchema; + dict->format = strdup( + ArrowAdapter::to_arrow_format(enmr.type(), false).data()); + dict->name = strdup(enmr.name().c_str()); + dict->metadata = nullptr; + dict->flags = 0; + dict->n_children = 0; + dict->children = nullptr; + dict->dictionary = nullptr; + dict->release = &ArrowAdapter::release_schema; + dict->private_data = nullptr; + child->dictionary = dict; + } + child->release = &ArrowAdapter::release_schema; + } + + return arrow_schema; +} + std::pair ArrowAdapter::_get_data_and_length( Enumeration& enmr, const void* dst) { switch (enmr.type()) { @@ -117,7 +179,7 @@ std::pair ArrowAdapter::_get_data_and_length( // Allocate a single byte to copy the bits into size_t sz = 1; - dst = (const void*)malloc(sz); + dst = new const void*[sz]; std::memcpy((void*)dst, &src, sz); return std::pair(dst, data.size()); @@ -196,12 +258,15 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { int n_buffers = column->is_var() ? 3 : 2; // Create an ArrowBuffer to manage the lifetime of `column`. - // - `arrow_buffer` holds a shared_ptr to `column`, which increments + // - `arrow_buffer` holds a shared_ptr to `column`, which + // increments // the use count and keeps the ColumnBuffer data alive. - // - When the arrow array is released, `array->release()` is called with - // `arrow_buffer` in `private_data`. `arrow_buffer` is deleted, which - // decrements the the `column` use count. When the `column` use count - // reaches 0, the ColumnBuffer data will be deleted. + // - When the arrow array is released, `array->release()` is + // called with + // `arrow_buffer` in `private_data`. `arrow_buffer` is + // deleted, which decrements the the `column` use count. When + // the `column` use count reaches 0, the ColumnBuffer data + // will be deleted. auto arrow_buffer = new ArrowBuffer(column); array->length = column->size(); @@ -220,7 +285,7 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { column->name(), column.use_count())); - array->buffers = (const void**)malloc(sizeof(void*) * n_buffers); + array->buffers = new const void*[n_buffers]; assert(array->buffers != nullptr); array->buffers[0] = nullptr; // validity array->buffers[n_buffers - 1] = column->data().data(); // data @@ -244,18 +309,18 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { schema->flags |= ARROW_FLAG_DICTIONARY_ORDERED; } - /* Workaround to cast TILEDB_BOOL from uint8 to 1-bit Arrow boolean. */ + // Workaround to cast TILEDB_BOOL from uint8 to 1-bit Arrow boolean if (column->type() == TILEDB_BOOL) { column->data_to_bitmap(); } if (column->has_enumeration()) { - ArrowSchema* dict_sch = new ArrowSchema; - ArrowArray* dict_arr = new ArrowArray; + auto dict_sch = new ArrowSchema; + auto dict_arr = new ArrowArray; auto enmr = column->get_enumeration_info(); dict_sch->format = strdup(to_arrow_format(enmr->type(), false).data()); - dict_sch->name = strdup(enmr->name().c_str()); + dict_sch->name = nullptr; dict_sch->metadata = nullptr; dict_sch->flags = 0; dict_sch->n_children = 0; @@ -275,18 +340,19 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { dict_arr->release = &release_array; dict_arr->private_data = nullptr; - dict_arr->buffers = (const void**)malloc(sizeof(void*) * n_buf); + dict_arr->buffers = new const void*[n_buf]; dict_arr->buffers[0] = nullptr; // validity: none here - // TODO string types currently get the data and offset buffers from - // ColumnBuffer::enum_offsets and ColumnBuffer::enum_string which is - // retrieved via ColumnBuffer::convert_enumeration. This may be - // refactored to all use ColumnBuffer::get_enumeration_info. Note - // that ColumnBuffer::has_enumeration may also be removed in a - // future refactor as ColumnBuffer::get_enumeration_info returns - // std::optional where std::nullopt indicates the column does not - // contain enumerated values. - if (enmr->type() == TILEDB_STRING_ASCII || + // TODO string types currently get the data and offset + // buffers from ColumnBuffer::enum_offsets and + // ColumnBuffer::enum_string which is retrieved via + // ColumnBuffer::convert_enumeration. This may be refactored + // to all use ColumnBuffer::get_enumeration_info. Note that + // ColumnBuffer::has_enumeration may also be removed in a + // future refactor as ColumnBuffer::get_enumeration_info + // returns std::optional where std::nullopt indicates the + // column does not contain enumerated values. + if (enmr->type() == TILEDB_STRING_ASCII or enmr->type() == TILEDB_STRING_UTF8) { auto dict_vec = enmr->as_vector(); column->convert_enumeration(); @@ -294,7 +360,7 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { dict_arr->buffers[2] = column->enum_string().data(); dict_arr->length = dict_vec.size(); } else { - auto [dict_data, dict_length] = ArrowAdapter::_get_data_and_length( + auto [dict_data, dict_length] = _get_data_and_length( *enmr, dict_arr->buffers[1]); dict_arr->buffers[1] = dict_data; dict_arr->length = dict_length; @@ -312,12 +378,12 @@ std::string_view ArrowAdapter::to_arrow_format( switch (datatype) { case TILEDB_STRING_ASCII: case TILEDB_STRING_UTF8: - return use_large ? "U" : - "u"; // large because TileDB uses 64bit offsets + return use_large ? "U" : "u"; // large because TileDB + // uses 64bit offsets case TILEDB_CHAR: case TILEDB_BLOB: - return use_large ? "Z" : - "z"; // large because TileDB uses 64bit offsets + return use_large ? "Z" : "z"; // large because TileDB + // uses 64bit offsets case TILEDB_BOOL: return "b"; case TILEDB_INT32: diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index fd33bff882..a210aca77c 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -46,6 +46,9 @@ class ArrowAdapter { static std::pair, std::unique_ptr> to_arrow(std::shared_ptr column); + static std::unique_ptr arrow_schema_from_tiledb_array( + std::shared_ptr ctx, std::shared_ptr tiledb_array); + /** * @brief Get Arrow format string from TileDB datatype. * diff --git a/libtiledbsoma/test/unit_soma_array.cc b/libtiledbsoma/test/unit_soma_array.cc index 112669e4fc..1e6b2bb0c3 100644 --- a/libtiledbsoma/test/unit_soma_array.cc +++ b/libtiledbsoma/test/unit_soma_array.cc @@ -127,10 +127,6 @@ std::tuple, std::vector> write_array( ResultOrder::automatic, std::pair(timestamp + i, timestamp + i)); - if (LOG_DEBUG_ENABLED()) { - soma_array->schema()->dump(); - } - std::vector d0(num_cells_per_fragment); for (int j = 0; j < num_cells_per_fragment; j++) { // Overlap odd fragments when generating overlaps diff --git a/libtiledbsoma/test/unit_soma_collection.cc b/libtiledbsoma/test/unit_soma_collection.cc index 1e79a41c7a..4066461403 100644 --- a/libtiledbsoma/test/unit_soma_collection.cc +++ b/libtiledbsoma/test/unit_soma_collection.cc @@ -107,8 +107,6 @@ TEST_CASE("SOMACollection: add SOMASparseNDArray") { REQUIRE(soma_sparse->ctx() == ctx); REQUIRE(soma_sparse->type() == "SOMASparseNDArray"); REQUIRE(soma_sparse->is_sparse() == true); - REQUIRE(soma_sparse->schema()->has_attribute("a0")); - REQUIRE(soma_sparse->schema()->domain().has_dimension("d0")); REQUIRE(soma_sparse->ndim() == 1); REQUIRE(soma_sparse->nnz() == 0); soma_sparse->close(); @@ -137,8 +135,6 @@ TEST_CASE("SOMACollection: add SOMADenseNDArray") { REQUIRE(soma_dense->ctx() == ctx); REQUIRE(soma_dense->type() == "SOMADenseNDArray"); REQUIRE(soma_dense->is_sparse() == false); - REQUIRE(soma_dense->schema()->has_attribute("a0")); - REQUIRE(soma_dense->schema()->domain().has_dimension("d0")); REQUIRE(soma_dense->ndim() == 1); REQUIRE(soma_dense->shape() == std::vector{1001}); soma_collection->close(); @@ -154,7 +150,7 @@ TEST_CASE("SOMACollection: add SOMADataFrame") { std::string sub_uri = "mem://unit-test-add-dataframe/sub"; SOMACollection::create(base_uri, ctx); - auto schema = create_schema(*ctx, false); + auto schema = create_schema(*ctx, true); std::map expected_map{{"dataframe", sub_uri}}; @@ -165,16 +161,14 @@ TEST_CASE("SOMACollection: add SOMADataFrame") { REQUIRE(soma_dataframe->uri() == sub_uri); REQUIRE(soma_dataframe->ctx() == ctx); REQUIRE(soma_dataframe->type() == "SOMADataFrame"); - REQUIRE(soma_dataframe->schema()->has_attribute("a0")); - REQUIRE(soma_dataframe->schema()->domain().has_dimension("d0")); std::vector expected_index_column_names = {"d0"}; REQUIRE( soma_dataframe->index_column_names() == expected_index_column_names); - REQUIRE(soma_dataframe->count() == 1); soma_collection->close(); soma_collection = SOMACollection::open(base_uri, OpenMode::read, ctx); REQUIRE(soma_collection->member_to_uri_mapping() == expected_map); + REQUIRE(soma_dataframe->count() == 0); soma_collection->close(); } diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index e54f34a805..174e2db124 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -59,7 +59,7 @@ const std::string src_path = TILEDBSOMA_SOURCE_ROOT; namespace { ArraySchema create_schema(Context& ctx, bool allow_duplicates = false) { // Create schema - ArraySchema schema(ctx, TILEDB_DENSE); + ArraySchema schema(ctx, TILEDB_SPARSE); auto dim = Dimension::create(ctx, "d0", {0, 1000}); @@ -86,16 +86,15 @@ TEST_CASE("SOMADataFrame: basic") { REQUIRE(soma_dataframe->uri() == uri); REQUIRE(soma_dataframe->ctx() == ctx); REQUIRE(soma_dataframe->type() == "SOMADataFrame"); - auto schema = soma_dataframe->schema(); - REQUIRE(schema->has_attribute("a0")); - REQUIRE(schema->domain().has_dimension("d0")); std::vector expected_index_column_names = {"d0"}; REQUIRE( soma_dataframe->index_column_names() == expected_index_column_names); - REQUIRE(soma_dataframe->count() == 1); + REQUIRE(soma_dataframe->count() == 0); soma_dataframe->close(); - std::vector d0{1, 10}; + std::vector d0(10); + for (int j = 0; j < 10; j++) + d0[j] = j; std::vector a0(10, 1); auto array_buffer = std::make_shared(); @@ -103,21 +102,24 @@ TEST_CASE("SOMADataFrame: basic") { array_buffer->emplace("a0", ColumnBuffer::create(tdb_arr, "a0", a0)); array_buffer->emplace("d0", ColumnBuffer::create(tdb_arr, "d0", d0)); - soma_dataframe->open(OpenMode::write); + soma_dataframe = SOMADataFrame::open(uri, OpenMode::write, ctx); soma_dataframe->write(array_buffer); soma_dataframe->close(); - soma_dataframe->open(OpenMode::read); + soma_dataframe = SOMADataFrame::open(uri, OpenMode::read, ctx); while (auto batch = soma_dataframe->read_next()) { auto arrbuf = batch.value(); auto d0span = arrbuf->at("d0")->data(); auto a0span = arrbuf->at("a0")->data(); - REQUIRE( - std::vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10} == - std::vector(d0span.begin(), d0span.end())); + REQUIRE(d0 == std::vector(d0span.begin(), d0span.end())); REQUIRE(a0 == std::vector(a0span.begin(), a0span.end())); } soma_dataframe->close(); + + auto soma_object = SOMAObject::open(uri, OpenMode::read, ctx); + REQUIRE(soma_object->uri() == uri); + REQUIRE(soma_object->type() == "SOMADataFrame"); + soma_object->close(); } TEST_CASE("SOMADataFrame: metadata") { diff --git a/libtiledbsoma/test/unit_soma_dense_ndarray.cc b/libtiledbsoma/test/unit_soma_dense_ndarray.cc index 55f15f54f7..5770d91b54 100644 --- a/libtiledbsoma/test/unit_soma_dense_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_dense_ndarray.cc @@ -87,7 +87,7 @@ TEST_CASE("SOMADenseNDArray: basic") { REQUIRE(soma_dense->ctx() == ctx); REQUIRE(soma_dense->type() == "SOMADenseNDArray"); REQUIRE(soma_dense->is_sparse() == false); - auto schema = soma_dense->schema(); + auto schema = soma_dense->tiledb_schema(); REQUIRE(schema->has_attribute("a0")); REQUIRE(schema->domain().has_dimension("d0")); REQUIRE(soma_dense->ndim() == 1); diff --git a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc index d4a418ef77..6b848ade19 100644 --- a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc @@ -87,7 +87,7 @@ TEST_CASE("SOMASparseNDArray: basic") { REQUIRE(soma_sparse->ctx() == ctx); REQUIRE(soma_sparse->type() == "SOMASparseNDArray"); REQUIRE(soma_sparse->is_sparse() == true); - auto schema = soma_sparse->schema(); + auto schema = soma_sparse->tiledb_schema(); REQUIRE(schema->has_attribute("a0")); REQUIRE(schema->domain().has_dimension("d0")); REQUIRE(soma_sparse->ndim() == 1);