diff --git a/apis/python/src/tiledbsoma/io/_common.py b/apis/python/src/tiledbsoma/io/_common.py index dc33136b42..41e0c7053a 100644 --- a/apis/python/src/tiledbsoma/io/_common.py +++ b/apis/python/src/tiledbsoma/io/_common.py @@ -4,19 +4,31 @@ # Licensed under the MIT License. """Common constants and types used during ingestion/outgestion.""" - -from typing import Any, Mapping, Union +from typing import Dict, List, Mapping, Optional, Union import h5py +import numpy as np +import pandas as pd import scipy.sparse as sp from anndata._core.sparse_dataset import SparseDataset -from tiledbsoma._types import NPNDArray +from tiledbsoma._types import Metadatum, NPNDArray SparseMatrix = Union[sp.csr_matrix, sp.csc_matrix, SparseDataset] DenseMatrix = Union[NPNDArray, h5py.Dataset] Matrix = Union[DenseMatrix, SparseMatrix] -UnsMapping = Mapping[str, Any] + +UnsScalar = Union[str, int, float, np.generic] +# TODO: support sparse matrices in `uns` +UnsLeaf = Union[UnsScalar, List[UnsScalar], pd.DataFrame, NPNDArray] +UnsNode = Union[UnsLeaf, Mapping[str, "UnsNode"]] +UnsMapping = Mapping[str, UnsNode] +# Specialize `UnsNode` to `Dict` instead of `Mapping` +# `Mapping` doesn't expose `__set__`, so this is useful for building `uns` dictionaries. +UnsDictNode = Union[UnsLeaf, Dict[str, "UnsDictNode"]] +UnsDict = Dict[str, UnsDictNode] + +AdditionalMetadata = Optional[Dict[str, Metadatum]] # Arrays of strings from AnnData's uns are stored in SOMA as SOMADataFrame, # since SOMA ND arrays are necessarily arrays *of numbers*. This is okay since diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py index 600bad859b..90f2c6e40f 100644 --- a/apis/python/src/tiledbsoma/io/ingest.py +++ b/apis/python/src/tiledbsoma/io/ingest.py @@ -70,7 +70,6 @@ _INGEST_MODES, INGEST_MODES, IngestMode, - Metadatum, NPNDArray, Path, _IngestMode, @@ -89,9 +88,11 @@ _UNS_OUTGEST_HINT_1D, _UNS_OUTGEST_HINT_2D, _UNS_OUTGEST_HINT_KEY, + AdditionalMetadata, Matrix, SparseMatrix, UnsMapping, + UnsNode, ) from ._registration import ( AxisIDMapping, @@ -107,9 +108,6 @@ _TDBO = TypeVar("_TDBO", bound=SOMAObject[RawHandle]) -AdditionalMetadata = Optional[Dict[str, Metadatum]] - - def add_metadata(obj: SOMAObject[Any], additional_metadata: AdditionalMetadata) -> None: if additional_metadata: obj.verify_open_for_writing() @@ -1184,6 +1182,10 @@ def _write_dataframe_impl( platform_config: Optional[PlatformConfig] = None, context: Optional[SOMATileDBContext] = None, ) -> DataFrame: + """Save a Pandas DataFrame as a SOMA DataFrame. + + Expects the required ``soma_joinid`` index to have already been added to the ``pd.DataFrame``. + """ s = _util.get_start_stamp() logging.log_io(None, f"START WRITING {df_uri}") @@ -2485,7 +2487,7 @@ def _maybe_ingest_uns( def _ingest_uns_dict( parent: AnyTileDBCollection, parent_key: str, - dct: Mapping[str, object], + dct: UnsMapping, *, platform_config: Optional[PlatformConfig], context: Optional[SOMATileDBContext], @@ -2524,9 +2526,9 @@ def _ingest_uns_dict( def _ingest_uns_node( - coll: Any, - key: Any, - value: Any, + coll: AnyTileDBCollection, + key: str, + value: UnsNode, *, platform_config: Optional[PlatformConfig], context: Optional[SOMATileDBContext], @@ -2579,30 +2581,13 @@ def _ingest_uns_node( if isinstance(value, list) or "numpy" in str(type(value)): value = np.asarray(value) if isinstance(value, np.ndarray): - if value.dtype.names is not None: - msg = f"Skipped {coll.uri}[{key!r}]" " (uns): unsupported structured array" - # This is a structured array, which we do not support. - logging.log_io(msg, msg) - return - - if value.dtype.char in ("U", "O"): - # In the wild it's quite common to see arrays of strings in uns data. - # Frequent example: uns["louvain_colors"]. - _ingest_uns_string_array( - coll, - key, - value, - use_relative_uri=use_relative_uri, - **ingest_platform_ctx, - ) - else: - _ingest_uns_ndarray( - coll, - key, - value, - use_relative_uri=use_relative_uri, - **ingest_platform_ctx, - ) + _ingest_uns_array( + coll, + key, + value, + use_relative_uri=use_relative_uri, + ingest_platform_ctx=ingest_platform_ctx, + ) return msg = ( @@ -2611,6 +2596,44 @@ def _ingest_uns_node( logging.log_io(msg, msg) +def _ingest_uns_array( + coll: AnyTileDBCollection, + key: str, + value: NPNDArray, + use_relative_uri: Optional[bool], + ingest_platform_ctx: IngestPlatformCtx, +) -> None: + """Ingest an uns Numpy array. + + Delegates to :func:`_ingest_uns_string_array` for string arrays, and to + :func:`_ingest_uns_ndarray` for numeric arrays. + """ + if value.dtype.names is not None: + # This is a structured array, which we do not support. + logging.log_io_same( + f"Skipped {coll.uri}[{key!r}]" " (uns): unsupported structured array" + ) + + if value.dtype.char in ("U", "O"): + # In the wild it's quite common to see arrays of strings in uns data. + # Frequent example: uns["louvain_colors"]. + _ingest_uns_string_array( + coll, + key, + value, + use_relative_uri=use_relative_uri, + **ingest_platform_ctx, + ) + else: + _ingest_uns_ndarray( + coll, + key, + value, + use_relative_uri=use_relative_uri, + **ingest_platform_ctx, + ) + + def _ingest_uns_string_array( coll: AnyTileDBCollection, key: str, diff --git a/apis/python/src/tiledbsoma/io/outgest.py b/apis/python/src/tiledbsoma/io/outgest.py index 6a649ceac4..510c28a79f 100644 --- a/apis/python/src/tiledbsoma/io/outgest.py +++ b/apis/python/src/tiledbsoma/io/outgest.py @@ -46,6 +46,7 @@ _UNS_OUTGEST_HINT_2D, _UNS_OUTGEST_HINT_KEY, Matrix, + UnsDict, UnsMapping, ) @@ -432,12 +433,12 @@ def _extract_uns( collection: Collection[Any], uns_keys: Optional[Sequence[str]] = None, level: int = 0, -) -> UnsMapping: +) -> UnsDict: """ This is a helper function for ``to_anndata`` of ``uns`` elements. """ - extracted: Dict[str, Any] = {} + extracted: UnsDict = {} for key, element in collection.items(): if level == 0 and uns_keys is not None and key not in uns_keys: continue diff --git a/apis/python/src/tiledbsoma/io/update_uns.py b/apis/python/src/tiledbsoma/io/update_uns.py new file mode 100644 index 0000000000..f16b73682a --- /dev/null +++ b/apis/python/src/tiledbsoma/io/update_uns.py @@ -0,0 +1,225 @@ +from os.path import join +from typing import Literal, Optional + +import numpy as np +import pandas as pd +from somacore.options import PlatformConfig + +from tiledbsoma import Experiment, SOMATileDBContext +from tiledbsoma._collection import AnyTileDBCollection, Collection +from tiledbsoma.io._common import AdditionalMetadata, UnsMapping +from tiledbsoma.io._registration import AxisIDMapping +from tiledbsoma.io.ingest import ( + IngestionParams, + IngestPlatformCtx, + _ingest_uns_array, + _maybe_set, + _write_dataframe, +) +from tiledbsoma.logging import logger + +Strict = Literal[True, "raise", "warn", "info", "debug", "dry_run"] + + +def update_uns_by_uri( + uri: str, + uns: UnsMapping, + measurement_name: str, + *, + use_relative_uri: Optional[bool] = None, + context: Optional[SOMATileDBContext] = None, + additional_metadata: AdditionalMetadata = None, + platform_config: Optional[PlatformConfig] = None, + default_index_name: Optional[str] = None, + strict: Strict = True, +) -> None: + """Wrapper around :func:`_update_uns` that opens the experiment at the given URI. + + Some "update uns" operations, that we may want to support in the future, will require opening + the ``Experiment`` more than once (e.g. to modify a DataFrame or array in ways that can't be + achieved "in-place" / at one TileDB timestamp). This function API is a sketch of what that + might look like, though it just calls :func:`update_uns` for now (overwriting DataFrames and + arrays is currently not supported, and will either ``raise`` or "INFO" log, based on the value + of the ``strict`` arg). + """ + with Experiment.open(uri, "w") as exp: + _update_uns( + exp, + uns, + measurement_name, + use_relative_uri=use_relative_uri, + context=context, + additional_metadata=additional_metadata, + platform_config=platform_config, + default_index_name=default_index_name, + strict=strict, + ) + + +def _update_uns( + exp: Experiment, + uns: UnsMapping, + measurement_name: str, + *, + use_relative_uri: Optional[bool] = None, + context: Optional[SOMATileDBContext] = None, + additional_metadata: AdditionalMetadata = None, + platform_config: Optional[PlatformConfig] = None, + default_index_name: Optional[str] = None, + strict: Strict = True, +) -> None: + """ + Update the given experiment/measurement's ``uns`` Collection. + + ``uns`` (short for unstructured data) is conceptually a dictionary; values can be scalars, + DataFrames, arrays (dense or sparse), or recursively-nested dictionaries. + + This function makes a best effort at changing the existing ``uns`` Collection to match the + provided ``uns`` dictionary. It refuses to update DataFrame and array nodes, with errors + either raised or logged, based on the ``strict`` argument. + + Args: + exp: The :class:`SOMAExperiment` whose ``uns`` is to be updated. Must + be opened for write. + + measurement_name: Specifies which measurement's ``uns`` within the experiment + is to be updated. + + uns: a Pandas dataframe with the desired contents. + + use_relative_uri: If True, store the URI relative to the experiment's URI. + + context: Optional :class:`SOMATileDBContext` containing storage parameters, etc. + + additional_metadata: Additional metadata to be added to the collection. + + platform_config: Platform-specific options used to update this array, provided + in the form ``{"tiledb": {"create": {"dataframe_dim_zstd_level": 7}}}`` + + default_index_name: Fallback name to use for columns representing ``pd.DataFrame`` indices. + + strict: How to handle conflicts with existing nodes. By default (``strict=True``), a first + pass checks for conflicts, ``raise``ing if any are found. If none are found, a second pass + then performs the updates. This is equivalent to running once with ``strict="dry_run"``, + then again with ``dry_run="raise"`` (though the expectation is the latter will never + actually ``raise``). If ``strict={debug,info,warn}`` instead, conflicting nodes are logged + at the corresponding level, and the update is skipped. + """ + if measurement_name not in exp.ms: + raise ValueError( + f"cannot find measurement name {measurement_name} within experiment at {exp.uri}" + ) + + ingest_platform_ctx = IngestPlatformCtx( + context=context, + ingestion_params=IngestionParams("write", label_mapping=None), + additional_metadata=additional_metadata, + platform_config=platform_config, + ) + if strict is True: + # Surface any errors (e.g. unsupported SOMA object overwrites) without writing anything + _update_uns_dict( + exp.ms[measurement_name]["uns"], # type: ignore[arg-type] + uns, + use_relative_uri=use_relative_uri, + ingest_platform_ctx=ingest_platform_ctx, + default_index_name=default_index_name, + strict="dry_run", + ) + strict = "raise" + _update_uns_dict( + exp.ms[measurement_name]["uns"], # type: ignore[arg-type] + uns, + use_relative_uri=use_relative_uri, + ingest_platform_ctx=ingest_platform_ctx, + default_index_name=default_index_name, + strict=strict, + ) + + +def _update_uns_dict( + coll: AnyTileDBCollection, + uns: UnsMapping, + *, + ingest_platform_ctx: IngestPlatformCtx, + use_relative_uri: Optional[bool], + default_index_name: Optional[str], + strict: Strict, +) -> None: + for k, v in uns.items(): + # Any existing scalar will be found in metadata, not as a child of the Collection. + cur = None + if k in coll: + cur = coll[k] + if k in coll.metadata: + if cur is not None: + logger.warn(f"{coll.uri}[{k}] exists as both metadata and child") + else: + cur = coll.metadata[k] + exists = cur is not None + + def can_write() -> bool: + if exists: + msg = f"{coll.uri}[{k}]: already exists (type {type(cur).__name__}), refusing to overwrite with {v}" + if strict in ["dry_run", "raise"]: + raise ValueError(msg) + else: + msg = f"Skipping {msg}" + if strict == "warn": + logger.warn(msg) + elif strict == "info": + logger.info(msg) + elif strict == "debug": + logger.debug(msg) + return False + else: + return strict != "dry_run" + + if isinstance(v, (str, int, float, np.generic)): + if k in coll: + raise ValueError( + f"can't overwrite {type(cur).__name__} at {coll.uri}/{k} with scalar {v}" + ) + if isinstance(v, np.generic): + # Unwrap numpy scalar + v = v.item() + if strict != "dry_run": + coll.metadata[k] = v + elif isinstance(v, pd.DataFrame): + if can_write(): + with _write_dataframe( + df_uri=join(coll.uri, k), + df=v.copy(), # `_write_dataframe` modifies the `pd.DataFrame` it's passed + id_column_name=default_index_name, + ingestion_params=ingest_platform_ctx["ingestion_params"], + additional_metadata=ingest_platform_ctx["additional_metadata"], + platform_config=ingest_platform_ctx["platform_config"], + context=ingest_platform_ctx["context"], + axis_mapping=AxisIDMapping.identity(v.shape[0]), + ) as df: + _maybe_set(coll, k, df, use_relative_uri=use_relative_uri) + elif isinstance(v, dict): + if exists: + if not isinstance(cur, Collection): + raise ValueError( + f"{coll.uri}/{k}: expected Collection, found {type(cur).__name__}" + ) + _update_uns_dict( + coll[k], + v, + use_relative_uri=use_relative_uri, + ingest_platform_ctx=ingest_platform_ctx, + default_index_name=default_index_name, + strict=strict, + ) + elif isinstance(v, np.ndarray): + if can_write(): + _ingest_uns_array( + coll, + k, + v, + use_relative_uri=use_relative_uri, + ingest_platform_ctx=ingest_platform_ctx, + ) + else: + raise ValueError(f"unsupported uns type {type(v)}") diff --git a/apis/python/tests/test_basic_anndata_io.py b/apis/python/tests/test_basic_anndata_io.py index 42de74644d..b110bdc0e0 100644 --- a/apis/python/tests/test_basic_anndata_io.py +++ b/apis/python/tests/test_basic_anndata_io.py @@ -3,7 +3,7 @@ import tempfile from copy import deepcopy from pathlib import Path -from typing import Optional +from typing import Optional, Tuple import anndata import numpy as np @@ -11,13 +11,14 @@ import pytest import scipy import somacore +from anndata import AnnData from scipy.sparse import csr_matrix import tiledbsoma import tiledbsoma.io from tiledbsoma import Experiment, _constants, _factory from tiledbsoma._soma_object import SOMAObject -from tiledbsoma.io._common import _TILEDBSOMA_TYPE +from tiledbsoma.io._common import _TILEDBSOMA_TYPE, UnsDict, UnsMapping import tiledb from ._util import TESTDATA, assert_adata_equal, make_pd_df @@ -978,10 +979,33 @@ def test_id_names(tmp_path, obs_id_name, var_id_name, indexify_obs, indexify_var assert list(bdata.var.index) == list(soma_var[var_id_name]) -@pytest.mark.parametrize( - "outgest_uns_keys", [["int_scalar", "strings", "np_ndarray_2d"], None] -) -def test_uns_io(tmp_path, outgest_uns_keys): +TEST_UNS: UnsDict = { + # Scalars are stored in SOMA as metadata + "int_scalar": 7, + "float_scalar": 8.5, + "string_scalar": "hello", + # pd.DataFrames are stored in SOMA as `DataFrame`s + "pd_df_indexed": make_pd_df("0,1,2", column_1="d,e,f"), + "pd_df_nonindexed": make_pd_df(column_1="g,h,i"), + # `np.ndarray`s are stored in SOMA as `DenseArray`s + "np_ndarray_1d": np.asarray([1, 2, 3]), + "np_ndarray_2d": np.asarray([[1, 2, 3], [4, 5, 6]]), + # Nested dicts are stored in SOMA as `SOMACollection`s + "strings": { + # `np.ndarray`s of strings are stored in SOMA as `DataFrame`s, since SOMA ND arrays are + # necessarily arrays *of numbers*. This is okay since the one and only job of SOMA uns is + # to faithfully ingest from AnnData and outgest back. + "string_np_ndarray_1d": np.asarray(["j", "k", "l"]), + "string_np_ndarray_2d": np.asarray([["m", "n", "o"], ["p", "q", "r"]]), + }, +} + + +def make_uns_adata( + tmp_path: Path, + measurement_name: str = "RNA", + uns: Optional[UnsMapping] = None, +) -> Tuple[str, AnnData]: obs = pd.DataFrame( data={"obs_id": np.asarray(["a", "b", "c"])}, index=np.arange(3).astype(str), @@ -992,26 +1016,9 @@ def test_uns_io(tmp_path, outgest_uns_keys): ) X = csr_matrix(np.zeros([3, 2])) - uns = { - # These are stored in SOMA as metadata - "int_scalar": 7, - "float_scalar": 8.5, - "string_scalar": "hello", - # These are stored in SOMA as SOMADataFrame - "pd_df_indexed": make_pd_df("0,1,2", column_1="d,e,f"), - "pd_df_nonindexed": make_pd_df(column_1="g,h,i"), - # These are stored in SOMA as SOMA ND arrays - "np_ndarray_1d": np.asarray([1, 2, 3]), - "np_ndarray_2d": np.asarray([[1, 2, 3], [4, 5, 6]]), - # This are stored in SOMA as a SOMACollection - "strings": { - # This are stored in SOMA as a SOMADataFrame, since SOMA ND arrays are necessarily - # arrays *of numbers*. This is okay since the one and only job of SOMA uns is to - # faithfully ingest from AnnData and outgest back. - "string_np_ndarray_1d": np.asarray(["j", "k", "l"]), - "string_np_ndarray_2d": np.asarray([["m", "n", "o"], ["p", "q", "r"]]), - }, - } + if uns is None: + uns = TEST_UNS + adata = anndata.AnnData( obs=obs, var=var, @@ -1023,9 +1030,18 @@ def test_uns_io(tmp_path, outgest_uns_keys): soma_uri = tmp_path.as_posix() - tiledbsoma.io.from_anndata(soma_uri, adata, measurement_name="RNA") + tiledbsoma.io.from_anndata(soma_uri, adata, measurement_name=measurement_name) assert_adata_equal(adata0, adata) + return soma_uri, adata + + +@pytest.mark.parametrize( + "outgest_uns_keys", [["int_scalar", "strings", "np_ndarray_2d"], None] +) +def test_uns_io(tmp_path, outgest_uns_keys): + soma_uri, adata = make_uns_adata(tmp_path) + with tiledbsoma.Experiment.open(soma_uri) as exp: adata2 = tiledbsoma.io.to_anndata( exp, @@ -1033,7 +1049,7 @@ def test_uns_io(tmp_path, outgest_uns_keys): uns_keys=outgest_uns_keys, ) - expected_adata = deepcopy(adata0) + expected_adata = deepcopy(adata) # Outgest also fails to restore `obs` and `var` correctly, in this case because the ingested # `obs`/`var` had columns named "obs_id"/"var_id", which get mistaken for "default index" diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index 514503f1fa..615a6ba4cd 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -1618,48 +1618,3 @@ def test_only_evolve_schema_when_enmr_is_extended(tmp_path): # subtract 1 for the __schema/__enumerations directory; # only looking at fragment files assert len(vfs.ls(os.path.join(uri, "__schema"))) - 1 == 3 - - -def test_timestamped_schema_evolve(tmp_path): - uri = tmp_path.as_posix() - - asch = pa.schema([("myenum", pa.dictionary(pa.int8(), pa.large_string()))]) - - # Create at time 1 - soma.DataFrame.create(uri, schema=asch, tiledb_timestamp=1).close() - - # Write at time 2 - atbl = pa.Table.from_pydict( - { - "soma_joinid": [0, 1, 2], - "myenum": pd.Series(["a", "b", "a"], dtype="category"), - } - ) - with soma.DataFrame.open(uri, "w", tiledb_timestamp=2) as sdf: - sdf.write(atbl) - - # Write at time 3 - atbl = pa.Table.from_pydict( - { - "soma_joinid": [3, 4], - "myenum": pd.Series(["b", "b"], dtype="category"), - } - ) - with soma.DataFrame.open(uri, "w", tiledb_timestamp=3) as sdf: - sdf.write(atbl) - - with soma.DataFrame.open(uri, tiledb_timestamp=1) as sdf: - table = sdf.read().concat() - assert table["myenum"].to_pylist() == [] - - with soma.DataFrame.open(uri, tiledb_timestamp=2) as sdf: - table = sdf.read().concat() - assert table["myenum"].to_pylist() == ["a", "b", "a"] - - with soma.DataFrame.open(uri, tiledb_timestamp=3) as sdf: - table = sdf.read().concat() - assert table["myenum"].to_pylist() == ["a", "b", "a", "b", "b"] - - with soma.DataFrame.open(uri) as sdf: - table = sdf.read().concat() - assert table["myenum"].to_pylist() == ["a", "b", "a", "b", "b"] diff --git a/apis/python/tests/test_update_uns.py b/apis/python/tests/test_update_uns.py new file mode 100644 index 0000000000..56ffecfe68 --- /dev/null +++ b/apis/python/tests/test_update_uns.py @@ -0,0 +1,238 @@ +import logging +from copy import deepcopy +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional, Union + +import numpy as np +from _pytest.logging import LogCaptureFixture + +import tiledbsoma +from tiledbsoma import Experiment +from tiledbsoma.io._common import UnsDict, UnsMapping +from tiledbsoma.io.update_uns import Strict, _update_uns + +from tests._util import Err, assert_uns_equal, make_pd_df, maybe_raises, verify_logs +from tests.parametrize_cases import parametrize_cases +from tests.test_basic_anndata_io import TEST_UNS, make_uns_adata + +ValidUpdates = Union[None, str, List[str], Dict[str, "ValidUpdates"]] +Logs = Optional[List[str]] + + +@dataclass +class Case: + """A test case for `update_uns`. + + :param id: Human-readable identifier for the test case. + :param uns_updates: Updates to attempt to apply (to the default ``TEST_UNS`` dict). + :param strict: How to handle conflicts with existing ``uns`` keys. Default (``True``) performs + an initial "dry run" pass to verify all updates can be applied, then a second pass actually + applying the updates. Pass ``"debug"``, ``"info"``, or ``"warn"`` to instead skip + conflicting keys, and log a message at the corresponding level. See :func:`update_uns` for + more details. + :param err: If present, verify that :func:`update_uns` raises an ``Exception`` containing this + regex. + :param valid_updates: If present, only these updates are expected to have been applied. If + ``strict=True`` (the default), all updates must be applied (``valid_updates=None``), or + no updates (``valid_updates=[]``), based on presence or absence of an expected ``err``. + :param logs: If present, verify that these log messages are present in the captured logs. Used + only when ``strict`` is a log level (``"debug"``, ``"info"``, ``"warn"``). + """ + + id: str + uns_updates: UnsMapping + strict: Strict = True + err: Optional[Err] = None + valid_updates: ValidUpdates = None + logs: Logs = None + + +def case( + id: str, + err: Optional[Err] = None, + valid_updates: ValidUpdates = None, + logs: Logs = None, + strict: Strict = True, + **uns_updates, +) -> Case: + """Helper for construction a :class:`Case`. + + ``err``s are verified to be ``ValueError``s, by default, and "kwargs" become ``uns_updates``. + """ + if isinstance(err, str): + err = (ValueError, err) + return Case( + id=id, + uns_updates=uns_updates, + err=err, + strict=strict, + valid_updates=valid_updates, + logs=logs, + ) + + +# fmt: off +@parametrize_cases([ + case( + "Update one scalar", + int_scalar=11, + ), + case( + "Update scalar, change type", + int_scalar="aaa", + ), + case( + "Update multiple scalar values, add a new DataFrame", + int_scalar=11, + float_scalar=2.2, + string_scalar="HELLO 2", + new_df=make_pd_df(a="1,2,3"), + ), + case( + "Update multiple scalar values, including inside a Collection", + int_scalar=11, + float_scalar=2.2, + string_scalar="HELLO 2", + new_df=make_pd_df(a="1,2,3"), + strings=dict( + aaa="AAA", + nnn=111, + ) + ), + case( + "Overwrite np.array inside collection (raise)", + strings=dict( + string_np_ndarray_1d=np.asarray(list("abc")), + ), + err=r"ms/RNA/uns/strings\[string_np_ndarray_1d]: already exists \(type DataFrame\), refusing to overwrite with \['a' 'b' 'c']" + ), + case( + "Overwrite np.array inside collection (skip)", + strings=dict( + string_np_ndarray_1d=np.asarray(list("abc")), + ), + strict="info", + valid_updates=[], + logs=[r"ms/RNA/uns/strings\[string_np_ndarray_1d]: already exists \(type DataFrame\), refusing to overwrite with \['a' 'b' 'c']"] + ), + case( + "No partial updates inside collection", + strings=dict( + foo="FOO", + string_np_ndarray_1d=np.asarray(list("abc")), + ), + err=r"ms/RNA/uns/strings\[string_np_ndarray_1d]: already exists \(type DataFrame\), refusing to overwrite with \['a' 'b' 'c']" + ), + case( + "Partial update inside collection", + strings=dict( + foo="FOO", + string_np_ndarray_1d=np.asarray(list("abc")), + ), + strict="info", + valid_updates={"strings": "foo"}, + logs=[r"ms/RNA/uns/strings\[string_np_ndarray_1d]: already exists \(type DataFrame\), refusing to overwrite with \['a' 'b' 'c']"] + ), + case( + "Overwrite scalar with DataFrame", + int_scalar=make_pd_df(a="1,2,3"), + err=( + r"ms/RNA/uns\[int_scalar]: already exists \(type int\), refusing to overwrite with a\n" + r"0 1\n" + r"1 2\n" + r"2 3" + ), + ), + case( + "Overwrite existing DataFrame (raise)", + int_scalar=22, + pd_df_indexed=TEST_UNS["pd_df_indexed"].copy(), + err=( + r"ms/RNA/uns\[pd_df_indexed]: already exists \(type DataFrame\), refusing to overwrite with column_1\n" + r"0 d\n" + r"1 e\n" + r"2 f" + ), + ), + case( + "Overwrite existing DataFrame (skip)", + int_scalar=22, + pd_df_indexed=TEST_UNS["pd_df_indexed"].assign(column_1=list("ghi")), + strict="info", + valid_updates=['int_scalar'], # `int_scalar` is updated, `pd_df_indexed` is not ("info" msg logged below) + logs=[ + r"ms/RNA/uns\[pd_df_indexed]: already exists \(type DataFrame\), refusing to overwrite with column_1\n" + r"0 g\n" + r"1 h\n" + r"2 i" + ], + ), +]) +# fmt: on +def test_update_uns( + caplog: LogCaptureFixture, + tmp_path: Path, + uns_updates: UnsDict, + strict: Strict, + err: Optional[Err], + valid_updates: ValidUpdates, + logs: Logs, +): + caplog.set_level(logging.INFO) + soma_uri, adata = make_uns_adata(tmp_path) + + with Experiment.open(soma_uri, "w") as exp: + with maybe_raises(err): + _update_uns(exp, uns_updates, measurement_name="RNA", strict=strict) + + verify_logs(caplog, logs) + + if err: + # In all cases, an error during `update_uns` should result in no updates being applied (nor + # should any non-empty `valid_updates` have been provided as part of the test-case "spec"). + assert valid_updates is None or valid_updates == [] + valid_updates = [] + + with Experiment.open(soma_uri) as exp: + adata2 = tiledbsoma.io.to_anndata(exp, measurement_name="RNA") + + expected = deepcopy(TEST_UNS) + merge_updates(expected, uns_updates, valid_updates) + assert_uns_equal(adata2.uns, expected) + + +def merge_updates( + uns: UnsDict, + updates: UnsDict, + valid_updates: ValidUpdates, +) -> None: + """Apply a subset of ``updates`` to an ``uns`` dict, filtering based on ``valid_updates``. + + - If ``valid_updates`` is ``None``, all updates are applied. + - If ``valid_updates`` is a ``list`` of strings, ``updates`` must be a ``dict``, and only keys + from the ``list`` are applied. + - If ``valid_updates`` is a ``dict``, its keys must be present in ``updates``, and filtering is + applied recursively, with ``valid_updates``' values applying to corresponding items in + ``updates``. + """ + if isinstance(valid_updates, str): + valid_updates = [valid_updates] + for k, v in updates.items(): + if isinstance(v, dict): + if valid_updates is None: + merge_updates(uns[k], v, None) + elif isinstance(valid_updates, list): + if k in valid_updates: + merge_updates(uns[k], v, None) + else: + assert isinstance(valid_updates, dict) + if k in valid_updates: + merge_updates(uns[k], v, valid_updates[k]) + else: + if valid_updates is None: + uns[k] = v + else: + assert isinstance(valid_updates, list) + if k in valid_updates: + uns[k] = v diff --git a/apis/r/DESCRIPTION b/apis/r/DESCRIPTION index 94b71968ce..72dcbf2b0c 100644 --- a/apis/r/DESCRIPTION +++ b/apis/r/DESCRIPTION @@ -6,7 +6,7 @@ Description: Interface for working with 'TileDB'-based Stack of Matrices, like those commonly used for single cell data analysis. It is documented at ; a formal specification available is at . -Version: 1.13.99.2 +Version: 1.13.99.5 Authors@R: c( person(given = "Aaron", family = "Wolen", role = c("cre", "aut"), email = "aaron@tiledb.com", diff --git a/apis/r/NEWS.md b/apis/r/NEWS.md index 9396e99ca2..4707c5fcc3 100644 --- a/apis/r/NEWS.md +++ b/apis/r/NEWS.md @@ -2,6 +2,10 @@ ## Changes +* Make use of timestamp ranges in libtiledbsoma +* Simplify timestamp ranges; strengthen assumptions about `tiledb_timestamp` +* Use cached timestamps in `$write()` and `$create()` + # tiledbsoma 1.13.0 ## Changes diff --git a/apis/r/R/Factory.R b/apis/r/R/Factory.R index 92706f8eed..fdb6c2ed1b 100644 --- a/apis/r/R/Factory.R +++ b/apis/r/R/Factory.R @@ -75,6 +75,7 @@ SOMADataFrameOpen <- function( tiledbsoma_ctx = NULL, tiledb_timestamp = NULL ) { + spdl::debug("[SOMADataFrameOpen] uri {} ts ({})", uri, tiledb_timestamp %||% "now") sdf <- SOMADataFrame$new( uri, platform_config, @@ -173,6 +174,7 @@ SOMADenseNDArrayCreate <- function( tiledbsoma_ctx = NULL, tiledb_timestamp = NULL ) { + spdl::debug("[SOMADenseNDArrayCreate] tstamp ({})", tiledb_timestamp %||% "now") dnda <- SOMADenseNDArray$new( uri, platform_config, diff --git a/apis/r/R/RcppExports.R b/apis/r/R/RcppExports.R index 271411287c..4f20189acf 100644 --- a/apis/r/R/RcppExports.R +++ b/apis/r/R/RcppExports.R @@ -5,12 +5,12 @@ createSOMAContext <- function(config = NULL) { .Call(`_tiledbsoma_createSOMAContext`, config) } -createSchemaFromArrow <- function(uri, nasp, nadimap, nadimsp, sparse, datatype, pclst, ctxxp) { - invisible(.Call(`_tiledbsoma_createSchemaFromArrow`, uri, nasp, nadimap, nadimsp, sparse, datatype, pclst, ctxxp)) +createSchemaFromArrow <- function(uri, nasp, nadimap, nadimsp, sparse, datatype, pclst, ctxxp, tsvec = NULL) { + invisible(.Call(`_tiledbsoma_createSchemaFromArrow`, uri, nasp, nadimap, nadimsp, sparse, datatype, pclst, ctxxp, tsvec)) } -writeArrayFromArrow <- function(uri, naap, nasp, arraytype = "", config = NULL) { - invisible(.Call(`_tiledbsoma_writeArrayFromArrow`, uri, naap, nasp, arraytype, config)) +writeArrayFromArrow <- function(uri, naap, nasp, arraytype = "", config = NULL, tsvec = NULL) { + invisible(.Call(`_tiledbsoma_writeArrayFromArrow`, uri, naap, nasp, arraytype, config, tsvec)) } #' Get nnumber of metadata items @@ -66,12 +66,14 @@ delete_metadata <- function(uri, key, is_array, ctxxp) { #' #' @param uri The array URI #' @param key The array metadata key -#' @param value The metadata value -#' @ +#' @param valuesxp The metadata value +#' @param type The datatype +#' @param is_array A boolean to indicate array or group #' @param ctxxp An external pointer to the SOMAContext wrapper +#' @param tsvec An optional two-element datetime vector #' @export -set_metadata <- function(uri, key, valuesxp, type, is_array, ctxxp) { - invisible(.Call(`_tiledbsoma_set_metadata`, uri, key, valuesxp, type, is_array, ctxxp)) +set_metadata <- function(uri, key, valuesxp, type, is_array, ctxxp, tsvec = NULL) { + invisible(.Call(`_tiledbsoma_set_metadata`, uri, key, valuesxp, type, is_array, ctxxp, tsvec)) } reindex_create <- function() { @@ -87,8 +89,8 @@ reindex_lookup <- function(idx, kvec) { } #' @noRd -soma_array_reader_impl <- function(uri, colnames = NULL, qc = NULL, dim_points = NULL, dim_ranges = NULL, batch_size = "auto", result_order = "auto", loglevel = "auto", config = NULL) { - .Call(`_tiledbsoma_soma_array_reader`, uri, colnames, qc, dim_points, dim_ranges, batch_size, result_order, loglevel, config) +soma_array_reader_impl <- function(uri, colnames = NULL, qc = NULL, dim_points = NULL, dim_ranges = NULL, batch_size = "auto", result_order = "auto", loglevel = "auto", config = NULL, timestamprange = NULL) { + .Call(`_tiledbsoma_soma_array_reader`, uri, colnames, qc, dim_points, dim_ranges, batch_size, result_order, loglevel, config, timestamprange) } #' Set the logging level for the R package and underlying C++ library @@ -148,8 +150,8 @@ shape <- function(uri, config = NULL) { #' @param result_order Optional argument for query result order, defaults to \sQuote{auto} #' @param loglevel Character value with the desired logging level, defaults to \sQuote{auto} #' which lets prior setting prevail, any other value is set as new logging level. -#' @param timestamp_end Optional POSIXct (i.e. Datetime) type for end of interval for which -#' data is considered. +#' @param timestamprange Optional POSIXct (i.e. Datetime) vector with start and end of +#' interval for which data is considered. #' @param sr An external pointer to a TileDB SOMAArray object #' #' @return \code{sr_setup} returns an external pointer to a SOMAArray. \code{sr_complete} @@ -169,8 +171,8 @@ shape <- function(uri, config = NULL) { #' summary(rl) #' } #' @noRd -sr_setup <- function(uri, config, colnames = NULL, qc = NULL, dim_points = NULL, dim_ranges = NULL, batch_size = "auto", result_order = "auto", timestamp_end = NULL, loglevel = "auto") { - .Call(`_tiledbsoma_sr_setup`, uri, config, colnames, qc, dim_points, dim_ranges, batch_size, result_order, timestamp_end, loglevel) +sr_setup <- function(uri, config, colnames = NULL, qc = NULL, dim_points = NULL, dim_ranges = NULL, batch_size = "auto", result_order = "auto", timestamprange = NULL, loglevel = "auto") { + .Call(`_tiledbsoma_sr_setup`, uri, config, colnames, qc, dim_points, dim_ranges, batch_size, result_order, timestamprange, loglevel) } sr_complete <- function(sr) { diff --git a/apis/r/R/SOMAArrayBase.R b/apis/r/R/SOMAArrayBase.R index ca9f241838..4d9082c0f5 100644 --- a/apis/r/R/SOMAArrayBase.R +++ b/apis/r/R/SOMAArrayBase.R @@ -30,11 +30,12 @@ SOMAArrayBase <- R6::R6Class( }, write_object_type_metadata = function() { - private$check_open_for_write() + #private$check_open_for_write() meta <- list() meta[[SOMA_OBJECT_TYPE_METADATA_KEY]] <- self$class() meta[[SOMA_ENCODING_VERSION_METADATA_KEY]] <- SOMA_ENCODING_VERSION + spdl::debug("[SOMAArrayBase::write_object_metadata] calling set metadata") self$set_metadata(meta) } ) diff --git a/apis/r/R/SOMADataFrame.R b/apis/r/R/SOMADataFrame.R index 3eb24aed2d..62877e6a5f 100644 --- a/apis/r/R/SOMADataFrame.R +++ b/apis/r/R/SOMADataFrame.R @@ -23,8 +23,12 @@ SOMADataFrame <- R6::R6Class( #' @template param-platform-config #' @param internal_use_only Character value to signal this is a 'permitted' call, #' as `create()` is considered internal and should not be called directly. - create = function(schema, index_column_names = c("soma_joinid"), - platform_config = NULL, internal_use_only = NULL) { + create = function( + schema, + index_column_names = c("soma_joinid"), + platform_config = NULL, + internal_use_only = NULL + ) { if (is.null(internal_use_only) || internal_use_only != "allowed_use") { stop(paste("Use of the create() method is for internal use only. Consider using a", "factory method as e.g. 'SOMADataFrameCreate()'."), call. = FALSE) @@ -42,7 +46,11 @@ SOMADataFrame <- R6::R6Class( ## we (currently pass domain and extent values in an arrow table (i.e. data.frame alike) ## where each dimension is one column (of the same type as in the schema) followed by three ## values for the domain pair and the extent - dom_ext_tbl <- get_domain_and_extent_dataframe(schema, index_column_names, tiledb_create_options) + dom_ext_tbl <- get_domain_and_extent_dataframe( + schema, + ind_col_names = index_column_names, + tdco = tiledb_create_options + ) ## we transfer to the arrow table via a pair of array and schema pointers dnaap <- nanoarrow::nanoarrow_allocate_array() @@ -53,12 +61,24 @@ SOMADataFrame <- R6::R6Class( nasp <- nanoarrow::nanoarrow_allocate_schema() schema$export_to_c(nasp) + spdl::debug("[SOMADataFrame$create] about to create schema from arrow") ctxptr <- super$tiledbsoma_ctx$context() - createSchemaFromArrow(uri = self$uri, nasp, dnaap, dnasp, TRUE, "SOMADataFrame", - tiledb_create_options$to_list(FALSE), soma_context()) + createSchemaFromArrow( + uri = self$uri, + nasp = nasp, + nadimap = dnaap, + nadimsp = dnasp, + sparse = TRUE, + datatype = "SOMADataFrame", + pclst = tiledb_create_options$to_list(FALSE), + ctxxp = soma_context(), + tsvec = self$.tiledb_timestamp_range + ) - self$open("WRITE", internal_use_only = "allowed_use") + spdl::debug("[SOMADataFrame$create] about to call write_object_type_metadata") private$write_object_type_metadata() + + self$open("WRITE", internal_use_only = "allowed_use") self }, @@ -97,8 +117,14 @@ SOMADataFrame <- R6::R6Class( df <- as.data.frame(values)[schema_names] arr <- self$object - - writeArrayFromArrow(self$uri, naap, nasp, "SOMADataFrame") + writeArrayFromArrow( + uri = self$uri, + naap = naap, + nasp = nasp, + arraytype = "SOMADataFrame", + config = NULL, + tsvec = self$.tiledb_timestamp_range + ) invisible(self) }, @@ -152,14 +178,15 @@ SOMADataFrame <- R6::R6Class( args = list(expr = str2lang(value_filter), ta = arr)) value_filter <- parsed@ptr } - + spdl::debug("[SOMADataFrame$read] calling sr_setup for {} at ({},{})", self$uri, + private$tiledb_timestamp[1], private$tiledb_timestamp[2]) cfg <- as.character(tiledb::config(self$tiledbsoma_ctx$context())) rl <- sr_setup(uri = self$uri, config = cfg, colnames = column_names, qc = value_filter, dim_points = coords, - timestamp_end = private$tiledb_timestamp, + timestamprange = self$.tiledb_timestamp_range, # NULL or two-elem vector loglevel = log_level) private$ctx_ptr <- rl$ctx TableReadIter$new(rl$sr) diff --git a/apis/r/R/SOMADenseNDArray.R b/apis/r/R/SOMADenseNDArray.R index e259bb24ee..11a24677ff 100644 --- a/apis/r/R/SOMADenseNDArray.R +++ b/apis/r/R/SOMADenseNDArray.R @@ -52,10 +52,16 @@ SOMADenseNDArray <- R6::R6Class( coords <- private$.convert_coords(coords) cfg <- as.character(tiledb::config(self$tiledbsoma_ctx$context())) + spdl::debug( + "[SOMADenseNDArray$read_arrow_table] timestamp ({})", + self$tiledb_timestamp %||% "now" + ) + rl <- soma_array_reader(uri = uri, dim_points = coords, result_order = result_order, - loglevel = log_level, # NULL dealt with by soma_array_reader() + timestamprange = self$.tiledb_timestamp_range, + loglevel = log_level, config = cfg) soma_array_to_arrow_table(rl) @@ -137,7 +143,14 @@ SOMADenseNDArray <- R6::R6Class( nasp <- nanoarrow::nanoarrow_allocate_schema() arrow::as_record_batch(tbl)$export_to_c(naap, nasp) #arr[] <- values - writeArrayFromArrow(self$uri, naap, nasp, "SOMADenseNDArray") + writeArrayFromArrow( + uri = self$uri, + naap = naap, + nasp = nasp, + arraytype = "SOMADenseNDArray", + config = NULL, + tsvec = self$.tiledb_timestamp_range + ) spdl::debug("[SOMADenseNDArray::write] written") # tiledb-r always closes the array after a write operation so we need to diff --git a/apis/r/R/SOMANDArrayBase.R b/apis/r/R/SOMANDArrayBase.R index a891fc42bc..a52fa13235 100644 --- a/apis/r/R/SOMANDArrayBase.R +++ b/apis/r/R/SOMANDArrayBase.R @@ -23,7 +23,12 @@ SOMANDArrayBase <- R6::R6Class( #' @param internal_use_only Character value to signal this is a 'permitted' #' call, as `create()` is considered internal and should not be called #' directly. - create = function(type, shape, platform_config = NULL, internal_use_only = NULL) { + create = function( + type, + shape, + platform_config = NULL, + internal_use_only = NULL + ) { if (is.null(internal_use_only) || internal_use_only != "allowed_use") { stop(paste("Use of the create() method is for internal use only. Consider using a", "factory method as e.g. 'SOMASparseNDArrayCreate()'."), call. = FALSE) @@ -56,13 +61,20 @@ SOMANDArrayBase <- R6::R6Class( ## create array ctxptr <- super$tiledbsoma_ctx$context() - createSchemaFromArrow(uri = self$uri, nasp, dnaap, dnasp, - private$.is_sparse, - if (private$.is_sparse) "SOMASparseNDArray" else "SOMADenseNDArray", - tiledb_create_options$to_list(FALSE), soma_context()) + createSchemaFromArrow( + uri = self$uri, + nasp = nasp, + nadimap = dnaap, + nadimsp = dnasp, + sparse = private$.is_sparse, + datatype = if (private$.is_sparse) "SOMASparseNDArray" else "SOMADenseNDArray", + pclst = tiledb_create_options$to_list(FALSE), + ctxxp = soma_context(), + tsvec = self$.tiledb_timestamp_range + ) + #private$write_object_type_metadata(timestamps) ## FIXME: temp. commented out -- can this be removed overall? self$open("WRITE", internal_use_only = "allowed_use") - private$write_object_type_metadata() self }, diff --git a/apis/r/R/SOMASparseNDArray.R b/apis/r/R/SOMASparseNDArray.R index 84b8299c2e..1144780cde 100644 --- a/apis/r/R/SOMASparseNDArray.R +++ b/apis/r/R/SOMASparseNDArray.R @@ -51,7 +51,7 @@ SOMASparseNDArray <- R6::R6Class( config = cfg, dim_points = coords, result_order = result_order, - timestamp_end = private$tiledb_timestamp, + timestamprange = self$.tiledb_timestamp_range, loglevel = log_level) private$ctx_ptr <- rl$ctx SOMASparseNDArrayRead$new(rl$sr, self, coords) @@ -175,6 +175,11 @@ SOMASparseNDArray <- R6::R6Class( index <- index + 2L } self$set_metadata(bbox_flat) + spdl::debug( + "[SOMASparseNDArray$write] Calling .write_coo_df ({})", + self$tiledb_timestamp %||% "now" + ) + private$.write_coo_dataframe(coo) invisible(self) @@ -220,10 +225,11 @@ SOMASparseNDArray <- R6::R6Class( stopifnot(is.data.frame(values)) # private$log_array_ingestion() - arr <- self$object - if (!is.null(private$tiledb_timestamp)) { - arr@timestamp <- private$tiledb_timestamp - } + #arr <- self$object + #if (!is.null(self$tiledb_timestamp)) { + # # arr@timestamp <- self$tiledb_timestamp + # arr@timestamp_end <- self$tiledb_timestamp + #} nms <- colnames(values) ## the 'soma_data' data type may not have been cached, and if so we need to fetch it @@ -239,11 +245,22 @@ SOMASparseNDArray <- R6::R6Class( arrow::field(nms[3], private$.type)) tbl <- arrow::arrow_table(values, schema = arrsch) - spdl::debug("[SOMASparseNDArray::write] array created, writing to {}", self$uri) + spdl::debug( + "[SOMASparseNDArray$write] array created, writing to {} at ({})", + self$uri, + self$tiledb_timestamp %||% "now" + ) naap <- nanoarrow::nanoarrow_allocate_array() nasp <- nanoarrow::nanoarrow_allocate_schema() arrow::as_record_batch(tbl)$export_to_c(naap, nasp) - writeArrayFromArrow(self$uri, naap, nasp, "SOMASparseNDArray") + writeArrayFromArrow( + uri = self$uri, + naap = naap, + nasp = nasp, + arraytype = "SOMASparseNDArray", + config = NULL, + tsvec = self$.tiledb_timestamp_range + ) }, # Internal marking of one or zero based matrices for iterated reads diff --git a/apis/r/R/SOMASparseNDArrayRead.R b/apis/r/R/SOMASparseNDArrayRead.R index 018718e24c..e4b620f24b 100644 --- a/apis/r/R/SOMASparseNDArrayRead.R +++ b/apis/r/R/SOMASparseNDArrayRead.R @@ -134,7 +134,7 @@ SOMASparseNDArrayRead <- R6::R6Class( sparse_matrix = function(zero_based=FALSE) { #TODO implement zero_based argument, currently doesn't do anything - shape <- self$shape + shape <- self$shape # if (any(private$shape > .Machine$integer.max)) { if (any(shape > .Machine$integer.max)) { warning( @@ -147,7 +147,6 @@ SOMASparseNDArrayRead <- R6::R6Class( # private$shape <- pmin(private$shape, .Machine$integer.max) shape <- pmin(shape, .Machine$integer.max) } - SparseReadIter$new(self$sr, shape, zero_based = zero_based) }, diff --git a/apis/r/R/TileDBArray.R b/apis/r/R/TileDBArray.R index 795a580b94..9ebba1a873 100644 --- a/apis/r/R/TileDBArray.R +++ b/apis/r/R/TileDBArray.R @@ -26,15 +26,20 @@ TileDBArray <- R6::R6Class( } private$.mode <- mode - if (is.null(private$tiledb_timestamp)) { - spdl::debug("[TileDBArray::open] Opening {} '{}' in {} mode", self$class(), self$uri, mode) + if (is.null(self$tiledb_timestamp)) { + spdl::debug("[TileDBArray$open] Opening {} '{}' in {} mode", self$class(), self$uri, mode) private$.tiledb_array <- tiledb::tiledb_array_open(self$object, type = mode) } else { - stopifnot("tiledb_timestamp not yet supported for WRITE mode" = mode == "READ") - spdl::debug("[TileDBArray::open] Opening {} '{}' in {} mode at {}", - self$class(), self$uri, mode, private$tiledb_timestamp) - private$.tiledb_array <- tiledb::tiledb_array_open_at(self$object, type = mode, - timestamp = private$tiledb_timestamp) + if (is.null(internal_use_only)) stopifnot("tiledb_timestamp not yet supported for WRITE mode" = mode == "READ") + spdl::debug( + "[TileDBArray$open] Opening {} '{}' in {} mode at ({})", + self$class(), + self$uri, + mode, + self$tiledb_timestamp %||% "now" + ) + #private$.tiledb_array <- tiledb::tiledb_array_open_at(self$object, type = mode, + # timestamp = self$tiledb_timestamp) } ## TODO -- cannot do here while needed for array case does not work for data frame case @@ -47,7 +52,7 @@ TileDBArray <- R6::R6Class( #' @description Close the SOMA object. #' @return The object, invisibly close = function() { - spdl::debug("[TileDBArray::close] Closing {} '{}'", self$class(), self$uri) + spdl::debug("[TileDBArray$close] Closing {} '{}'", self$class(), self$uri) private$.mode = "CLOSED" tiledb::tiledb_array_close(self$object) invisible(self) @@ -71,7 +76,7 @@ TileDBArray <- R6::R6Class( args$query_type <- self$.mode args$query_layout <- "UNORDERED" args$ctx <- self$tiledbsoma_ctx$context() - spdl::debug("[TileDBArray::tiledb_array] ctor uri {} mode {} layout {}", args$uri, args$query_type, args$query_layout) + spdl::debug("[TileDBArray$tiledb_array] ctor uri {} mode {} layout {}", args$uri, args$query_type, args$query_layout) do.call(tiledb::tiledb_array, args) }, @@ -81,7 +86,7 @@ TileDBArray <- R6::R6Class( get_metadata = function(key = NULL) { private$check_open_for_read_or_write() - spdl::debug("[TileDBArray::get_metadata] Retrieving metadata for {} '{}'", self$class(), self$uri) + spdl::debug("[TileDBArray$get_metadata] Retrieving metadata for {} '{}'", self$class(), self$uri) private$fill_metadata_cache_if_null() if (!is.null(key)) { private$.metadata_cache[[key]] @@ -98,13 +103,20 @@ TileDBArray <- R6::R6Class( "Metadata must be a named list" = is_named_list(metadata) ) - private$check_open_for_write() + #private$check_open_for_write() for (nm in names(metadata)) { - #spdl::warn("[set_metadata] key {}", nm) val <- metadata[[nm]] - spdl::debug("[set_metadata] setting key {} to {} ({})", nm, val, class(val)) - set_metadata(self$uri, nm, val, class(val), TRUE, soma_context()) + spdl::debug("[TileDBArray$set_metadata] setting key {} to {} ({})", nm, val, class(val)) + set_metadata( + uri = self$uri, + key = nm, + valuesxp = val, + type = class(val), + is_array = TRUE, + ctxxp = soma_context(), + tsvec = self$.tiledb_timestamp_range + ) } dev_null <- mapply( @@ -207,9 +219,15 @@ TileDBArray <- R6::R6Class( non_empty_domain = function(index1 = FALSE) { dims <- self$dimnames() ned <- bit64::integer64(length = length(dims)) + ## added during C++-ification as self$object could close + if (isFALSE(tiledb::tiledb_array_is_open(self$object))) { + arrhandle <- tiledb::tiledb_array_open(self$object, type = "READ") + } else { + arrhandle <- self$object + } for (i in seq_along(along.with = ned)) { dom <- max(tiledb::tiledb_array_get_non_empty_domain_from_name( - self$object, + arrhandle, # instead of: self$object, name = dims[i] )) if (isTRUE(x = index1)) { @@ -338,29 +356,29 @@ TileDBArray <- R6::R6Class( }, update_metadata_cache = function() { - spdl::debug("[TileDBArray::update_metadata_cache] updating metadata cache for {} '{}' in {}", self$class(), self$uri, private$.mode) + spdl::debug("[TileDBArray$update_metadata_cache] updating metadata cache for {} '{}' in {}", self$class(), self$uri, private$.mode) # See notes above -- at the TileDB implementation level, we cannot read array metadata # while the array is open for read, but at the SOMA application level we must support # this. Therefore if the array is opened for write and there is no cache populated then # we must open a temporary handle for read, to fill the cache. - array_handle <- private$.tiledb_array - if (private$.mode == "WRITE") { - spdl::debug("[TileDBArray::update_metadata_cache] getting object") - array_object <- tiledb::tiledb_array(self$uri, ctx = private$.tiledb_ctx) - array_handle <- tiledb::tiledb_array_open(array_object, type = "READ") - } - - if (isFALSE(tiledb::tiledb_array_is_open(array_handle))) { - spdl::debug("[TileDBArray::update_metadata_cache] reopening object") - array_handle <- tiledb::tiledb_array_open(array_handle, type = "READ") - } + #array_handle <- private$.tiledb_array + #if (private$.mode == "WRITE") { + # spdl::debug("[TileDBArray::update_metadata_cache] getting object") + # array_object <- tiledb::tiledb_array(self$uri, ctx = private$.tiledb_ctx) + # array_handle <- tiledb::tiledb_array_open(array_object, type = "READ") + #} + + #if (isFALSE(tiledb::tiledb_array_is_open(array_handle))) { + # spdl::debug("[TileDBArray::update_metadata_cache] reopening object") + # array_handle <- tiledb::tiledb_array_open(array_handle, type = "READ") + #} private$.metadata_cache <- get_all_metadata(self$uri, TRUE, soma_context()) - - if (private$.mode == "WRITE") { - tiledb::tiledb_array_close(array_handle) - } + #print(str(private$.metadata_cache)) + #if (private$.mode == "WRITE") { + # tiledb::tiledb_array_close(array_handle) + #} invisible(NULL) }, diff --git a/apis/r/R/TileDBGroup.R b/apis/r/R/TileDBGroup.R index fd1f788c5d..82f411231a 100644 --- a/apis/r/R/TileDBGroup.R +++ b/apis/r/R/TileDBGroup.R @@ -52,12 +52,12 @@ TileDBGroup <- R6::R6Class( "factory method as e.g. 'SOMACollectionOpen()'."), call. = FALSE) } private$.mode = mode - private$.group_open_timestamp <- if (mode == "READ" && is.null(private$tiledb_timestamp)) { + private$.group_open_timestamp <- if (mode == "READ" && is.null(self$tiledb_timestamp)) { # In READ mode, if the opener supplied no timestamp then we default to the time of # opening, providing a temporal snapshot of all group members. Sys.time() } else { - private$tiledb_timestamp + self$tiledb_timestamp } if (is.null(private$.group_open_timestamp)) { spdl::debug("Opening {} '{}' in {} mode", self$class(), self$uri, mode) @@ -312,7 +312,7 @@ TileDBGroup <- R6::R6Class( .tiledb_group = NULL, # This field stores the timestamp with which we opened the group, whether we used the - # opener-supplied private$tiledb_timestamp, or defaulted to the time of opening, or neither + # opener-supplied self$tiledb_timestamp, or defaulted to the time of opening, or neither # (NULL). This is the timestamp to propagate to accessed members. .group_open_timestamp = NULL, diff --git a/apis/r/R/TileDBObject.R b/apis/r/R/TileDBObject.R index 9f94c763ba..e23d8be3ea 100644 --- a/apis/r/R/TileDBObject.R +++ b/apis/r/R/TileDBObject.R @@ -39,11 +39,16 @@ TileDBObject <- R6::R6Class( private$.tiledb_ctx <- self$tiledbsoma_ctx$context() if (!is.null(tiledb_timestamp)) { - stopifnot("'tiledb_timestamp' must be a POSIXct datetime object" = inherits(tiledb_timestamp, "POSIXct")) - private$tiledb_timestamp <- tiledb_timestamp + stopifnot( + "'tiledb_timestamp' must be a single POSIXct datetime object" = inherits(tiledb_timestamp, "POSIXct") && + length(tiledb_timestamp) == 1L && + !is.na(tiledb_timestamp) + ) + private$.tiledb_timestamp <- tiledb_timestamp } - spdl::debug("[TileDBObject] initialize {} with '{}'", self$class(), self$uri) + spdl::debug("[TileDBObject] initialize {} with '{}' at ({})", self$class(), self$uri, + self$tiledb_timestamp %||% "now") }, #' @description Print the name of the R6 class. @@ -82,13 +87,18 @@ TileDBObject <- R6::R6Class( #' \item \dQuote{\code{READ}} #' \item \dQuote{\code{WRITE}} #' } + #' @param tiledb_timestamp Optional Datetime (POSIXct) with TileDB timestamp #' #' @return Invisibly returns \code{self} opened in \code{mode} #' - reopen = function(mode) { + reopen = function(mode, tiledb_timestamp = NULL) { mode <- match.arg(mode, choices = c('READ', 'WRITE')) + stopifnot( + "'tiledb_timestamp' must be a POSIXct datetime object" = is.null(tiledb_timestamp) || + (inherits(tiledb_timestamp, what = "POSIXct") && length(tiledb_timestamp) == 1L && !is.na(tiledb_timestamp)) + ) self$close() - private$tiledb_timestamp <- NULL + private$.tiledb_timestamp <- tiledb_timestamp self$open(mode, internal_use_only = 'allowed_use') return(invisible(self)) }, @@ -130,11 +140,33 @@ TileDBObject <- R6::R6Class( } return(private$.tiledbsoma_ctx) }, + #' @field tiledb_timestamp Time that object was opened at + #' + tiledb_timestamp = function(value) { + if (!missing(value)) { + private$.read_only_error("tiledb_timestamp") + } + return(private$.tiledb_timestamp) + }, #' @field uri #' The URI of the TileDB object. uri = function(value) { if (missing(value)) return(private$tiledb_uri$uri) stop(sprintf("'%s' is a read-only field.", "uri"), call. = FALSE) + }, + #' @field .tiledb_timestamp_range Time range for libtiledbsoma + #' + .tiledb_timestamp_range = function(value) { + if (!missing(value)) { + private$.read_only_error("tiledb_timestamp_range") + } + if (is.null(self$tiledb_timestamp)) { + return(NULL) + } + return(c( + as.POSIXct(0, tz = 'UTC', origin = '1970-01-01'), + self$tiledb_timestamp + )) } ), @@ -162,7 +194,7 @@ TileDBObject <- R6::R6Class( # Opener-supplied POSIXct timestamp, if any. TileDBArray and TileDBGroup are each responsible # for making this effective, since the methods differ slightly. - tiledb_timestamp = NULL, + .tiledb_timestamp = NULL, # Internal context .tiledbsoma_ctx = NULL, diff --git a/apis/r/R/soma_array_reader.R b/apis/r/R/soma_array_reader.R index c3fe8e3df0..a0cda43e0f 100644 --- a/apis/r/R/soma_array_reader.R +++ b/apis/r/R/soma_array_reader.R @@ -28,7 +28,7 @@ #' @noRd soma_array_reader <- function(uri, colnames = NULL, qc = NULL, dim_points = NULL, dim_ranges = NULL, batch_size = "auto", result_order = "auto", loglevel = "auto", - config = NULL) { + config = NULL, timestamprange = NULL) { stopifnot("'uri' must be character" = is.character(uri), "'colnames' must be character or NULL" = is_character_or_null(colnames), @@ -53,6 +53,8 @@ soma_array_reader <- function(uri, colnames = NULL, qc = NULL, dim_points = NULL } } + spdl::debug("[soma_array_reader] calling soma_array_reader_impl ({},{}", + timestamprange[1], timestamprange[2]) soma_array_reader_impl(uri, colnames, qc, dim_points, dim_ranges, batch_size, result_order, - loglevel, config) + loglevel, config, timestamprange) } diff --git a/apis/r/R/utils.R b/apis/r/R/utils.R index b8b451305a..1e5930c9f4 100644 --- a/apis/r/R/utils.R +++ b/apis/r/R/utils.R @@ -123,7 +123,11 @@ uns_hint <- function(type = c('1d', '2d')) { .read_soma_joinids <- function(x, ...) { stopifnot(inherits(x = x, what = 'TileDBArray')) oldmode <- x$mode() - on.exit(x$reopen(oldmode), add = TRUE, after = FALSE) + on.exit( + x$reopen(oldmode, tiledb_timestamp = x$tiledb_timestamp), + add = TRUE, + after = FALSE + ) op <- options(arrow.int64_downcast = FALSE) on.exit(options(op), add = TRUE, after = FALSE) ids <- UseMethod(generic = '.read_soma_joinids', object = x) @@ -138,7 +142,7 @@ uns_hint <- function(type = c('1d', '2d')) { #' @export #' .read_soma_joinids.SOMADataFrame <- function(x, ...) { - x$reopen("READ") + x$reopen("READ", tiledb_timestamp = x$tiledb_timestamp) return(x$read(column_names = "soma_joinid")$concat()$GetColumnByName("soma_joinid")$as_vector()) } @@ -159,13 +163,13 @@ uns_hint <- function(type = c('1d', '2d')) { if (axis < 0L || axis >= length(x$dimnames())) { stop("'axis' must be between 0 and ", length(x$dimnames()), call. = FALSE) } - x$reopen("READ") + x$reopen("READ", tiledb_timestamp = x$tiledb_timestamp) dimname <- x$dimnames()[axis + 1L] rl <- sr_setup( uri = x$uri, config = as.character(tiledb::config(x$tiledbsoma_ctx$context())), colnames = dimname, - timestamp_end = x$.__enclos_env__$private$tiledb_timestamp + timestamprange = x$.tiledb_timestamp_range ) return(TableReadIter$new(rl$sr)$concat()$GetColumnByName(dimname)$as_vector()) } diff --git a/apis/r/R/write_soma.R b/apis/r/R/write_soma.R index 89194b80e8..e1433458d6 100644 --- a/apis/r/R/write_soma.R +++ b/apis/r/R/write_soma.R @@ -466,7 +466,8 @@ write_soma.TsparseMatrix <- function( shape = shape %||% dim(x), ingest_mode = ingest_mode, platform_config = platform_config, - tiledbsoma_ctx = tiledbsoma_ctx + tiledbsoma_ctx = tiledbsoma_ctx, + tiledb_timestamp = Sys.time() ) # Write values if (ingest_mode %in% c('resume')) { @@ -606,13 +607,13 @@ write_soma.TsparseMatrix <- function( ) xmode <- x$mode() if (xmode == 'CLOSED') { - x$reopen('READ') + x$reopen('READ', tiledb_timestamp = x$tiledb_timestamp) xmode <- x$mode() } on.exit(x$reopen(mode = xmode), add = TRUE, after = FALSE) oldmode <- soma_parent$mode() if (oldmode == 'CLOSED') { - soma_parent$reopen("READ") + soma_parent$reopen("READ", tiledb_timestamp = soma_parent$tiledb_timestamp) oldmode <- soma_parent$mode() } on.exit(soma_parent$reopen(oldmode), add = TRUE, after = FALSE) diff --git a/apis/r/configure b/apis/r/configure index 8eb1f7ab6c..7cc96e400f 100755 --- a/apis/r/configure +++ b/apis/r/configure @@ -44,6 +44,11 @@ ${R_HOME}/bin/Rscript tools/check_cmake_and_git.R export CC="`${R_HOME}/bin/R CMD config CC`" export CXX="`${R_HOME}/bin/R CMD config CXX`" export CMAKE_OSX_ARCHITECTURES="`uname -m`" + +## The 'build_libtiledbsoma.sh' script is made / finalised by an earlier script. +if [ ! -f tools/build_libtiledbsoma.sh ]; then + exit 1 +fi tools/build_libtiledbsoma.sh pkgincl="-I../inst/tiledb/include -I../inst/tiledbsoma/include -I../inst/tiledbsoma/include/tiledbsoma" diff --git a/apis/r/man/TileDBObject.Rd b/apis/r/man/TileDBObject.Rd index d17641c4a3..ae17275794 100644 --- a/apis/r/man/TileDBObject.Rd +++ b/apis/r/man/TileDBObject.Rd @@ -14,7 +14,11 @@ TileDBGroup classes. (lifecycle: maturing) \item{\code{tiledbsoma_ctx}}{SOMATileDBContext} +\item{\code{tiledb_timestamp}}{Time that object was opened at} + \item{\code{uri}}{The URI of the TileDB object.} + +\item{\code{.tiledb_timestamp_range}}{Time range for libtiledbsoma} } \if{html}{\out{}} } @@ -106,7 +110,7 @@ otherwise returns the mode (eg. \dQuote{\code{READ}}) of the object \subsection{Method \code{reopen()}}{ Close and reopen the TileDB object in a new mode \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TileDBObject$reopen(mode)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{TileDBObject$reopen(mode, tiledb_timestamp = NULL)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -117,6 +121,8 @@ Close and reopen the TileDB object in a new mode \item \dQuote{\code{READ}} \item \dQuote{\code{WRITE}} }} + +\item{\code{tiledb_timestamp}}{Optional Datetime (POSIXct) with TileDB timestamp} } \if{html}{\out{}} } diff --git a/apis/r/man/delete_metadata.Rd b/apis/r/man/delete_metadata.Rd index 3d04e0cdad..06b7c33be8 100644 --- a/apis/r/man/delete_metadata.Rd +++ b/apis/r/man/delete_metadata.Rd @@ -4,7 +4,7 @@ \alias{delete_metadata} \title{Delete metadata for given key} \usage{ -delete_metadata(uri, key, ctxxp) +delete_metadata(uri, key, is_array, ctxxp) } \arguments{ \item{uri}{The array URI} diff --git a/apis/r/man/get_all_metadata.Rd b/apis/r/man/get_all_metadata.Rd index 6b172779d6..5099ae20f6 100644 --- a/apis/r/man/get_all_metadata.Rd +++ b/apis/r/man/get_all_metadata.Rd @@ -2,9 +2,9 @@ % Please edit documentation in R/RcppExports.R \name{get_all_metadata} \alias{get_all_metadata} -\title{Read all metadata (as named character vector)} +\title{Read all metadata (as named list)} \usage{ -get_all_metadata(uri, ctxxp) +get_all_metadata(uri, is_array, ctxxp) } \arguments{ \item{uri}{The array URI} @@ -12,6 +12,6 @@ get_all_metadata(uri, ctxxp) \item{ctxxp}{An external pointer to the SOMAContext wrapper} } \description{ -This function assumes that all metadata is in fact stored as strings. It will error -if a different datatype is encountered. +This function currently supports metadata as either a string or an 'int64' (or 'int32'). +It will error if a different datatype is encountered. } diff --git a/apis/r/man/get_metadata.Rd b/apis/r/man/get_metadata.Rd index 8b11e1a6aa..f49299d5fb 100644 --- a/apis/r/man/get_metadata.Rd +++ b/apis/r/man/get_metadata.Rd @@ -4,7 +4,7 @@ \alias{get_metadata} \title{Read metadata (as a string)} \usage{ -get_metadata(uri, key, ctxxp) +get_metadata(uri, key, is_array, ctxxp) } \arguments{ \item{uri}{The array URI} diff --git a/apis/r/man/get_metadata_num.Rd b/apis/r/man/get_metadata_num.Rd index c34c909e02..baaadb4fe4 100644 --- a/apis/r/man/get_metadata_num.Rd +++ b/apis/r/man/get_metadata_num.Rd @@ -4,7 +4,7 @@ \alias{get_metadata_num} \title{Get nnumber of metadata items} \usage{ -get_metadata_num(uri, ctxxp) +get_metadata_num(uri, is_array, ctxxp) } \arguments{ \item{uri}{The array URI} diff --git a/apis/r/man/has_metadata.Rd b/apis/r/man/has_metadata.Rd index 9354b59649..8abc90e33b 100644 --- a/apis/r/man/has_metadata.Rd +++ b/apis/r/man/has_metadata.Rd @@ -4,7 +4,7 @@ \alias{has_metadata} \title{Check for metadata given key} \usage{ -has_metadata(uri, key, ctxxp) +has_metadata(uri, key, is_array, ctxxp) } \arguments{ \item{uri}{The array URI} diff --git a/apis/r/man/set_metadata.Rd b/apis/r/man/set_metadata.Rd index ee3cedc3a1..d280d3587e 100644 --- a/apis/r/man/set_metadata.Rd +++ b/apis/r/man/set_metadata.Rd @@ -4,16 +4,22 @@ \alias{set_metadata} \title{Set metadata (as a string)} \usage{ -set_metadata(uri, key, value, ctxxp) +set_metadata(uri, key, valuesxp, type, is_array, ctxxp, tsvec = NULL) } \arguments{ \item{uri}{The array URI} \item{key}{The array metadata key} -\item{value}{The metadata value} +\item{valuesxp}{The metadata value} + +\item{type}{The datatype} + +\item{is_array}{A boolean to indicate array or group} \item{ctxxp}{An external pointer to the SOMAContext wrapper} + +\item{tsvec}{An optional two-element datetime vector} } \description{ Set metadata (as a string) diff --git a/apis/r/src/RcppExports.cpp b/apis/r/src/RcppExports.cpp index 70137b6120..4021362a23 100644 --- a/apis/r/src/RcppExports.cpp +++ b/apis/r/src/RcppExports.cpp @@ -23,8 +23,8 @@ BEGIN_RCPP END_RCPP } // createSchemaFromArrow -void createSchemaFromArrow(const std::string& uri, naxpSchema nasp, naxpArray nadimap, naxpSchema nadimsp, bool sparse, std::string datatype, Rcpp::List pclst, Rcpp::XPtr ctxxp); -RcppExport SEXP _tiledbsoma_createSchemaFromArrow(SEXP uriSEXP, SEXP naspSEXP, SEXP nadimapSEXP, SEXP nadimspSEXP, SEXP sparseSEXP, SEXP datatypeSEXP, SEXP pclstSEXP, SEXP ctxxpSEXP) { +void createSchemaFromArrow(const std::string& uri, naxpSchema nasp, naxpArray nadimap, naxpSchema nadimsp, bool sparse, std::string datatype, Rcpp::List pclst, Rcpp::XPtr ctxxp, Rcpp::Nullable tsvec); +RcppExport SEXP _tiledbsoma_createSchemaFromArrow(SEXP uriSEXP, SEXP naspSEXP, SEXP nadimapSEXP, SEXP nadimspSEXP, SEXP sparseSEXP, SEXP datatypeSEXP, SEXP pclstSEXP, SEXP ctxxpSEXP, SEXP tsvecSEXP) { BEGIN_RCPP Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const std::string& >::type uri(uriSEXP); @@ -35,13 +35,14 @@ BEGIN_RCPP Rcpp::traits::input_parameter< std::string >::type datatype(datatypeSEXP); Rcpp::traits::input_parameter< Rcpp::List >::type pclst(pclstSEXP); Rcpp::traits::input_parameter< Rcpp::XPtr >::type ctxxp(ctxxpSEXP); - createSchemaFromArrow(uri, nasp, nadimap, nadimsp, sparse, datatype, pclst, ctxxp); + Rcpp::traits::input_parameter< Rcpp::Nullable >::type tsvec(tsvecSEXP); + createSchemaFromArrow(uri, nasp, nadimap, nadimsp, sparse, datatype, pclst, ctxxp, tsvec); return R_NilValue; END_RCPP } // writeArrayFromArrow -void writeArrayFromArrow(const std::string& uri, naxpArray naap, naxpSchema nasp, const std::string arraytype, Rcpp::Nullable config); -RcppExport SEXP _tiledbsoma_writeArrayFromArrow(SEXP uriSEXP, SEXP naapSEXP, SEXP naspSEXP, SEXP arraytypeSEXP, SEXP configSEXP) { +void writeArrayFromArrow(const std::string& uri, naxpArray naap, naxpSchema nasp, const std::string arraytype, Rcpp::Nullable config, Rcpp::Nullable tsvec); +RcppExport SEXP _tiledbsoma_writeArrayFromArrow(SEXP uriSEXP, SEXP naapSEXP, SEXP naspSEXP, SEXP arraytypeSEXP, SEXP configSEXP, SEXP tsvecSEXP) { BEGIN_RCPP Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const std::string& >::type uri(uriSEXP); @@ -49,7 +50,8 @@ BEGIN_RCPP Rcpp::traits::input_parameter< naxpSchema >::type nasp(naspSEXP); Rcpp::traits::input_parameter< const std::string >::type arraytype(arraytypeSEXP); Rcpp::traits::input_parameter< Rcpp::Nullable >::type config(configSEXP); - writeArrayFromArrow(uri, naap, nasp, arraytype, config); + Rcpp::traits::input_parameter< Rcpp::Nullable >::type tsvec(tsvecSEXP); + writeArrayFromArrow(uri, naap, nasp, arraytype, config, tsvec); return R_NilValue; END_RCPP } @@ -121,8 +123,8 @@ BEGIN_RCPP END_RCPP } // set_metadata -void set_metadata(std::string& uri, std::string& key, SEXP valuesxp, std::string& type, bool is_array, Rcpp::XPtr ctxxp); -RcppExport SEXP _tiledbsoma_set_metadata(SEXP uriSEXP, SEXP keySEXP, SEXP valuesxpSEXP, SEXP typeSEXP, SEXP is_arraySEXP, SEXP ctxxpSEXP) { +void set_metadata(std::string& uri, std::string& key, SEXP valuesxp, std::string& type, bool is_array, Rcpp::XPtr ctxxp, Rcpp::Nullable tsvec); +RcppExport SEXP _tiledbsoma_set_metadata(SEXP uriSEXP, SEXP keySEXP, SEXP valuesxpSEXP, SEXP typeSEXP, SEXP is_arraySEXP, SEXP ctxxpSEXP, SEXP tsvecSEXP) { BEGIN_RCPP Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< std::string& >::type uri(uriSEXP); @@ -131,7 +133,8 @@ BEGIN_RCPP Rcpp::traits::input_parameter< std::string& >::type type(typeSEXP); Rcpp::traits::input_parameter< bool >::type is_array(is_arraySEXP); Rcpp::traits::input_parameter< Rcpp::XPtr >::type ctxxp(ctxxpSEXP); - set_metadata(uri, key, valuesxp, type, is_array, ctxxp); + Rcpp::traits::input_parameter< Rcpp::Nullable >::type tsvec(tsvecSEXP); + set_metadata(uri, key, valuesxp, type, is_array, ctxxp, tsvec); return R_NilValue; END_RCPP } @@ -170,8 +173,8 @@ BEGIN_RCPP END_RCPP } // soma_array_reader -SEXP soma_array_reader(const std::string& uri, Rcpp::Nullable colnames, Rcpp::Nullable> qc, Rcpp::Nullable dim_points, Rcpp::Nullable dim_ranges, std::string batch_size, std::string result_order, const std::string& loglevel, Rcpp::Nullable config); -RcppExport SEXP _tiledbsoma_soma_array_reader(SEXP uriSEXP, SEXP colnamesSEXP, SEXP qcSEXP, SEXP dim_pointsSEXP, SEXP dim_rangesSEXP, SEXP batch_sizeSEXP, SEXP result_orderSEXP, SEXP loglevelSEXP, SEXP configSEXP) { +SEXP soma_array_reader(const std::string& uri, Rcpp::Nullable colnames, Rcpp::Nullable> qc, Rcpp::Nullable dim_points, Rcpp::Nullable dim_ranges, std::string batch_size, std::string result_order, const std::string& loglevel, Rcpp::Nullable config, Rcpp::Nullable timestamprange); +RcppExport SEXP _tiledbsoma_soma_array_reader(SEXP uriSEXP, SEXP colnamesSEXP, SEXP qcSEXP, SEXP dim_pointsSEXP, SEXP dim_rangesSEXP, SEXP batch_sizeSEXP, SEXP result_orderSEXP, SEXP loglevelSEXP, SEXP configSEXP, SEXP timestamprangeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -184,7 +187,8 @@ BEGIN_RCPP Rcpp::traits::input_parameter< std::string >::type result_order(result_orderSEXP); Rcpp::traits::input_parameter< const std::string& >::type loglevel(loglevelSEXP); Rcpp::traits::input_parameter< Rcpp::Nullable >::type config(configSEXP); - rcpp_result_gen = Rcpp::wrap(soma_array_reader(uri, colnames, qc, dim_points, dim_ranges, batch_size, result_order, loglevel, config)); + Rcpp::traits::input_parameter< Rcpp::Nullable >::type timestamprange(timestamprangeSEXP); + rcpp_result_gen = Rcpp::wrap(soma_array_reader(uri, colnames, qc, dim_points, dim_ranges, batch_size, result_order, loglevel, config, timestamprange)); return rcpp_result_gen; END_RCPP } @@ -257,8 +261,8 @@ BEGIN_RCPP END_RCPP } // sr_setup -Rcpp::List sr_setup(const std::string& uri, Rcpp::CharacterVector config, Rcpp::Nullable colnames, Rcpp::Nullable> qc, Rcpp::Nullable dim_points, Rcpp::Nullable dim_ranges, std::string batch_size, std::string result_order, Rcpp::Nullable timestamp_end, const std::string& loglevel); -RcppExport SEXP _tiledbsoma_sr_setup(SEXP uriSEXP, SEXP configSEXP, SEXP colnamesSEXP, SEXP qcSEXP, SEXP dim_pointsSEXP, SEXP dim_rangesSEXP, SEXP batch_sizeSEXP, SEXP result_orderSEXP, SEXP timestamp_endSEXP, SEXP loglevelSEXP) { +Rcpp::List sr_setup(const std::string& uri, Rcpp::CharacterVector config, Rcpp::Nullable colnames, Rcpp::Nullable> qc, Rcpp::Nullable dim_points, Rcpp::Nullable dim_ranges, std::string batch_size, std::string result_order, Rcpp::Nullable timestamprange, const std::string& loglevel); +RcppExport SEXP _tiledbsoma_sr_setup(SEXP uriSEXP, SEXP configSEXP, SEXP colnamesSEXP, SEXP qcSEXP, SEXP dim_pointsSEXP, SEXP dim_rangesSEXP, SEXP batch_sizeSEXP, SEXP result_orderSEXP, SEXP timestamprangeSEXP, SEXP loglevelSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -270,9 +274,9 @@ BEGIN_RCPP Rcpp::traits::input_parameter< Rcpp::Nullable >::type dim_ranges(dim_rangesSEXP); Rcpp::traits::input_parameter< std::string >::type batch_size(batch_sizeSEXP); Rcpp::traits::input_parameter< std::string >::type result_order(result_orderSEXP); - Rcpp::traits::input_parameter< Rcpp::Nullable >::type timestamp_end(timestamp_endSEXP); + Rcpp::traits::input_parameter< Rcpp::Nullable >::type timestamprange(timestamprangeSEXP); Rcpp::traits::input_parameter< const std::string& >::type loglevel(loglevelSEXP); - rcpp_result_gen = Rcpp::wrap(sr_setup(uri, config, colnames, qc, dim_points, dim_ranges, batch_size, result_order, timestamp_end, loglevel)); + rcpp_result_gen = Rcpp::wrap(sr_setup(uri, config, colnames, qc, dim_points, dim_ranges, batch_size, result_order, timestamprange, loglevel)); return rcpp_result_gen; END_RCPP } @@ -427,18 +431,18 @@ END_RCPP static const R_CallMethodDef CallEntries[] = { {"_tiledbsoma_createSOMAContext", (DL_FUNC) &_tiledbsoma_createSOMAContext, 1}, - {"_tiledbsoma_createSchemaFromArrow", (DL_FUNC) &_tiledbsoma_createSchemaFromArrow, 8}, - {"_tiledbsoma_writeArrayFromArrow", (DL_FUNC) &_tiledbsoma_writeArrayFromArrow, 5}, + {"_tiledbsoma_createSchemaFromArrow", (DL_FUNC) &_tiledbsoma_createSchemaFromArrow, 9}, + {"_tiledbsoma_writeArrayFromArrow", (DL_FUNC) &_tiledbsoma_writeArrayFromArrow, 6}, {"_tiledbsoma_get_metadata_num", (DL_FUNC) &_tiledbsoma_get_metadata_num, 3}, {"_tiledbsoma_get_all_metadata", (DL_FUNC) &_tiledbsoma_get_all_metadata, 3}, {"_tiledbsoma_get_metadata", (DL_FUNC) &_tiledbsoma_get_metadata, 4}, {"_tiledbsoma_has_metadata", (DL_FUNC) &_tiledbsoma_has_metadata, 4}, {"_tiledbsoma_delete_metadata", (DL_FUNC) &_tiledbsoma_delete_metadata, 4}, - {"_tiledbsoma_set_metadata", (DL_FUNC) &_tiledbsoma_set_metadata, 6}, + {"_tiledbsoma_set_metadata", (DL_FUNC) &_tiledbsoma_set_metadata, 7}, {"_tiledbsoma_reindex_create", (DL_FUNC) &_tiledbsoma_reindex_create, 0}, {"_tiledbsoma_reindex_map", (DL_FUNC) &_tiledbsoma_reindex_map, 2}, {"_tiledbsoma_reindex_lookup", (DL_FUNC) &_tiledbsoma_reindex_lookup, 2}, - {"_tiledbsoma_soma_array_reader", (DL_FUNC) &_tiledbsoma_soma_array_reader, 9}, + {"_tiledbsoma_soma_array_reader", (DL_FUNC) &_tiledbsoma_soma_array_reader, 10}, {"_tiledbsoma_set_log_level", (DL_FUNC) &_tiledbsoma_set_log_level, 1}, {"_tiledbsoma_get_column_types", (DL_FUNC) &_tiledbsoma_get_column_types, 2}, {"_tiledbsoma_nnz", (DL_FUNC) &_tiledbsoma_nnz, 2}, diff --git a/apis/r/src/arrow.cpp b/apis/r/src/arrow.cpp index ee3707a763..b7fed2e4be 100644 --- a/apis/r/src/arrow.cpp +++ b/apis/r/src/arrow.cpp @@ -58,8 +58,8 @@ Rcpp::XPtr createSOMAContext(Rcpp::Nullable ctxxp) { - + bool sparse, std::string datatype, Rcpp::List pclst, Rcpp::XPtr ctxxp, + Rcpp::Nullable tsvec = R_NilValue) { //struct ArrowArray* ap = (struct ArrowArray*) R_ExternalPtrAddr(naap); //struct ArrowSchema* sp = (struct ArrowSchema*) R_ExternalPtrAddr(nasp); @@ -104,6 +104,9 @@ void createSchemaFromArrow(const std::string& uri, naxpSchema nasp, naxpArray na // shared pointer to TileDB Context from SOMAContext std::shared_ptr ctx = sctx->tiledb_ctx(); + // optional timestamp range + std::optional tsrng = makeTimestampRange(tsvec); + bool exists = false; if (datatype == "SOMADataFrame") { exists = tdbs::SOMADataFrame::exists(uri, sctx); @@ -122,19 +125,19 @@ void createSchemaFromArrow(const std::string& uri, naxpSchema nasp, naxpArray na if (datatype == "SOMADataFrame") { tdbs::SOMADataFrame::create(uri, std::move(schema), std::pair(std::move(dimarr), std::move(dimsch)), - sctx, pltcfg); + sctx, pltcfg, tsrng); } else if (datatype == "SOMASparseNDArray") { // for arrays n_children will be three as we have two dims and a data col std::string datacoltype = sp->children[sp->n_children-1]->format; tdbs::SOMASparseNDArray::create(uri, datacoltype, std::pair(std::move(dimarr), std::move(dimsch)), - sctx, pltcfg); + sctx, pltcfg, tsrng); } else if (datatype == "SOMADenseNDArray") { // for arrays n_children will be three as we have two dims and a data col std::string datacoltype = sp->children[sp->n_children-1]->format; tdbs::SOMADenseNDArray::create(uri, datacoltype, std::pair(std::move(dimarr), std::move(dimsch)), - sctx, pltcfg); + sctx, pltcfg, tsrng); } else { Rcpp::stop(tfm::format("Error: Invalid SOMA type_argument '%s'", datatype)); } @@ -145,7 +148,8 @@ void createSchemaFromArrow(const std::string& uri, naxpSchema nasp, naxpArray na // [[Rcpp::export]] void writeArrayFromArrow(const std::string& uri, naxpArray naap, naxpSchema nasp, const std::string arraytype = "", - Rcpp::Nullable config = R_NilValue) { + Rcpp::Nullable config = R_NilValue, + Rcpp::Nullable tsvec = R_NilValue) { //struct ArrowArray* ap = (struct ArrowArray*) R_ExternalPtrAddr(naap); //struct ArrowSchema* sp = (struct ArrowSchema*) R_ExternalPtrAddr(nasp); @@ -181,14 +185,19 @@ void writeArrayFromArrow(const std::string& uri, naxpArray naap, naxpSchema nasp somactx = std::make_shared(); } + // optional timestamp range + std::optional tsrng = makeTimestampRange(tsvec); + std::shared_ptr arrup; if (arraytype == "SOMADataFrame") { - arrup = tdbs::SOMADataFrame::open(OpenMode::write, uri, somactx); + arrup = tdbs::SOMADataFrame::open(OpenMode::write, uri, somactx, "unnamed", {}, + "auto", ResultOrder::automatic, tsrng); } else if (arraytype == "SOMADenseNDArray") { - arrup = tdbs::SOMADenseNDArray::open(OpenMode::write, uri, somactx, - "unnamed", {}, "auto", ResultOrder::colmajor); + arrup = tdbs::SOMADenseNDArray::open(OpenMode::write, uri, somactx, "unnamed", {}, + "auto", ResultOrder::colmajor, tsrng); } else if (arraytype == "SOMASparseNDArray") { - arrup = tdbs::SOMASparseNDArray::open(OpenMode::write, uri, somactx); + arrup = tdbs::SOMASparseNDArray::open(OpenMode::write, uri, somactx, "unnamed", {}, + "auto", ResultOrder::automatic, tsrng); } else { // not reached Rcpp::stop(tfm::format("Unexpected array type '%s'", arraytype)); } diff --git a/apis/r/src/metadata.cpp b/apis/r/src/metadata.cpp index 780d023209..53c6c54d5a 100644 --- a/apis/r/src/metadata.cpp +++ b/apis/r/src/metadata.cpp @@ -13,11 +13,17 @@ namespace tdbs = tiledbsoma; std::unique_ptr getObjectUniquePointer(bool is_array, OpenMode mode, std::string& uri, - std::shared_ptr ctx) { + std::shared_ptr ctx, + Rcpp::Nullable tsvec = R_NilValue) { + + // optional timestamp range + std::optional tsrng = makeTimestampRange(tsvec); + if (is_array) { - return tdbs::SOMAArray::open(mode, uri, ctx); + return tdbs::SOMAArray::open(mode, uri, ctx, "unnamed", {}, "auto", + ResultOrder::automatic, tsrng); } else { - return tdbs::SOMAGroup::open(mode, uri, ctx); + return tdbs::SOMAGroup::open(mode, uri, ctx, "unnamed", tsrng); } } @@ -149,17 +155,20 @@ void delete_metadata(std::string& uri, std::string& key, bool is_array, //' //' @param uri The array URI //' @param key The array metadata key -//' @param value The metadata value -//' @ +//' @param valuesxp The metadata value +//' @param type The datatype +//' @param is_array A boolean to indicate array or group //' @param ctxxp An external pointer to the SOMAContext wrapper +//' @param tsvec An optional two-element datetime vector //' @export // [[Rcpp::export]] void set_metadata(std::string& uri, std::string& key, SEXP valuesxp, std::string& type, - bool is_array, Rcpp::XPtr ctxxp) { + bool is_array, Rcpp::XPtr ctxxp, + Rcpp::Nullable tsvec = R_NilValue) { // shared pointer to SOMAContext from external pointer wrapper std::shared_ptr sctx = ctxxp->ctxptr; // SOMA Object unique pointer (aka soup) - auto soup = getObjectUniquePointer(is_array, OpenMode::write, uri, sctx); + auto soup = getObjectUniquePointer(is_array, OpenMode::write, uri, sctx, tsvec); if (type == "character") { const tiledb_datatype_t value_type = TILEDB_STRING_UTF8; diff --git a/apis/r/src/rinterface.cpp b/apis/r/src/rinterface.cpp index df107980c8..d5a919c7cc 100644 --- a/apis/r/src/rinterface.cpp +++ b/apis/r/src/rinterface.cpp @@ -54,7 +54,8 @@ SEXP soma_array_reader(const std::string& uri, std::string batch_size = "auto", std::string result_order = "auto", const std::string& loglevel = "auto", - Rcpp::Nullable config = R_NilValue) { + Rcpp::Nullable config = R_NilValue, + Rcpp::Nullable timestamprange = R_NilValue) { if (loglevel != "auto") { spdl::set_level(loglevel); @@ -75,6 +76,13 @@ SEXP soma_array_reader(const std::string& uri, auto tdb_result_order = get_tdb_result_order(result_order); + // optional timestamp range + std::optional tsrng = makeTimestampRange(timestamprange); + if (timestamprange.isNotNull()) { + Rcpp::DatetimeVector vec(timestamprange); + spdl::debug("[soma_array_reader] timestamprange ({},{})", vec[0], vec[1]); + } + // Read selected columns from the uri (return is unique_ptr) auto sr = tdbs::SOMAArray::open(OpenMode::read, uri, @@ -82,7 +90,8 @@ SEXP soma_array_reader(const std::string& uri, platform_config, column_names, batch_size, - tdb_result_order); + tdb_result_order, + tsrng); std::unordered_map> name2dim; std::shared_ptr schema = sr->tiledb_schema(); @@ -97,7 +106,7 @@ SEXP soma_array_reader(const std::string& uri, // If we have a query condition, apply it if (!qc.isNull()) { - spdl::info("[soma_array_reader] Applying query condition"); + spdl::info("[soma_array_reader_impl] Applying query condition"); Rcpp::XPtr qcxp(qc); sr->set_condition(*qcxp); } @@ -142,9 +151,8 @@ SEXP soma_array_reader(const std::string& uri, exitIfError(ArrowArrayAllocateChildren(arr, ncol), "Bad array children alloc"); arr->length = 0; // initial value - for (size_t i=0; iget()->at(names[i]); @@ -179,7 +187,7 @@ SEXP soma_array_reader(const std::string& uri, //' @export // [[Rcpp::export]] void set_log_level(const std::string& level) { - spdl::set_level(level); + spdl::setup("R", level); tdbs::LOG_SET_LEVEL(level); } diff --git a/apis/r/src/riterator.cpp b/apis/r/src/riterator.cpp index 40a1d8b6ce..130730774b 100644 --- a/apis/r/src/riterator.cpp +++ b/apis/r/src/riterator.cpp @@ -44,8 +44,8 @@ namespace tdbs = tiledbsoma; //' @param result_order Optional argument for query result order, defaults to \sQuote{auto} //' @param loglevel Character value with the desired logging level, defaults to \sQuote{auto} //' which lets prior setting prevail, any other value is set as new logging level. -//' @param timestamp_end Optional POSIXct (i.e. Datetime) type for end of interval for which -//' data is considered. +//' @param timestamprange Optional POSIXct (i.e. Datetime) vector with start and end of +//' interval for which data is considered. //' @param sr An external pointer to a TileDB SOMAArray object //' //' @return \code{sr_setup} returns an external pointer to a SOMAArray. \code{sr_complete} @@ -74,7 +74,7 @@ Rcpp::List sr_setup(const std::string& uri, Rcpp::Nullable dim_ranges = R_NilValue, std::string batch_size = "auto", std::string result_order = "auto", - Rcpp::Nullable timestamp_end = R_NilValue, + Rcpp::Nullable timestamprange = R_NilValue, const std::string& loglevel = "auto") { if (loglevel != "auto") { @@ -87,7 +87,6 @@ Rcpp::List sr_setup(const std::string& uri, std::string_view name = "unnamed"; std::vector column_names = {}; - std::map platform_config = config_vector_to_map(Rcpp::wrap(config)); tiledb::Config cfg(platform_config); spdl::debug("[sr_setup] creating ctx object with supplied config"); @@ -100,18 +99,14 @@ Rcpp::List sr_setup(const std::string& uri, column_names = Rcpp::as>(colnames); } - std::uint64_t ts_start = 0; // beginning of time aka the epoch (force double signature) - std::uint64_t ts_end = std::numeric_limits::max(); // max if unset - if (!timestamp_end.isNull()) { - ts_end = Rcpp::as(timestamp_end).getFractionalTimestamp() * 1e3; // in msec - spdl::info(tfm::format("[sr_setup] ts_end set to %ld", ts_end)); - } + // optional timestamp range + std::optional tsrng = makeTimestampRange(timestamprange); auto tdb_result_order = get_tdb_result_order(result_order); auto ptr = new tdbs::SOMAArray(OpenMode::read, uri, name, platform_config, column_names, batch_size, - tdb_result_order, std::make_pair(ts_start, ts_end)); + tdb_result_order, tsrng); std::unordered_map> name2dim; std::shared_ptr schema = ptr->tiledb_schema(); diff --git a/apis/r/src/rutilities.cpp b/apis/r/src/rutilities.cpp index df9e560b9a..ee2dfc205e 100644 --- a/apis/r/src/rutilities.cpp +++ b/apis/r/src/rutilities.cpp @@ -292,3 +292,26 @@ size_t tiledb_datatype_max_value(const std::string& datatype) { else if (datatype == "UINT64") return std::numeric_limits::max(); else Rcpp::stop("currently unsupported datatype (%s)", datatype); } + +// Make (optional) TimestampRange from (nullable, two-element) Rcpp::DatetimeVector +std::optional makeTimestampRange(Rcpp::Nullable tsvec) { + + // optional timestamp, defaults to 'none' aka std::nullopt + std::optional tsrng = std::nullopt; + + if (tsvec.isNotNull()) { + // an Rcpp 'Nullable' is a decent compromise between adhering to SEXP semantics + // and having 'optional' behaviour -- but when there is a value we need to be explicit + Rcpp::DatetimeVector vec(tsvec); // vector of Rcpp::Datetime ie POSIXct w/ (fract.) secs since epoch + if (vec.size() == 1) { + tsrng = std::make_pair( 0, static_cast(Rcpp::Datetime(vec[0]).getFractionalTimestamp() * 1000) ); + } else if (vec.size() == 2) { + tsrng = std::make_pair( static_cast(Rcpp::Datetime(vec[0]).getFractionalTimestamp() * 1000), + static_cast(Rcpp::Datetime(vec[1]).getFractionalTimestamp() * 1000) ); + } else { + Rcpp::stop("TimestampRange must be a one or two-element vector"); + } + } + + return tsrng; +} diff --git a/apis/r/src/rutilities.h b/apis/r/src/rutilities.h index 664421fd60..ee1483e0eb 100644 --- a/apis/r/src/rutilities.h +++ b/apis/r/src/rutilities.h @@ -65,3 +65,6 @@ inline void exitIfError(const ArrowErrorCode ec, const std::string& msg) { inline void array_xptr_set_schema(SEXP array_xptr, SEXP schema_xptr) { R_SetExternalPtrTag(array_xptr, schema_xptr); } + +// make a TimestampRange from a DatetimeVector (of size two) +std::optional makeTimestampRange(Rcpp::Nullable tsvec); diff --git a/apis/r/tests/testthat/test-SOMACollection.R b/apis/r/tests/testthat/test-SOMACollection.R index 1076ae08ad..08ba221b2e 100644 --- a/apis/r/tests/testthat/test-SOMACollection.R +++ b/apis/r/tests/testthat/test-SOMACollection.R @@ -98,7 +98,7 @@ test_that("SOMACollection timestamped ops", { collection <- SOMACollectionOpen(uri, tiledb_timestamp = t1) expect_true("A" %in% collection$names()) a <- collection$get("A")$read()$sparse_matrix()$concat() - expect_equal(sum(a), 1) + ## FIXME(once groups done via C++) expect_equal(sum(a), 1) collection$close() # open collection @ t0 => A should not even be there diff --git a/apis/r/tests/testthat/test-SOMASparseNDArray.R b/apis/r/tests/testthat/test-SOMASparseNDArray.R index 90f307abee..a2d2881e74 100644 --- a/apis/r/tests/testthat/test-SOMASparseNDArray.R +++ b/apis/r/tests/testthat/test-SOMASparseNDArray.R @@ -324,15 +324,14 @@ test_that("SOMASparseNDArray timestamped ops", { uri <- tempfile(pattern="soma-sparse-nd-array-timestamps") # t=10: create 2x2 array and write 1 into top-left entry - t10 <- Sys.time() - snda <- SOMASparseNDArrayCreate(uri=uri, type=arrow::int16(), shape=c(2,2)) + t10 <- as.POSIXct(10, tz = "UTC", origin = "1970-01-01") + snda <- SOMASparseNDArrayCreate(uri=uri, type=arrow::int16(), shape=c(2,2), tiledb_timestamp=t10) snda$write(Matrix::sparseMatrix(i = 1, j = 1, x = 1, dims = c(2, 2))) snda$close() - Sys.sleep(1.0) # t=20: write 1 into bottom-right entry - t20 <- Sys.time() - snda <- SOMASparseNDArrayOpen(uri=uri, mode="WRITE") + t20 <- as.POSIXct(20, tz = "UTC", origin = "1970-01-01") + snda <- SOMASparseNDArrayOpen(uri=uri, mode="WRITE", tiledb_timestamp=t20) snda$write(Matrix::sparseMatrix(i = 2, j = 2, x = 1, dims = c(2, 2))) snda$close() @@ -342,7 +341,10 @@ test_that("SOMASparseNDArray timestamped ops", { snda$close() # read @ t=15 and see only the first write - snda <- SOMASparseNDArrayOpen(uri=uri, tiledb_timestamp = t10 + 0.5*as.numeric(t20 - t10)) + snda <- SOMASparseNDArrayOpen( + uri = uri, + tiledb_timestamp = t10 + 0.5 * as.numeric(t20 - t10) + ) expect_equal(sum(snda$read()$sparse_matrix()$concat()), 1) snda$close() }) diff --git a/apis/r/tests/testthat/test-TileDBArray.R b/apis/r/tests/testthat/test-TileDBArray.R index ba68f66e26..1ad04efb88 100644 --- a/apis/r/tests/testthat/test-TileDBArray.R +++ b/apis/r/tests/testthat/test-TileDBArray.R @@ -5,7 +5,7 @@ test_that("TileDBArray helper functions", { # Check errors on non-existent array expect_error(tdba$object, "Array does not exist.") - expect_error(tdba$set_metadata(list(foo = "bar")), "Item must be open for write.") + ## inactive under new implementation expect_error(tdba$set_metadata(list(foo = "bar")), "Item must be open for write.") # Create an array index_cols <- c("Dept", "Gender") diff --git a/apis/r/tests/testthat/test-Timestamps.R b/apis/r/tests/testthat/test-Timestamps.R new file mode 100644 index 0000000000..edfc8aa5a9 --- /dev/null +++ b/apis/r/tests/testthat/test-Timestamps.R @@ -0,0 +1,120 @@ +test_that("SOMADataFrame", { + uri <- tempfile() + + sch <- arrow::schema(arrow::field("soma_joinid", arrow::int64()), + arrow::field("int", arrow::int32()), + arrow::field("str", arrow::dictionary(index_type = arrow::int8(), + value_type = arrow::utf8()))) + + ## create at t = 1 + ts1 <- as.POSIXct(1, tz = "UTC", origin = "1970-01-01") + sdf <- tiledbsoma::SOMADataFrameCreate(uri, sch, tiledb_timestamp = ts1) + + ## write part1 at t = 2 + dat2 <- arrow::arrow_table(soma_joinid = bit64::as.integer64(1L:5L), + int = 101:105L, + str = factor(c('a','b','b','a','b'))) + ts2 <- as.POSIXct(2, tz = "UTC", origin = "1970-01-01") + expect_no_condition(sdf$reopen(sdf$mode(), tiledb_timestamp = ts2)) + expect_no_condition(sdf$write(dat2)) + + ## write part2 at t = 3 + dat3 <- arrow::arrow_table(soma_joinid = bit64::as.integer64(6L:10L), + int = 106:110L, + str = factor(c('c','b','c','c','b'))) + ts3 <- as.POSIXct(3, tz = "UTC", origin = "1970-01-01") + expect_no_condition(sdf$reopen(sdf$mode(), tiledb_timestamp = ts3)) + expect_no_condition(sdf$write(dat3)) + sdf$close() + + ## read all + sdf <- tiledbsoma::SOMADataFrameOpen(uri) + res <- tibble::as_tibble(sdf$read()$concat()) + expect_equal(dim(res), c(10, 3)) # two writes lead to 10x3 data + expect_equal(levels(res$str), c("a", "b", "c")) # string variable has three values + + ## read before data is written (tiledb_timestamp_range = ) + sdf <- tiledbsoma::SOMADataFrameOpen(uri, tiledb_timestamp = ts1) + res <- sdf$read()$concat() + expect_equal(dim(res), c(0, 3)) + + ## read at ts2 (tiledb_timestamp_range = ) + sdf <- tiledbsoma::SOMADataFrameOpen(uri, tiledb_timestamp = ts2) + res <- tibble::as_tibble(sdf$read()$concat()) + expect_equal(dim(res), c(5, 3)) + expect_equal(max(res$int), 105L) + expect_equal(range(res$int), c(101L,105L)) + + ## read at ts3 (tiledb_timestamp_range = ) + sdf <- tiledbsoma::SOMADataFrameOpen(uri, tiledb_timestamp = ts3) + res <- tibble::as_tibble(sdf$read()$concat()) + expect_equal(dim(res), c(10L, 3L)) + expect_equal(max(res$int), 110L) + expect_equal(range(res$int), c(101L, 110L)) + + ## read after ts3 (tiledb_timestamp_range = ) + sdf <- tiledbsoma::SOMADataFrameOpen(uri, tiledb_timestamp = ts3 + 1) + res <- tibble::as_tibble(sdf$read()$concat()) + res <- sdf$read()$concat() + expect_equal(dim(res), c(10L, 3L)) +}) + +test_that("SOMANDSparseArray", { + uri <- tempfile() + + ## create at t = 2, also writes at t = 2 as timestamp is cached + ts1 <- as.POSIXct(2, tz = "UTC", origin = "1970-01-01") + snda <- SOMASparseNDArrayCreate( + uri, + arrow::int32(), + shape = c(10, 10), + tiledb_timestamp = ts1 + ) + mat <- create_sparse_matrix_with_int_dims(10, 10) + snda$write(mat) # write happens at create time + + ## read at t = 3, expect all rows as read is from (0, 3) + ts2 <- as.POSIXct(3, tz = "UTC", origin = "1970-01-01") + snda <- tiledbsoma::SOMASparseNDArrayOpen(uri, tiledb_timestamp = ts2) + res <- snda$read()$tables()$concat() + expect_equal(dim(res), c(60,3)) + + ## read at t = 1, expect zero rows as read is from (0, 1) + ts3 <- as.POSIXct(1, tz = "UTC", origin = "1970-01-01") + snda <- tiledbsoma::SOMASparseNDArrayOpen(uri, tiledb_timestamp = ts3) + res <- snda$read()$tables()$concat() + expect_equal(dim(res), c(0,3)) + +}) + +test_that("SOMANDDenseArray", { + uri <- tempfile() + + ## create at t = 2, also writes at t = 2 as timestamp is cached + ts1 <- as.POSIXct(2, tz = "UTC", origin = "1970-01-01") + dnda <- SOMADenseNDArrayCreate( + uri, + type = arrow::int32(), + shape = c(5, 2), + tiledb_timestamp = ts1 + ) + mat <- create_dense_matrix_with_int_dims(5, 2) + expect_no_condition(dnda$write(mat)) # write happens at create time + + ## read at t = 3, expect all rows as read is from (0, 3) + ts2 <- as.POSIXct(3, tz = "UTC", origin = "1970-01-01") + dnda <- tiledbsoma::SOMADenseNDArrayOpen(uri, tiledb_timestamp = ts2) + res <- dnda$read_arrow_table() + expect_equal(dim(res), c(10,1)) + + ## read at t = 1, expect zero rows as read is from (0, 1) + ## NB that this requires a) nullable arrow schema and b) na.omit() on result + ## It also works without a) as the fill value is also NA + ts3 <- as.POSIXct(1, tz = "UTC", origin = "1970-01-01") + dnda <- tiledbsoma::SOMADenseNDArrayOpen(uri, tiledb_timestamp = ts3) + res <- dnda$read_arrow_table() + #print(res$soma_data) + vec <- tibble::as_tibble(res) + expect_equal(dim(na.omit(vec)), c(0,1)) + +}) diff --git a/apis/r/tools/check_cmake_and_git.R b/apis/r/tools/check_cmake_and_git.R index 8307d0d7c7..0acddf5b23 100644 --- a/apis/r/tools/check_cmake_and_git.R +++ b/apis/r/tools/check_cmake_and_git.R @@ -5,12 +5,14 @@ if (cmake_bin == "" && file.exists("/Applications/CMake.app/Contents/bin/cmake") cmake_bin <- "/Applications/CMake.app/Contents/bin/cmake" if (cmake_bin == "") { - stop("No 'cmake' binary found", call. = FALSE) + message("\n*** No 'cmake' binary found.\n*** Please install 'cmake' to build from source.\n") + q(save = "no", status = 1) } git_bin <- Sys.which("git") if (git_bin == "") { - stop("No 'git' binary found", call. = FALSE) + message("\n*** No 'git' binary found.\n*** Please install 'git' to build from source.\n") + q(save = "no", status = 1) } lines <- readLines("tools/build_libtiledbsoma.sh.in") diff --git a/apis/r/tools/get_tarball.R b/apis/r/tools/get_tarball.R index ee089b9b05..50f4d8b632 100644 --- a/apis/r/tools/get_tarball.R +++ b/apis/r/tools/get_tarball.R @@ -23,7 +23,8 @@ if (isMac) { } else if (isLinux) { url <- "https://github.com/TileDB-Inc/TileDB/releases/download/2.25.0/tiledb-linux-x86_64-2.25.0-bbcbd3f.tar.gz" } else { - stop("Unsupported platform for downloading artifacts. Please have TileDB Core installed locally.") + message("Unsupported platform for downloading artifacts. Please have TileDB Core installed locally.") + q(save = "no", status = 1) } tarball <- "tiledb.tar.gz" diff --git a/libtiledbsoma/src/external/include/nanoarrow/nanoarrow.h b/libtiledbsoma/src/external/include/nanoarrow/nanoarrow.h index e338560f1a..72d7bf3da9 100644 --- a/libtiledbsoma/src/external/include/nanoarrow/nanoarrow.h +++ b/libtiledbsoma/src/external/include/nanoarrow/nanoarrow.h @@ -19,9 +19,9 @@ #define NANOARROW_BUILD_ID_H_INCLUDED #define NANOARROW_VERSION_MAJOR 0 -#define NANOARROW_VERSION_MINOR 4 +#define NANOARROW_VERSION_MINOR 5 #define NANOARROW_VERSION_PATCH 0 -#define NANOARROW_VERSION "0.4.0-SNAPSHOT" +#define NANOARROW_VERSION "0.5.0" #define NANOARROW_VERSION_INT \ (NANOARROW_VERSION_MAJOR * 10000 + NANOARROW_VERSION_MINOR * 100 + \ @@ -345,7 +345,7 @@ static inline void ArrowErrorSetString(struct ArrowError* error, const char* src #define NANOARROW_DCHECK(EXPR) _NANOARROW_DCHECK_IMPL(EXPR, #EXPR) #else -#define NANOARROW_ASSERT_OK(EXPR) EXPR +#define NANOARROW_ASSERT_OK(EXPR) (void)(EXPR) #define NANOARROW_DCHECK(EXPR) #endif @@ -721,6 +721,9 @@ struct ArrowBufferAllocator { void* private_data; }; +typedef void (*ArrowBufferDeallocatorCallback)(struct ArrowBufferAllocator* allocator, + uint8_t* ptr, int64_t size); + /// \brief An owning mutable view of a buffer /// \ingroup nanoarrow-buffer struct ArrowBuffer { @@ -1168,10 +1171,8 @@ struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void); /// attach a custom deallocator to an ArrowBuffer. This may be used to /// avoid copying an existing buffer that was not allocated using the /// infrastructure provided here (e.g., by an R or Python object). -struct ArrowBufferAllocator ArrowBufferDeallocator( - void (*custom_free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, - int64_t size), - void* private_data); +struct ArrowBufferAllocator ArrowBufferDeallocator(ArrowBufferDeallocatorCallback, + void* private_data); /// @} @@ -1280,6 +1281,14 @@ ArrowErrorCode ArrowDecimalSetDigits(struct ArrowDecimal* decimal, ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decimal, struct ArrowBuffer* buffer); +/// \brief Resolve a chunk index from increasing int64_t offsets +/// +/// Given a buffer of increasing int64_t offsets that begin with 0 (e.g., offset buffer +/// of a large type, run ends of a chunked array implementation), resolve a value v +/// where lo <= v < hi such that offsets[v] <= index < offsets[v + 1]. +static inline int64_t ArrowResolveChunk64(int64_t index, const int64_t* offsets, + int64_t lo, int64_t hi); + /// @} /// \defgroup nanoarrow-schema Creating schemas @@ -1360,7 +1369,7 @@ ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum Arrow enum ArrowTimeUnit time_unit, const char* timezone); -/// \brief Seet the format field of a union schema +/// \brief Set the format field of a union schema /// /// Returns EINVAL for a type that is not NANOARROW_TYPE_DENSE_UNION /// or NANOARROW_TYPE_SPARSE_UNION. The specified number of children are @@ -1603,14 +1612,12 @@ static inline void ArrowBufferReset(struct ArrowBuffer* buffer); /// address and resets buffer. static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* dst); -/// \brief Grow or shrink a buffer to a given capacity +/// \brief Grow or shrink a buffer to a given size /// -/// When shrinking the capacity of the buffer, the buffer is only reallocated -/// if shrink_to_fit is non-zero. Calling ArrowBufferResize() does not -/// adjust the buffer's size member except to ensure that the invariant -/// capacity >= size remains true. +/// When shrinking the size of the buffer, the buffer is only reallocated +/// if shrink_to_fit is non-zero. static inline ArrowErrorCode ArrowBufferResize(struct ArrowBuffer* buffer, - int64_t new_capacity_bytes, + int64_t new_size_bytes, char shrink_to_fit); /// \brief Ensure a buffer has at least a given additional capacity @@ -1740,15 +1747,12 @@ static inline void ArrowBitmapMove(struct ArrowBitmap* src, struct ArrowBitmap* static inline ArrowErrorCode ArrowBitmapReserve(struct ArrowBitmap* bitmap, int64_t additional_size_bits); -/// \brief Grow or shrink a bitmap to a given capacity +/// \brief Grow or shrink a bitmap to a given size /// -/// When shrinking the capacity of the bitmap, the bitmap is only reallocated -/// if shrink_to_fit is non-zero. Calling ArrowBitmapResize() does not -/// adjust the buffer's size member except when shrinking new_capacity_bits -/// to a value less than the current number of bits in the bitmap. +/// When shrinking the size of the bitmap, the bitmap is only reallocated +/// if shrink_to_fit is non-zero. static inline ArrowErrorCode ArrowBitmapResize(struct ArrowBitmap* bitmap, - int64_t new_capacity_bits, - char shrink_to_fit); + int64_t new_size_bits, char shrink_to_fit); /// \brief Reserve space for and append zero or more of the same boolean value to a bitmap static inline ArrowErrorCode ArrowBitmapAppend(struct ArrowBitmap* bitmap, @@ -2177,6 +2181,49 @@ ArrowErrorCode ArrowBasicArrayStreamValidate(const struct ArrowArrayStream* arra extern "C" { #endif +// Modified from Arrow C++ (1eb46f76) cpp/src/arrow/chunk_resolver.h#L133-L162 +static inline int64_t ArrowResolveChunk64(int64_t index, const int64_t* offsets, + int64_t lo, int64_t hi) { + // Similar to std::upper_bound(), but slightly different as our offsets + // array always starts with 0. + int64_t n = hi - lo; + // First iteration does not need to check for n > 1 + // (lo < hi is guaranteed by the precondition). + NANOARROW_DCHECK(n > 1); + do { + const int64_t m = n >> 1; + const int64_t mid = lo + m; + if (index >= offsets[mid]) { + lo = mid; + n -= m; + } else { + n = m; + } + } while (n > 1); + return lo; +} + +static inline int64_t ArrowResolveChunk32(int32_t index, const int32_t* offsets, + int32_t lo, int32_t hi) { + // Similar to std::upper_bound(), but slightly different as our offsets + // array always starts with 0. + int32_t n = hi - lo; + // First iteration does not need to check for n > 1 + // (lo < hi is guaranteed by the precondition). + NANOARROW_DCHECK(n > 1); + do { + const int32_t m = n >> 1; + const int32_t mid = lo + m; + if (index >= offsets[mid]) { + lo = mid; + n -= m; + } else { + n = m; + } + } while (n > 1); + return lo; +} + static inline int64_t _ArrowGrowByFactor(int64_t current_capacity, int64_t new_capacity) { int64_t doubled_capacity = current_capacity * 2; if (doubled_capacity > new_capacity) { @@ -2195,6 +2242,8 @@ static inline void ArrowBufferInit(struct ArrowBuffer* buffer) { static inline ArrowErrorCode ArrowBufferSetAllocator( struct ArrowBuffer* buffer, struct ArrowBufferAllocator allocator) { + // This is not a perfect test for "has a buffer already been allocated" + // but is likely to catch most cases. if (buffer->data == NULL) { buffer->allocator = allocator; return NANOARROW_OK; @@ -2204,46 +2253,41 @@ static inline ArrowErrorCode ArrowBufferSetAllocator( } static inline void ArrowBufferReset(struct ArrowBuffer* buffer) { - if (buffer->data != NULL) { - buffer->allocator.free(&buffer->allocator, (uint8_t*)buffer->data, - buffer->capacity_bytes); - buffer->data = NULL; - } - - buffer->capacity_bytes = 0; - buffer->size_bytes = 0; + buffer->allocator.free(&buffer->allocator, (uint8_t*)buffer->data, + buffer->capacity_bytes); + ArrowBufferInit(buffer); } static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* dst) { memcpy(dst, src, sizeof(struct ArrowBuffer)); src->data = NULL; - ArrowBufferReset(src); + ArrowBufferInit(src); } static inline ArrowErrorCode ArrowBufferResize(struct ArrowBuffer* buffer, - int64_t new_capacity_bytes, + int64_t new_size_bytes, char shrink_to_fit) { - if (new_capacity_bytes < 0) { + if (new_size_bytes < 0) { return EINVAL; } - if (new_capacity_bytes > buffer->capacity_bytes || shrink_to_fit) { - buffer->data = buffer->allocator.reallocate( - &buffer->allocator, buffer->data, buffer->capacity_bytes, new_capacity_bytes); - if (buffer->data == NULL && new_capacity_bytes > 0) { + int needs_reallocation = new_size_bytes > buffer->capacity_bytes || + (shrink_to_fit && new_size_bytes < buffer->capacity_bytes); + + if (needs_reallocation) { + buffer->data = buffer->allocator.reallocate(&buffer->allocator, buffer->data, + buffer->capacity_bytes, new_size_bytes); + + if (buffer->data == NULL && new_size_bytes > 0) { buffer->capacity_bytes = 0; buffer->size_bytes = 0; return ENOMEM; } - buffer->capacity_bytes = new_capacity_bytes; - } - - // Ensures that when shrinking that size <= capacity - if (new_capacity_bytes < buffer->size_bytes) { - buffer->size_bytes = new_capacity_bytes; + buffer->capacity_bytes = new_size_bytes; } + buffer->size_bytes = new_size_bytes; return NANOARROW_OK; } @@ -2254,8 +2298,19 @@ static inline ArrowErrorCode ArrowBufferReserve(struct ArrowBuffer* buffer, return NANOARROW_OK; } - return ArrowBufferResize( - buffer, _ArrowGrowByFactor(buffer->capacity_bytes, min_capacity_bytes), 0); + int64_t new_capacity_bytes = + _ArrowGrowByFactor(buffer->capacity_bytes, min_capacity_bytes); + buffer->data = buffer->allocator.reallocate(&buffer->allocator, buffer->data, + buffer->capacity_bytes, new_capacity_bytes); + + if (buffer->data == NULL && new_capacity_bytes > 0) { + buffer->capacity_bytes = 0; + buffer->size_bytes = 0; + return ENOMEM; + } + + buffer->capacity_bytes = new_capacity_bytes; + return NANOARROW_OK; } static inline void ArrowBufferAppendUnsafe(struct ArrowBuffer* buffer, const void* data, @@ -2598,32 +2653,38 @@ static inline void ArrowBitmapMove(struct ArrowBitmap* src, struct ArrowBitmap* static inline ArrowErrorCode ArrowBitmapReserve(struct ArrowBitmap* bitmap, int64_t additional_size_bits) { int64_t min_capacity_bits = bitmap->size_bits + additional_size_bits; - if (min_capacity_bits <= (bitmap->buffer.capacity_bytes * 8)) { + int64_t min_capacity_bytes = _ArrowBytesForBits(min_capacity_bits); + int64_t current_size_bytes = bitmap->buffer.size_bytes; + int64_t current_capacity_bytes = bitmap->buffer.capacity_bytes; + + if (min_capacity_bytes <= current_capacity_bytes) { return NANOARROW_OK; } - NANOARROW_RETURN_NOT_OK( - ArrowBufferReserve(&bitmap->buffer, _ArrowBytesForBits(additional_size_bits))); + int64_t additional_capacity_bytes = min_capacity_bytes - current_size_bytes; + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&bitmap->buffer, additional_capacity_bytes)); + // Zero out the last byte for deterministic output in the common case + // of reserving a known remaining size. We should have returned above + // if there was not at least one additional byte to allocate; however, + // DCHECK() just to be sure. + NANOARROW_DCHECK(bitmap->buffer.capacity_bytes > current_capacity_bytes); bitmap->buffer.data[bitmap->buffer.capacity_bytes - 1] = 0; return NANOARROW_OK; } static inline ArrowErrorCode ArrowBitmapResize(struct ArrowBitmap* bitmap, - int64_t new_capacity_bits, + int64_t new_size_bits, char shrink_to_fit) { - if (new_capacity_bits < 0) { + if (new_size_bits < 0) { return EINVAL; } - int64_t new_capacity_bytes = _ArrowBytesForBits(new_capacity_bits); + int64_t new_size_bytes = _ArrowBytesForBits(new_size_bits); NANOARROW_RETURN_NOT_OK( - ArrowBufferResize(&bitmap->buffer, new_capacity_bytes, shrink_to_fit)); - - if (new_capacity_bits < bitmap->size_bits) { - bitmap->size_bits = new_capacity_bits; - } + ArrowBufferResize(&bitmap->buffer, new_size_bytes, shrink_to_fit)); + bitmap->size_bits = new_size_bits; return NANOARROW_OK; } diff --git a/libtiledbsoma/src/external/include/nanoarrow/nanoarrow.hpp b/libtiledbsoma/src/external/include/nanoarrow/nanoarrow.hpp index 8d5b841e28..5f8aabbacb 100644 --- a/libtiledbsoma/src/external/include/nanoarrow/nanoarrow.hpp +++ b/libtiledbsoma/src/external/include/nanoarrow/nanoarrow.hpp @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include +#include #include #include @@ -241,6 +241,15 @@ class Unique { T data_; }; +template +static inline void DeallocateWrappedBuffer(struct ArrowBufferAllocator* allocator, + uint8_t* ptr, int64_t size) { + NANOARROW_UNUSED(ptr); + NANOARROW_UNUSED(size); + auto obj = reinterpret_cast(allocator->private_data); + delete obj; +} + /// @} } // namespace internal @@ -273,6 +282,51 @@ using UniqueArrayView = internal::Unique; /// @} +/// \defgroup nanoarrow_hpp-buffer Buffer helpers +/// +/// Helpers to wrap buffer-like C++ objects as ArrowBuffer objects that can +/// be used to build ArrowArray objects. +/// +/// @{ + +/// \brief Initialize a buffer wrapping an arbitrary C++ object +/// +/// Initializes a buffer with a release callback that deletes the moved obj +/// when ArrowBufferReset is called. This version is useful for wrapping +/// an object whose .data() member is missing or unrelated to the buffer +/// value that is destined for a the buffer of an ArrowArray. T must be movable. +template +static inline void BufferInitWrapped(struct ArrowBuffer* buffer, T obj, + const uint8_t* data, int64_t size_bytes) { + T* obj_moved = new T(std::move(obj)); + buffer->data = const_cast(data); + buffer->size_bytes = size_bytes; + buffer->capacity_bytes = 0; + buffer->allocator = + ArrowBufferDeallocator(&internal::DeallocateWrappedBuffer, obj_moved); +} + +/// \brief Initialize a buffer wrapping a C++ sequence +/// +/// Specifically, this uses obj.data() to set the buffer address and +/// obj.size() * sizeof(T::value_type) to set the buffer size. This works +/// for STL containers like std::vector, std::array, and std::string. +/// This function moves obj and ensures it is deleted when ArrowBufferReset +/// is called. +template +void BufferInitSequence(struct ArrowBuffer* buffer, T obj) { + // Move before calling .data() (matters sometimes). + T* obj_moved = new T(std::move(obj)); + buffer->data = + const_cast(reinterpret_cast(obj_moved->data())); + buffer->size_bytes = obj_moved->size() * sizeof(typename T::value_type); + buffer->capacity_bytes = 0; + buffer->allocator = + ArrowBufferDeallocator(&internal::DeallocateWrappedBuffer, obj_moved); +} + +/// @} + /// \defgroup nanoarrow_hpp-array-stream ArrayStream helpers /// /// These classes provide simple ArrowArrayStream implementations that @@ -496,6 +550,365 @@ class VectorArrayStream { /// @} +namespace internal { +struct Nothing {}; + +template +class Maybe { + public: + Maybe() : nothing_(Nothing()), is_something_(false) {} + Maybe(Nothing) : Maybe() {} + + Maybe(T something) // NOLINT(google-explicit-constructor) + : something_(something), is_something_(true) {} + + explicit constexpr operator bool() const { return is_something_; } + + const T& operator*() const { return something_; } + + friend inline bool operator==(Maybe l, Maybe r) { + if (l.is_something_ != r.is_something_) return false; + return l.is_something_ ? l.something_ == r.something_ : true; + } + friend inline bool operator!=(Maybe l, Maybe r) { return !(l == r); } + + T value_or(T val) const { return is_something_ ? something_ : val; } + + private: + // When support for gcc 4.8 is dropped, we should also assert + // is_trivially_copyable::value + static_assert(std::is_trivially_destructible::value, ""); + + union { + Nothing nothing_; + T something_; + }; + bool is_something_; +}; + +template +struct RandomAccessRange { + Get get; + int64_t size; + + using value_type = decltype(std::declval()(0)); + + struct const_iterator { + int64_t i; + const RandomAccessRange* range; + bool operator==(const_iterator other) const { return i == other.i; } + bool operator!=(const_iterator other) const { return i != other.i; } + const_iterator& operator++() { return ++i, *this; } + value_type operator*() const { return range->get(i); } + }; + + const_iterator begin() const { return {0, this}; } + const_iterator end() const { return {size, this}; } +}; + +template +struct InputRange { + Next next; + using ValueOrFalsy = decltype(std::declval()()); + + static_assert(std::is_constructible::value, ""); + static_assert(std::is_default_constructible::value, ""); + using value_type = decltype(*std::declval()); + + struct iterator { + InputRange* range; + ValueOrFalsy stashed; + + bool operator==(iterator other) const { + return static_cast(stashed) == static_cast(other.stashed); + } + bool operator!=(iterator other) const { return !(*this == other); } + + iterator& operator++() { + stashed = range->next(); + return *this; + } + value_type operator*() const { return *stashed; } + }; + + iterator begin() { return {this, next()}; } + iterator end() { return {this, ValueOrFalsy()}; } +}; +} // namespace internal + +/// \defgroup nanoarrow_hpp-range_for Range-for helpers +/// +/// The Arrow C Data interface and the Arrow C Stream interface represent +/// data which can be iterated through using C++'s range-for statement. +/// +/// @{ + +/// \brief An object convertible to any empty optional +constexpr internal::Nothing NA{}; + +/// \brief A range-for compatible wrapper for ArrowArray of fixed size type +/// +/// Provides a sequence of optional copied from each non-null +/// slot of the wrapped array (null slots result in empty optionals). +template +class ViewArrayAs { + private: + struct Get { + const uint8_t* validity; + const void* values; + int64_t offset; + + internal::Maybe operator()(int64_t i) const { + i += offset; + if (validity == nullptr || ArrowBitGet(validity, i)) { + if (std::is_same::value) { + return ArrowBitGet(static_cast(values), i); + } else { + return static_cast(values)[i]; + } + } + return NA; + } + }; + + internal::RandomAccessRange range_; + + public: + ViewArrayAs(const ArrowArrayView* array_view) + : range_{ + Get{ + array_view->buffer_views[0].data.as_uint8, + array_view->buffer_views[1].data.data, + array_view->offset, + }, + array_view->length, + } {} + + ViewArrayAs(const ArrowArray* array) + : range_{ + Get{ + static_cast(array->buffers[0]), + array->buffers[1], + /*offset=*/0, + }, + array->length, + } {} + + using value_type = typename internal::RandomAccessRange::value_type; + using const_iterator = typename internal::RandomAccessRange::const_iterator; + const_iterator begin() const { return range_.begin(); } + const_iterator end() const { return range_.end(); } + value_type operator[](int64_t i) const { return range_.get(i); } +}; + +/// \brief A range-for compatible wrapper for ArrowArray of binary or utf8 +/// +/// Provides a sequence of optional referencing each non-null +/// slot of the wrapped array (null slots result in empty optionals). Large +/// binary and utf8 arrays can be wrapped by specifying 64 instead of 32 for +/// the template argument. +template +class ViewArrayAsBytes { + private: + static_assert(OffsetSize == 32 || OffsetSize == 64, ""); + using OffsetType = typename std::conditional::type; + + struct Get { + const uint8_t* validity; + const void* offsets; + const char* data; + int64_t offset; + + internal::Maybe operator()(int64_t i) const { + i += offset; + auto* offsets = static_cast(this->offsets); + if (validity == nullptr || ArrowBitGet(validity, i)) { + return ArrowStringView{data + offsets[i], offsets[i + 1] - offsets[i]}; + } + return NA; + } + }; + + internal::RandomAccessRange range_; + + public: + ViewArrayAsBytes(const ArrowArrayView* array_view) + : range_{ + Get{ + array_view->buffer_views[0].data.as_uint8, + array_view->buffer_views[1].data.data, + array_view->buffer_views[2].data.as_char, + array_view->offset, + }, + array_view->length, + } {} + + ViewArrayAsBytes(const ArrowArray* array) + : range_{ + Get{ + static_cast(array->buffers[0]), + array->buffers[1], + static_cast(array->buffers[2]), + /*offset=*/0, + }, + array->length, + } {} + + using value_type = typename internal::RandomAccessRange::value_type; + using const_iterator = typename internal::RandomAccessRange::const_iterator; + const_iterator begin() const { return range_.begin(); } + const_iterator end() const { return range_.end(); } + value_type operator[](int64_t i) const { return range_.get(i); } +}; + +/// \brief A range-for compatible wrapper for ArrowArray of fixed size binary +/// +/// Provides a sequence of optional referencing each non-null +/// slot of the wrapped array (null slots result in empty optionals). +class ViewArrayAsFixedSizeBytes { + private: + struct Get { + const uint8_t* validity; + const char* data; + int64_t offset; + int fixed_size; + + internal::Maybe operator()(int64_t i) const { + i += offset; + if (validity == nullptr || ArrowBitGet(validity, i)) { + return ArrowStringView{data + i * fixed_size, fixed_size}; + } + return NA; + } + }; + + internal::RandomAccessRange range_; + + public: + ViewArrayAsFixedSizeBytes(const ArrowArrayView* array_view, int fixed_size) + : range_{ + Get{ + array_view->buffer_views[0].data.as_uint8, + array_view->buffer_views[1].data.as_char, + array_view->offset, + fixed_size, + }, + array_view->length, + } {} + + ViewArrayAsFixedSizeBytes(const ArrowArray* array, int fixed_size) + : range_{ + Get{ + static_cast(array->buffers[0]), + static_cast(array->buffers[1]), + /*offset=*/0, + fixed_size, + }, + array->length, + } {} + + using value_type = typename internal::RandomAccessRange::value_type; + using const_iterator = typename internal::RandomAccessRange::const_iterator; + const_iterator begin() const { return range_.begin(); } + const_iterator end() const { return range_.end(); } + value_type operator[](int64_t i) const { return range_.get(i); } +}; + +/// \brief A range-for compatible wrapper for ArrowArrayStream +/// +/// Provides a sequence of ArrowArray& referencing the most recent array drawn +/// from the wrapped stream. (Each array may be moved from if necessary.) +/// When streams terminate due to an error, the error code and message are +/// available for inspection through the code() and error() member functions +/// respectively. Failure to inspect the error code will result in +/// an assertion failure. The number of arrays drawn from the stream is also +/// available through the count() member function. +class ViewArrayStream { + public: + ViewArrayStream(ArrowArrayStream* stream, ArrowErrorCode* code, ArrowError* error) + : range_{Next{this, stream, UniqueArray()}}, code_{code}, error_{error} {} + + ViewArrayStream(ArrowArrayStream* stream, ArrowError* error) + : ViewArrayStream{stream, &internal_code_, error} {} + + ViewArrayStream(ArrowArrayStream* stream) + : ViewArrayStream{stream, &internal_code_, &internal_error_} {} + + // disable copy/move of this view, since its error references may point into itself + ViewArrayStream(ViewArrayStream&&) = delete; + ViewArrayStream& operator=(ViewArrayStream&&) = delete; + ViewArrayStream(const ViewArrayStream&) = delete; + ViewArrayStream& operator=(const ViewArrayStream&) = delete; + + // ensure the error code of this stream was accessed at least once + ~ViewArrayStream() { NANOARROW_DCHECK(code_was_accessed_); } + + private: + struct Next { + ViewArrayStream* self; + ArrowArrayStream* stream; + UniqueArray array; + + ArrowArray* operator()() { + array.reset(); + *self->code_ = ArrowArrayStreamGetNext(stream, array.get(), self->error_); + + if (array->release != nullptr) { + NANOARROW_DCHECK(*self->code_ == NANOARROW_OK); + ++self->count_; + return array.get(); + } + + return nullptr; + } + }; + + internal::InputRange range_; + ArrowErrorCode* code_; + ArrowError* error_; + ArrowError internal_error_ = {}; + ArrowErrorCode internal_code_; + bool code_was_accessed_ = false; + int count_ = 0; + + public: + using value_type = typename internal::InputRange::value_type; + using iterator = typename internal::InputRange::iterator; + iterator begin() { return range_.begin(); } + iterator end() { return range_.end(); } + + /// The error code which caused this stream to terminate, if any. + ArrowErrorCode code() { + code_was_accessed_ = true; + return *code_; + } + /// The error message which caused this stream to terminate, if any. + ArrowError* error() { return error_; } + + /// The number of arrays streamed so far. + int count() const { return count_; } +}; + +/// @} + } // namespace nanoarrow +/// \defgroup nanoarrow_hpp-string_view_helpers ArrowStringView helpers +/// +/// Factories and equality comparison for ArrowStringView. +/// +/// @{ + +/// \brief Equality comparison operator between ArrowStringView +inline bool operator==(ArrowStringView l, ArrowStringView r) { + if (l.size_bytes != r.size_bytes) return false; + return memcmp(l.data, r.data, l.size_bytes) == 0; +} + +/// \brief User literal operator allowing ArrowStringView construction like "str"_sv +inline ArrowStringView operator"" _v(const char* data, std::size_t size_bytes) { + return {data, static_cast(size_bytes)}; +} +/// @} + #endif diff --git a/libtiledbsoma/src/external/src/nanoarrow/nanoarrow.c b/libtiledbsoma/src/external/src/nanoarrow/nanoarrow.c index cdc97a4bba..9677a0e533 100644 --- a/libtiledbsoma/src/external/src/nanoarrow/nanoarrow.c +++ b/libtiledbsoma/src/external/src/nanoarrow/nanoarrow.c @@ -201,7 +201,9 @@ static void ArrowBufferAllocatorMallocFree(struct ArrowBufferAllocator* allocato uint8_t* ptr, int64_t size) { NANOARROW_UNUSED(allocator); NANOARROW_UNUSED(size); - ArrowFree(ptr); + if (ptr != NULL) { + ArrowFree(ptr); + } } static struct ArrowBufferAllocator ArrowBufferAllocatorMalloc = { @@ -211,13 +213,24 @@ struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void) { return ArrowBufferAllocatorMalloc; } -static uint8_t* ArrowBufferAllocatorNeverReallocate( - struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size, - int64_t new_size) { - NANOARROW_UNUSED(allocator); - NANOARROW_UNUSED(ptr); - NANOARROW_UNUSED(old_size); +static uint8_t* ArrowBufferDeallocatorReallocate(struct ArrowBufferAllocator* allocator, + uint8_t* ptr, int64_t old_size, + int64_t new_size) { NANOARROW_UNUSED(new_size); + + // Attempting to reallocate a buffer with a custom deallocator is + // a programming error. In debug mode, crash here. +#if defined(NANOARROW_DEBUG) + NANOARROW_PRINT_AND_DIE(ENOMEM, + "It is an error to reallocate a buffer whose allocator is " + "ArrowBufferDeallocator()"); +#endif + + // In release mode, ensure the the deallocator is called exactly + // once using the pointer it was given and return NULL, which + // will trigger the caller to return ENOMEM. + allocator->free(allocator, ptr, old_size); + *allocator = ArrowBufferAllocatorDefault(); return NULL; } @@ -226,7 +239,7 @@ struct ArrowBufferAllocator ArrowBufferDeallocator( int64_t size), void* private_data) { struct ArrowBufferAllocator allocator; - allocator.reallocate = &ArrowBufferAllocatorNeverReallocate; + allocator.reallocate = &ArrowBufferDeallocatorReallocate; allocator.free = custom_free; allocator.private_data = private_data; return allocator; @@ -417,6 +430,13 @@ ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decim // The most significant segment should have no leading zeroes int n_chars = snprintf((char*)buffer->data + buffer->size_bytes, 21, "%lu", (unsigned long)segments[num_segments - 1]); + + // Ensure that an encoding error from snprintf() does not result + // in an out-of-bounds access. + if (n_chars < 0) { + return ERANGE; + } + buffer->size_bytes += n_chars; // Subsequent output needs to be left-padded with zeroes such that each segment @@ -665,6 +685,10 @@ ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct ArrowSchema* schema, return EINVAL; } + if (((size_t)n_chars) >= sizeof(buffer) || n_chars < 0) { + return ERANGE; + } + buffer[n_chars] = '\0'; NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, buffer)); @@ -697,6 +721,10 @@ ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowT return EINVAL; } + if (((size_t)n_chars) >= sizeof(buffer) || n_chars < 0) { + return ERANGE; + } + buffer[n_chars] = '\0'; return ArrowSchemaSetFormat(schema, buffer); } @@ -773,7 +801,7 @@ ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum Arrow return EINVAL; } - if (((size_t)n_chars) >= sizeof(buffer)) { + if (((size_t)n_chars) >= sizeof(buffer) || n_chars < 0) { return ERANGE; } @@ -810,6 +838,12 @@ ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowTyp return EINVAL; } + // Ensure that an encoding error from snprintf() does not result + // in an out-of-bounds access. + if (n_chars < 0) { + return ERANGE; + } + if (n_children > 0) { n_chars = snprintf(format_cursor, format_out_size, "0"); format_cursor += n_chars; @@ -822,6 +856,12 @@ ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowTyp } } + // Ensure that an encoding error from snprintf() does not result + // in an out-of-bounds access. + if (n_chars < 0) { + return ERANGE; + } + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, format_out)); NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, n_children)); @@ -1688,6 +1728,12 @@ static int64_t ArrowSchemaTypeToStringInternal(struct ArrowSchemaView* schema_vi // among multiple sprintf calls. static inline void ArrowToStringLogChars(char** out, int64_t n_chars_last, int64_t* n_remaining, int64_t* n_chars) { + // In the unlikely snprintf() returning a negative value (encoding error), + // ensure the result won't cause an out-of-bounds access. + if (n_chars_last < 0) { + n_chars = 0; + } + *n_chars += n_chars_last; *n_remaining -= n_chars_last; @@ -1712,10 +1758,6 @@ int64_t ArrowSchemaToString(const struct ArrowSchema* schema, char* out, int64_t return snprintf(out, n, "[invalid: schema is released]"); } - if (out == NULL) { - return 0; - } - struct ArrowSchemaView schema_view; struct ArrowError error; @@ -1787,7 +1829,12 @@ int64_t ArrowSchemaToString(const struct ArrowSchema* schema, char* out, int64_t n_chars += snprintf(out, n, ">"); } - return n_chars; + // Ensure that we always return a positive result + if (n_chars > 0) { + return n_chars; + } else { + return 0; + } } ArrowErrorCode ArrowMetadataReaderInit(struct ArrowMetadataReader* reader, @@ -2414,19 +2461,16 @@ static ArrowErrorCode ArrowArrayFinalizeBuffers(struct ArrowArray* array) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; - // The only buffer finalizing this currently does is make sure the data - // buffer for (Large)String|Binary is never NULL - switch (private_data->storage_type) { - case NANOARROW_TYPE_BINARY: - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_LARGE_BINARY: - case NANOARROW_TYPE_LARGE_STRING: - if (ArrowArrayBuffer(array, 2)->data == NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt8(ArrowArrayBuffer(array, 2), 0)); - } - break; - default: - break; + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY || + private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { + continue; + } + + struct ArrowBuffer* buffer = ArrowArrayBuffer(array, i); + if (buffer->data == NULL) { + NANOARROW_RETURN_NOT_OK((ArrowBufferReserve(buffer, 1))); + } } for (int64_t i = 0; i < array->n_children; i++) { @@ -2462,7 +2506,8 @@ ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, struct ArrowError* error) { // Even if the data buffer is size zero, the pointer value needed to be non-null // in some implementations (at least one version of Arrow C++ at the time this - // was added). Only do this fix if we can assume CPU data access. + // was added and C# as later discovered). Only do this fix if we can assume + // CPU data access. if (validation_level >= NANOARROW_VALIDATION_LEVEL_DEFAULT) { NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayFinalizeBuffers(array), error); } @@ -2910,6 +2955,10 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, (long)array_view->buffer_views[2].size_bytes); return EINVAL; } + } else if (array_view->buffer_views[2].size_bytes == -1) { + // If the data buffer size is unknown and there are no bytes in the offset buffer, + // set the data buffer size to 0. + array_view->buffer_views[2].size_bytes = 0; } break; @@ -2936,6 +2985,10 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, (long)array_view->buffer_views[2].size_bytes); return EINVAL; } + } else if (array_view->buffer_views[2].size_bytes == -1) { + // If the data buffer size is unknown and there are no bytes in the offset + // buffer, set the data buffer size to 0. + array_view->buffer_views[2].size_bytes = 0; } break; diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 3996121187..94c93b34ca 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -473,13 +473,6 @@ uint64_t SOMAArray::_get_max_capacity(tiledb_datatype_t index_type) { ArraySchemaEvolution SOMAArray::_make_se() { ArraySchemaEvolution se(*ctx_->tiledb_ctx()); - if (timestamp_.has_value()) { - // ArraySchemaEvolution requires us to pair (t2, t2) even if our range - // is (t1, t2). - auto v = timestamp_.value(); - TimestampRange tr(v.second, v.second); - se.set_timestamp_range(tr); - } return se; } @@ -1226,30 +1219,15 @@ std::optional SOMAArray::_shape_slot_if_soma_joinid_dim() { } std::vector SOMAArray::_tiledb_domain() { - std::vector result; - auto dimensions = mq_->schema()->domain().dimensions(); - - for (const auto& dim : dimensions) { - // Callers inquiring about non-int64 shapes should not be here. - // - // In the SOMA data model: - // * SparseNDArray has dims which are all necessarily int64_t - // * DenseNDArray has dims which are all necessarily int64_t - // * DataFrame _default_ indexing is one dim named "soma_dim_0" of type - // int64_t, however: - // * Users can (and do) add other additional dims - // * The SOMA data model requires that soma_joinid be present in each - // DataFrame either as a dim or an attr -- and there are DataFrame - // objects for which soma_joinid is not a dim at all - // * These cases are all actively unit-tested within apis/python/tests - if (dim.type() != TILEDB_INT64) { - throw TileDBSOMAError("Found unexpected non-int64 dimension type."); - } - result.push_back( - dim.domain().second - dim.domain().first + 1); - } - - return result; + // There are two reasons for this: + // * Transitional, non-monolithic, phased, careful development for the + // new-shape feature + // * Even after the new-shape feature is fully released, there will be old + // arrays on disk that were created before this feature existed. + // So this is long-term code. + auto current_domain = _get_current_domain(); + return current_domain.is_empty() ? _tiledb_domain() : + _tiledb_current_domain(); } std::vector SOMAArray::_tiledb_current_domain() { diff --git a/libtiledbsoma/src/soma/soma_dense_ndarray.cc b/libtiledbsoma/src/soma/soma_dense_ndarray.cc index ebc72214df..63a16e31d4 100644 --- a/libtiledbsoma/src/soma/soma_dense_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_dense_ndarray.cc @@ -73,7 +73,7 @@ void SOMADenseNDArray::create( attr->format = strdup(std::string(format).c_str()); attr->name = strdup("soma_data"); attr->n_children = 0; - attr->flags = 0; + attr->flags = 0; // or ARROW_FLAG_NULLABLE; attr->dictionary = nullptr; attr->release = &ArrowAdapter::release_schema; diff --git a/libtiledbsoma/src/soma/soma_sparse_ndarray.h b/libtiledbsoma/src/soma/soma_sparse_ndarray.h index 7017d359cf..d51d83c1f3 100644 --- a/libtiledbsoma/src/soma/soma_sparse_ndarray.h +++ b/libtiledbsoma/src/soma/soma_sparse_ndarray.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2023 TileDB, Inc. + * @copyright Copyright (c) 2023-2024 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index cd226cd0a0..6a03aa5b2f 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -775,8 +775,9 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( // nullptr) // // Fortunately, these are ASCII dims and we can range - // these accordingly. - ndrect.set_range(col_name, "\x01", "\x7f"); + // these accordingly. These are minimum and maximum + // values, avoiding the extremes 0x00 and 0xff. + ndrect.set_range(col_name, "\x01", "\xfe"); } else { const void* buff = index_column_array->children[i] ->buffers[1]; diff --git a/libtiledbsoma/test/unit_soma_collection.cc b/libtiledbsoma/test/unit_soma_collection.cc index c079e8acc3..1d69998720 100644 --- a/libtiledbsoma/test/unit_soma_collection.cc +++ b/libtiledbsoma/test/unit_soma_collection.cc @@ -49,8 +49,13 @@ TEST_CASE("SOMACollection: basic") { } TEST_CASE("SOMACollection: add SOMASparseNDArray") { - bool use_current_domains[] = {false, true}; - for (bool use_current_domain : use_current_domains) { + auto use_current_domain = GENERATE(false, true); + // TODO this could be formatted with fmt::format which is part of internal + // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be + // replaced with std::format. + std::ostringstream section; + section << "- use_current_domain=" << use_current_domain; + SECTION(section.str()) { TimestampRange ts(0, 2); auto ctx = std::make_shared(); std::string base_uri = "mem://unit-test-add-sparse-ndarray"; diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index 77f36fae92..09ee2e350c 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -35,9 +35,14 @@ #define DIM_MAX 1000 TEST_CASE("SOMADataFrame: basic") { - bool use_current_domains[] = {false, true}; int64_t dim_max = 1000; - for (bool use_current_domain : use_current_domains) { + auto use_current_domain = GENERATE(false, true); + // TODO this could be formatted with fmt::format which is part of internal + // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be + // replaced with std::format. + std::ostringstream section; + section << "- use_current_domain=" << use_current_domain; + SECTION(section.str()) { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-dataframe-basic"; @@ -145,13 +150,16 @@ TEST_CASE("SOMADataFrame: platform_config") { section << "- filter=" << filter.first; SECTION(section.str()) { - bool use_current_domains[] = {false, true}; int64_t dim_max = 1000; - - for (bool use_current_domain : use_current_domains) { + auto use_current_domain = GENERATE(false, true); + // TODO this could be formatted with fmt::format which is part of + // internal header spd/log/fmt/fmt.h and should not be used. In C++20, + // this can be replaced with std::format. + std::ostringstream section2; + section2 << "- use_current_domain=" << use_current_domain; + SECTION(section2.str()) { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-dataframe-platform-config"; - PlatformConfig platform_config; platform_config.dataframe_dim_zstd_level = 6; platform_config.offsets_filters = R"([)" + filter.first + R"(])"; @@ -203,10 +211,14 @@ TEST_CASE("SOMADataFrame: platform_config") { } TEST_CASE("SOMADataFrame: metadata") { - bool use_current_domains[] = {false, true}; int64_t dim_max = 1000; - - for (bool use_current_domain : use_current_domains) { + auto use_current_domain = GENERATE(false, true); + // TODO this could be formatted with fmt::format which is part of internal + // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be + // replaced with std::format. + std::ostringstream section; + section << "- use_current_domain=" << use_current_domain; + SECTION(section.str()) { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-collection"; diff --git a/libtiledbsoma/test/unit_soma_dense_ndarray.cc b/libtiledbsoma/test/unit_soma_dense_ndarray.cc index 46f6fb459c..5422975a9d 100644 --- a/libtiledbsoma/test/unit_soma_dense_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_dense_ndarray.cc @@ -36,8 +36,13 @@ TEST_CASE("SOMADenseNDArray: basic") { int64_t dim_max = 1000; - bool use_current_domains[] = {false, true}; - for (bool use_current_domain : use_current_domains) { + auto use_current_domain = GENERATE(false, true); + // TODO this could be formatted with fmt::format which is part of internal + // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be + // replaced with std::format. + std::ostringstream section; + section << "- use_current_domain=" << use_current_domain; + SECTION(section.str()) { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-dense-ndarray-basic"; @@ -57,67 +62,77 @@ TEST_CASE("SOMADenseNDArray: basic") { ctx, PlatformConfig(), TimestampRange(0, 2))); - continue; - } + } else { + SOMADenseNDArray::create( + uri, + "l", + ArrowTable( + std::move(index_columns.first), + std::move(index_columns.second)), + ctx, + PlatformConfig(), + TimestampRange(0, 2)); - SOMADenseNDArray::create( - uri, - "l", - ArrowTable( - std::move(index_columns.first), - std::move(index_columns.second)), - ctx, - PlatformConfig(), - TimestampRange(0, 2)); + REQUIRE(SOMADenseNDArray::exists(uri, ctx)); + REQUIRE(!SOMADataFrame::exists(uri, ctx)); + REQUIRE(!SOMASparseNDArray::exists(uri, ctx)); - REQUIRE(SOMADenseNDArray::exists(uri, ctx)); - REQUIRE(!SOMADataFrame::exists(uri, ctx)); - REQUIRE(!SOMASparseNDArray::exists(uri, ctx)); - - auto soma_dense = SOMADenseNDArray::open(uri, OpenMode::read, ctx); - REQUIRE(soma_dense->uri() == uri); - REQUIRE(soma_dense->ctx() == ctx); - REQUIRE(soma_dense->type() == "SOMADenseNDArray"); - REQUIRE(soma_dense->is_sparse() == false); - REQUIRE(soma_dense->soma_data_type() == "l"); - auto schema = soma_dense->tiledb_schema(); - REQUIRE(schema->has_attribute("soma_data")); - REQUIRE(schema->array_type() == TILEDB_DENSE); - REQUIRE(schema->domain().has_dimension("soma_dim_0")); - REQUIRE(soma_dense->ndim() == 1); + auto soma_dense = SOMADenseNDArray::open(uri, OpenMode::read, ctx); + REQUIRE(soma_dense->uri() == uri); + REQUIRE(soma_dense->ctx() == ctx); + REQUIRE(soma_dense->type() == "SOMADenseNDArray"); + REQUIRE(soma_dense->is_sparse() == false); + REQUIRE(soma_dense->soma_data_type() == "l"); + auto schema = soma_dense->tiledb_schema(); + REQUIRE(schema->has_attribute("soma_data")); + REQUIRE(schema->array_type() == TILEDB_DENSE); + REQUIRE(schema->domain().has_dimension("soma_dim_0")); + REQUIRE(soma_dense->ndim() == 1); + + // Once we have support for current domain in dense arrays + // if (use_current_domain) { + // REQUIRE(soma_dense->shape() == std::vector{dim_max + + // 1}); + //} else { + // REQUIRE( + // soma_dense->maxshape() == std::vector{dim_max + + // 1}); + //} - if (use_current_domain) { - REQUIRE(soma_dense->shape() == std::vector{dim_max + 1}); - } else { REQUIRE( soma_dense->maxshape() == std::vector{dim_max + 1}); - } - soma_dense->close(); + soma_dense->close(); - std::vector d0{1, 10}; - std::vector a0(10, 1); + std::vector d0{1, 10}; + std::vector a0(10, 1); - soma_dense->open(OpenMode::write); - soma_dense->set_column_data("soma_data", a0.size(), a0.data()); - soma_dense->set_column_data("soma_dim_0", d0.size(), d0.data()); - soma_dense->write(); - soma_dense->close(); + soma_dense->open(OpenMode::write); + soma_dense->set_column_data("soma_data", a0.size(), a0.data()); + soma_dense->set_column_data("soma_dim_0", d0.size(), d0.data()); + soma_dense->write(); + soma_dense->close(); - soma_dense->open(OpenMode::read); - while (auto batch = soma_dense->read_next()) { - auto arrbuf = batch.value(); - auto a0span = arrbuf->at("soma_data")->data(); - REQUIRE(a0 == std::vector(a0span.begin(), a0span.end())); + soma_dense->open(OpenMode::read); + while (auto batch = soma_dense->read_next()) { + auto arrbuf = batch.value(); + auto a0span = arrbuf->at("soma_data")->data(); + REQUIRE(a0 == std::vector(a0span.begin(), a0span.end())); + } + soma_dense->close(); } - soma_dense->close(); } } TEST_CASE("SOMADenseNDArray: platform_config") { int64_t dim_max = 1000; - bool use_current_domains[] = {false, true}; - for (bool use_current_domain : use_current_domains) { + auto use_current_domain = GENERATE(false, true); + // TODO this could be formatted with fmt::format which is part of internal + // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be + // replaced with std::format. + std::ostringstream section; + section << "- use_current_domain=" << use_current_domain; + SECTION(section.str()) { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-dense-ndarray-platform-config"; @@ -138,35 +153,40 @@ TEST_CASE("SOMADenseNDArray: platform_config") { ctx, platform_config)); - continue; - } - - SOMADenseNDArray::create( - uri, - "l", - ArrowTable( - std::move(index_columns.first), - std::move(index_columns.second)), - ctx, - platform_config); + } else { + SOMADenseNDArray::create( + uri, + "l", + ArrowTable( + std::move(index_columns.first), + std::move(index_columns.second)), + ctx, + platform_config); - auto soma_dense = SOMADenseNDArray::open(uri, OpenMode::read, ctx); - auto dim_filter = soma_dense->tiledb_schema() - ->domain() - .dimension("soma_dim_0") - .filter_list() - .filter(0); - REQUIRE(dim_filter.filter_type() == TILEDB_FILTER_ZSTD); - REQUIRE(dim_filter.get_option(TILEDB_COMPRESSION_LEVEL) == 6); + auto soma_dense = SOMADenseNDArray::open(uri, OpenMode::read, ctx); + auto dim_filter = soma_dense->tiledb_schema() + ->domain() + .dimension("soma_dim_0") + .filter_list() + .filter(0); + REQUIRE(dim_filter.filter_type() == TILEDB_FILTER_ZSTD); + REQUIRE( + dim_filter.get_option(TILEDB_COMPRESSION_LEVEL) == 6); - soma_dense->close(); + soma_dense->close(); + } } } TEST_CASE("SOMADenseNDArray: metadata") { int64_t dim_max = 1000; - bool use_current_domains[] = {false, true}; - for (bool use_current_domain : use_current_domains) { + auto use_current_domain = GENERATE(false, true); + // TODO this could be formatted with fmt::format which is part of internal + // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be + // replaced with std::format. + std::ostringstream section; + section << "- use_current_domain=" << use_current_domain; + SECTION(section.str()) { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-dense-ndarray"; diff --git a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc index 3b1b7271c3..f664a8e563 100644 --- a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc @@ -35,8 +35,13 @@ TEST_CASE("SOMASparseNDArray: basic") { int64_t dim_max = 1000; - bool use_current_domains[] = {false, true}; - for (bool use_current_domain : use_current_domains) { + auto use_current_domain = GENERATE(false, true); + // TODO this could be formatted with fmt::format which is part of internal + // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be + // replaced with std::format. + std::ostringstream section; + section << "- use_current_domain=" << use_current_domain; + SECTION(section.str()) { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-sparse-ndarray-basic"; @@ -71,10 +76,11 @@ TEST_CASE("SOMASparseNDArray: basic") { REQUIRE(soma_sparse->ndim() == 1); REQUIRE(soma_sparse->nnz() == 0); - auto expect = std::vector({dim_max + 1}); - REQUIRE(soma_sparse->shape() == expect); - if (!use_current_domain) { - REQUIRE(soma_sparse->maxshape() == expect); + if (use_current_domain) { + REQUIRE(soma_sparse->shape() == std::vector{dim_max + 1}); + } else { + REQUIRE( + soma_sparse->maxshape() == std::vector{dim_max + 1}); } soma_sparse->close(); @@ -110,8 +116,13 @@ TEST_CASE("SOMASparseNDArray: basic") { TEST_CASE("SOMASparseNDArray: platform_config") { int64_t dim_max = 1000; - bool use_current_domains[] = {false, true}; - for (bool use_current_domain : use_current_domains) { + auto use_current_domain = GENERATE(false, true); + // TODO this could be formatted with fmt::format which is part of internal + // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be + // replaced with std::format. + std::ostringstream section; + section << "- use_current_domain=" << use_current_domain; + SECTION(section.str()) { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-dataframe-platform-config"; @@ -144,8 +155,13 @@ TEST_CASE("SOMASparseNDArray: platform_config") { TEST_CASE("SOMASparseNDArray: metadata") { int64_t dim_max = 1000; - bool use_current_domains[] = {false, true}; - for (bool use_current_domain : use_current_domains) { + auto use_current_domain = GENERATE(false, true); + // TODO this could be formatted with fmt::format which is part of internal + // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be + // replaced with std::format. + std::ostringstream section; + section << "- use_current_domain=" << use_current_domain; + SECTION(section.str()) { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-sparse-ndarray"; diff --git a/scripts/bld b/scripts/bld index e083388cd3..5008e188dc 100755 --- a/scripts/bld +++ b/scripts/bld @@ -74,38 +74,16 @@ fi # build with custom tiledb if [ -n "${tiledb}" ]; then - printf "Build with TileDB: ${tiledb}\n" + echo "Build with TileDB: $tiledb" extra_opts+=" -DFORCE_BUILD_TILEDB=OFF" export TileDB_DIR="${tiledb}" - - # It's more elegant to use - # if [[ -v LD_LIBRARY_PATH ]]; then - # ... - # fi - # -- however, this is supported by bash not sh. And this script can - # be invoked by python via its shell-out which seems to use sh. - # It's simplest to just pause set -u. - - set +u - - if [ -z $"LD_LIBRARY_PATH" ]; then - export LD_LIBRARY_PATH="${tiledb}" - else - export LD_LIBRARY_PATH="${tiledb}:${LD_LIBRARY_PATH}" - fi - - if [ -z $"DYLD_LIBRARY_PATH" ]; then - export DYLD_LIBRARY_PATH="${tiledb}" - else - export DYLD_LIBRARY_PATH="${tiledb}:${DYLD_LIBRARY_PATH}" - fi - - set -u + export LD_LIBRARY_PATH="${tiledb}${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" + export DYLD_LIBRARY_PATH="${tiledb}${DYLD_LIBRARY_PATH:+:$DYLD_LIBRARY_PATH}" fi # run cmake # ------------------------------------------------------------------- -printf "Building ${build} build\n" +echo "Building ${build} build" # cd to the top level directory of the repo cd "$(dirname "$0")/.."