From ddea64c0ced17ffc58e71fd6007e176e6f31c925 Mon Sep 17 00:00:00 2001 From: Julia Dark <24235303+jp-dark@users.noreply.github.com> Date: Tue, 17 Dec 2024 12:18:47 -0500 Subject: [PATCH] [python/c++] Deprecate `config_options_from_schema` in favor of new function (#3437) * Create new `PlatformSchemaConfig` class with only TileDB schema properties. * Create new method `schema_config_options` that returns the `PlatformSchemaConfig`. * Deprecate `config_options_from_schema` in favor of `schema_config_options`. --- apis/python/src/tiledbsoma/_soma_array.py | 41 +++++- apis/python/src/tiledbsoma/_tdb_handles.py | 6 + .../tiledbsoma/io/spatial/_xarray_backend.py | 2 +- apis/python/src/tiledbsoma/pytiledbsoma.cc | 14 ++ apis/python/src/tiledbsoma/soma_array.cc | 1 + apis/python/tests/test_dataframe.py | 4 +- apis/python/tests/test_dense_nd_array.py | 2 +- apis/python/tests/test_platform_config.py | 4 +- apis/python/tests/test_sparse_nd_array.py | 10 +- libtiledbsoma/src/soma/soma_array.h | 11 ++ libtiledbsoma/src/utils/arrow_adapter.cc | 29 +++++ libtiledbsoma/src/utils/arrow_adapter.h | 120 +++++++++++++++++- libtiledbsoma/test/unit_soma_dataframe.cc | 2 +- 13 files changed, 232 insertions(+), 14 deletions(-) diff --git a/apis/python/src/tiledbsoma/_soma_array.py b/apis/python/src/tiledbsoma/_soma_array.py index 1e0ed63cc9..b23d86035f 100644 --- a/apis/python/src/tiledbsoma/_soma_array.py +++ b/apis/python/src/tiledbsoma/_soma_array.py @@ -3,6 +3,7 @@ # # Licensed under the MIT License. +import warnings from typing import Any, Tuple import pyarrow as pa @@ -33,9 +34,41 @@ def schema(self) -> pa.Schema: """ return self._handle.schema + def schema_config_options(self) -> clib.PlatformSchemaConfig: + """Returns metadata about the array schema that is not encompassed within + the Arrow Schema, in the form of a PlatformConfig. + + Available attributes are: + * capacity: int + * allows_duplicates: bool + * tile_order: str + * cell_order: str + * offsets_filters: str + * name (of filter): str + * compression_level: str + * validity_filters: str + * attrs: str + * name (of attribute): str + * filters: str + * name (of filter): str + * compression_level: str + * dims: str + * name (of dimension): str + * filters: str + * name (of filter): str + * compression_level: str + * tile: int + + Lifecycle: + Experimental. + """ + return self._handle.schema_config_options() + def config_options_from_schema(self) -> clib.PlatformConfig: """Returns metadata about the array that is not encompassed within the - Arrow Schema, in the form of a PlatformConfig. + Arrow Schema, in the form of a PlatformConfig (deprecated). + + Use ``schema_config_options`` instead. Available attributes are: * dataframe_dim_zstd_level: int @@ -64,7 +97,13 @@ def config_options_from_schema(self) -> clib.PlatformConfig: * tile_order: str * cell_order: str * consolidate_and_vacuum: bool + + Lifecycle: + Deprecated. """ + warnings.warn( + "Deprecated. Use schema_config_options instead.", DeprecationWarning + ) return self._handle.config_options_from_schema() def non_empty_domain(self) -> Tuple[Tuple[Any, Any], ...]: diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index f7b00bf654..54d802d5dd 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -394,6 +394,12 @@ def _do_initial_reads(self, reader: RawHandle) -> None: def schema(self) -> pa.Schema: return self._handle.schema + def schema_config_options(self) -> clib.PlatformSchemaConfig: + """Returns a class containing the TileDB platform configuration options that + can be read from an array schema. + """ + return self._handle.schema_config_options() + def config_options_from_schema(self) -> clib.PlatformConfig: return self._handle.config_options_from_schema() diff --git a/apis/python/src/tiledbsoma/io/spatial/_xarray_backend.py b/apis/python/src/tiledbsoma/io/spatial/_xarray_backend.py index c27e813980..99e52ad752 100644 --- a/apis/python/src/tiledbsoma/io/spatial/_xarray_backend.py +++ b/apis/python/src/tiledbsoma/io/spatial/_xarray_backend.py @@ -107,7 +107,7 @@ def dtype(self) -> np.typing.DTypeLike: def recommend_chunks(self) -> Tuple[int, ...]: """Returns recommended chunk sizes for chunking this array.""" - dim_info = json.loads(self._array.config_options_from_schema().dims) + dim_info = json.loads(self._array.schema_config_options().dims) return tuple( _str_to_int(dim_info[f"soma_dim_{index}"]["tile"]) for index in range(self.ndim) diff --git a/apis/python/src/tiledbsoma/pytiledbsoma.cc b/apis/python/src/tiledbsoma/pytiledbsoma.cc index 8cb9883abf..8d5514f7b4 100644 --- a/apis/python/src/tiledbsoma/pytiledbsoma.cc +++ b/apis/python/src/tiledbsoma/pytiledbsoma.cc @@ -143,6 +143,20 @@ PYBIND11_MODULE(pytiledbsoma, m) { .def_readwrite( "consolidate_and_vacuum", &PlatformConfig::consolidate_and_vacuum); + py::class_(m, "PlatformSchemaConfig") + .def(py::init<>()) + .def_readwrite("capacity", &PlatformSchemaConfig::capacity) + .def_readwrite( + "offsets_filters", &PlatformSchemaConfig::offsets_filters) + .def_readwrite( + "validity_filters", &PlatformSchemaConfig::validity_filters) + .def_readwrite("attrs", &PlatformSchemaConfig::attrs) + .def_readwrite("dims", &PlatformSchemaConfig::dims) + .def_readwrite( + "allows_duplicates", &PlatformSchemaConfig::allows_duplicates) + .def_readwrite("tile_order", &PlatformSchemaConfig::tile_order) + .def_readwrite("cell_order", &PlatformSchemaConfig::cell_order); + m.def("_update_dataframe_schema", &SOMADataFrame::update_dataframe_schema); load_soma_context(m); diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index c38960e548..c62f4aa28c 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -242,6 +242,7 @@ void load_soma_array(py::module& m) { return pa_schema_import( py::capsule(array.arrow_schema().get())); }) + .def("schema_config_options", &SOMAArray::schema_config_options) .def( "config_options_from_schema", &SOMAArray::config_options_from_schema) diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index eb654808ae..bec5c5fc36 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -139,7 +139,7 @@ def test_dataframe(tmp_path, arrow_schema): assert [e.as_py() for e in table["mybool"]] == pydict["mybool"] with soma.DataFrame.open(uri) as A: - cfg = A.config_options_from_schema() + cfg = A.schema_config_options() assert not cfg.allows_duplicates assert json.loads(cfg.dims)["myint"]["filters"] == [ {"COMPRESSION_LEVEL": 3, "name": "ZSTD"} @@ -1189,7 +1189,7 @@ def test_create_platform_config_overrides( ).close() with soma.DataFrame.open(tmp_path.as_posix()) as A: - cfg = A.config_options_from_schema() + cfg = A.schema_config_options() assert expected_schema_fields["validity_filters"] == json.loads( cfg.validity_filters ) diff --git a/apis/python/tests/test_dense_nd_array.py b/apis/python/tests/test_dense_nd_array.py index dbe7bf846c..17d378c53a 100644 --- a/apis/python/tests/test_dense_nd_array.py +++ b/apis/python/tests/test_dense_nd_array.py @@ -434,7 +434,7 @@ def test_tile_extents(tmp_path): ).close() with soma.DenseNDArray.open(tmp_path.as_posix()) as A: - dim_info = json.loads(A.config_options_from_schema().dims) + dim_info = json.loads(A.schema_config_options().dims) # With new shape (tiledbsoma 1.15), core current domain is (100,10000) # but core domain is huge, and therefore dim 0 does not get its extent # squashed down to 100. diff --git a/apis/python/tests/test_platform_config.py b/apis/python/tests/test_platform_config.py index 4653d6f32f..4dd7e97c4f 100644 --- a/apis/python/tests/test_platform_config.py +++ b/apis/python/tests/test_platform_config.py @@ -46,7 +46,7 @@ def test_platform_config(conftest_pbmc_small): x_arr_uri = str(Path(output_path) / "ms" / "RNA" / "X" / "data") with tiledbsoma.SparseNDArray.open(x_arr_uri) as x_arr: - cfg = x_arr.config_options_from_schema() + cfg = x_arr.schema_config_options() assert cfg.capacity == create_cfg["capacity"] assert cfg.cell_order == create_cfg["cell_order"] assert cfg.tile_order == create_cfg["tile_order"] @@ -70,7 +70,7 @@ def test_platform_config(conftest_pbmc_small): var_arr_uri = str(Path(output_path) / "ms" / "RNA" / "var") with tiledbsoma.DataFrame.open(var_arr_uri) as var_arr: - cfg = var_arr.config_options_from_schema() + cfg = var_arr.schema_config_options() assert json.loads(cfg.dims)["soma_joinid"]["filters"] == [ {"COMPRESSION_LEVEL": 1, "name": "ZSTD"} ] diff --git a/apis/python/tests/test_sparse_nd_array.py b/apis/python/tests/test_sparse_nd_array.py index 9480ee8598..9d3e799ffe 100644 --- a/apis/python/tests/test_sparse_nd_array.py +++ b/apis/python/tests/test_sparse_nd_array.py @@ -352,7 +352,7 @@ def test_sparse_nd_array_read_write_sparse_tensor( with soma.SparseNDArray.open(tmp_path.as_posix()) as A: assert A.is_sparse - assert not A.config_options_from_schema().allows_duplicates + assert not A.schema_config_options().allows_duplicates @pytest.mark.parametrize("shape", [(10,), (23, 4), (5, 3, 1), (8, 4, 2, 30)]) @@ -376,7 +376,7 @@ def test_sparse_nd_array_read_write_table( with soma.SparseNDArray.open(tmp_path.as_posix()) as A: assert A.is_sparse - assert not A.config_options_from_schema().allows_duplicates + assert not A.schema_config_options().allows_duplicates @pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) @@ -404,7 +404,7 @@ def test_sparse_nd_array_read_as_pandas( with soma.SparseNDArray.open(tmp_path.as_posix()) as A: assert A.is_sparse - assert not A.config_options_from_schema().allows_duplicates + assert not A.schema_config_options().allows_duplicates @pytest.mark.parametrize("shape_is_nones", [True, False]) @@ -1114,7 +1114,7 @@ def test_tile_extents(tmp_path): ).close() with soma.SparseNDArray.open(tmp_path.as_posix()) as A: - dim_info = json.loads(A.config_options_from_schema().dims) + dim_info = json.loads(A.schema_config_options().dims) assert int(dim_info["soma_dim_0"]["tile"]) == 2048 assert int(dim_info["soma_dim_1"]["tile"]) == 2048 @@ -1157,7 +1157,7 @@ def test_create_platform_config_overrides( ).close() with soma.SparseNDArray.open(tmp_path.as_posix()) as A: - cfg = A.config_options_from_schema() + cfg = A.schema_config_options() assert expected_schema_fields["validity_filters"] == json.loads( cfg.validity_filters ) diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 296200c873..4ef56bc567 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -650,6 +650,17 @@ class SOMAArray : public SOMAObject { ctx_->tiledb_ctx(), arr_); } + /** + * @brief Get members of the schema (capacity, allows_duplicates, + * tile_order, cell_order, offsets_filters, validity_filters, attr filters, + * and dim filters) in the form of a PlatformSchemaConfig. + * + * @return PlatformSchemaConfig + */ + PlatformSchemaConfig schema_config_options() const { + return ArrowAdapter::platform_schema_config_from_tiledb(*schema_); + } + /** * @brief Get members of the schema (capacity, allows_duplicates, * tile_order, cell_order, offsets_filters, validity_filters, attr filters, diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index d601719abf..9cb92e2ede 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -207,6 +207,35 @@ PlatformConfig ArrowAdapter::platform_config_from_tiledb_schema( return platform_config; } +PlatformSchemaConfig ArrowAdapter::platform_schema_config_from_tiledb( + ArraySchema tiledb_schema) { + std::map layout_as_string{ + {TILEDB_ROW_MAJOR, "row-major"}, + {TILEDB_COL_MAJOR, "column-major"}, + {TILEDB_HILBERT, "hilbert"}, + {TILEDB_UNORDERED, "unordered"}, + }; + + PlatformSchemaConfig platform_config; + platform_config.capacity = tiledb_schema.capacity(); + platform_config.allows_duplicates = tiledb_schema.allows_dups(); + platform_config.tile_order = layout_as_string[tiledb_schema.tile_order()]; + platform_config.cell_order = layout_as_string[tiledb_schema.cell_order()]; + platform_config.offsets_filters = ArrowAdapter::_get_filter_list_json( + tiledb_schema.offsets_filter_list()) + .dump(); + platform_config.validity_filters = ArrowAdapter::_get_filter_list_json( + tiledb_schema.validity_filter_list()) + .dump(); + platform_config.attrs = ArrowAdapter::_get_attrs_filter_list_json( + tiledb_schema) + .dump(); + platform_config.dims = ArrowAdapter::_get_dims_list_json(tiledb_schema) + .dump(); + + return platform_config; +} + json ArrowAdapter::_get_attrs_filter_list_json( const ArraySchema& tiledb_schema) { json attrs_filter_list_as_json; diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index 81c4bc8755..7c3b14f9ca 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -62,7 +62,7 @@ struct ArrowBuffer { using ArrowTable = std::pair, std::unique_ptr>; -class PlatformConfig { +struct PlatformConfig { public: /* Set the ZstdFilter's level for DataFrame dims */ int32_t dataframe_dim_zstd_level = 3; @@ -190,6 +190,115 @@ class PlatformConfig { bool consolidate_and_vacuum = false; }; +/** TileDB specific configuration options that can be read back from a single + * TileDB ArraySchema. + */ +struct PlatformSchemaConfig { + public: + /* Set whether the TileDB Array allows duplicate values */ + bool allows_duplicates = false; + + /* Set the tile order as "row", "row-major", "col", or "col-major" */ + std::optional tile_order = std::nullopt; + + /* Set the cell order as "hilbert", "row", "row-major", "col", or + * "col-major" + */ + std::optional cell_order = std::nullopt; + + /* Set the tile capcity for sparse arrays */ + uint64_t capacity = 100000; + + /** + * Available filters with associated options are + * [ + * { + * "name": "GZIP", "COMPRESSION_LEVEL": (int32_t) + * }, + * { + * "name": "ZSTD", "COMPRESSION_LEVEL": (int32_t) + * }, + * { + * "name": "LZ4", "COMPRESSION_LEVEL": (int32_t) + * }, + * { + * "name": "BZIP2", "COMPRESSION_LEVEL": (int32_t) + * }, + * { + * "name": "RLE", "COMPRESSION_LEVEL": (int32_t) + * }, + * { + * "name": "DELTA", + * "COMPRESSION_LEVEL": (int32_t), + * "COMPRESSION_REINTERPRET_DATATYPE": (uint8_t) + * }, + * { + * "name": "DOUBLE_DELTA", + * "COMPRESSION_LEVEL": (int32_t), + * "COMPRESSION_REINTERPRET_DATATYPE": (uint8_t) + * }, + * { + * "name": "BIT_WIDTH_REDUCTION", + * "BIT_WIDTH_MAX_WINDOW": (uint32_t) + * }, + * { + * "name": "POSITIVE_DELTA", "POSITIVE_DELTA_MAX_WINDOW": + * (uint32_t), + * }, + * { + * "name": "DICTIONARY_ENCODING", "COMPRESSION_LEVEL": (int32_t) + * }, + * { + * "name": "SCALE_FLOAT", + * "SCALE_FLOAT_FACTOR": (double), + * "SCALE_FLOAT_OFFSET": (double), + * "SCALE_FLOAT_BYTEWIDTH": (uint64_t), + * }, + * { + * "name": "WEBP", + * "WEBP_INPUT_FORMAT": (uint8_t), + * "WEBP_QUALITY": (float), + * "WEBP_LOSSLESS": (uint8_t), + * }, + * "CHECKSUM_MD5", + * "CHECKSUM_SHA256", + * "XOR", + * "BITSHUFFLE", + * "BYTESHUFFLE", + * "NOOP" + * ] + * + */ + std::string + offsets_filters = R"(["DOUBLE_DELTA", "BIT_WIDTH_REDUCTION", "ZSTD"])"; + + /* Set the validity filters. */ + std::string validity_filters = ""; + + /* Set the filters for attributes. + * + * Example: + * { + * "attr_name": { + * "filters": ["XOR", {"name": "GZIP", "COMPRESSION_LEVEL": 3}] + * } + * } + * + */ + std::string attrs = ""; + + /* Set the filters and tiles for dimensions. + * + * Example: + * { + * "dim_name": {"filters": ["NoOpFilter"], "tile": 8} + * } + * + */ + + std::string dims = ""; +}; + /** * This is our application-specific wrapper around nanoarrow. * @@ -229,6 +338,15 @@ class ArrowAdapter { static std::unique_ptr arrow_schema_from_tiledb_array( std::shared_ptr ctx, std::shared_ptr tiledb_array); + /** + * @brief Get members of the TileDB Schema in the form of a + * PlatformSchemaConfig + * + * @return PlatformSchemaConfig + */ + static PlatformSchemaConfig platform_schema_config_from_tiledb( + ArraySchema tiledb_schema); + /** * @brief Get members of the TileDB Schema in the form of a PlatformConfig * diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index 3780f85549..600e2e190c 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -341,7 +341,7 @@ TEST_CASE_METHOD( .filter_type() == filter.second); } - auto config_options = sdf->config_options_from_schema(); + auto config_options = sdf->schema_config_options(); REQUIRE(config_options.capacity == 100000); REQUIRE(config_options.allows_duplicates == false); REQUIRE(config_options.tile_order == "row-major");