From e90d7db6f2f2e2d256948617630cfa559d36b0d3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 11:13:42 -0400 Subject: [PATCH] [python] Consolidation and vacuuming are now platform configuration options (#1690) (#1696) * Consolidation and vacuuming are now platform configuration options Commit and fragment_metadata consolidation and vacuuming can improve the opening and query performance of SOMA experiments. Vacuuming requires slight coordination though and should not happen by default. Instead a platform config allows the user to control these operations based. This will be expanded to defaults for top-level `io` packages where its more likely a user is doing a one-shot ingestion and will want automatic handling. A new platform config, `consolidate_and_vacuum` has been added which is a boolean to handle this behavior. * set is_mac for ci-minimal workflow Co-authored-by: Seth Shelnutt --- .github/workflows/python-ci-minimal.yml | 1 + apis/python/src/tiledbsoma/_dataframe.py | 9 +++--- apis/python/src/tiledbsoma/_dense_nd_array.py | 7 +++-- .../python/src/tiledbsoma/_sparse_nd_array.py | 19 +++++++----- apis/python/src/tiledbsoma/_tiledb_array.py | 29 ++++++++++++++++--- .../options/_tiledb_create_options.py | 3 ++ 6 files changed, 51 insertions(+), 17 deletions(-) diff --git a/.github/workflows/python-ci-minimal.yml b/.github/workflows/python-ci-minimal.yml index a1eb0a153f..ce9f3a0f4e 100644 --- a/.github/workflows/python-ci-minimal.yml +++ b/.github/workflows/python-ci-minimal.yml @@ -29,6 +29,7 @@ jobs: python_version: ${{ matrix.python-version }} cc: ${{ matrix.cc }} cxx: ${{ matrix.cxx }} + is_mac: ${{ contains(matrix.os, 'macos') }} report_codecov: ${{ matrix.python-version == '3.10' }} run_lint: ${{ matrix.python-version == '3.10' }} secrets: inherit diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 3d77312763..cb118f9fd4 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -401,7 +401,6 @@ def write( """ _util.check_type("values", values, (pa.Table,)) - del platform_config # unused dim_cols_map: Dict[str, pd.DataFrame] = {} attr_cols_map: Dict[str, pd.DataFrame] = {} dim_names_set = self.index_column_names @@ -437,14 +436,17 @@ def write( dim_cols_list = [dim_cols_map[name] for name in self.index_column_names] dim_cols_tuple = tuple(dim_cols_list) self._handle.writer[dim_cols_tuple] = attr_cols_map - self._consolidate_and_vacuum_fragment_metadata() + tiledb_create_options = TileDBCreateOptions.from_platform_config( + platform_config + ) + if tiledb_create_options.consolidate_and_vacuum: + self._consolidate_and_vacuum() return self def _set_reader_coord( self, sr: clib.SOMAArray, dim_idx: int, dim: tiledb.Dim, coord: object ) -> bool: - if coord is None: return True # No constraint; select all in this dimension @@ -582,7 +584,6 @@ def _set_reader_coord_by_py_seq_or_np_array( def _set_reader_coord_by_numeric_slice( self, sr: clib.SOMAArray, dim_idx: int, dim: tiledb.Dim, coord: Slice[Any] ) -> bool: - try: lo_hi = _util.slice_to_numeric_range(coord, dim.domain) except _util.NonNumericDimensionError: diff --git a/apis/python/src/tiledbsoma/_dense_nd_array.py b/apis/python/src/tiledbsoma/_dense_nd_array.py index 542579d5c8..e538444930 100644 --- a/apis/python/src/tiledbsoma/_dense_nd_array.py +++ b/apis/python/src/tiledbsoma/_dense_nd_array.py @@ -172,9 +172,12 @@ def write( """ _util.check_type("values", values, (pa.Tensor,)) - del platform_config # Currently unused. self._handle.writer[coords] = values.to_numpy() - self._consolidate_and_vacuum_fragment_metadata() + tiledb_create_options = TileDBCreateOptions.from_platform_config( + platform_config + ) + if tiledb_create_options.consolidate_and_vacuum: + self._consolidate_and_vacuum() return self @classmethod diff --git a/apis/python/src/tiledbsoma/_sparse_nd_array.py b/apis/python/src/tiledbsoma/_sparse_nd_array.py index 97103d06f4..2fd86f5258 100644 --- a/apis/python/src/tiledbsoma/_sparse_nd_array.py +++ b/apis/python/src/tiledbsoma/_sparse_nd_array.py @@ -183,9 +183,11 @@ def write( Lifecycle: Experimental. """ - del platform_config # Currently unused. arr = self._handle.writer + tiledb_create_options = TileDBCreateOptions.from_platform_config( + platform_config + ) if isinstance(values, pa.SparseCOOTensor): # Write bulk data @@ -197,8 +199,9 @@ def write( bounding_box = self._compute_bounding_box_metadata(maxes) self._set_bounding_box_metadata(bounding_box) - # Consolidate non-bulk data - self._consolidate_and_vacuum_fragment_metadata() + if tiledb_create_options.consolidate_and_vacuum: + # Consolidate non-bulk data + self._consolidate_and_vacuum() return self if isinstance(values, (pa.SparseCSCMatrix, pa.SparseCSRMatrix)): @@ -216,8 +219,9 @@ def write( bounding_box = self._compute_bounding_box_metadata([nr - 1, nc - 1]) self._set_bounding_box_metadata(bounding_box) - # Consolidate non-bulk data - self._consolidate_and_vacuum_fragment_metadata() + if tiledb_create_options.consolidate_and_vacuum: + # Consolidate non-bulk data + self._consolidate_and_vacuum() return self if isinstance(values, pa.Table): @@ -241,8 +245,9 @@ def write( bounding_box = self._compute_bounding_box_metadata(maxes) self._set_bounding_box_metadata(bounding_box) - # Consolidate non-bulk data - self._consolidate_and_vacuum_fragment_metadata() + if tiledb_create_options.consolidate_and_vacuum: + # Consolidate non-bulk data + self._consolidate_and_vacuum() return self raise TypeError( diff --git a/apis/python/src/tiledbsoma/_tiledb_array.py b/apis/python/src/tiledbsoma/_tiledb_array.py index a5bdeed4a2..516563f4a8 100644 --- a/apis/python/src/tiledbsoma/_tiledb_array.py +++ b/apis/python/src/tiledbsoma/_tiledb_array.py @@ -6,7 +6,7 @@ import ctypes import os import sys -from typing import Any, Dict, Optional, Sequence, Tuple +from typing import Any, Dict, List, Optional, Sequence, Tuple import pyarrow as pa import tiledb @@ -194,7 +194,9 @@ def _create_internal( cls._set_create_metadata(handle) return handle - def _consolidate_and_vacuum_fragment_metadata(self) -> None: + def _consolidate_and_vacuum( + self, modes: List[str] = ["fragment_meta", "commits"] + ) -> None: """ This post-ingestion helper consolidates and vacuums fragment metadata and commit files -- this is quick to do, and positively impacts query performance. It does _not_ consolidate @@ -202,12 +204,31 @@ def _consolidate_and_vacuum_fragment_metadata(self) -> None: discretion. """ - for mode in ["fragment_meta", "commits"]: + for mode in modes: + self._consolidate(modes=[mode]) + self._vacuum(modes=[mode]) + def _consolidate(self, modes: List[str] = ["fragment_meta", "commits"]) -> None: + """ + This post-ingestion helper consolidates by default fragment metadata and commit files -- + this is quick to do, and positively impacts query performance. + """ + + for mode in modes: cfg = self._ctx.config() cfg["sm.consolidation.mode"] = mode - cfg["sm.vacuum.mode"] = mode ctx = tiledb.Ctx(cfg) tiledb.consolidate(self.uri, ctx=ctx) + + def _vacuum(self, modes: List[str] = ["fragment_meta", "commits"]) -> None: + """ + This post-ingestion helper vacuums by default fragment metadata and commit files. Vacuuming is not multi-process safe and requires coordination that nothing is currently reading the files that will be vacuumed. + """ + + for mode in modes: + cfg = self._ctx.config() + cfg["sm.vacuum.mode"] = mode + ctx = tiledb.Ctx(cfg) + tiledb.vacuum(self.uri, ctx=ctx) diff --git a/apis/python/src/tiledbsoma/options/_tiledb_create_options.py b/apis/python/src/tiledbsoma/options/_tiledb_create_options.py index a60c687197..74b69f9a33 100644 --- a/apis/python/src/tiledbsoma/options/_tiledb_create_options.py +++ b/apis/python/src/tiledbsoma/options/_tiledb_create_options.py @@ -143,6 +143,9 @@ class TileDBCreateOptions: attrs: Mapping[str, _ColumnConfig] = attrs_.field( factory=dict, converter=_normalize_columns ) + consolidate_and_vacuum: bool = attrs_.field( + validator=vld.instance_of(bool), default=False + ) @classmethod def from_platform_config(