From 49772c2b03fbb2a1dd06a078393576391a3a26e0 Mon Sep 17 00:00:00 2001 From: Jon Burdo Date: Wed, 17 Aug 2022 11:51:38 -0400 Subject: [PATCH] feat(commit): ensure all data objects committed are present in the index --- ldb/commit.py | 11 ++++++++++- ldb/dataset.py | 24 +++++++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/ldb/commit.py b/ldb/commit.py index 97791cc1..250d61ce 100644 --- a/ldb/commit.py +++ b/ldb/commit.py @@ -3,7 +3,12 @@ from pathlib import Path from typing import Optional -from ldb.dataset import CommitInfo, Dataset, DatasetVersion +from ldb.dataset import ( + CommitInfo, + Dataset, + DatasetVersion, + ensure_all_collection_dir_keys_contained, +) from ldb.path import InstanceDir, WorkspacePath from ldb.transform import save_transform_object from ldb.utils import ( @@ -55,6 +60,10 @@ def commit( ): print("Nothing to commit.") return + ensure_all_collection_dir_keys_contained( + workspace_path / WorkspacePath.COLLECTION, + ldb_dir / InstanceDir.DATA_OBJECT_INFO, + ) collection_obj = collection_dir_to_object( workspace_path / WorkspacePath.COLLECTION, ) diff --git a/ldb/dataset.py b/ldb/dataset.py index 52f3efbc..5b966712 100644 --- a/ldb/dataset.py +++ b/ldb/dataset.py @@ -25,7 +25,11 @@ from funcy.objects import cached_property from ldb.collections import LDBMappingCache -from ldb.exceptions import DatasetNotFoundError, LDBException +from ldb.exceptions import ( + DataObjectNotFoundError, + DatasetNotFoundError, + LDBException, +) from ldb.iter_utils import take from ldb.op_type import OpType from ldb.path import InstanceDir @@ -38,6 +42,7 @@ ) from ldb.typing import JSONDecoded, JSONObject from ldb.utils import ( + DATA_OBJ_ID_PREFIX, ROOT, format_dataset_identifier, format_datetime, @@ -211,6 +216,23 @@ def get_collection_dir_items( yield path.parent.name + path.name, annotation_hash_func(path) +def ensure_all_collection_dir_keys_contained( + collection_dir1: Path, + collection_dir2: Path, +) -> None: + keys = { + (p1, p2) + for p1 in os.listdir(collection_dir2) + for p2 in os.listdir(os.path.join(collection_dir2, p1)) + } + for p1 in os.listdir(collection_dir1): + for p2 in os.listdir(os.path.join(collection_dir1, p1)): + if (p1, p2) not in keys: + raise DataObjectNotFoundError( + f"Data object not found: {DATA_OBJ_ID_PREFIX}{p1}{p2}", + ) + + def get_collection_size( collection_dir: Union[str, Path], ) -> int: