Skip to content
This repository has been archived by the owner on Feb 28, 2024. It is now read-only.

Commit

Permalink
feat(commit): ensure all data objects committed are present in the index
Browse files Browse the repository at this point in the history
  • Loading branch information
jonburdo committed Aug 17, 2022
1 parent 4420563 commit 49772c2
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 2 deletions.
11 changes: 10 additions & 1 deletion ldb/commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@
from pathlib import Path
from typing import Optional

from ldb.dataset import CommitInfo, Dataset, DatasetVersion
from ldb.dataset import (
CommitInfo,
Dataset,
DatasetVersion,
ensure_all_collection_dir_keys_contained,
)
from ldb.path import InstanceDir, WorkspacePath
from ldb.transform import save_transform_object
from ldb.utils import (
Expand Down Expand Up @@ -55,6 +60,10 @@ def commit(
):
print("Nothing to commit.")
return
ensure_all_collection_dir_keys_contained(
workspace_path / WorkspacePath.COLLECTION,
ldb_dir / InstanceDir.DATA_OBJECT_INFO,
)
collection_obj = collection_dir_to_object(
workspace_path / WorkspacePath.COLLECTION,
)
Expand Down
24 changes: 23 additions & 1 deletion ldb/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,11 @@
from funcy.objects import cached_property

from ldb.collections import LDBMappingCache
from ldb.exceptions import DatasetNotFoundError, LDBException
from ldb.exceptions import (
DataObjectNotFoundError,
DatasetNotFoundError,
LDBException,
)
from ldb.iter_utils import take
from ldb.op_type import OpType
from ldb.path import InstanceDir
Expand All @@ -38,6 +42,7 @@
)
from ldb.typing import JSONDecoded, JSONObject
from ldb.utils import (
DATA_OBJ_ID_PREFIX,
ROOT,
format_dataset_identifier,
format_datetime,
Expand Down Expand Up @@ -211,6 +216,23 @@ def get_collection_dir_items(
yield path.parent.name + path.name, annotation_hash_func(path)


def ensure_all_collection_dir_keys_contained(
collection_dir1: Path,
collection_dir2: Path,
) -> None:
keys = {
(p1, p2)
for p1 in os.listdir(collection_dir2)
for p2 in os.listdir(os.path.join(collection_dir2, p1))
}
for p1 in os.listdir(collection_dir1):
for p2 in os.listdir(os.path.join(collection_dir1, p1)):
if (p1, p2) not in keys:
raise DataObjectNotFoundError(
f"Data object not found: {DATA_OBJ_ID_PREFIX}{p1}{p2}",
)


def get_collection_size(
collection_dir: Union[str, Path],
) -> int:
Expand Down

0 comments on commit 49772c2

Please sign in to comment.