Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

index: add dvc_data.index.view() and DataIndexView #195

Merged
merged 3 commits into from
Oct 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/dvc_data/hashfile/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from dvc_objects.db import ObjectDB

from ..hashfile.hash_info import HashInfo
from ..index import DataIndex, DataIndexKey
from ..index import BaseDataIndex, DataIndexKey

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -331,14 +331,14 @@ def merge(odb, ancestor_info, our_info, their_info, allowed=None):


def tree_from_index(
index: "DataIndex",
index: "BaseDataIndex",
prefix: "DataIndexKey",
) -> Tuple["Meta", "Tree"]:
tree_meta = Meta(size=0, nfiles=0, isdir=True)
assert tree_meta.size is not None and tree_meta.nfiles is not None
tree = Tree()
for key, entry in index.iteritems(prefix=prefix):
if key == prefix or entry.meta.isdir:
if key == prefix or entry.meta and entry.meta.isdir:
continue
assert entry.meta and entry.hash_info
tree_key = key[len(prefix) :]
Expand Down
4 changes: 4 additions & 0 deletions src/dvc_data/index/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,7 @@
write_db,
write_json,
)
from .view import ( # noqa: F401, pylint: disable=unused-import
DataIndexView,
view,
)
6 changes: 3 additions & 3 deletions src/dvc_data/index/checkout.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
if TYPE_CHECKING:
from dvc_objects.fs.base import FileSystem

from .index import DataIndex
from .index import BaseDataIndex


def checkout(
index: "DataIndex",
index: "BaseDataIndex",
path: str,
fs: "FileSystem",
old: Optional["DataIndex"] = None,
old: Optional["BaseDataIndex"] = None,
delete=False,
) -> None:
delete = []
Expand Down
4 changes: 2 additions & 2 deletions src/dvc_data/index/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from attrs import define, field

if TYPE_CHECKING:
from .index import DataIndex
from .index import BaseDataIndex

from .index import DataIndexEntry

Expand Down Expand Up @@ -45,7 +45,7 @@ def __bool__(self):
return self.typ != UNCHANGED


def diff(old: Optional["DataIndex"], new: Optional["DataIndex"]):
def diff(old: Optional["BaseDataIndex"], new: Optional["BaseDataIndex"]):
old_keys = {key for key, _ in old.iteritems()} if old else set()
new_keys = {key for key, _ in new.iteritems()} if new else set()

Expand Down
150 changes: 97 additions & 53 deletions src/dvc_data/index/index.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,18 @@
from abc import ABC, abstractmethod
from collections import defaultdict
from collections.abc import MutableMapping
from dataclasses import dataclass
from typing import TYPE_CHECKING, Dict, Iterable, Optional, Tuple
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
Iterator,
Mapping,
MutableMapping,
Optional,
Tuple,
)

from dvc_objects.errors import ObjectFormatError
from pygtrie import ShortKeyError # noqa: F401, pylint: disable=unused-import
Expand Down Expand Up @@ -77,7 +88,76 @@ def _try_load(
return None


class DataIndex(MutableMapping):
class BaseDataIndex(ABC, Mapping[DataIndexKey, DataIndexEntry]):
@abstractmethod
def iteritems(
self, prefix: Optional[DataIndexKey] = None, shallow: bool = False
) -> Iterator[Tuple[DataIndexKey, DataIndexEntry]]:
pass

@abstractmethod
def traverse(self, node_factory: Callable, **kwargs) -> Any:
pass

@abstractmethod
def has_node(self, key: DataIndexKey) -> bool:
pass

@abstractmethod
def longest_prefix(
self, key: DataIndexKey
) -> Tuple[Optional[DataIndexKey], Optional[DataIndexEntry]]:
pass

def ls(self, root_key: DataIndexKey, detail=True):
if not detail:

def node_factory(_, key, children, *args):
if key == root_key:
return children
else:
return key

else:

def node_factory(_, key, children, *args):
if key == root_key:
return children
else:
return key, self.info(key)

return self.traverse(node_factory, prefix=root_key)

def info(self, key: DataIndexKey):
try:
entry = self[key]
assert entry.hash_info
isdir = entry.hash_info and entry.hash_info.isdir
return {
"type": "directory" if isdir else "file",
"size": entry.meta.size if entry.meta else 0,
"isexec": entry.meta.isexec if entry.meta else False,
"isdvc": True,
"isout": True,
"obj": entry.obj,
"entry": entry,
entry.hash_info.name: entry.hash_info.value,
}
except ShortKeyError:
return {
"type": "directory",
"size": 0,
"isexec": False,
"isdvc": bool(self.longest_prefix(key)),
"isout": False,
"obj": None,
"entry": None,
}
except KeyError as exc:
raise FileNotFoundError from exc


class DataIndex(BaseDataIndex, MutableMapping[DataIndexKey, DataIndexEntry]):
def __init__(self, *args, **kwargs):
self._trie = Trie(*args, **kwargs)

Expand Down Expand Up @@ -138,20 +218,24 @@ def load(self, **kwargs):
for key, entry in self.iteritems(shallow=True, **kwargs):
self._load(key, entry)

def has_node(self, key):
def has_node(self, key: DataIndexKey) -> bool:
return self._trie.has_node(key)

def shortest_prefix(self, *args, **kwargs):
return self._trie.shortest_prefix(*args, **kwargs)

def longest_prefix(self, *args, **kwargs):
return self._trie.longest_prefix(*args, **kwargs)
def longest_prefix(
self, key: DataIndexKey
) -> Tuple[Optional[DataIndexKey], Optional[DataIndexEntry]]:
return self._trie.longest_prefix(key)

def traverse(self, *args, **kwargs):
def traverse(self, *args, **kwargs) -> Any:
return self._trie.traverse(*args, **kwargs)

def iteritems(self, prefix=None, shallow=False):
kwargs = {"shallow": shallow}
def iteritems(
self, prefix: Optional[DataIndexKey] = None, shallow: bool = False
) -> Iterator[Tuple[DataIndexKey, DataIndexEntry]]:
kwargs: Dict[str, Any] = {"shallow": shallow}
if prefix:
kwargs = {"prefix": prefix}
item = self._trie.longest_prefix(prefix)
Expand All @@ -163,32 +247,8 @@ def iteritems(self, prefix=None, shallow=False):
self._load(key, entry)
yield key, entry

def info(self, key):
try:
entry = self[key]
isdir = entry.hash_info and entry.hash_info.isdir
return {
"type": "directory" if isdir else "file",
"size": entry.meta.size if entry.meta else 0,
"isexec": entry.meta.isexec if entry.meta else False,
"isdvc": True,
"isout": True,
"obj": entry.obj,
"entry": entry,
entry.hash_info.name: entry.hash_info.value,
}
except ShortKeyError:
return {
"type": "directory",
"size": 0,
"isexec": False,
"isdvc": bool(self.longest_prefix(key)),
"isout": False,
"obj": None,
"entry": None,
}
except KeyError as exc:
raise FileNotFoundError from exc
def iterkeys(self, *args, **kwargs):
return self._trie.iterkeys(*args, **kwargs)

def _ensure_loaded(self, prefix):
entry = self._trie.get(prefix)
Expand All @@ -202,25 +262,9 @@ def _ensure_loaded(self, prefix):
if not entry.obj:
raise TreeError

def ls(self, root_key, detail=True):
def ls(self, root_key: DataIndexKey, detail=True):
self._ensure_loaded(root_key)
if not detail:

def node_factory(_, key, children, *args):
if key == root_key:
return children
else:
return key

else:

def node_factory(_, key, children, *args):
if key == root_key:
return children
else:
return key, self.info(key)

return self.traverse(node_factory, prefix=root_key)
return super().ls(root_key, detail=detail)


def transfer(index, src, dst):
Expand Down
18 changes: 12 additions & 6 deletions src/dvc_data/index/save.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,21 @@
from ..hashfile.meta import Meta

if TYPE_CHECKING:
from .index import DataIndex, DataIndexKey
from .index import BaseDataIndex, DataIndexKey


def md5(index: "DataIndex") -> None:
def md5(index: "BaseDataIndex") -> None:
from ..hashfile.hash import fobj_md5

for _, entry in index.iteritems():
if entry.meta.isdir:
assert entry.fs
if entry.meta and entry.meta.isdir:
continue

if entry.hash_info:
continue

if entry.meta.version_id and entry.fs.version_aware:
if entry.meta and entry.meta.version_id and entry.fs.version_aware:
# NOTE: if we have versioning available - there is no need to check
# metadata as we can directly get correct file content using
# version_id.
Expand All @@ -42,7 +43,9 @@ def md5(index: "DataIndex") -> None:
)


def _save_dir_entry(index: "DataIndex", key: "DataIndexKey", odb=None) -> None:
def _save_dir_entry(
index: "BaseDataIndex", key: "DataIndexKey", odb=None
) -> None:
from ..hashfile.db import add_update_tree
from ..hashfile.tree import tree_from_index

Expand All @@ -57,10 +60,11 @@ def _save_dir_entry(index: "DataIndex", key: "DataIndexKey", odb=None) -> None:
setattr(entry.meta, tree.hash_info.name, tree.hash_info.value)


def save(index: "DataIndex", odb=None) -> None:
def save(index: "BaseDataIndex", odb=None) -> None:
dir_entries: List["DataIndexKey"] = []

for key, entry in index.iteritems():
assert entry.meta and entry.fs
if entry.meta.isdir:
dir_entries.append(key)
continue
Expand All @@ -77,6 +81,8 @@ def save(index: "DataIndex", odb=None) -> None:

if entry.hash_info:
cache = odb or entry.cache
assert entry.hash_info.value
assert cache
cache.add(
path,
entry.fs,
Expand Down
Loading