Skip to content

Commit

Permalink
erepo: make it better
Browse files Browse the repository at this point in the history
- code moving during git pull won't affect any erepos
- source tag move will work as expected
- source cache dir setting will never break havoc
- cache is reused between all clones from same source
  • Loading branch information
Suor committed Jan 27, 2020
1 parent c2def08 commit 4e27ded
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 41 deletions.
98 changes: 57 additions & 41 deletions dvc/external_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,21 +41,28 @@ def external_repo(url, rev=None):
raise PathMissingError(exc.path, url)
finally:
repo.close()
_remove(path)


CLONES = {}
CACHE_DIRS = {}


def clean_repos():
# Outside code should not see cache while we are removing
repo_paths = list(REPOS_CACHE.values())
REPOS_CACHE.clear()
paths = list(CLONES.values()) + list(CACHE_DIRS.values())
CLONES.clear()
CACHE_DIRS.clear()

for path in repo_paths:
for path in paths:
_remove(path)


class ExternalRepo(Repo):
def __init__(self, root_dir, url):
super().__init__(root_dir)
self.url = url
self._set_cache_dir()
self._set_upstream()

def pull_to(self, path, to_info):
Expand Down Expand Up @@ -88,6 +95,15 @@ def _pull_cached(self, out, to_info):
if failed:
raise FileNotFoundError

@wrap_with(threading.Lock())
def _set_cache_dir(self):
try:
cache_dir = CACHE_DIRS[self.url]
except KeyError:
cache_dir = CACHE_DIRS[self.url] = tempfile.mkdtemp("dvc-cache")

self.cache.local.cache_dir = cache_dir

def _set_upstream(self):
# check if the URL is local and no default remote is present
# add default remote pointing to the original repo's cache location
Expand Down Expand Up @@ -142,43 +158,40 @@ def open_by_relpath(self, path, mode="r", encoding=None, **kwargs):
raise PathMissingError(path, self.url)


REPOS_CACHE = {}


@wrap_with(threading.Lock())
def _cached_clone(url, rev):
"""Clone an external git repo to a temporary directory.
Returns the path to a local temporary directory with the specified
revision checked out.
"""
if (url, rev) in REPOS_CACHE:
path = REPOS_CACHE[url, rev]
_git_pull(path, rev)
return path

new_path = tempfile.mkdtemp("dvc-erepo")

if url in REPOS_CACHE:
# Copy and an existing clean clone
# This one unlike shutil.copytree() works with an existing dir
copy_tree(REPOS_CACHE[url], new_path)
_git_pull(new_path, None)
# Get or create a clean clone
clone_path = CLONES.get(url)
if clone_path:
# Do not pull for known shas, branches and tags might move
if not _is_known_sha(clone_path, rev):
_git_pull(clone_path)
else:
# Create a new clone
_clone_repo(url, new_path)
clone_path = tempfile.mkdtemp("dvc-clone")
_git_clone(url, clone_path)
CLONES[url] = clone_path

# Save clean clone dir so that we will have access to a default branch
clean_clone_path = tempfile.mkdtemp("dvc-erepo")
copy_tree(new_path, clean_clone_path)
REPOS_CACHE[url] = clean_clone_path
# Copy to a new dir to keep the clone clean
repo_path = tempfile.mkdtemp("dvc-erepo")
copy_tree(clone_path, repo_path)

# Check out the specified revision
if rev is not None:
_git_checkout(new_path, rev)
_git_checkout(repo_path, rev)

return repo_path

REPOS_CACHE[url, rev] = new_path
return new_path

def _git_clone(url, path):
from dvc.scm.git import Git

git = Git.clone(url, path)
git.close()


def _git_checkout(repo_path, rev):
Expand All @@ -191,19 +204,29 @@ def _git_checkout(repo_path, rev):
git.close()


def _git_pull(repo_path, rev):
def _is_known_sha(repo_path, rev):
import git
from gitdb.exc import BadName

# Do not try to pull in a detached mode
if rev and git.Repo.re_hexsha_only.search(rev):
return
if not rev or not git.Repo.re_hexsha_shortened.search(rev):
return False

git = git.Repo(repo_path)
try:
msg = git.git.pull()
git.Repo(repo_path).commit(rev)
return True
except BadName:
return False


def _git_pull(repo_path):
import git

repo = git.Repo(repo_path)
try:
msg = repo.git.pull()
logger.debug("external repo: git pull: {}", msg)
finally:
git.close()
repo.close()


def _remove(path):
Expand All @@ -213,10 +236,3 @@ def _remove(path):
os_retry(remove)(path)
else:
remove(path)


def _clone_repo(url, path):
from dvc.scm.git import Git

git = Git.clone(url, path)
git.close()
20 changes: 20 additions & 0 deletions tests/func/test_external_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from dvc.external_repo import external_repo
from dvc.scm.git import Git
from dvc.remote import RemoteLOCAL


def test_external_repo(erepo_dir):
Expand Down Expand Up @@ -36,3 +37,22 @@ def test_source_change(erepo_dir):
new_rev = repo.scm.get_rev()

assert old_rev != new_rev


def test_cache_reused(erepo_dir, mocker):
with erepo_dir.chdir():
erepo_dir.dvc_gen("file", "text", commit="add file")

download_spy = mocker.spy(RemoteLOCAL, "download")

# Use URL to prevent any fishy optimizations
url = "file://{}".format(erepo_dir)
with external_repo(url) as repo:
repo.fetch()
assert download_spy.mock.call_count == 1

# Should not download second time
erepo_dir.scm.branch("branch")
with external_repo(url, "branch") as repo:
repo.fetch()
assert download_spy.mock.call_count == 1

0 comments on commit 4e27ded

Please sign in to comment.