Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

erepo: make it better #3234

Merged
merged 1 commit into from
Jan 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 57 additions & 41 deletions dvc/external_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,21 +41,28 @@ def external_repo(url, rev=None):
raise PathMissingError(exc.path, url)
finally:
repo.close()
_remove(path)


CLONES = {}
CACHE_DIRS = {}


def clean_repos():
# Outside code should not see cache while we are removing
repo_paths = list(REPOS_CACHE.values())
REPOS_CACHE.clear()
paths = list(CLONES.values()) + list(CACHE_DIRS.values())
CLONES.clear()
CACHE_DIRS.clear()

for path in repo_paths:
for path in paths:
_remove(path)


class ExternalRepo(Repo):
def __init__(self, root_dir, url):
super().__init__(root_dir)
self.url = url
self._set_cache_dir()
self._set_upstream()

def pull_to(self, path, to_info):
Expand Down Expand Up @@ -88,6 +95,15 @@ def _pull_cached(self, out, to_info):
if failed:
raise FileNotFoundError

@wrap_with(threading.Lock())
def _set_cache_dir(self):
try:
cache_dir = CACHE_DIRS[self.url]
except KeyError:
cache_dir = CACHE_DIRS[self.url] = tempfile.mkdtemp("dvc-cache")

self.cache.local.cache_dir = cache_dir

def _set_upstream(self):
# check if the URL is local and no default remote is present
# add default remote pointing to the original repo's cache location
Expand Down Expand Up @@ -142,43 +158,40 @@ def open_by_relpath(self, path, mode="r", encoding=None, **kwargs):
raise PathMissingError(path, self.url)


REPOS_CACHE = {}


@wrap_with(threading.Lock())
def _cached_clone(url, rev):
"""Clone an external git repo to a temporary directory.

Returns the path to a local temporary directory with the specified
revision checked out.
"""
if (url, rev) in REPOS_CACHE:
path = REPOS_CACHE[url, rev]
_git_pull(path, rev)
return path

new_path = tempfile.mkdtemp("dvc-erepo")

if url in REPOS_CACHE:
# Copy and an existing clean clone
# This one unlike shutil.copytree() works with an existing dir
copy_tree(REPOS_CACHE[url], new_path)
_git_pull(new_path, None)
# Get or create a clean clone
clone_path = CLONES.get(url)
if clone_path:
# Do not pull for known shas, branches and tags might move
if not _is_known_sha(clone_path, rev):
_git_pull(clone_path)
else:
# Create a new clone
_clone_repo(url, new_path)
clone_path = tempfile.mkdtemp("dvc-clone")
_git_clone(url, clone_path)
CLONES[url] = clone_path

# Save clean clone dir so that we will have access to a default branch
clean_clone_path = tempfile.mkdtemp("dvc-erepo")
copy_tree(new_path, clean_clone_path)
REPOS_CACHE[url] = clean_clone_path
# Copy to a new dir to keep the clone clean
repo_path = tempfile.mkdtemp("dvc-erepo")
copy_tree(clone_path, repo_path)

# Check out the specified revision
if rev is not None:
_git_checkout(new_path, rev)
_git_checkout(repo_path, rev)

return repo_path

REPOS_CACHE[url, rev] = new_path
return new_path

def _git_clone(url, path):
from dvc.scm.git import Git

git = Git.clone(url, path)
git.close()


def _git_checkout(repo_path, rev):
Expand All @@ -191,19 +204,29 @@ def _git_checkout(repo_path, rev):
git.close()


def _git_pull(repo_path, rev):
def _is_known_sha(repo_path, rev):
import git
from gitdb.exc import BadName

# Do not try to pull in a detached mode
if rev and git.Repo.re_hexsha_only.search(rev):
return
if not rev or not git.Repo.re_hexsha_shortened.search(rev):
return False

git = git.Repo(repo_path)
try:
msg = git.git.pull()
git.Repo(repo_path).commit(rev)
return True
except BadName:
return False


def _git_pull(repo_path):
import git

repo = git.Repo(repo_path)
try:
msg = repo.git.pull()
logger.debug("external repo: git pull: {}", msg)
finally:
git.close()
repo.close()


def _remove(path):
Expand All @@ -213,10 +236,3 @@ def _remove(path):
os_retry(remove)(path)
else:
remove(path)


def _clone_repo(url, path):
from dvc.scm.git import Git

git = Git.clone(url, path)
git.close()
20 changes: 20 additions & 0 deletions tests/func/test_external_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from dvc.external_repo import external_repo
from dvc.scm.git import Git
from dvc.remote import RemoteLOCAL


def test_external_repo(erepo_dir):
Expand Down Expand Up @@ -36,3 +37,22 @@ def test_source_change(erepo_dir):
new_rev = repo.scm.get_rev()

assert old_rev != new_rev


def test_cache_reused(erepo_dir, mocker):
with erepo_dir.chdir():
erepo_dir.dvc_gen("file", "text", commit="add file")

download_spy = mocker.spy(RemoteLOCAL, "download")

# Use URL to prevent any fishy optimizations
url = "file://{}".format(erepo_dir)
with external_repo(url) as repo:
repo.fetch()
assert download_spy.mock.call_count == 1

# Should not download second time
erepo_dir.scm.branch("branch")
with external_repo(url, "branch") as repo:
repo.fetch()
assert download_spy.mock.call_count == 1