Skip to content

Commit

Permalink
git: don't use multithreading for computhing dir hash
Browse files Browse the repository at this point in the history
GitPython is not threadsafe, which was causing issues when we were
computing hash for a directory.

Fixes iterative#4079

Related to tree generalization iterative#4050
  • Loading branch information
efiop committed Jun 22, 2020
1 parent 8f190a9 commit e5d44e7
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 2 deletions.
4 changes: 4 additions & 0 deletions dvc/ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,3 +244,7 @@ def stat(self, path):
if self.exists(path):
return self.tree.stat(path)
raise FileNotFoundError

@property
def hash_jobs(self):
return self.tree.hash_jobs
5 changes: 3 additions & 2 deletions dvc/remote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,15 +323,16 @@ def save_info(self, path_info, tree=None, **kwargs):
self.PARAM_CHECKSUM: self.get_hash(path_info, tree=tree, **kwargs)
}

def _calculate_hashes(self, file_infos, tree):
@staticmethod
def _calculate_hashes(file_infos, tree):
file_infos = list(file_infos)
with Tqdm(
total=len(file_infos),
unit="md5",
desc="Computing file/dir hashes (only done once)",
) as pbar:
worker = pbar.wrap_fn(tree.get_file_hash)
with ThreadPoolExecutor(max_workers=self.hash_jobs) as executor:
with ThreadPoolExecutor(max_workers=tree.hash_jobs) as executor:
tasks = executor.map(worker, file_infos)
hashes = dict(zip(file_infos, tasks))
return hashes
Expand Down
4 changes: 4 additions & 0 deletions dvc/repo/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,3 +421,7 @@ def copytree(self, top, dest):
src = root / fname
with self.open(src, mode="rb") as fobj:
copy_fobj_to_file(fobj, dest / fname)

@property
def hash_jobs(self):
return self.repo.tree.hash_jobs
6 changes: 6 additions & 0 deletions dvc/scm/git/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,3 +183,9 @@ def to_ctime(git_time):
to_ctime(entry.ctime),
)
)

@property
def hash_jobs(self):
# NOTE: gitpython is not threadsafe. See
# https://github.com/iterative/dvc/issues/4079
return 1
7 changes: 7 additions & 0 deletions dvc/scm/tree.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import os
import stat
from multiprocessing import cpu_count

from funcy import cached_property


class BaseTree:
Expand Down Expand Up @@ -83,6 +86,10 @@ def isexec(self, path):
def stat(path):
return os.stat(path)

@cached_property
def hash_jobs(self):
return max(1, min(4, cpu_count() // 2))


def is_working_tree(tree):
return isinstance(tree, WorkingTree) or isinstance(
Expand Down

0 comments on commit e5d44e7

Please sign in to comment.