Skip to content

Commit

Permalink
remote: short-circuit remote size estimation for large remotes (#3537)
Browse files Browse the repository at this point in the history
  • Loading branch information
pmrowla authored Mar 30, 2020
1 parent f6077d0 commit d276ba4
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 23 deletions.
85 changes: 63 additions & 22 deletions dvc/remote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -845,30 +845,12 @@ def cache_exists(self, checksums, jobs=None, name=None):
if len(checksums) == 1 or not self.CAN_TRAVERSE:
return self._cache_object_exists(checksums, jobs, name)

# Fetch cache entries beginning with "00..." prefix for estimating the
# size of entire remote cache
checksums = frozenset(checksums)
prefix = "0" * self.TRAVERSE_PREFIX_LEN
total_prefixes = pow(16, self.TRAVERSE_PREFIX_LEN)
with Tqdm(
desc="Estimating size of "
+ ("cache in '{}'".format(name) if name else "remote cache"),
unit="file",
) as pbar:

def update(n=1):
pbar.update(n * total_prefixes)

paths = self.list_cache_paths(
prefix=prefix, progress_callback=update
)
remote_checksums = set(map(self.path_to_checksum, paths))

if remote_checksums:
remote_size = total_prefixes * len(remote_checksums)
else:
remote_size = total_prefixes
logger.debug("Estimated remote size: {} files".format(remote_size))
# Max remote size allowed for us to use traverse method
remote_size, remote_checksums = self._estimate_cache_size(
checksums, name=name
)

traverse_pages = remote_size / self.LIST_OBJECT_PAGE_SIZE
# For sufficiently large remotes, traverse must be weighted to account
Expand Down Expand Up @@ -909,6 +891,65 @@ def update(n=1):
checksums, remote_checksums, remote_size, jobs, name
)

def _cache_paths_with_max(
self, max_paths, prefix=None, progress_callback=None
):
count = 0
for path in self.list_cache_paths(prefix, progress_callback):
yield path
count += 1
if count > max_paths:
logger.debug(
"list_cache_paths() returned max '{}' paths, "
"skipping remaining results".format(max_paths)
)
return

def _max_estimation_size(self, checksums):
# Max remote size allowed for us to use traverse method
return max(
self.TRAVERSE_THRESHOLD_SIZE,
len(checksums)
/ self.TRAVERSE_WEIGHT_MULTIPLIER
* self.LIST_OBJECT_PAGE_SIZE,
)

def _estimate_cache_size(self, checksums, short_circuit=True, name=None):
"""Estimate remote cache size based on number of entries beginning with
"00..." prefix.
"""
prefix = "0" * self.TRAVERSE_PREFIX_LEN
total_prefixes = pow(16, self.TRAVERSE_PREFIX_LEN)
if short_circuit:
max_remote_size = self._max_estimation_size(checksums)
else:
max_remote_size = None

with Tqdm(
desc="Estimating size of "
+ ("cache in '{}'".format(name) if name else "remote cache"),
unit="file",
total=max_remote_size,
) as pbar:

def update(n=1):
pbar.update(n * total_prefixes)

if max_remote_size:
paths = self._cache_paths_with_max(
max_remote_size / total_prefixes, prefix, update
)
else:
paths = self.list_cache_paths(prefix, update)

remote_checksums = set(map(self.path_to_checksum, paths))
if remote_checksums:
remote_size = total_prefixes * len(remote_checksums)
else:
remote_size = total_prefixes
logger.debug("Estimated remote size: {} files".format(remote_size))
return remote_size, remote_checksums

def _cache_exists_traverse(
self, checksums, remote_checksums, remote_size, jobs=None, name=None
):
Expand Down
10 changes: 9 additions & 1 deletion tests/unit/remote/test_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from unittest import TestCase

import math
import mock

from dvc.path_info import PathInfo
Expand Down Expand Up @@ -76,8 +77,15 @@ def test_cache_exists(path_to_checksum, object_exists, traverse):
):
checksums = list(range(1000))
remote.cache_exists(checksums)
# verify that _cache_paths_with_max() short circuits
# before returning all 256 remote checksums
max_checksums = math.ceil(
remote._max_estimation_size(checksums)
/ pow(16, remote.TRAVERSE_PREFIX_LEN)
)
assert max_checksums < 256
object_exists.assert_called_with(
frozenset(range(256, 1000)), None, None
frozenset(range(max_checksums, 1000)), None, None
)
traverse.assert_not_called()

Expand Down

0 comments on commit d276ba4

Please sign in to comment.