Skip to content

Commit

Permalink
perf: delay cache creation
Browse files Browse the repository at this point in the history
The older code instantiates non-local cache remotes on Repo creation if
any of those are configured. And remote instantiation loads their API
libs.
  • Loading branch information
Suor committed Sep 24, 2019
1 parent bd2b590 commit e2eb4ae
Showing 1 changed file with 44 additions and 45 deletions.
89 changes: 44 additions & 45 deletions dvc/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
from __future__ import unicode_literals

import os
from funcy import cached_property

from dvc.utils.compat import builtin_str
from dvc.config import Config


Expand All @@ -21,6 +23,42 @@ def set_dir(self, dname, level=None):
)


def _make_remote_property(name):
"""
The config file is stored in a way that allows you to have a
cache for each remote.
This is needed when specifying external outputs
(as they require you to have an external cache location).
Imagine a config file like the following:
['remote "dvc-storage"']
url = ssh://localhost/tmp
ask_password = true
[cache]
ssh = dvc-storage
This method creates a cached property, containing cache named `name`:
self.config == {'ssh': 'dvc-storage'}
self.ssh # a RemoteSSH instance
"""

def getter(self):
from dvc.remote import Remote

remote = self.config.get(name)
if not remote:
return None

return Remote(self.repo, name=remote)

getter.__name__ = builtin_str(name)
return cached_property(getter)


class Cache(object):
"""Class that manages cache locations of a dvc repo.
Expand All @@ -35,7 +73,7 @@ def __init__(self, repo):

self.repo = repo

config = repo.config.config[Config.SECTION_CACHE]
self.config = config = repo.config.config[Config.SECTION_CACHE]
local = config.get(Config.SECTION_CACHE_LOCAL)

if local:
Expand All @@ -59,48 +97,9 @@ def __init__(self, repo):
}

self.local = Remote(repo, **settings)
self.s3 = self._get_remote(config, Config.SECTION_CACHE_S3)
self.gs = self._get_remote(config, Config.SECTION_CACHE_GS)
self.ssh = self._get_remote(config, Config.SECTION_CACHE_SSH)
self.hdfs = self._get_remote(config, Config.SECTION_CACHE_HDFS)
self.azure = self._get_remote(config, Config.SECTION_CACHE_AZURE)

def _get_remote(self, config, name):
"""
The config file is stored in a way that allows you to have a
cache for each remote.
This is needed when specifying external outputs
(as they require you to have an external cache location).
Imagine a config file like the following:
['remote "dvc-storage"']
url = ssh://localhost/tmp
ask_password = true
[cache]
ssh = dvc-storage
This method resolves the name under the cache section into the
correct Remote instance.
Args:
config (dict): The cache section on the config file
name (str): Name of the section we are interested in to retrieve

Returns:
remote (dvc.Remote): Remote instance that the section is referring.
None when there's no remote with that name.
Example:
>>> _get_remote(config={'ssh': 'dvc-storage'}, name='ssh')
"""
from dvc.remote import Remote

remote = config.get(name)

if not remote:
return None

return Remote(self.repo, name=remote)
s3 = _make_remote_property(Config.SECTION_CACHE_S3)
gs = _make_remote_property(Config.SECTION_CACHE_GS)
ssh = _make_remote_property(Config.SECTION_CACHE_SSH)
hdfs = _make_remote_property(Config.SECTION_CACHE_HDFS)
azure = _make_remote_property(Config.SECTION_CACHE_AZURE)

0 comments on commit e2eb4ae

Please sign in to comment.