Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gdrive: fix multi-remote workflow, cont. cleanup #3686

Merged
merged 13 commits into from
Apr 29, 2020
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions dvc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ def load_one(self, level):
conf = _load_config(self.files[level])
conf = self._load_paths(conf, self.files[level])

# Autovivify sections
# Auto-verify sections
for key in COMPILED_SCHEMA.schema:
conf.setdefault(key, {})

Expand Down Expand Up @@ -341,7 +341,12 @@ def _save_paths(conf, filename):

@staticmethod
def _map_dirs(conf, func):
dirs_schema = {"cache": {"dir": func}, "remote": {str: {"url": func}}}
dirs_schema = {
"cache": {"dir": func},
"remote": {
str: {"url": func, "gdrive_user_credentials_file": func}
},
}
return Schema(dirs_schema, extra=ALLOW_EXTRA)(conf)

def _load_config_to_level(self, level=None):
Expand Down
14 changes: 7 additions & 7 deletions dvc/data_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ class DataCloud(object):
def __init__(self, repo):
self.repo = repo

def get_remote(self, remote=None, command="<command>"):
if not remote:
remote = self.repo.config["core"].get("remote")
def get_remote(self, name=None, command="<command>"):
if not name:
name = self.repo.config["core"].get("remote")

if remote:
return self._init_remote(remote)
if name:
return self._init_remote(name)

if bool(self.repo.config["remote"]):
error_msg = (
Expand All @@ -45,8 +45,8 @@ def get_remote(self, remote=None, command="<command>"):

raise NoRemoteError(error_msg)

def _init_remote(self, remote):
return Remote(self.repo, name=remote)
def _init_remote(self, name):
return Remote(self.repo, name=name)

def push(self, cache, jobs=None, remote=None, show_checksums=False):
"""Push data items in a cloud-agnostic way.
Expand Down
182 changes: 128 additions & 54 deletions dvc/remote/gdrive.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,27 @@


class GDrivePathNotFound(DvcException):
def __init__(self, path_info):
super().__init__("Google Drive path '{}' not found.".format(path_info))
def __init__(self, path_info, hint):
hint = "" if hint is None else " {}".format(hint)
super().__init__(
"GDrive path '{}' not found.{}".format(path_info, hint)
shcheklein marked this conversation as resolved.
Show resolved Hide resolved
)


class GDriveAccessTokenRefreshError(DvcException):
def __init__(self):
super().__init__("Google Drive access token refreshment is failed.")
class GDriveAuthError(DvcException):
def __init__(self, cred_location):

if cred_location:
message = (
"GDrive remote auth failed with credentials in '{}'.\n"
"Backup first, remove of fix it, and run the command.\n"
"It should do auth again and refresh the credentials.\n\n"
shcheklein marked this conversation as resolved.
Show resolved Hide resolved
"Details:".format(cred_location)
)
else:
message = "Failed to authenticate GDrive remote"

class GDriveMissedCredentialKeyError(DvcException):
def __init__(self, path):
super().__init__(
"Google Drive user credentials file '{}' "
"misses value for key.".format(path)
)
super().__init__(message)


def _extract(exc, field):
Expand Down Expand Up @@ -115,8 +121,7 @@ def __init__(self, repo, config):

if not self.path_info.bucket:
raise DvcException(
"Empty Google Drive URL '{}'. Learn more at "
"{}.".format(
"Empty GDrive URL '{}'. Learn more at {}".format(
config["url"],
format_link("https://man.dvc.org/remote/add"),
)
Expand Down Expand Up @@ -155,51 +160,89 @@ def _validate_config(self):
or not self._service_account_p12_file_path
):
raise DvcException(
"To use service account please specify {}, {} and "
"{} in DVC config. Learn more at "
"{}.".format(
"gdrive_service_account_email",
"gdrive_service_account_p12_file_path",
"gdrive_service_account_user_email (optional)",
format_link("https://man.dvc.org/remote/modify"),
"To use service account set 'gdrive_service_account_email',\n"
"'gdrive_service_account_p12_file_path', and optionally "
"'gdrive_service_account_user_email'\nin DVC config. "
"Learn more at {}".format(
shcheklein marked this conversation as resolved.
Show resolved Hide resolved
format_link("https://man.dvc.org/remote/modify")
)
)

# Validate OAuth 2.0 Client ID configuration
if not self._use_service_account:
if bool(self._client_id) != bool(self._client_secret):
raise DvcException(
"Please specify Google Drive's client id and secret in "
"DVC config or omit both to use defaults. Learn more at "
"{}.".format(
"Please specify GDrive's client id and secret in "
"DVC config or omit both to use defaults.\n"
shcheklein marked this conversation as resolved.
Show resolved Hide resolved
"Learn more at {}".format(
format_link("https://man.dvc.org/remote/modify")
)
)

@cached_property
def _credentials_location(self):
if os.getenv(GDriveRemote.GDRIVE_CREDENTIALS_DATA):
return GDriveRemote.GDRIVE_CREDENTIALS_DATA
if os.path.exists(self._gdrive_user_credentials_path):
return self._gdrive_user_credentials_path
return None

# Detects discrepancy in DVC config and cached credentials file.
# Usually happens when a second remote is added and it is using
# the same credentials default file. Or when someones decides to change
# DVC config client id or secret but forgets to remove the cached
# credentials file.
@staticmethod
def _validate_credentials(auth, settings):
if not os.getenv(GDriveRemote.GDRIVE_CREDENTIALS_DATA):
if (
settings["client_config"]["client_id"]
!= auth.credentials.client_id
or settings["client_config"]["client_secret"]
!= auth.credentials.client_secret
):
logger.warning(
"Client secret and id from the DVC config do not match "
"actual ones used\nto access the remote storage. Do you "
"use multiple GDrive remotes and forgot to\nset "
"'gdrive_user_credentials_file' for one or more of them? "
"Learn more at\n{}.\n".format(
shcheklein marked this conversation as resolved.
Show resolved Hide resolved
format_link("https://man.dvc.org/remote/modify")
)
)

@wrap_prop(threading.RLock())
@cached_property
def _drive(self):
from pydrive2.auth import RefreshError
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive

if os.getenv(GDriveRemote.GDRIVE_CREDENTIALS_DATA):
with open(
self._gdrive_user_credentials_path, "w"
) as credentials_file:
credentials_file.write(
with open(self._gdrive_user_credentials_path, "w") as cred_file:
cred_file.write(
os.getenv(GDriveRemote.GDRIVE_CREDENTIALS_DATA)
)

GoogleAuth.DEFAULT_SETTINGS["client_config_backend"] = "settings"
auth_settings = {
"client_config_backend": "settings",
"save_credentials": True,
"save_credentials_backend": "file",
"save_credentials_file": self._gdrive_user_credentials_path,
"get_refresh_token": True,
"oauth_scope": [
"https://www.googleapis.com/auth/drive",
"https://www.googleapis.com/auth/drive.appdata",
],
}

if self._use_service_account:
GoogleAuth.DEFAULT_SETTINGS["service_config"] = {
auth_settings["service_config"] = {
"client_service_email": self._service_account_email,
"client_user_email": self._service_account_user_email,
"client_pkcs12_file_path": self._service_account_p12_file_path,
}
else:
GoogleAuth.DEFAULT_SETTINGS["client_config"] = {
auth_settings["client_config"] = {
"client_id": self._client_id or self.DEFAULT_GDRIVE_CLIENT_ID,
"client_secret": self._client_secret
or self.DEFAULT_GDRIVE_CLIENT_SECRET,
Expand All @@ -208,34 +251,30 @@ def _drive(self):
"revoke_uri": "https://oauth2.googleapis.com/revoke",
"redirect_uri": "",
}
GoogleAuth.DEFAULT_SETTINGS["save_credentials"] = True
GoogleAuth.DEFAULT_SETTINGS["save_credentials_backend"] = "file"
GoogleAuth.DEFAULT_SETTINGS[
"save_credentials_file"
] = self._gdrive_user_credentials_path
GoogleAuth.DEFAULT_SETTINGS["get_refresh_token"] = True
GoogleAuth.DEFAULT_SETTINGS["oauth_scope"] = [
"https://www.googleapis.com/auth/drive",
"https://www.googleapis.com/auth/drive.appdata",
]

# Pass non existent settings path to force DEFAULT_SETTINGS loading
GoogleAuth.DEFAULT_SETTINGS.update(auth_settings)

# Pass non existent settings path to force DEFAULT_SETTINGS loadings
gauth = GoogleAuth(settings_file="")

try:
logger.debug(
"GDrive remote auth with config '{}'.".format(
GoogleAuth.DEFAULT_SETTINGS
)
)
if self._use_service_account:
gauth.ServiceAuth()
else:
gauth.CommandLineAuth()
except RefreshError as exc:
raise GDriveAccessTokenRefreshError from exc
except KeyError as exc:
raise GDriveMissedCredentialKeyError(
self._gdrive_user_credentials_path
) from exc
# Handle pydrive2.auth.AuthenticationError and other auth failures
GDriveRemote._validate_credentials(gauth, auth_settings)

# Handle AuthenticationError, RefreshError and other auth failures
# It's hard to come up with a narrow exception, since PyDrive throws
# a lot of different errors - broken credentials file, refresh token
# expired, flow failed, etc.
except Exception as exc:
raise DvcException("Google Drive authentication failed") from exc
raise GDriveAuthError(self._credentials_location) from exc
finally:
if os.getenv(GDriveRemote.GDRIVE_CREDENTIALS_DATA):
os.remove(self._gdrive_user_credentials_path)
Expand All @@ -248,7 +287,11 @@ def _ids_cache(self):
cache = {
"dirs": defaultdict(list),
"ids": {},
"root_id": self._get_item_id(self.path_info, use_cache=False),
"root_id": self._get_item_id(
self.path_info,
use_cache=False,
hint="Check remote directory exists and you can access it.",
shcheklein marked this conversation as resolved.
Show resolved Hide resolved
),
}

self._cache_path_id(self.path_info.path, cache["root_id"], cache)
Expand All @@ -272,18 +315,42 @@ def _list_params(self):
if self._bucket != "root" and self._bucket != "appDataFolder":
drive_id = self._gdrive_shared_drive_id(self._bucket)
if drive_id:
logger.debug(
"GDrive remote '{}' is using shared drive id '{}'.".format(
self.path_info, drive_id
)
)
params["driveId"] = drive_id
params["corpora"] = "drive"
return params

@_gdrive_retry
def _gdrive_shared_drive_id(self, item_id):
from pydrive2.files import ApiRequestError

param = {"id": item_id}
# it does not create a file on the remote
item = self._drive.CreateFile(param)
# ID of the shared drive the item resides in.
# Only populated for items in shared drives.
item.FetchMetadata("driveId")
try:
item.FetchMetadata("driveId")
except ApiRequestError as exc:
error_code = exc.error.get("code", 0)
if error_code == 404:
raise DvcException(
"'{}' for '{}':\n\n"
"1. Check that directory exists and you can access it.\n"
"2. Make sure that credentials in '{}'\n"
" are the right ones to use with this remote storage, "
"e.g. use 'gdrive_user_credentials_file' config\n"
" option if you use multiple GDrive remotes with "
"different email accounts.\n\nDetails".format(
shcheklein marked this conversation as resolved.
Show resolved Hide resolved
item_id, self.path_info, self._credentials_location
)
) from exc
raise

return item.get("driveId", None)

@_gdrive_retry
Expand Down Expand Up @@ -436,14 +503,15 @@ def _path_to_item_ids(self, path, create, use_cache):
[self._create_dir(min(parent_ids), title, path)] if create else []
)

def _get_item_id(self, path_info, create=False, use_cache=True):
def _get_item_id(self, path_info, create=False, use_cache=True, hint=None):
assert path_info.bucket == self._bucket

item_ids = self._path_to_item_ids(path_info.path, create, use_cache)
if item_ids:
return min(item_ids)

raise GDrivePathNotFound(path_info)
assert not create
raise GDrivePathNotFound(path_info, hint)

def exists(self, path_info):
try:
Expand Down Expand Up @@ -492,3 +560,9 @@ def list_cache_paths(self, prefix=None, progress_callback=None):
def remove(self, path_info):
item_id = self._get_item_id(path_info)
self._gdrive_delete_file(item_id)

def get_file_checksum(self, path_info):
raise NotImplementedError

def walk_files(self, path_info):
raise NotImplementedError
10 changes: 3 additions & 7 deletions tests/unit/remote/test_gdrive.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
import pytest
import os

from dvc.remote.gdrive import (
GDriveRemote,
GDriveAccessTokenRefreshError,
GDriveMissedCredentialKeyError,
)
from dvc.remote.gdrive import GDriveRemote, GDriveAuthError

USER_CREDS_TOKEN_REFRESH_ERROR = '{"access_token": "", "client_id": "", "client_secret": "", "refresh_token": "", "token_expiry": "", "token_uri": "https://oauth2.googleapis.com/token", "user_agent": null, "revoke_uri": "https://oauth2.googleapis.com/revoke", "id_token": null, "id_token_jwt": null, "token_response": {"access_token": "", "expires_in": 3600, "scope": "https://www.googleapis.com/auth/drive.appdata https://www.googleapis.com/auth/drive", "token_type": "Bearer"}, "scopes": ["https://www.googleapis.com/auth/drive", "https://www.googleapis.com/auth/drive.appdata"], "token_info_uri": "https://oauth2.googleapis.com/tokeninfo", "invalid": true, "_class": "OAuth2Credentials", "_module": "oauth2client.client"}' # noqa: E501

Expand All @@ -28,13 +24,13 @@ def test_drive(self, dvc):
os.environ[
GDriveRemote.GDRIVE_CREDENTIALS_DATA
] = USER_CREDS_TOKEN_REFRESH_ERROR
with pytest.raises(GDriveAccessTokenRefreshError):
with pytest.raises(GDriveAuthError):
remote._drive

os.environ[GDriveRemote.GDRIVE_CREDENTIALS_DATA] = ""
remote = GDriveRemote(dvc, self.CONFIG)
os.environ[
GDriveRemote.GDRIVE_CREDENTIALS_DATA
] = USER_CREDS_MISSED_KEY_ERROR
with pytest.raises(GDriveMissedCredentialKeyError):
with pytest.raises(GDriveAuthError):
remote._drive