Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support adding directories in google cloud storage remote #2853

Merged
merged 19 commits into from
Dec 1, 2019
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 34 additions & 5 deletions dvc/remote/gs.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,16 +138,45 @@ def remove(self, path_info):

blob.delete()

def _list_paths(self, bucket, prefix):
for blob in self.gs.bucket(bucket).list_blobs(prefix=prefix):
def _list_paths(self, path_info, max_items=None):
for blob in self.gs.bucket(path_info.bucket).list_blobs(
prefix=path_info.path, max_results=max_items
):
yield blob.name

def list_cache_paths(self):
return self._list_paths(self.path_info.bucket, self.path_info.path)
return self._list_paths(self.path_info)

def walk_files(self, path_info):
for fname in self._list_paths(path_info / ""):
# skip nested empty directories
if fname.endswith("/"):
continue
yield path_info.replace(fname)

def makedirs(self, path_info):
self.gs.bucket(path_info.bucket).blob(
(path_info / "").path
).upload_from_string("")
skshetry marked this conversation as resolved.
Show resolved Hide resolved

def isdir(self, path_info):
dir_path = path_info / ""
return bool(list(self._list_paths(dir_path, max_items=1)))

def isfile(self, path_info):
if path_info.path.endswith("/"):
return False

blob = self.gs.bucket(path_info.bucket).blob(path_info.path)
return blob.exists()

def exists(self, path_info):
paths = set(self._list_paths(path_info.bucket, path_info.path))
return any(path_info.path == path for path in paths)
"""Check if the blob exists. If it does not exist,
it could be a part of a directory path.

eg: if `data/file.txt` exists, check for `data` should return True
"""
return self.isfile(path_info) or self.isdir(path_info / "")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return self.isfile(path_info) or self.isdir(path_info / "")
return self.isfile(path_info) or self.isdir(path_info)

No need to make it twice.


def _upload(self, from_file, to_info, name=None, no_progress_bar=True):
bucket = self.gs.bucket(to_info.bucket)
Expand Down
50 changes: 1 addition & 49 deletions tests/func/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,60 +3,12 @@

import pytest

from .test_data_cloud import _should_test_aws
from .test_data_cloud import _should_test_azure
from .test_data_cloud import _should_test_gcp
from .test_data_cloud import _should_test_hdfs
from .test_data_cloud import _should_test_oss
from .test_data_cloud import _should_test_ssh
from .test_data_cloud import get_aws_url
from .test_data_cloud import get_azure_url
from .test_data_cloud import get_gcp_url
from .test_data_cloud import get_hdfs_url
from .test_data_cloud import get_local_url
from .test_data_cloud import get_oss_url
from .test_data_cloud import get_ssh_url
from dvc import api
from dvc.exceptions import FileMissingError
from dvc.main import main
from dvc.path_info import URLInfo
from dvc.remote.config import RemoteConfig


# NOTE: staticmethod is only needed in Python 2
class Local:
should_test = staticmethod(lambda: True)
get_url = staticmethod(get_local_url)


class S3:
should_test = staticmethod(_should_test_aws)
get_url = staticmethod(get_aws_url)


class GCP:
should_test = staticmethod(_should_test_gcp)
get_url = staticmethod(get_gcp_url)


class Azure:
should_test = staticmethod(_should_test_azure)
get_url = staticmethod(get_azure_url)


class OSS:
should_test = staticmethod(_should_test_oss)
get_url = staticmethod(get_oss_url)


class SSH:
should_test = staticmethod(_should_test_ssh)
get_url = staticmethod(get_ssh_url)


class HDFS:
should_test = staticmethod(_should_test_hdfs)
get_url = staticmethod(get_hdfs_url)
from tests.remotes import Azure, GCP, HDFS, Local, OSS, S3, SSH


remote_params = [S3, GCP, Azure, OSS, SSH, HDFS]
Expand Down
230 changes: 23 additions & 207 deletions tests/func/test_data_cloud.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,8 @@
import copy
import getpass
import logging
import os
import platform
import shutil
import uuid
from subprocess import CalledProcessError
from subprocess import check_output
from subprocess import Popen
from unittest import SkipTest

import pytest
Expand All @@ -29,216 +24,37 @@
from dvc.remote.base import STATUS_DELETED
from dvc.remote.base import STATUS_NEW
from dvc.remote.base import STATUS_OK
from dvc.utils import env2bool
from dvc.utils import file_md5
from dvc.utils.compat import str
from dvc.utils.stage import dump_stage_file
from dvc.utils.stage import load_stage_file
from tests.basic_env import TestDvc
from tests.utils import spy


TEST_REMOTE = "upstream"
TEST_SECTION = 'remote "{}"'.format(TEST_REMOTE)
TEST_CONFIG = {
Config.SECTION_CACHE: {},
Config.SECTION_CORE: {Config.SECTION_CORE_REMOTE: TEST_REMOTE},
TEST_SECTION: {Config.SECTION_REMOTE_URL: ""},
}

TEST_AWS_REPO_BUCKET = os.environ.get("DVC_TEST_AWS_REPO_BUCKET", "dvc-test")
TEST_GCP_REPO_BUCKET = os.environ.get("DVC_TEST_GCP_REPO_BUCKET", "dvc-test")
TEST_OSS_REPO_BUCKET = "dvc-test"

TEST_GCP_CREDS_FILE = os.path.abspath(
os.environ.get(
"GOOGLE_APPLICATION_CREDENTIALS",
os.path.join("scripts", "ci", "gcp-creds.json"),
)
)
# Ensure that absolute path is used
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = TEST_GCP_CREDS_FILE

TEST_GDRIVE_CLIENT_ID = (
"719861249063-v4an78j9grdtuuuqg3lnm0sugna6v3lh.apps.googleusercontent.com"
from tests.remotes import (
_should_test_aws,
_should_test_azure,
_should_test_gcp,
_should_test_gdrive,
_should_test_hdfs,
_should_test_oss,
_should_test_ssh,
TEST_CONFIG,
TEST_SECTION,
TEST_GCP_CREDS_FILE,
TEST_GDRIVE_CLIENT_ID,
TEST_GDRIVE_CLIENT_SECRET,
TEST_REMOTE,
get_aws_url,
get_azure_url,
get_gcp_url,
get_gdrive_url,
get_hdfs_url,
get_local_url,
get_oss_url,
get_ssh_url,
get_ssh_url_mocked,
)
TEST_GDRIVE_CLIENT_SECRET = "2fy_HyzSwkxkGzEken7hThXb"


def _should_test_aws():
do_test = env2bool("DVC_TEST_AWS", undefined=None)
if do_test is not None:
return do_test

if os.getenv("AWS_ACCESS_KEY_ID") and os.getenv("AWS_SECRET_ACCESS_KEY"):
return True

return False


def _should_test_gdrive():
if os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA):
return True

return False


def _should_test_gcp():
do_test = env2bool("DVC_TEST_GCP", undefined=None)
if do_test is not None:
return do_test

if not os.path.exists(TEST_GCP_CREDS_FILE):
return False

try:
check_output(
[
"gcloud",
"auth",
"activate-service-account",
"--key-file",
TEST_GCP_CREDS_FILE,
]
)
except (CalledProcessError, OSError):
return False
return True


def _should_test_azure():
do_test = env2bool("DVC_TEST_AZURE", undefined=None)
if do_test is not None:
return do_test

return os.getenv("AZURE_STORAGE_CONTAINER_NAME") and os.getenv(
"AZURE_STORAGE_CONNECTION_STRING"
)


def _should_test_oss():
do_test = env2bool("DVC_TEST_OSS", undefined=None)
if do_test is not None:
return do_test

return (
os.getenv("OSS_ENDPOINT")
and os.getenv("OSS_ACCESS_KEY_ID")
and os.getenv("OSS_ACCESS_KEY_SECRET")
)


def _should_test_ssh():
do_test = env2bool("DVC_TEST_SSH", undefined=None)
if do_test is not None:
return do_test

# FIXME: enable on windows
if os.name == "nt":
return False

try:
check_output(["ssh", "-o", "BatchMode=yes", "127.0.0.1", "ls"])
except (CalledProcessError, IOError):
return False

return True


def _should_test_hdfs():
if platform.system() != "Linux":
return False

try:
check_output(
["hadoop", "version"], shell=True, executable=os.getenv("SHELL")
)
except (CalledProcessError, IOError):
return False

p = Popen(
"hadoop fs -ls hdfs://127.0.0.1/",
shell=True,
executable=os.getenv("SHELL"),
)
p.communicate()
if p.returncode != 0:
return False

return True


def get_local_storagepath():
return TestDvc.mkdtemp()


def get_local_url():
return get_local_storagepath()


def get_ssh_url():
return "ssh://{}@127.0.0.1:22{}".format(
getpass.getuser(), get_local_storagepath()
)


def get_ssh_url_mocked(user, port):
path = get_local_storagepath()
if os.name == "nt":
# NOTE: On Windows get_local_storagepath() will return an ntpath
# that looks something like `C:\some\path`, which is not compatible
# with SFTP paths [1], so we need to convert it to a proper posixpath.
# To do that, we should construct a posixpath that would be relative
# to the server's root. In our case our ssh server is running with
# `c:/` as a root, and our URL format requires absolute paths, so the
# resulting path would look like `/some/path`.
#
# [1]https://tools.ietf.org/html/draft-ietf-secsh-filexfer-13#section-6
drive, path = os.path.splitdrive(path)
assert drive.lower() == "c:"
path = path.replace("\\", "/")
url = "ssh://{}@127.0.0.1:{}{}".format(user, port, path)
return url


def get_hdfs_url():
return "hdfs://{}@127.0.0.1{}".format(
getpass.getuser(), get_local_storagepath()
)


def get_aws_storagepath():
return TEST_AWS_REPO_BUCKET + "/" + str(uuid.uuid4())


def get_aws_url():
return "s3://" + get_aws_storagepath()


def get_gdrive_url():
return "gdrive://root/" + str(uuid.uuid4())


def get_gcp_storagepath():
return TEST_GCP_REPO_BUCKET + "/" + str(uuid.uuid4())


def get_gcp_url():
return "gs://" + get_gcp_storagepath()


def get_azure_url():
container_name = os.getenv("AZURE_STORAGE_CONTAINER_NAME")
assert container_name is not None
return "azure://{}/{}".format(container_name, str(uuid.uuid4()))


def get_oss_storagepath():
return "{}/{}".format(TEST_OSS_REPO_BUCKET, (uuid.uuid4()))


def get_oss_url():
return "oss://{}".format(get_oss_storagepath())


class TestDataCloud(TestDvc):
Expand Down
3 changes: 1 addition & 2 deletions tests/func/test_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,13 @@
import configobj
from mock import patch

from .test_data_cloud import get_local_url
from dvc.config import Config
from dvc.main import main
from dvc.path_info import PathInfo
from dvc.remote import RemoteLOCAL
from dvc.remote.base import RemoteBASE
from tests.basic_env import TestDvc
from tests.func.test_data_cloud import get_local_storagepath
from tests.remotes import get_local_url, get_local_storagepath


class TestRemote(TestDvc):
Expand Down
Loading