Skip to content

Commit

Permalink
tree,remote: add support for WebDAV (iterative#4256)
Browse files Browse the repository at this point in the history
* tree,remote: add support for webdav

Webdav support is based on https://pypi.org/project/webdavclient3/ and
supports basic download/upload operation, directory creation as well as
existence, file hash and isdir query. Copy, move and remove are also
implemented, though probably not used yet.

WebdavURLInfo is taken from https://github.com/shizacat/dvc/tree/remote-webdav

Fixes iterative#1153

* tree,remote: add further webdavclient3 options

Webdav token auth, certificate and key path and connection timeout are
configurable. Webdav username might be specified or extracted from URL.

Refs iterative#1153

* tree,remote: validate webdav client configuration

Refs iterative#1153

* tree,remote: WebDAV is written with capitalized 'DAV'...

Refs iterative#1153

* tree,remote: terminate WebDAV makedirs at self.path_info.path

This enables the WebDAV api location (e.g. '/public.php/webdav') to be
part of the remote 'url' configuration instead of beeing specified
separately via the 'root' option. The 'root' option may then be used to
specify real directories at the WebDAV storage, although using it to
set the api location is still possible.

Refs iterative#1153

* tree,remote: use >=3.14.5 for WebDAV dependency webdavclient3

Context: iterative#4256 (comment)

Refs iterative#1153

* tree,remote: get rid of WebDAV 'root' option and add connection check

The WebDAV 'root' option was rather confusing and should be handled by
the initial 'path_info' from the config 'url' option.

Context: iterative#4256 (comment)

While stripping the path/root from the hostname the port got lost, which
is fixed now by simply using the URLInfo 'replace' method as suggested.

Context: iterative#4256 (comment)

The WebDAV client connection is tested by probing the existence of the
root (self.path_info.path).

Refs iterative#1153

* tree,remote: implement walk_files for WebDAV

Context: iterative#4256 (comment)

Refs: iterative#1153

* tree,remote: let WebDAV client list query file info in walk_files

Context: iterative#4256 (comment)

Refs iterative#1153

* tree,remote: add some unit tests for WebDAVTree

Refs iterative#1153

* tree,remote: use ConfigError for WebDAVTree and move Error to webdav.py

Context: iterative#4256 (comment)

Refs iterative#1153

* tree,remote: remove/change some (unnecessary) comments

Context: iterative#4256 (comment)

* tree,remote: uploading to WebDAV only create directories if not exist

* tree,remote: add BaseTree parameter use_dvcignore to WebDAVTree exists

Refs iterative/iterative#1153

* tree,remote: remove WebDAVTree copy method as proposed

Context: iterative#4256 (comment)

Refs iterative#1153

* tree,remote: add progress bar to WebDAV _download and _upload method

Context: iterative#4256 (comment)

Refs iterative#1153

Co-authored-by: Christoph Berganski <[email protected]>
  • Loading branch information
iksnagreb and Christoph Berganski authored Jul 29, 2020
1 parent c8d2f4f commit 2f48bb4
Show file tree
Hide file tree
Showing 8 changed files with 337 additions and 1 deletion.
12 changes: 12 additions & 0 deletions dvc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,16 @@ class RelPath(str):
"password": str,
"ask_password": Bool,
}
WEBDAV_COMMON = {
"user": str,
"password": str,
"ask_password": Bool,
"token": str,
"cert_path": str,
"key_path": str,
"timeout": Coerce(int),
}

SCHEMA = {
"core": {
"remote": Lower,
Expand Down Expand Up @@ -199,6 +209,8 @@ class RelPath(str):
},
"http": {**HTTP_COMMON, **REMOTE_COMMON},
"https": {**HTTP_COMMON, **REMOTE_COMMON},
"webdav": {**WEBDAV_COMMON, **REMOTE_COMMON},
"webdavs": {**WEBDAV_COMMON, **REMOTE_COMMON},
"remote": {str: object}, # Any of the above options are valid
}
)
Expand Down
14 changes: 14 additions & 0 deletions dvc/path_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,3 +315,17 @@ def __eq__(self, other):
and self._path == other._path
and self._extra_parts == other._extra_parts
)


# See https://github.com/shizacat/dvc/blob/remote-webdav/dvc/path_info.py
class WebDAVURLInfo(HTTPURLInfo):
@cached_property
def url(self):
return "{}://{}{}{}{}{}".format(
self.scheme.replace("webdav", "http"),
self.netloc,
self._spath,
(";" + self.params) if self.params else "",
("?" + self.query) if self.query else "",
("#" + self.fragment) if self.fragment else "",
)
2 changes: 2 additions & 0 deletions dvc/scheme.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ class Schemes:
GDRIVE = "gdrive"
LOCAL = "local"
OSS = "oss"
WEBDAV = "webdav"
WEBDAVS = "webdavs"
4 changes: 4 additions & 0 deletions dvc/tree/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from .oss import OSSTree
from .s3 import S3Tree
from .ssh import SSHTree
from .webdav import WebDAVTree
from .webdavs import WebDAVSTree

TREES = [
AzureTree,
Expand All @@ -22,6 +24,8 @@
S3Tree,
SSHTree,
OSSTree,
WebDAVTree,
WebDAVSTree,
# NOTE: LocalTree is the default
]

Expand Down
253 changes: 253 additions & 0 deletions dvc/tree/webdav.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
import logging
import os
import threading
from collections import deque

from funcy import cached_property, wrap_prop

from dvc.config import ConfigError
from dvc.exceptions import DvcException
from dvc.path_info import HTTPURLInfo, WebDAVURLInfo
from dvc.progress import Tqdm
from dvc.scheme import Schemes

from .base import BaseTree
from .http import ask_password

logger = logging.getLogger(__name__)


class WebDAVConnectionError(DvcException):
def __init__(self, host):
super().__init__(f"Unable to connect to WebDAV {host}.")


class WebDAVTree(BaseTree): # pylint:disable=abstract-method
# Use webdav scheme
scheme = Schemes.WEBDAV

# URLInfo for Webdav ~ replaces webdav -> http
PATH_CLS = WebDAVURLInfo

# Traversable as walk_files is implemented
CAN_TRAVERSE = True

# Length of walk_files prefix
TRAVERSE_PREFIX_LEN = 2

# Implementation based on webdav3.client
REQUIRES = {"webdavclient3": "webdav3.client"}

# Chunk size for buffered upload/download with progress bar
CHUNK_SIZE = 2 ** 16

# Constructor
def __init__(self, repo, config):
# Call BaseTree constructor
super().__init__(repo, config)

# Get username from configuration
self.user = config.get("user", None)

# Get password from configuration (might be None ~ not set)
self.password = config.get("password", None)

# Whether to ask for password if it is not set
self.ask_password = config.get("ask_password", False)

# Use token for webdav auth
self.token = config.get("token", None)

# Path to certificate
self.cert_path = config.get("cert_path", None)

# Path to private key
self.key_path = config.get("key_path", None)

# Connection timeout
self.timeout = config.get("timeout", 30)

# Get URL from configuration
self.url = config.get("url", None)

# If URL in config parse path_info
if self.url:
self.path_info = self.PATH_CLS(self.url)

# If username not specified try to use from URL
if self.user is None and self.path_info.user is not None:
self.user = self.path_info.user

# If username specified add to path_info
if self.user is not None:
self.path_info.user = self.user
else:
self.path_info = None

# Webdav client
@wrap_prop(threading.Lock())
@cached_property
def _client(self):
from webdav3.client import Client

# Construct hostname from path_info by stripping path
http_info = HTTPURLInfo(self.path_info.url)
hostname = http_info.replace(path="").url

# Set password or ask for it
if self.ask_password and self.password is None and self.token is None:
host, user = self.path_info.host, self.path_info.user
self.password = ask_password(host, user)

# Setup webdav client options dictionary
options = {
"webdav_hostname": hostname,
"webdav_login": self.user,
"webdav_password": self.password,
"webdav_token": self.token,
"webdav_cert_path": self.cert_path,
"webdav_key_path": self.key_path,
"webdav_timeout": self.timeout,
"webdav_chunk_size": self.CHUNK_SIZE,
}

client = Client(options)

# Check whether client options are valid
if not client.valid():
raise ConfigError(
f"Configuration for WebDAV {hostname} is invalid."
)

# Check whether connection is valid (root should always exist)
if not client.check(self.path_info.path):
raise WebDAVConnectionError(hostname)

return client

# Checks whether file/directory exists at remote
def exists(self, path_info, use_dvcignore=True):
# Use webdav check to test for file existence
return self._client.check(path_info.path)

# Gets file hash 'etag'
def get_file_hash(self, path_info):
# Use webdav client info method to get etag
etag = self._client.info(path_info.path)["etag"].strip('"')

# From HTTPTree
if not etag:
raise DvcException(
"could not find an ETag or "
"Content-MD5 header for '{url}'".format(url=path_info.url)
)

if etag.startswith("W/"):
raise DvcException(
"Weak ETags are not supported."
" (Etag: '{etag}', URL: '{url}')".format(
etag=etag, url=path_info.url
)
)

return etag

# Checks whether path points to directory
def isdir(self, path_info):
# Use webdav is_dir to test whether path points to a directory
return self._client.is_dir(path_info.path)

# Yields path info to all files
def walk_files(self, path_info, **kwargs):
# Check whether directory exists
if not self.exists(path_info):
return

# Collect directories
dirs = deque([path_info.path])

# Iterate all directories found so far
while dirs:
# Iterate directory content
for entry in self._client.list(dirs.pop(), get_info=True):
# Construct path_info to entry
info = path_info.replace(path=entry["path"])

# Check whether entry is a directory
if entry["isdir"]:
# Append new found directory to directory list
dirs.append(info.path)
else:
# Yield path info to non directory
yield info

# Removes file/directory
def remove(self, path_info):
# Use webdav client clean (DELETE) method to remove file/directory
self._client.clean(path_info.path)

# Creates directories
def makedirs(self, path_info):
# Terminate recursion
if path_info.path == self.path_info.path or self.exists(path_info):
return

# Recursively descent to root
self.makedirs(path_info.parent)

# Construct directory at current recursion depth
self._client.mkdir(path_info.path)

# Moves file/directory at remote
def move(self, from_info, to_info, mode=None):
# Webdav client move
self._client.move(from_info.path, to_info.path)

# Downloads file from remote to file
def _download(self, from_info, to_file, name=None, no_progress_bar=False):
# Progress from HTTPTree
with open(to_file, "wb") as fd:
with Tqdm.wrapattr(
fd,
"write",
total=None if no_progress_bar else self._file_size(from_info),
leave=False,
desc=from_info.url if name is None else name,
disable=no_progress_bar,
) as fd_wrapped:
# Download from WebDAV via buffer
self._client.download_from(
buff=fd_wrapped, remote_path=from_info.path
)

# Uploads file to remote
def _upload(self, from_file, to_info, name=None, no_progress_bar=False):
# First try to create parent directories
self.makedirs(to_info.parent)

# Progress from HTTPTree
def chunks():
with open(from_file, "rb") as fd:
with Tqdm.wrapattr(
fd,
"read",
total=None
if no_progress_bar
else os.path.getsize(from_file),
leave=False,
desc=to_info.url if name is None else name,
disable=no_progress_bar,
) as fd_wrapped:
while True:
chunk = fd_wrapped.read(self.CHUNK_SIZE)
if not chunk:
break
yield chunk

# Upload to WebDAV via buffer
self._client.upload_to(buff=chunks(), remote_path=to_info.path)

# Queries size of file at remote
def _file_size(self, path_info):
# Get file size from info dictionary and convert to int (from str)
return int(self._client.info(path_info.path)["size"])
7 changes: 7 additions & 0 deletions dvc/tree/webdavs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from dvc.scheme import Schemes

from .webdav import WebDAVTree


class WebDAVSTree(WebDAVTree): # pylint:disable=abstract-method
scheme = Schemes.WEBDAVS
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,13 @@ def run(self):
oss = ["oss2==2.6.1"]
ssh = ["paramiko>=2.5.0"]
hdfs = ["pyarrow>=0.17.0"]
webdav = ["webdavclient3>=3.14.5"]
# gssapi should not be included in all_remotes, because it doesn't have wheels
# for linux and mac, so it will fail to compile if user doesn't have all the
# requirements, including kerberos itself. Once all the wheels are available,
# we can start shipping it by default.
ssh_gssapi = ["paramiko[gssapi]>=2.5.0"]
all_remotes = gs + s3 + azure + ssh + oss + gdrive + hdfs
all_remotes = gs + s3 + azure + ssh + oss + gdrive + hdfs + webdav

# Extra dependecies to run tests
tests_requirements = [
Expand Down Expand Up @@ -162,6 +163,7 @@ def run(self):
"ssh": ssh,
"ssh_gssapi": ssh_gssapi,
"hdfs": hdfs,
"webdav": webdav,
"tests": tests_requirements,
},
keywords="data-science data-version-control machine-learning git"
Expand Down
42 changes: 42 additions & 0 deletions tests/unit/remote/test_webdav.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from dvc.tree.webdav import WebDAVTree

# Test configuration
url = "webdavs://example.com/public.php/webdav"
user = "username"
userurl = f"webdavs://{user}@example.com/public.php/webdav"
password = "password"


# Test minimum requiered configuration (url)
def test_init(dvc):
config = {"url": url}
tree = WebDAVTree(dvc, config)

assert tree.path_info == url


# Test username from configuration
def test_user(dvc):
config = {"url": url, "user": user}
tree = WebDAVTree(dvc, config)

assert tree.user == user
assert tree.path_info.user == user


# Test username extraction from url
def test_userurl(dvc):
config = {"url": userurl}
tree = WebDAVTree(dvc, config)

assert tree.path_info == userurl
assert tree.user == user
assert tree.path_info.user == user


# test password from config
def test_password(dvc):
config = {"url": url, "user": user, "password": password}
tree = WebDAVTree(dvc, config)

assert tree.password == password

0 comments on commit 2f48bb4

Please sign in to comment.