From e039c06b32f2e9e2b365e64e491bfce1039baaea Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Thu, 9 Jul 2020 02:28:53 +0300 Subject: [PATCH] tests: hdfs: use hadoop/docker fixtures Fixed #4054 --- dvc/remote/hdfs.py | 2 +- scripts/ci/before_install.sh | 4 -- scripts/ci/core-site.xml | 6 -- scripts/ci/hdfs-site.xml | 6 -- scripts/ci/install_hadoop.sh | 48 -------------- scripts/ci/remove_hadoop.sh | 7 --- setup.py | 2 + tests/docker-compose.yml | 8 +++ tests/remotes/__init__.py | 31 ++++++++- tests/remotes/hdfs.py | 118 ++++++++++++++++++++++------------- 10 files changed, 117 insertions(+), 115 deletions(-) delete mode 100644 scripts/ci/core-site.xml delete mode 100644 scripts/ci/hdfs-site.xml delete mode 100755 scripts/ci/install_hadoop.sh delete mode 100755 scripts/ci/remove_hadoop.sh diff --git a/dvc/remote/hdfs.py b/dvc/remote/hdfs.py index fc4c316452..05dd06e5f8 100644 --- a/dvc/remote/hdfs.py +++ b/dvc/remote/hdfs.py @@ -165,7 +165,7 @@ def get_file_hash(self, path_info): # NOTE: pyarrow doesn't support checksum, so we need to use hadoop regex = r".*\t.*\t(?P.*)" stdout = self.hadoop_fs( - f"checksum {path_info.path}", user=path_info.user + f"checksum {path_info.url}", user=path_info.user ) return self._group(regex, stdout, "checksum") diff --git a/scripts/ci/before_install.sh b/scripts/ci/before_install.sh index 62b799f166..69798d2f32 100644 --- a/scripts/ci/before_install.sh +++ b/scripts/ci/before_install.sh @@ -31,10 +31,6 @@ if [[ "$TRAVIS_BUILD_STAGE_NAME" == "test" ]]; then ssh 0.0.0.0 ls &>/dev/null fi - if [ "$TRAVIS_OS_NAME" == "linux" ]; then - bash "$scriptdir/install_hadoop.sh" - fi - if [[ "$TRAVIS_OS_NAME" == "osx" && "$TRAVIS_PULL_REQUEST" == "false" ]]; then brew install openssl $scriptdir/retry.sh brew cask install google-cloud-sdk diff --git a/scripts/ci/core-site.xml b/scripts/ci/core-site.xml deleted file mode 100644 index 3879811c4c..0000000000 --- a/scripts/ci/core-site.xml +++ /dev/null @@ -1,6 +0,0 @@ - - -fs.defaultFS -hdfs://localhost:8020 - - diff --git a/scripts/ci/hdfs-site.xml b/scripts/ci/hdfs-site.xml deleted file mode 100644 index e80ac2021a..0000000000 --- a/scripts/ci/hdfs-site.xml +++ /dev/null @@ -1,6 +0,0 @@ - - -dfs.replication -1 - - diff --git a/scripts/ci/install_hadoop.sh b/scripts/ci/install_hadoop.sh deleted file mode 100755 index c44bd5e64d..0000000000 --- a/scripts/ci/install_hadoop.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -# NOTE: based on http://sharafjaffri.blogspot.com/2015/04/installing-single-node-hadoop-26-using.html - -set -x -set -e - -sudo add-apt-repository -y ppa:openjdk-r/ppa -sudo apt-get update -y -sudo apt-get install openjdk-7-jdk -java -version - -pushd /usr/local -sudo wget https://s3-us-east-2.amazonaws.com/dvc-public/travis-test/hadoop-2.6.5.tar.gz -sudo tar xzf hadoop-2.6.5.tar.gz -sudo mkdir hadoop -sudo mv hadoop-2.6.5/* hadoop/ -popd - -echo "export HADOOP_HOME=/usr/local/hadoop" >>env.sh -echo "export HADOOP_MAPRED_HOME=/usr/local/hadoop" >>env.sh -echo "export HADOOP_COMMON_HOME=/usr/local/hadoop" >>env.sh -echo "export HADOOP_HDFS_HOME=/usr/local/hadoop" >>env.sh -echo "export YARN_HOME=/usr/local/hadoop" >>env.sh -echo "export HADOOP_COMMON_LIB_NATIVE_DIR=/usr/local/hadoop/lib/native" >>env.sh - -echo "export JAVA_HOME=/usr/" >>env.sh -echo "export PATH=\$PATH:/usr/local/hadoop/sbin:/usr/local/hadoop/bin:$JAVA_PATH/bin" >>env.sh - -source env.sh - -sudo bash -c 'echo "export JAVA_HOME=/usr/" >> /usr/local/hadoop/etc/hadoop/hadoop-env.sh' - -sudo cp scripts/ci/core-site.xml /usr/local/hadoop/etc/hadoop -sudo cp scripts/ci/hdfs-site.xml /usr/local/hadoop/etc/hadoop - -hadoop fs -help -pushd ~ -sudo mkdir -p /home/hadoop/hadoopinfra/hdfs/namenode/ -sudo chmod 777 /home/hadoop/hadoopinfra/hdfs/namenode/ -sudo chmod 777 -R /usr/local/hadoop -hdfs namenode -format -start-dfs.sh -jps -hadoop fs -ls / -hadoop fs -ls hdfs://127.0.0.1/ -hadoop fs -mkdir hdfs://127.0.0.1/ololo -popd diff --git a/scripts/ci/remove_hadoop.sh b/scripts/ci/remove_hadoop.sh deleted file mode 100755 index c8b56d5962..0000000000 --- a/scripts/ci/remove_hadoop.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -stop-dfs.sh -sudo rm -rf /usr/local/hadoop -sudo rm -rf /usr/local/hadoop-2.6.5 -sudo rm -rf /usr/local/hadoop-2.6.5.tar.gz -rm env.sh diff --git a/setup.py b/setup.py index b49f57d19c..f0646c7a2c 100644 --- a/setup.py +++ b/setup.py @@ -131,6 +131,8 @@ def run(self): "pylint", "pylint-pytest", "pylint-plugin-utils", + "wget", + "filelock", ] if (sys.version_info) >= (3, 6): diff --git a/tests/docker-compose.yml b/tests/docker-compose.yml index c06b60ce67..04d6774857 100644 --- a/tests/docker-compose.yml +++ b/tests/docker-compose.yml @@ -9,3 +9,11 @@ services: image: rkuprieiev/oss-emulator ports: - "8880" + hdfs: + image: rkuprieiev/docker-hdfs + ports: + - "8020" + # NOTE: having this port as dynamic one will require modifying + # `dfs.datanode.address` in `hdfs-site.xml` and probably something + # else, so using default one for now. + - "50010:50010" diff --git a/tests/remotes/__init__.py b/tests/remotes/__init__.py index 877b1f74dd..05e1a2ea3f 100644 --- a/tests/remotes/__init__.py +++ b/tests/remotes/__init__.py @@ -3,7 +3,7 @@ import pytest from .azure import Azure, azure, azure_server # noqa: F401 -from .hdfs import HDFS, hdfs # noqa: F401 +from .hdfs import HDFS, hadoop, hdfs, hdfs_server # noqa: F401 from .http import HTTP, http, http_server # noqa: F401 from .local import Local, local_cloud, local_remote # noqa: F401 from .oss import OSS, TEST_OSS_REPO_BUCKET, oss, oss_server # noqa: F401 @@ -45,6 +45,35 @@ def docker_compose(): pytest.skip("no docker-compose installed") +@pytest.fixture(scope="session") +def docker_compose_project_name(): + return "pytest-dvc-test" + + +@pytest.fixture(scope="session") +def docker_services( + docker_compose_file, docker_compose_project_name, tmp_path_factory +): + # overriding `docker_services` fixture from `pytest_docker` plugin to + # only launch docker images once. + + from filelock import FileLock + from pytest_docker.plugin import DockerComposeExecutor, Services + + executor = DockerComposeExecutor( + docker_compose_file, docker_compose_project_name, + ) + + # making sure we don't accidentally launch docker-compose in parallel, + # as it might result in network conflicts. Inspired by: + # https://github.com/pytest-dev/pytest-xdist#making-session-scoped-fixtures-execute-only-once + lockfile = tmp_path_factory.getbasetemp().parent / "docker-compose.lock" + with FileLock(str(lockfile)): + executor.execute("up --build -d") + + return Services(executor) + + @pytest.fixture def remote(tmp_dir, dvc, request): cloud = request.param diff --git a/tests/remotes/hdfs.py b/tests/remotes/hdfs.py index a411a2b143..f7502b7977 100644 --- a/tests/remotes/hdfs.py +++ b/tests/remotes/hdfs.py @@ -1,57 +1,22 @@ -import getpass import locale import os import platform +import uuid from contextlib import contextmanager -from subprocess import CalledProcessError, Popen, check_output import pytest from dvc.path_info import URLInfo from .base import Base -from .local import Local -class HDFS(Base, URLInfo): - @staticmethod - def should_test(): - if platform.system() != "Linux": - return False - - try: - # pylint: disable=unexpected-keyword-arg - # see: https://github.com/PyCQA/pylint/issues/3645 - check_output( - ["hadoop", "version"], - shell=True, - executable=os.getenv("SHELL"), - ) - except (CalledProcessError, OSError): - return False - - p = Popen( - "hadoop fs -ls hdfs://127.0.0.1/", - shell=True, - executable=os.getenv("SHELL"), - ) - p.communicate() - if p.returncode != 0: - return False - - return True - - @staticmethod - def get_url(): - return "hdfs://{}@127.0.0.1{}".format( - getpass.getuser(), Local.get_storagepath() - ) - +class HDFS(Base, URLInfo): # pylint: disable=abstract-method @contextmanager def _hdfs(self): import pyarrow - conn = pyarrow.hdfs.connect() + conn = pyarrow.hdfs.connect(self.host, self.port) try: yield conn finally: @@ -103,8 +68,77 @@ def read_text(self, encoding=None, errors=None): return self.read_bytes().decode(encoding) +@pytest.fixture(scope="session") +def hadoop(): + import wget + import tarfile + from appdirs import user_cache_dir + + if platform.system() != "Linux": + pytest.skip("only supported on Linux") + + hadoop_name = "hadoop-2.7.2.tar.gz" + java_name = "openjdk-7u75-b13-linux-x64-18_dec_2014.tar.gz" + + base_url = "https://s3-us-east-2.amazonaws.com/dvc-public/dvc-test/" + hadoop_url = base_url + hadoop_name + java_url = base_url + java_name + + (cache_dir,) = (user_cache_dir("dvc-test", "iterative"),) + dname = os.path.join(cache_dir, "hdfs") + + java_tar = os.path.join(dname, java_name) + hadoop_tar = os.path.join(dname, hadoop_name) + + java_home = os.path.join(dname, "java-se-7u75-ri") + hadoop_home = os.path.join(dname, "hadoop-2.7.2") + + def _get(url, tar, target): + if os.path.isdir(target): + return + + if not os.path.exists(tar): + wget.download(url, out=tar) + tar = tarfile.open(tar) + tar.extractall(dname) + assert os.path.isdir(target) + + os.makedirs(dname, exist_ok=True) + _get(hadoop_url, hadoop_tar, hadoop_home) + _get(java_url, java_tar, java_home) + + os.environ["JAVA_HOME"] = java_home + os.environ["HADOOP_HOME"] = hadoop_home + os.environ["PATH"] += f":{hadoop_home}/bin:{hadoop_home}/sbin" + + +@pytest.fixture(scope="session") +def hdfs_server(hadoop, docker_compose, docker_services): + import pyarrow + + port = docker_services.port_for("hdfs", 8020) + + def _check(): + try: + # NOTE: just connecting or even opening something is not enough, + # we need to make sure that we are able to write something. + conn = pyarrow.hdfs.connect("127.0.0.1", port) + try: + with conn.open(str(uuid.uuid4()), "wb") as fobj: + fobj.write(b"test") + finally: + conn.close() + return True + except (pyarrow.ArrowException, OSError): + return False + + docker_services.wait_until_responsive(timeout=30.0, pause=5, check=_check) + + return port + + @pytest.fixture -def hdfs(): - if not HDFS.should_test(): - pytest.skip("no hadoop running") - yield HDFS(HDFS.get_url()) +def hdfs(hdfs_server): + port = hdfs_server + url = f"hdfs://127.0.0.1:{port}/{uuid.uuid4()}" + yield HDFS(url)