diff --git a/.github/PULL_REQUEST_TEMPLATE b/.github/PULL_REQUEST_TEMPLATE index b0289f4f6..3d8952163 100644 --- a/.github/PULL_REQUEST_TEMPLATE +++ b/.github/PULL_REQUEST_TEMPLATE @@ -1,3 +1,9 @@ ## What changes were proposed in this pull request? (Please fill in changes proposed in this fix) + +## Does this PR also require the following changes? + +- CI +- Documentation +- Example diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index f0c1ab3e5..1b28d0a79 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -20,7 +20,7 @@ jobs: ~/.m2/repository /opt/intel/oneapi ~/opt - key: ${{ runner.os }}_spark-3.1.1_hadoop-3.2.0_oneapi-2021.3.0 + key: ${{ runner.os }}_spark-3.1.1_hadoop-3.2.0_oneapi-2021.4.0 restore-keys: | ${{ runner.os }}- - name: Set up environments diff --git a/.gitignore b/.gitignore index 1d621bdd4..b69b6d7f3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ *.o *.log -.vscode *.iml +.vscode/ target/ .idea/ .idea_modules/ diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 000000000..baa3db3b1 --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,19 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/mllib-dal/src/main/native/**", + "${CCL_ROOT}/include/**", + "${DAALROOT}/include/**", + "${JAVA_HOME}/include/**" + ], + "defines": [], + "compilerPath": "${CMPLR_ROOT}/linux/bin/clang", + "cStandard": "c17", + "cppStandard": "c++14", + "intelliSenseMode": "clang-x64" + } + ], + "version": 4 +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..2edd51bcb --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,37 @@ +{ + "files.associations": { + "*.tcc": "cpp", + "cctype": "cpp", + "chrono": "cpp", + "cstdint": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "exception": "cpp", + "initializer_list": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "limits": "cpp", + "ostream": "cpp", + "ratio": "cpp", + "string_view": "cpp", + "type_traits": "cpp", + "clocale": "cpp", + "streambuf": "cpp", + "algorithm": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdio": "cpp", + "deque": "cpp", + "vector": "cpp", + "functional": "cpp", + "memory_resource": "cpp", + "string": "cpp", + "utility": "cpp", + "fstream": "cpp", + "iomanip": "cpp", + "new": "cpp", + "sstream": "cpp", + "*.template": "shellscript" + } +} \ No newline at end of file diff --git a/README.md b/README.md index 6088d0350..0667dc69c 100644 --- a/README.md +++ b/README.md @@ -10,11 +10,11 @@ OAP MLlib is an optimized package to accelerate machine learning algorithms in ## Compatibility -OAP MLlib maintains the same API interfaces with Spark MLlib. That means the application built with Spark MLlib can be running directly with minimum configuration. +OAP MLlib maintains the same API interfaces with Spark MLlib. That means the application built with Spark MLlib can be running directly with minimum configuration. -Most of the algorithms can produce the same results that are identical with Spark MLlib. However due to the nature of distributed float point operations, there may be some small deviation from the original result, we will make sure the error is within acceptable range and the accuracy is on par with Spark MLlib. +Most of the algorithms can produce the same results that are identical with Spark MLlib. However due to the nature of distributed float point operations, there may be some small deviation from the original result, we will make sure the error is within acceptable range and the accuracy is on par with Spark MLlib. -For those algorithms that are not accelerated by OAP MLlib, the original Spark MLlib one will be used. +For those algorithms that are not accelerated by OAP MLlib, the original Spark MLlib one will be used. ## Online Documentation @@ -55,7 +55,7 @@ Intel® oneAPI Toolkits components used by the project are already included into #### General Configuration ##### YARN Cluster Manager -Users usually run Spark application on __YARN__ with __client__ mode. In that case, you only need to add the following configurations in `spark-defaults.conf` or in `spark-submit` command line before running. +Users usually run Spark application on __YARN__ with __client__ mode. In that case, you only need to add the following configurations in `spark-defaults.conf` or in `spark-submit` command line before running. ``` # absolute path of the jar for uploading @@ -85,14 +85,14 @@ OAP MLlib expects 1 executor acts as 1 oneCCL rank for compute. As `spark.shuffl ### Sanity Check #### Setup `env.sh` -``` +```bash $ cd conf $ cp env.sh.template env.sh ``` Edit related variables in "`Minimun Settings`" of `env.sh` #### Upload example data files to HDFS -``` +```bash $ cd examples $ hadoop fs -mkdir -p /user/$USER $ hadoop fs -copyFromLocal data @@ -100,7 +100,7 @@ Edit related variables in "`Minimun Settings`" of `env.sh` ``` #### Run K-means -``` +```bash $ cd examples/kmeans $ ./build.sh $ ./run.sh @@ -119,45 +119,27 @@ We use [Apache Maven](https://maven.apache.org/) to manage and build source code * JDK 8.0+ * Apache Maven 3.6.2+ * GNU GCC 4.8.5+ -* Intel® oneAPI Toolkits 2021.3.0 Components: +* Intel® oneAPI Base Toolkit (>=2021.4.0) Components : - DPC++/C++ Compiler (dpcpp/clang++) - Data Analytics Library (oneDAL) - Threading Building Blocks (oneTBB) -* [Open Source Intel® oneAPI Collective Communications Library (oneCCL)](https://github.com/oneapi-src/oneCCL) - -Intel® oneAPI Toolkits and its components can be downloaded and install from [here](https://software.intel.com/content/www/us/en/develop/tools/oneapi.html). Installation process for oneAPI using Package Managers (YUM (DNF), APT, and ZYPPER) is also available. Generally you only need to install oneAPI Base Toolkit for Linux with all or selected components mentioned above. Instead of using oneCCL included in Intel® oneAPI Toolkits, we prefer to build from open source oneCCL to resolve some bugs. + - Collective Communications Library (oneCCL)] -More details about oneAPI can be found [here](https://software.intel.com/content/www/us/en/develop/tools/oneapi.html). +Generally you only need to install __Intel® oneAPI Base Toolkit for Linux__ with all or selected components mentioned above. Intel® oneAPI Base Toolkit can be downloaded and installed from [here](https://software.intel.com/content/www/us/en/develop/tools/oneapi.html). Installation process for oneAPI using Package Managers (YUM (DNF), APT, and ZYPPER) is also available. More details about oneAPI can be found [here](https://software.intel.com/content/www/us/en/develop/tools/oneapi.html). -Scala and Java dependency descriptions are already included in Maven POM file. +Scala and Java dependency descriptions are already included in Maven POM file. ***Note:*** You can refer to [this script](dev/install-build-deps-centos.sh) to install correct dependencies: DPC++/C++, oneDAL, oneTBB, oneCCL. ### Build -#### Building oneCCL - -To clone and build from open source oneCCL, run the following commands: -``` - $ git clone https://github.com/oneapi-src/oneCCL - $ cd oneCCL - $ git checkout 2021.2.1 - $ mkdir build && cd build - $ cmake .. - $ make -j install -``` - -The generated files will be placed in `/your/oneCCL_source_code/build/_install` - -#### Building OAP MLlib - To clone and checkout source code, run the following commands: -``` - $ git clone https://github.com/oap-project/oap-mllib.git +```bash + $ git clone https://github.com/oap-project/oap-mllib.git ``` __Optional__ to checkout specific release branch: -``` - $ cd oap-mllib && git checkout ${version} +```bash + $ cd oap-mllib && git checkout ${version} ``` We rely on environment variables to find required toolchains and libraries. Please make sure the following environment variables are set for building: @@ -171,25 +153,22 @@ CCL_ROOT | Path to oneCCL home directory We suggest you to source `setvars.sh` script into current shell to setup building environments as following: -``` +```bash $ source /opt/intel/oneapi/setvars.sh - $ source /your/oneCCL_source_code/build/_install/env/setvars.sh ``` -__Be noticed we are using our own built oneCCL instead, we should source oneCCL's `setvars.sh` to overwrite oneAPI one.__ - You can also refer to [this CI script](dev/ci-test.sh) to setup the building environments. -If you prefer to buid your own open source [oneDAL](https://github.com/oneapi-src/oneDAL), [oneTBB](https://github.com/oneapi-src/oneTBB) versions rather than use the ones included in oneAPI TookKits, you can refer to the related build instructions and manually source `setvars.sh` accordingly. +If you prefer to buid your own open source [oneDAL](https://github.com/oneapi-src/oneDAL), [oneTBB](https://github.com/oneapi-src/oneTBB), [oneCCL](https://github.com/oneapi-src/oneCCL) versions rather than use the ones included in oneAPI Base Toolkit, you can refer to the related build instructions and manually source `setvars.sh` accordingly. -To build, run the following commands: -``` +To build, run the following commands: +```bash $ cd mllib-dal $ ./build.sh ``` If no parameter is given, the Spark version __3.1.1__ will be activated by default. You can also specify a different Spark version with option `-p spark-x.x.x`. For example: -``` +```bash $ ./build.sh -p spark-3.0.0 ``` @@ -206,6 +185,7 @@ pca | PCA example for Scala als | ALS example for Scala naive-bayes | Naive Bayes example for Scala linear-regression | Linear Regression example for Scala +correlation | Correlation example for Scala ### Python Examples @@ -217,12 +197,11 @@ als-pyspark | ALS example for PySpark ## List of Accelerated Algorithms -Algorithm | Category | Maturity -------------------|----------|------------- -K-Means | CPU | Stable -K-Means | GPU | Experimental -PCA | CPU | Stable -PCA | GPU | Experimental -ALS | CPU | Stable -Naive Bayes | CPU | Experimental -Linear Regression | CPU | Experimental +Algorithm | CPU | GPU | Maturity +------------------|-----|-----|--------- +K-Means | X | X | Stable +PCA | X | X | Stable +ALS | X | | Experimental +Naive Bayes | X | | Stable +Linear Regression | X | | Stable +Correlation | X | X | Experimental diff --git a/RELEASE b/RELEASE new file mode 100644 index 000000000..a72a32503 --- /dev/null +++ b/RELEASE @@ -0,0 +1 @@ +OAP_MLLIB_VERSION=1.2.0 \ No newline at end of file diff --git a/conf/env.sh.template b/conf/env.sh.template index 7bdb97f22..168f9d133 100644 --- a/conf/env.sh.template +++ b/conf/env.sh.template @@ -2,8 +2,6 @@ # ============== Minimum Settings ============= # -# Set OAP MLlib version (e.g. 1.1.0) -OAP_MLLIB_VERSION=x.x.x # Set Spark master SPARK_MASTER=yarn # Set Hadoop home path @@ -17,6 +15,9 @@ export OAP_MLLIB_ROOT=/path/to/oap-mllib/home # ============================================= # +# Import RELEASE envs +source $OAP_MLLIB_ROOT/RELEASE + # Set HADOOP_CONF_DIR for Spark export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop @@ -42,7 +43,7 @@ SPARK_TOTAL_CORES=$((SPARK_NUM_EXECUTORS * SPARK_EXECUTOR_CORES)) SPARK_DEFAULT_PARALLELISM=$((SPARK_TOTAL_CORES * 2)) # Checks -for dir in $SPARK_HOME $HADOOP_HOME $OAP_MLLIB_JAR +for dir in $SPARK_HOME $HADOOP_HOME $OAP_MLLIB_JAR do if [[ ! -e $dir ]]; then echo $dir does not exist! diff --git a/dev/build-maven-local-repo.sh b/dev/build-maven-local-repo.sh new file mode 100755 index 000000000..44a94a794 --- /dev/null +++ b/dev/build-maven-local-repo.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +if [[ -z $DAALROOT ]]; then + echo DAALROOT not defined! + exit 1 +fi + +echo "Building Maven Repo for oneDAL ..." + +mkdir maven-repository +mvn deploy:deploy-file -Dfile=$DAALROOT/lib/onedal.jar -DgroupId=com.intel.onedal -Dversion=2021.4.0 -Dpackaging=jar -Durl=file:./maven-repository -DrepositoryId=maven-repository -DupdateReleaseInfo=true + +echo "DONE" + +find ./maven-repository + +# Add the following into pom.xml: + +# +# +# maven-repository +# file:///${project.basedir}/maven-repository +# +# + +# +# com.intel.dal +# dal +# 2021.4.0 +# \ No newline at end of file diff --git a/dev/ci-test.sh b/dev/ci-test.sh index ce079fe7d..59c64eb7d 100755 --- a/dev/ci-test.sh +++ b/dev/ci-test.sh @@ -1,11 +1,22 @@ #!/usr/bin/env bash +# exit when any command fails +set -e + +# keep track of the last executed command +trap 'last_command=$current_command; current_command=$BASH_COMMAND' DEBUG +# echo an error message before exiting +trap 'echo "\"${last_command}\" command filed with exit code $?."' EXIT + # Setup building envs source /opt/intel/oneapi/setvars.sh -source /tmp/oneCCL/build/_install/env/setvars.sh -SupportedSparkVersions=("spark-3.0.0" "spark-3.0.1" "spark-3.0.2" "spark-3.1.1") +# Prepare lib resources +cd $GITHUB_WORKSPACE/mllib-dal +../dev/prepare-build-deps.sh +# Test for all versions +SupportedSparkVersions=("spark-3.0.0" "spark-3.0.1" "spark-3.0.2" "spark-3.1.1") for SparkVer in ${SupportedSparkVersions[*]}; do echo echo "========================================" @@ -13,6 +24,7 @@ for SparkVer in ${SupportedSparkVersions[*]}; do echo "========================================" echo cd $GITHUB_WORKSPACE/mllib-dal + ./build.sh -q ./test.sh -q -p $SparkVer done diff --git a/dev/install-build-deps-centos.sh b/dev/install-build-deps-centos.sh index 275222be8..877992228 100755 --- a/dev/install-build-deps-centos.sh +++ b/dev/install-build-deps-centos.sh @@ -15,17 +15,7 @@ EOF sudo mv /tmp/oneAPI.repo /etc/yum.repos.d # sudo yum groupinstall -y "Development Tools" # sudo yum install -y cmake - sudo yum install -y intel-oneapi-dpcpp-cpp-2021.3.0 intel-oneapi-dal-devel-2021.3.0 intel-oneapi-tbb-devel-2021.3.0 + sudo yum install -y intel-oneapi-dpcpp-cpp-2021.4.0 intel-oneapi-dal-devel-2021.4.0 intel-oneapi-tbb-devel-2021.4.0 intel-oneapi-ccl-devel-2021.4.0 intel-oneapi-mpi-devel-2021.4.0 else echo "oneAPI components already installed!" fi - -echo "Building oneCCL ..." -cd /tmp -rm -rf oneCCL -git clone https://github.com/oneapi-src/oneCCL -cd oneCCL -git checkout 2021.2.1 -mkdir build && cd build -cmake .. -make -j 2 install diff --git a/dev/install-build-deps-ubuntu.sh b/dev/install-build-deps-ubuntu.sh index a6379dae9..027956b74 100755 --- a/dev/install-build-deps-ubuntu.sh +++ b/dev/install-build-deps-ubuntu.sh @@ -9,17 +9,7 @@ if [ ! -d /opt/intel/oneapi ]; then echo "deb https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list sudo apt-get update # sudo apt-get install -y build-essential cmake - sudo apt-get install -y intel-oneapi-dpcpp-cpp-2021.3.0 intel-oneapi-dal-devel-2021.3.0 intel-oneapi-tbb-devel-2021.3.0 + sudo apt-get install -y intel-oneapi-dpcpp-cpp-2021.4.0 intel-oneapi-dal-devel-2021.4.0 intel-oneapi-tbb-devel-2021.4.0 intel-oneapi-ccl-devel-2021.4.0 intel-oneapi-mpi-devel-2021.4.0 else echo "oneAPI components already installed!" fi - -echo "Building oneCCL ..." -cd /tmp -rm -rf oneCCL -git clone https://github.com/oneapi-src/oneCCL -cd oneCCL -git checkout 2021.2.1 -mkdir build && cd build -cmake .. -make -j 2 install diff --git a/dev/prepare-build-deps-gpu.sh b/dev/prepare-build-deps-gpu.sh new file mode 100755 index 000000000..e6762e1c7 --- /dev/null +++ b/dev/prepare-build-deps-gpu.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash + +if [ -z ${ONEAPI_ROOT} ]; then + echo Please source Intel oneAPI Toolkit environments! + exit 1 +fi + +if [[ -z $DAALROOT ]]; then + echo DAALROOT not defined! + exit 1 +fi + +if [[ -z $TBBROOT ]]; then + echo TBBROOT not defined! + exit 1 +fi + +if [[ -z $I_MPI_ROOT ]]; then + echo I_MPI_ROOT not defined! + exit 1 +fi + +if [[ -z $CCL_ROOT ]]; then + echo CCL_ROOT not defined! + exit 1 +fi + +# Use patchelf to change SONAME for libfabric +if [[ -z $(which patchelf) ]]; then + echo Please install \"patchelf\"! + exit 1 +fi + +if [[ $(basename $(pwd)) != "mllib-dal" ]]; then + echo Please execute the script from \"mllib-dal\" directory! + exit 1 +fi + +TARGET_DIR=./src/main/resources/lib + +rm -f $TARGET_DIR/*.so* + +cp $CCL_ROOT/lib/cpu_icc/libccl.so.1.0 $TARGET_DIR/libccl.so.1 + +cp $I_MPI_ROOT/libfabric/lib/libfabric.so.1 $TARGET_DIR/libfabric.so.1 +cp $I_MPI_ROOT/libfabric/lib/prov/libsockets-fi.so $TARGET_DIR + +# Workaround dlopen (libfabric.so) in oneCCL +cp $I_MPI_ROOT/libfabric/lib/libfabric.so.1 $TARGET_DIR/libfabric.so +patchelf --set-soname libfabric.so $TARGET_DIR/libfabric.so + +cp $I_MPI_ROOT/lib/release_mt/libmpi.so.12.0.0 $TARGET_DIR/libmpi.so.12 + +cp $DAALROOT/lib/intel64/libJavaAPI.so.1.1 $TARGET_DIR/libJavaAPI.so + +cp $TBBROOT/lib/intel64/gcc4.8/libtbb.so.12.4 $TARGET_DIR/libtbb.so.12 +cp $TBBROOT/lib/intel64/gcc4.8/libtbbmalloc.so.2.4 $TARGET_DIR/libtbbmalloc.so.2 + +# SYCL libs +cp $CMPLR_ROOT/linux/compiler/lib/intel64_lin/libintlc.so.5 $TARGET_DIR +cp $CMPLR_ROOT/linux/compiler/lib/intel64_lin/libsvml.so $TARGET_DIR + +# Workaround lib loading for JNI as libirng.so doesn't have soname +cp $CMPLR_ROOT/linux/compiler/lib/intel64_lin/libirng.so $TARGET_DIR +patchelf --set-soname libirng.so $TARGET_DIR/libirng.so + +cp $CMPLR_ROOT/linux/compiler/lib/intel64_lin/libimf.so $TARGET_DIR +cp $CMPLR_ROOT/linux/lib/libOpenCL.so.1 $TARGET_DIR +cp $CMPLR_ROOT/linux/lib/libsycl.so.5 $TARGET_DIR + +echo oneAPI Toolkit version: $(basename $CCL_ROOT) > $TARGET_DIR/VERSION diff --git a/dev/prepare-build-deps.sh b/dev/prepare-build-deps.sh new file mode 100755 index 000000000..6b74dfed2 --- /dev/null +++ b/dev/prepare-build-deps.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash + +if [ -z ${ONEAPI_ROOT} ]; then + echo Please source Intel oneAPI Toolkit environments! + exit 1 +fi + +if [[ -z $DAALROOT ]]; then + echo DAALROOT not defined! + exit 1 +fi + +if [[ -z $TBBROOT ]]; then + echo TBBROOT not defined! + exit 1 +fi + +if [[ -z $I_MPI_ROOT ]]; then + echo I_MPI_ROOT not defined! + exit 1 +fi + +if [[ -z $CCL_ROOT ]]; then + echo CCL_ROOT not defined! + exit 1 +fi + +# Use patchelf to change SONAME for libfabric +if [[ -z $(which patchelf) ]]; then + echo Please install \"patchelf\"! + exit 1 +fi + +if [[ $(basename $(pwd)) != "mllib-dal" ]]; then + echo Please execute the script from \"mllib-dal\" directory! + exit 1 +fi + +TARGET_DIR=./src/main/resources/lib + +rm -f $TARGET_DIR/*.so* + +cp $CCL_ROOT/lib/cpu_icc/libccl.so.1.0 $TARGET_DIR/libccl.so.1 + +cp $I_MPI_ROOT/libfabric/lib/libfabric.so.1 $TARGET_DIR/libfabric.so.1 +cp $I_MPI_ROOT/libfabric/lib/prov/libsockets-fi.so $TARGET_DIR + +# Workaround dlopen (libfabric.so) in oneCCL +cp $I_MPI_ROOT/libfabric/lib/libfabric.so.1 $TARGET_DIR/libfabric.so +patchelf --set-soname libfabric.so $TARGET_DIR/libfabric.so + +cp $I_MPI_ROOT/lib/release_mt/libmpi.so.12.0.0 $TARGET_DIR/libmpi.so.12 + +cp $DAALROOT/lib/intel64/libJavaAPI.so.1.1 $TARGET_DIR/libJavaAPI.so + +cp $TBBROOT/lib/intel64/gcc4.8/libtbb.so.12.4 $TARGET_DIR/libtbb.so.12 +cp $TBBROOT/lib/intel64/gcc4.8/libtbbmalloc.so.2.4 $TARGET_DIR/libtbbmalloc.so.2 + +echo oneAPI Toolkit version: $(basename $CCL_ROOT) > $TARGET_DIR/VERSION diff --git a/dev/setup-all.sh b/dev/setup-all.sh index 66510e85e..7c08ce0e4 100755 --- a/dev/setup-all.sh +++ b/dev/setup-all.sh @@ -1,5 +1,13 @@ #!/usr/bin/env bash +# exit when any command fails +set -e + +# keep track of the last executed command +trap 'last_command=$current_command; current_command=$BASH_COMMAND' DEBUG +# echo an error message before exiting +trap 'echo "\"${last_command}\" command filed with exit code $?."' EXIT + # Install dependencies for building $GITHUB_WORKSPACE/dev/install-build-deps-ubuntu.sh diff --git a/dev/test-cluster/ci-test-cluster.sh b/dev/test-cluster/ci-test-cluster.sh index 7a4600267..d86d89aef 100755 --- a/dev/test-cluster/ci-test-cluster.sh +++ b/dev/test-cluster/ci-test-cluster.sh @@ -1,5 +1,13 @@ #!/usr/bin/env bash +# exit when any command fails +set -e + +# keep track of the last executed command +trap 'last_command=$current_command; current_command=$BASH_COMMAND' DEBUG +# echo an error message before exiting +trap 'echo "\"${last_command}\" command filed with exit code $?."' EXIT + # Setup Spark envs source $GITHUB_WORKSPACE/dev/test-cluster/setup-spark-envs.sh @@ -8,12 +16,14 @@ cp $GITHUB_WORKSPACE/dev/test-cluster/env.sh $GITHUB_WORKSPACE/conf cd $GITHUB_WORKSPACE/examples +HOST_NAME=$(hostname -f) +export HDFS_ROOT=hdfs://$HOST_NAME:8020 + # Copy examples data to HDFS -hadoop fs -mkdir -p /user/$USER -hadoop fs -copyFromLocal data -hadoop fs -ls data +hadoop fs -copyFromLocal data / +hadoop fs -find / # Build and run all examples -./build-all.sh +./build-all-scala.sh ./run-all-scala.sh ./run-all-pyspark.sh diff --git a/dev/test-cluster/env.sh b/dev/test-cluster/env.sh index 225db0b7b..0a92a1a10 100644 --- a/dev/test-cluster/env.sh +++ b/dev/test-cluster/env.sh @@ -2,8 +2,6 @@ # ============== Minimum Settings ============= # -# Set OAP MLlib version (e.g. 1.1.0) -OAP_MLLIB_VERSION=1.1.0 # Set Spark master SPARK_MASTER=yarn # Set Hadoop home path @@ -11,33 +9,43 @@ export HADOOP_HOME=$HADOOP_HOME # Set Spark home path export SPARK_HOME=$SPARK_HOME # Set HDFS Root, should be hdfs://xxx or file://xxx -export HDFS_ROOT=hdfs://localhost:8020 + +HOST_NAME=$(hostname -f) +export HDFS_ROOT=hdfs://$HOST_NAME:8020 # Set OAP MLlib source code root directory export OAP_MLLIB_ROOT=$GITHUB_WORKSPACE # ============================================= # +# Import RELEASE envs +source $OAP_MLLIB_ROOT/RELEASE + # Set HADOOP_CONF_DIR for Spark export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop # Set JAR name & path OAP_MLLIB_JAR_NAME=oap-mllib-$OAP_MLLIB_VERSION.jar OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME -# Set Spark driver & executor classpaths, -# absolute path for driver, relative path for executor +# Set Spark driver & executor classpaths +# YARN mode: use absolute path for driver, relative path for executors +# Standalone mode: use absolute path for both driver and executors SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR -SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME +if [[ $SPARK_MASTER == yarn ]]; then + SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME +else + SPARK_EXECUTOR_CLASSPATH=$OAP_MLLIB_JAR +fi # Set Spark resources, can be overwritten in example SPARK_DRIVER_MEMORY=1G SPARK_NUM_EXECUTORS=2 SPARK_EXECUTOR_CORES=1 SPARK_EXECUTOR_MEMORY=1G -SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) +SPARK_TOTAL_CORES=$((SPARK_NUM_EXECUTORS * SPARK_EXECUTOR_CORES)) +SPARK_DEFAULT_PARALLELISM=$((SPARK_TOTAL_CORES * 2)) # Checks - -for dir in $SPARK_HOME $HADOOP_HOME $OAP_MLLIB_JAR +for dir in $SPARK_HOME $HADOOP_HOME $OAP_MLLIB_JAR do if [[ ! -e $dir ]]; then echo $dir does not exist! diff --git a/dev/test-cluster/log4j.properties b/dev/test-cluster/log4j.properties new file mode 100644 index 000000000..ff29121c2 --- /dev/null +++ b/dev/test-cluster/log4j.properties @@ -0,0 +1,42 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +log4j.rootCategory=WARN, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Set the default spark-shell log level to WARN. When running the spark-shell, the +# log level for this class is used to overwrite the root logger's log level, so that +# the user can have different defaults for the shell and regular Spark apps. +log4j.logger.org.apache.spark.repl.Main=WARN + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.sparkproject.jetty=WARN +log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO +log4j.logger.org.apache.parquet=ERROR +log4j.logger.parquet=ERROR + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL +log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR + +log4j.logger.org.apache.spark.ml.util.LibLoader=DEBUG diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh index 633d848e9..a5b48490e 100755 --- a/dev/test-cluster/setup-cluster.sh +++ b/dev/test-cluster/setup-cluster.sh @@ -1,15 +1,21 @@ #!/usr/bin/env bash -WORK_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +# exit when any command fails +set -e -cd $WORK_DIR +# keep track of the last executed command +trap 'last_command=$current_command; current_command=$BASH_COMMAND' DEBUG +# echo an error message before exiting +trap 'echo "\"${last_command}\" command filed with exit code $?."' EXIT + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" echo JAVA_HOME is $JAVA_HOME -HADOOP_VERSION=3.2.0 -SPARK_VERSION=3.1.1 -SPARK_HADOOP_VERSION=hadoop3.2 +# setup envs +source $SCRIPT_DIR/setup-spark-envs.sh +# download spark & hadoop bins [ -d ~/opt ] || mkdir ~/opt cd ~/opt [ -f spark-$SPARK_VERSION-bin-$SPARK_HADOOP_VERSION.tgz ] || wget --no-verbose https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-$SPARK_HADOOP_VERSION.tgz @@ -17,7 +23,7 @@ cd ~/opt [ -f hadoop-$HADOOP_VERSION.tar.gz ] || wget --no-verbose https://archive.apache.org/dist/hadoop/core/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz [ -d hadoop-$HADOOP_VERSION ] || tar -xzf hadoop-$HADOOP_VERSION.tar.gz -cd $WORK_DIR +cd $SCRIPT_DIR HOST_IP=$(hostname -f) @@ -28,13 +34,14 @@ cp ./core-site.xml ~/opt/hadoop-$HADOOP_VERSION/etc/hadoop/ cp ./hdfs-site.xml ~/opt/hadoop-$HADOOP_VERSION/etc/hadoop/ cp ./yarn-site.xml ~/opt/hadoop-$HADOOP_VERSION/etc/hadoop/ cp ./hadoop-env.sh ~/opt/hadoop-$HADOOP_VERSION/etc/hadoop/ +cp ./log4j.properties ~/opt/spark-$SPARK_VERSION-bin-$SPARK_HADOOP_VERSION/conf cp ./spark-defaults.conf ~/opt/spark-$SPARK_VERSION-bin-$SPARK_HADOOP_VERSION/conf -source ./setup-spark-envs.sh - echo $HOST_IP > $HADOOP_HOME/etc/hadoop/slaves echo $HOST_IP > $SPARK_HOME/conf/slaves +ls -l $SPARK_HOME/conf + # create directories mkdir -p /tmp/run/hdfs/namenode mkdir -p /tmp/run/hdfs/datanode diff --git a/dev/test-cluster/setup-spark-envs.sh b/dev/test-cluster/setup-spark-envs.sh index 6e4e06423..5e988c3a9 100755 --- a/dev/test-cluster/setup-spark-envs.sh +++ b/dev/test-cluster/setup-spark-envs.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +set -x + HADOOP_VERSION=3.2.0 SPARK_VERSION=3.1.1 SPARK_HADOOP_VERSION=hadoop3.2 @@ -12,4 +14,6 @@ export SPARK_HOME=~/opt/spark-$SPARK_VERSION-bin-$SPARK_HADOOP_VERSION export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH export PYSPARK_PYTHON=python3 -export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH \ No newline at end of file +export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH + +set +x \ No newline at end of file diff --git a/examples/build-all.sh b/examples/build-all-scala.sh similarity index 100% rename from examples/build-all.sh rename to examples/build-all-scala.sh diff --git a/examples/correlation/build.sh b/examples/correlation/build.sh old mode 100644 new mode 100755 diff --git a/examples/correlation/run.sh b/examples/correlation/run.sh old mode 100644 new mode 100755 diff --git a/mllib-dal/build-cpu-gpu.sh b/mllib-dal/build-cpu-gpu.sh index 27b1777d9..4317471e1 100755 --- a/mllib-dal/build-cpu-gpu.sh +++ b/mllib-dal/build-cpu-gpu.sh @@ -26,6 +26,19 @@ if [[ -z $CCL_ROOT ]]; then exit 1 fi +# Check lib dependencies for building +RESOURCE_PATH=src/main/resources/lib +LIBS=(libccl.so.1 libfabric.so libfabric.so.1 libJavaAPI.so libmpi.so.12 \ + libsockets-fi.so libtbbmalloc.so.2 libtbb.so.12 libintlc.so.5 libsvml.so libirng.so libimf.so \ + libOpenCL.so.1 libsycl.so.5) +for lib in ${LIBS[@]} +do + if [[ ! -f ./$RESOURCE_PATH/$lib ]]; then + echo $RESOURCE_PATH/$lib does not exsit, please run ../dev/prepare-build-deps-gpu.sh! + exit 1 +fi +done + versionArray=( spark-3.0.0 \ spark-3.0.1 \ @@ -45,7 +58,7 @@ print_usage() { do echo " $version" done - echo + echo } while getopts "hqp:" opt diff --git a/mllib-dal/build.sh b/mllib-dal/build.sh index 7ae84e01f..96393f1ca 100755 --- a/mllib-dal/build.sh +++ b/mllib-dal/build.sh @@ -26,6 +26,23 @@ if [[ -z $CCL_ROOT ]]; then exit 1 fi +# Check lib dependencies for building +RESOURCE_PATH=src/main/resources/lib +LIBS=(libccl.so.1 libfabric.so libfabric.so.1 libJavaAPI.so libmpi.so.12 \ + libsockets-fi.so libtbbmalloc.so.2 libtbb.so.12) +for lib in ${LIBS[@]} +do + if [[ ! -f ./$RESOURCE_PATH/$lib ]]; then + echo $RESOURCE_PATH/$lib does not exsit, please run ../dev/prepare-build-deps.sh! + exit 1 +fi +done + +if [[ -f ./$RESOURCE_PATH/libsycl.so.5 ]]; then + echo GPU libs found! Please re-run ../dev/prepare-build-deps.sh! + exit 1 +fi + versionArray=( spark-3.0.0 \ spark-3.0.1 \ @@ -45,7 +62,7 @@ print_usage() { do echo " $version" done - echo + echo } while getopts "hqp:" opt diff --git a/mllib-dal/pom.xml b/mllib-dal/pom.xml index a3f81ae3a..d2b74863d 100644 --- a/mllib-dal/pom.xml +++ b/mllib-dal/pom.xml @@ -1,6 +1,5 @@ + xmlns="http://maven.apache.org/POM/4.0.0" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 com.intel.oap @@ -29,63 +28,84 @@ src/assembly/assembly.xml - + + + gcs-maven-central-mirror + + GCS Maven Central mirror + https://maven-central.storage-download.googleapis.com/maven2/ + + true + + + false + + + + + central + Maven Repository + https://repo.maven.apache.org/maven2 + + true + + + false + + + + org.scala-lang scala-library 2.12.10 - com.github.scopt scopt_2.12 3.7.0 - org.apache.spark spark-core_2.12 ${spark.version} provided - org.apache.spark spark-sql_2.12 ${spark.version} provided - org.apache.spark spark-mllib_2.12 ${spark.version} provided - - com.intel.onedal - onedal - ${oneapi.version} - system - ${env.DAALROOT}/lib/onedal.jar + com.intel.dal + dal + 2021.4.0.83 - junit junit 4.12 test - org.scalatest scalatest_${scala.binary.version} ${scalatest.version} test - org.apache.spark spark-mllib_2.12 @@ -93,7 +113,6 @@ test-jar test - org.apache.spark spark-mllib-local_${scala.binary.version} @@ -101,7 +120,6 @@ test-jar test - org.jpmml pmml-model @@ -114,7 +132,6 @@ - org.apache.spark spark-sql_2.12 @@ -122,7 +139,6 @@ test-jar test - org.apache.spark spark-core_2.12 @@ -130,7 +146,6 @@ test-jar test - org.apache.spark spark-catalyst_2.12 @@ -138,7 +153,6 @@ test-jar test - org.apache.spark spark-tags_2.12 @@ -146,11 +160,9 @@ test-jar test - - cpu-gpu @@ -159,11 +171,7 @@ CPU_GPU_PROFILE - - src/assembly/assembly-cpu-gpu.xml - - spark-3.0.0 @@ -171,7 +179,6 @@ 3.0.8 - spark-3.0.1 @@ -179,7 +186,6 @@ 3.0.8 - spark-3.0.2 @@ -187,7 +193,6 @@ 3.0.8 - spark-3.1.1 @@ -202,58 +207,58 @@ - - org.codehaus.mojo - build-helper-maven-plugin - 3.2.0 - + + org.codehaus.mojo + build-helper-maven-plugin + 3.2.0 + - add-source - generate-sources - - add-source - - - - src/spark-${spark.version}/main/java - src/spark-${spark.version}/main/scala - - + add-source + generate-sources + + add-source + + + + src/spark-${spark.version}/main/java + src/spark-${spark.version}/main/scala + + - add-test-source - generate-sources - - add-test-source - - - - src/spark-${spark.version}/test/scala - - + add-test-source + generate-sources + + add-test-source + + + + src/spark-${spark.version}/test/scala + + - - + + net.alchim31.maven scala-maven-plugin 4.4.0 - - scala-compile-first - process-resources - - add-source - compile - - - - scala-test-compile - process-test-resources - - testCompile - - + + scala-compile-first + process-resources + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + ${scala.version} @@ -329,7 +334,6 @@ true - org.scalatest scalatest-maven-plugin @@ -348,18 +352,16 @@ - maven-antrun-plugin 1.8 - process-classes + process-resources Building native code - + @@ -369,89 +371,10 @@ - maven-resources-plugin 3.0.2 - - ${project.build.testOutputDirectory}/lib - - - ${env.CCL_ROOT}/lib - - ${ccl.lib} - ${ccl.mpi.lib} - ${ccl.fabric.lib} - - - - ${env.CCL_ROOT}/lib/prov - - libsockets-fi.so - - - - ${env.TBBROOT}/lib/intel64/gcc4.8 - - ${tbb.lib} - ${tbb.malloc.lib} - - - - ${env.DAALROOT}/lib/intel64 - - ${dal.java.lib} - - - - ${project.build.directory} - - libMLlibDAL.so - - - - - - - - com.coderplus.maven.plugins - copy-rename-maven-plugin - 1.0 - - - rename-file - process-test-resources - - rename - - - - - ${project.build.testOutputDirectory}/lib/${tbb.lib} - ${project.build.testOutputDirectory}/lib/libtbb.so.12 - - - - ${project.build.testOutputDirectory}/lib/${tbb.malloc.lib} - ${project.build.testOutputDirectory}/lib/libtbbmalloc.so.2 - - - - ${project.build.testOutputDirectory}/lib/${ccl.mpi.lib} - ${project.build.testOutputDirectory}/lib/libmpi.so.12 - - - - ${project.build.testOutputDirectory}/lib/${dal.java.lib} - ${project.build.testOutputDirectory}/lib/libJavaAPI.so - - - - - - - maven-assembly-plugin 3.0.0 @@ -472,8 +395,6 @@ - - diff --git a/mllib-dal/src/assembly/assembly.xml b/mllib-dal/src/assembly/assembly.xml index 1d6abe146..e0d177b95 100644 --- a/mllib-dal/src/assembly/assembly.xml +++ b/mllib-dal/src/assembly/assembly.xml @@ -13,12 +13,6 @@ true runtime - - - / - true - system - @@ -28,51 +22,8 @@ README* LICENSE* NOTICE* - - - - ${project.build.directory} - lib - - *.so + RELEASE* - - - - ${env.TBBROOT}/lib/intel64/gcc4.8/${tbb.lib} - lib - libtbb.so.12 - - - ${env.TBBROOT}/lib/intel64/gcc4.8/${tbb.malloc.lib} - lib - libtbbmalloc.so.2 - - - - ${env.DAALROOT}/lib/intel64/${dal.java.lib} - lib - libJavaAPI.so - - - - ${env.CCL_ROOT}/lib/${ccl.fabric.lib} - lib - - - ${env.CCL_ROOT}/lib/${ccl.mpi.lib} - lib - libmpi.so.12 - - - ${env.CCL_ROOT}/lib/libccl.so - lib - - - ${env.CCL_ROOT}/lib/prov/libsockets-fi.so - lib - - diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java b/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java index 7741e29ce..52a898efd 100644 --- a/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java +++ b/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java @@ -28,7 +28,7 @@ public final class LibLoader { // Make sure loading libraries from different temp directory for each process private static final String subDir = "MLlibDAL_" + UUID.randomUUID(); - private static final Logger log = LoggerFactory.getLogger("LibLoader"); + private static final Logger log = LoggerFactory.getLogger(LibLoader.class); private static boolean isLoaded = false; @@ -65,11 +65,15 @@ public static synchronized void loadLibraries() throws IOException { private static synchronized void loadLibCCL() throws IOException { // Load libfabric from system first, if failed load from jar if (!loadFromSystem("libfabric.so.1")) { + // Fix dlopen(libfabric.so) error: + // $ cp libfabric.so.1 libfabric.so + // $ patchelf --set-soname libfabric.so libfabric.so + loadFromJar(subDir, "libfabric.so"); loadFromJar(subDir, "libfabric.so.1"); loadFromJar(subDir, "libsockets-fi.so"); } loadFromJar(subDir, "libmpi.so.12"); - loadFromJar(subDir, "libccl.so"); + loadFromJar(subDir, "libccl.so.1"); } /** @@ -140,8 +144,7 @@ private static void loadFromJar(String path, String name) throws IOException { } try (OutputStream streamOut = new FileOutputStream(fileOut)) { - log.debug("Writing resource to temp file."); - + // Writing resource to temp file byte[] buffer = new byte[32768]; while (true) { int read = streamIn.read(buffer); @@ -158,8 +161,8 @@ private static void loadFromJar(String path, String name) throws IOException { streamIn.close(); } - System.load(fileOut.toString()); - log.debug("DONE: Loading library as resource."); + System.load(fileOut.toString()); + log.debug("DONE: Loading library " + fileOut.toString() +" as resource."); } /** diff --git a/mllib-dal/src/main/native/CorrelationDALImpl.cpp b/mllib-dal/src/main/native/CorrelationDALImpl.cpp index 347f5afda..f2efb70ea 100644 --- a/mllib-dal/src/main/native/CorrelationDALImpl.cpp +++ b/mllib-dal/src/main/native/CorrelationDALImpl.cpp @@ -150,8 +150,6 @@ Java_org_apache_spark_ml_stat_CorrelationDALImpl_cCorrelationTrainDAL( ccl::communicator &comm = getComm(); size_t rankId = comm.rank(); - std::cout << " rankId : " << rankId << " ! " - << std::endl; const size_t nBlocks = executor_num; diff --git a/mllib-dal/src/main/native/Makefile b/mllib-dal/src/main/native/Makefile index 4f18a363b..cdc79a071 100644 --- a/mllib-dal/src/main/native/Makefile +++ b/mllib-dal/src/main/native/Makefile @@ -33,20 +33,18 @@ else exit 1 endif -# The following paths setting works for self-built libs from source code -# https://github.com/oneapi-src/oneCCL. If oneCCL package in oneAPI Toolkit is used, -# Should change paths to $(CCL_ROOT)/{include,lib}/cpu_icc instead INCS := -I $(JAVA_HOME)/include \ -I $(JAVA_HOME)/include/linux \ - -I $(CCL_ROOT)/include \ + -I $(CCL_ROOT)/include/cpu_icc \ -I $(DAALROOT)/include \ -I ./javah \ -I ./ # Use static link if possible, TBB is only available as dynamic libs -LIBS_COMMON := -L$(CCL_ROOT)/lib -lccl \ +LIBS_COMMON := -L$(CCL_ROOT)/lib/cpu_icc -lccl \ + -L$(CMPLR_ROOT)/linux/compiler/lib/intel64_lin -l:libirc.a \ -L$(DAALROOT)/lib/intel64 -l:libonedal_core.a -l:libonedal_thread.a \ - -L$(TBBROOT)/lib/lib/intel64/gcc4.8 -ltbb -ltbbmalloc + -L$(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE) LIBS := $(LIBS_COMMON) @@ -80,7 +78,7 @@ ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE) endif # Output Binary -OUTPUT = ../../../target/libMLlibDAL.so +OUTPUT = ../../../src/main/resources/lib/libMLlibDAL.so all: $(OUTPUT) diff --git a/mllib-dal/src/main/native/build.sh b/mllib-dal/src/main/native/build.sh index d271c5d97..cfa1ef844 100755 --- a/mllib-dal/src/main/native/build.sh +++ b/mllib-dal/src/main/native/build.sh @@ -14,5 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +if [[ $OAP_MLLIB_TESTING == "true" ]]; then + exit 0 +fi + make clean make -j diff --git a/mllib-dal/src/main/resources/lib/.gitignore b/mllib-dal/src/main/resources/lib/.gitignore new file mode 100644 index 000000000..86d0cb272 --- /dev/null +++ b/mllib-dal/src/main/resources/lib/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore \ No newline at end of file diff --git a/mllib-dal/src/main/resources/log4j.properties b/mllib-dal/src/main/resources/log4j.properties new file mode 100644 index 000000000..a33c21109 --- /dev/null +++ b/mllib-dal/src/main/resources/log4j.properties @@ -0,0 +1 @@ +log4j.logger.org.apache.spark.ml.util.LibLoader=DEBUG diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala index 7fccae192..643ed8f54 100644 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala +++ b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala @@ -27,8 +27,10 @@ object OneCCL extends Logging { // Run on Executor def setExecutorEnv(): Unit = { setEnv("CCL_ATL_TRANSPORT", "ofi") + // Set CCL_ROOT to workaround CCL_ROOT env read bug, should remove when upstream fix this + setEnv("CCL_ROOT", "/opt/intel/oneapi/ccl/latest") // Uncomment this if you whant to debug oneCCL - // setEnv("CCL_LOG_LEVEL", "2") + // setEnv("CCL_LOG_LEVEL", "debug") } def init(executor_num: Int, rank: Int, ip_port: String): Unit = { diff --git a/mllib-dal/src/test/resources/log4j.properties b/mllib-dal/src/test/resources/log4j.properties new file mode 100644 index 000000000..ff29121c2 --- /dev/null +++ b/mllib-dal/src/test/resources/log4j.properties @@ -0,0 +1,42 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +log4j.rootCategory=WARN, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Set the default spark-shell log level to WARN. When running the spark-shell, the +# log level for this class is used to overwrite the root logger's log level, so that +# the user can have different defaults for the shell and regular Spark apps. +log4j.logger.org.apache.spark.repl.Main=WARN + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.sparkproject.jetty=WARN +log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO +log4j.logger.org.apache.parquet=ERROR +log4j.logger.parquet=ERROR + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL +log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR + +log4j.logger.org.apache.spark.ml.util.LibLoader=DEBUG diff --git a/mllib-dal/test.sh b/mllib-dal/test.sh index b4c1cde36..13235cb26 100755 --- a/mllib-dal/test.sh +++ b/mllib-dal/test.sh @@ -1,31 +1,39 @@ #!/usr/bin/env bash -# Check envs for building -if [[ -z $JAVA_HOME ]]; then - echo JAVA_HOME not defined! - exit 1 +if [[ -n $DAALROOT ]]; then + echo + echo ==================================================================================== + echo WARNING: DAALROOT detected. It is recommended to test without oneAPI environment! + echo ==================================================================================== + echo fi -if [[ -z $(which mvn) ]]; then - echo Maven not found! - exit 1 +# Unset FI_PROVIDER_PATH if present otherwise may hang +if [[ -n $FI_PROVIDER_PATH ]]; then + echo ==================================================================================== + echo WARNING: FI_PROVIDER_PATH detected. Will unset FI_PROVIDER_PATH before proceeding! + unset FI_PROVIDER_PATH + echo ==================================================================================== fi -if [[ -z $DAALROOT ]]; then - echo DAALROOT not defined! - exit 1 +if [[ ! -f target/oap-mllib-1.2.0.jar ]]; then + echo Please run ./build.sh first to do a complete build before testing! + exit 1 fi -if [[ -z $TBBROOT ]]; then - echo TBBROOT not defined! +# Check envs for building +if [[ -z $JAVA_HOME ]]; then + echo JAVA_HOME not defined! exit 1 fi -if [[ -z $CCL_ROOT ]]; then - echo CCL_ROOT not defined! +if [[ -z $(which mvn) ]]; then + echo Maven not found! exit 1 fi +export OAP_MLLIB_TESTING=true + versionArray=( spark-3.0.0 \ spark-3.0.1 \ @@ -84,11 +92,7 @@ export PLATFORM_PROFILE=CPU_ONLY_PROFILE echo === Testing Environments === echo JAVA_HOME=$JAVA_HOME -echo DAALROOT=$DAALROOT -echo TBBROOT=$TBBROOT -echo CCL_ROOT=$CCL_ROOT echo Maven Version: $(mvn -v | head -n 1 | cut -f3 -d" ") -echo Clang Version: $(clang -dumpversion) echo Spark Version: $SPARK_VER echo Platform Profile: $PLATFORM_PROFILE echo ============================ @@ -109,10 +113,10 @@ if [[ -z $SUITE ]]; then echo echo Testing ALL suites... echo - mvn $MVN_NO_TRANSFER_PROGRESS -P$SPARK_VER -Dtest=none clean test + mvn $MVN_NO_TRANSFER_PROGRESS -P$SPARK_VER -Dtest=none test else echo echo Testing org.apache.spark.ml.$SUITE ... echo - mvn $MVN_NO_TRANSFER_PROGRESS -P$SPARK_VER -Dtest=none -DwildcardSuites=org.apache.spark.ml.$SUITE clean test + mvn $MVN_NO_TRANSFER_PROGRESS -P$SPARK_VER -Dtest=none -DwildcardSuites=org.apache.spark.ml.$SUITE test fi