From 6d074817f1039f723379eed393a6ded38b6da134 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Fri, 29 Oct 2021 22:10:42 +0800 Subject: [PATCH 01/19] Move libs to resource and use oneCCL in oneAPI ToolKit (need to patch soname of libfabric.so.1) --- mllib-dal/.gitignore | 1 + mllib-dal/pom.xml | 79 +------------------ mllib-dal/src/assembly/assembly.xml | 57 +------------ .../org/apache/spark/ml/util/LibLoader.java | 15 ++-- mllib-dal/src/main/native/Makefile | 12 ++- mllib-dal/src/main/resources/lib/README.md | 9 +++ mllib-dal/src/main/resources/log4j.properties | 1 + mllib-dal/src/test/resources/log4j.properties | 25 ++++++ 8 files changed, 52 insertions(+), 147 deletions(-) create mode 100644 mllib-dal/.gitignore create mode 100644 mllib-dal/src/main/resources/lib/README.md create mode 100644 mllib-dal/src/main/resources/log4j.properties create mode 100644 mllib-dal/src/test/resources/log4j.properties diff --git a/mllib-dal/.gitignore b/mllib-dal/.gitignore new file mode 100644 index 000000000..d50d2b06f --- /dev/null +++ b/mllib-dal/.gitignore @@ -0,0 +1 @@ +src/main/resources/lib/ diff --git a/mllib-dal/pom.xml b/mllib-dal/pom.xml index a3f81ae3a..9ac307b68 100644 --- a/mllib-dal/pom.xml +++ b/mllib-dal/pom.xml @@ -354,7 +354,7 @@ 1.8 - process-classes + process-resources Building native code @@ -373,83 +373,6 @@ maven-resources-plugin 3.0.2 - - ${project.build.testOutputDirectory}/lib - - - ${env.CCL_ROOT}/lib - - ${ccl.lib} - ${ccl.mpi.lib} - ${ccl.fabric.lib} - - - - ${env.CCL_ROOT}/lib/prov - - libsockets-fi.so - - - - ${env.TBBROOT}/lib/intel64/gcc4.8 - - ${tbb.lib} - ${tbb.malloc.lib} - - - - ${env.DAALROOT}/lib/intel64 - - ${dal.java.lib} - - - - ${project.build.directory} - - libMLlibDAL.so - - - - - - - - com.coderplus.maven.plugins - copy-rename-maven-plugin - 1.0 - - - rename-file - process-test-resources - - rename - - - - - ${project.build.testOutputDirectory}/lib/${tbb.lib} - ${project.build.testOutputDirectory}/lib/libtbb.so.12 - - - - ${project.build.testOutputDirectory}/lib/${tbb.malloc.lib} - ${project.build.testOutputDirectory}/lib/libtbbmalloc.so.2 - - - - ${project.build.testOutputDirectory}/lib/${ccl.mpi.lib} - ${project.build.testOutputDirectory}/lib/libmpi.so.12 - - - - ${project.build.testOutputDirectory}/lib/${dal.java.lib} - ${project.build.testOutputDirectory}/lib/libJavaAPI.so - - - - - - diff --git a/mllib-dal/src/assembly/assembly.xml b/mllib-dal/src/assembly/assembly.xml index 1d6abe146..932acc8a1 100644 --- a/mllib-dal/src/assembly/assembly.xml +++ b/mllib-dal/src/assembly/assembly.xml @@ -19,60 +19,5 @@ true system - - - - ${project.basedir} - / - - README* - LICENSE* - NOTICE* - - - - ${project.build.directory} - lib - - *.so - - - - - - - ${env.TBBROOT}/lib/intel64/gcc4.8/${tbb.lib} - lib - libtbb.so.12 - - - ${env.TBBROOT}/lib/intel64/gcc4.8/${tbb.malloc.lib} - lib - libtbbmalloc.so.2 - - - - ${env.DAALROOT}/lib/intel64/${dal.java.lib} - lib - libJavaAPI.so - - - - ${env.CCL_ROOT}/lib/${ccl.fabric.lib} - lib - - - ${env.CCL_ROOT}/lib/${ccl.mpi.lib} - lib - libmpi.so.12 - - - ${env.CCL_ROOT}/lib/libccl.so - lib - - - ${env.CCL_ROOT}/lib/prov/libsockets-fi.so - lib - - + diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java b/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java index 7741e29ce..52a898efd 100644 --- a/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java +++ b/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java @@ -28,7 +28,7 @@ public final class LibLoader { // Make sure loading libraries from different temp directory for each process private static final String subDir = "MLlibDAL_" + UUID.randomUUID(); - private static final Logger log = LoggerFactory.getLogger("LibLoader"); + private static final Logger log = LoggerFactory.getLogger(LibLoader.class); private static boolean isLoaded = false; @@ -65,11 +65,15 @@ public static synchronized void loadLibraries() throws IOException { private static synchronized void loadLibCCL() throws IOException { // Load libfabric from system first, if failed load from jar if (!loadFromSystem("libfabric.so.1")) { + // Fix dlopen(libfabric.so) error: + // $ cp libfabric.so.1 libfabric.so + // $ patchelf --set-soname libfabric.so libfabric.so + loadFromJar(subDir, "libfabric.so"); loadFromJar(subDir, "libfabric.so.1"); loadFromJar(subDir, "libsockets-fi.so"); } loadFromJar(subDir, "libmpi.so.12"); - loadFromJar(subDir, "libccl.so"); + loadFromJar(subDir, "libccl.so.1"); } /** @@ -140,8 +144,7 @@ private static void loadFromJar(String path, String name) throws IOException { } try (OutputStream streamOut = new FileOutputStream(fileOut)) { - log.debug("Writing resource to temp file."); - + // Writing resource to temp file byte[] buffer = new byte[32768]; while (true) { int read = streamIn.read(buffer); @@ -158,8 +161,8 @@ private static void loadFromJar(String path, String name) throws IOException { streamIn.close(); } - System.load(fileOut.toString()); - log.debug("DONE: Loading library as resource."); + System.load(fileOut.toString()); + log.debug("DONE: Loading library " + fileOut.toString() +" as resource."); } /** diff --git a/mllib-dal/src/main/native/Makefile b/mllib-dal/src/main/native/Makefile index 94ab3a9b1..1c0370d78 100644 --- a/mllib-dal/src/main/native/Makefile +++ b/mllib-dal/src/main/native/Makefile @@ -33,20 +33,18 @@ else exit 1 endif -# The following paths setting works for self-built libs from source code -# https://github.com/oneapi-src/oneCCL. If oneCCL package in oneAPI Toolkit is used, -# Should change paths to $(CCL_ROOT)/{include,lib}/cpu_icc instead INCS := -I $(JAVA_HOME)/include \ -I $(JAVA_HOME)/include/linux \ - -I $(CCL_ROOT)/include \ + -I $(CCL_ROOT)/include/cpu_icc \ -I $(DAALROOT)/include \ -I ./javah \ -I ./ # Use static link if possible, TBB is only available as dynamic libs -LIBS_COMMON := -L$(CCL_ROOT)/lib -lccl \ +LIBS_COMMON := -L$(CCL_ROOT)/lib/cpu_icc -lccl \ + -L$(CMPLR_ROOT)/linux/compiler/lib/intel64_lin -l:libirc.a \ -L$(DAALROOT)/lib/intel64 -l:libonedal_core.a -l:libonedal_thread.a \ - -L$(TBBROOT)/lib/lib/intel64/gcc4.8 -ltbb -ltbbmalloc + -L$(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE) LIBS := $(LIBS_COMMON) @@ -78,7 +76,7 @@ ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE) endif # Output Binary -OUTPUT = ../../../target/libMLlibDAL.so +OUTPUT = ../../../src/main/resources/lib/libMLlibDAL.so all: $(OUTPUT) diff --git a/mllib-dal/src/main/resources/lib/README.md b/mllib-dal/src/main/resources/lib/README.md new file mode 100644 index 000000000..dbabe3fbd --- /dev/null +++ b/mllib-dal/src/main/resources/lib/README.md @@ -0,0 +1,9 @@ +libccl.so.1 +libfabric.so +libfabric.so.1 +libJavaAPI.so +libMLlibDAL.so +libmpi.so.12 +libsockets-fi.so +libtbbmalloc.so.2 +libtbb.so.12 diff --git a/mllib-dal/src/main/resources/log4j.properties b/mllib-dal/src/main/resources/log4j.properties new file mode 100644 index 000000000..a33c21109 --- /dev/null +++ b/mllib-dal/src/main/resources/log4j.properties @@ -0,0 +1 @@ +log4j.logger.org.apache.spark.ml.util.LibLoader=DEBUG diff --git a/mllib-dal/src/test/resources/log4j.properties b/mllib-dal/src/test/resources/log4j.properties new file mode 100644 index 000000000..0e2b28070 --- /dev/null +++ b/mllib-dal/src/test/resources/log4j.properties @@ -0,0 +1,25 @@ +# Set everything to be logged to the console +log4j.rootCategory=WARN, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Set the default spark-shell log level to WARN. When running the spark-shell, the +# log level for this class is used to overwrite the root logger's log level, so that +# the user can have different defaults for the shell and regular Spark apps. +log4j.logger.org.apache.spark.repl.Main=WARN + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.sparkproject.jetty=WARN +log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO +log4j.logger.org.apache.parquet=ERROR +log4j.logger.parquet=ERROR + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL +log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR + +log4j.logger.org.apache.spark.ml.util.LibLoader=DEBUG From 2db8fc7cb97def3ec4fcb1025eb12b0f744cb190 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Sat, 30 Oct 2021 21:40:28 +0800 Subject: [PATCH 02/19] Add prepare build resources, workaround CCL_ROOT parsing bug for 2021.4 --- dev/prepare-build-deps.sh | 54 +++++++++++++++++++ mllib-dal/src/main/native/build.sh | 4 ++ .../main/resources/{lib => lib1}/README.md | 0 .../org/apache/spark/ml/util/OneCCL.scala | 4 +- mllib-dal/src/test/resources/log4j.properties | 25 --------- mllib-dal/test.sh | 22 +++----- 6 files changed, 68 insertions(+), 41 deletions(-) create mode 100755 dev/prepare-build-deps.sh rename mllib-dal/src/main/resources/{lib => lib1}/README.md (100%) delete mode 100644 mllib-dal/src/test/resources/log4j.properties diff --git a/dev/prepare-build-deps.sh b/dev/prepare-build-deps.sh new file mode 100755 index 000000000..06088ceff --- /dev/null +++ b/dev/prepare-build-deps.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash + +if [ -z ${ONEAPI_ROOT} ]; then + echo Please source Intel oneAPI Toolkit environments! + exit 1 +fi + +if [[ -z $DAALROOT ]]; then + echo DAALROOT not defined! + exit 1 +fi + +if [[ -z $TBBROOT ]]; then + echo TBBROOT not defined! + exit 1 +fi + +if [[ -z $I_MPI_ROOT ]]; then + echo I_MPI_ROOT not defined! + exit 1 +fi + +if [[ -z $CCL_ROOT ]]; then + echo CCL_ROOT not defined! + exit 1 +fi + +# Use patchelf to change SONAME for libfabric +if [[ -z $(which patchelf) ]]; then + echo Please install \"patchelf\"! + exit 1 +fi + +if [[ $(basename $(pwd)) != "mllib-dal" ]]; then + echo Please execute the script from \"mllib-dal\" directory! + exit 1 +fi + +TARGET_DIR=./src/main/resources/lib + +cp $CCL_ROOT/lib/cpu_icc/libccl.so.1.0 $TARGET_DIR/libccl.so.1 + +cp $I_MPI_ROOT/libfabric/lib/libfabric.so.1 $TARGET_DIR/libfabric.so.1 +cp $I_MPI_ROOT/libfabric/lib/prov/libsockets-fi.so $TARGET_DIR + +cp $I_MPI_ROOT/libfabric/lib/libfabric.so.1 $TARGET_DIR/libfabric.so +patchelf --set-soname libfabric.so $TARGET_DIR/libfabric.so + +cp $I_MPI_ROOT/lib/release_mt/libmpi.so.12.0.0 $TARGET_DIR/libmpi.so.12 + +cp $DAALROOT/lib/intel64/libJavaAPI.so.1.1 $TARGET_DIR/libJavaAPI.so + +cp $TBBROOT/lib/intel64/gcc4.8/libtbb.so.12.4 $TARGET_DIR/libtbb.so.12 +cp $TBBROOT/lib/intel64/gcc4.8/libtbbmalloc.so.2.4 $TARGET_DIR/libtbbmalloc.so.2 diff --git a/mllib-dal/src/main/native/build.sh b/mllib-dal/src/main/native/build.sh index d271c5d97..cfa1ef844 100755 --- a/mllib-dal/src/main/native/build.sh +++ b/mllib-dal/src/main/native/build.sh @@ -14,5 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +if [[ $OAP_MLLIB_TESTING == "true" ]]; then + exit 0 +fi + make clean make -j diff --git a/mllib-dal/src/main/resources/lib/README.md b/mllib-dal/src/main/resources/lib1/README.md similarity index 100% rename from mllib-dal/src/main/resources/lib/README.md rename to mllib-dal/src/main/resources/lib1/README.md diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala index 7fccae192..643ed8f54 100644 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala +++ b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala @@ -27,8 +27,10 @@ object OneCCL extends Logging { // Run on Executor def setExecutorEnv(): Unit = { setEnv("CCL_ATL_TRANSPORT", "ofi") + // Set CCL_ROOT to workaround CCL_ROOT env read bug, should remove when upstream fix this + setEnv("CCL_ROOT", "/opt/intel/oneapi/ccl/latest") // Uncomment this if you whant to debug oneCCL - // setEnv("CCL_LOG_LEVEL", "2") + // setEnv("CCL_LOG_LEVEL", "debug") } def init(executor_num: Int, rank: Int, ip_port: String): Unit = { diff --git a/mllib-dal/src/test/resources/log4j.properties b/mllib-dal/src/test/resources/log4j.properties deleted file mode 100644 index 0e2b28070..000000000 --- a/mllib-dal/src/test/resources/log4j.properties +++ /dev/null @@ -1,25 +0,0 @@ -# Set everything to be logged to the console -log4j.rootCategory=WARN, console -log4j.appender.console=org.apache.log4j.ConsoleAppender -log4j.appender.console.target=System.err -log4j.appender.console.layout=org.apache.log4j.PatternLayout -log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n - -# Set the default spark-shell log level to WARN. When running the spark-shell, the -# log level for this class is used to overwrite the root logger's log level, so that -# the user can have different defaults for the shell and regular Spark apps. -log4j.logger.org.apache.spark.repl.Main=WARN - -# Settings to quiet third party logs that are too verbose -log4j.logger.org.sparkproject.jetty=WARN -log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR -log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO -log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO -log4j.logger.org.apache.parquet=ERROR -log4j.logger.parquet=ERROR - -# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support -log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL -log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR - -log4j.logger.org.apache.spark.ml.util.LibLoader=DEBUG diff --git a/mllib-dal/test.sh b/mllib-dal/test.sh index b9bfd215d..c2c41fc90 100755 --- a/mllib-dal/test.sh +++ b/mllib-dal/test.sh @@ -1,5 +1,9 @@ #!/usr/bin/env bash +if ! [[ -f target/oap-mllib-1.2.0.jar ]]; then + echo Please run ./build.sh first to do a complete build before testing! +fi + # Check envs for building if [[ -z $JAVA_HOME ]]; then echo JAVA_HOME not defined! @@ -16,15 +20,7 @@ if [[ -z $DAALROOT ]]; then exit 1 fi -if [[ -z $TBBROOT ]]; then - echo TBBROOT not defined! - exit 1 -fi - -if [[ -z $CCL_ROOT ]]; then - echo CCL_ROOT not defined! - exit 1 -fi +export OAP_MLLIB_TESTING=true versionArray=( spark-3.0.0 \ @@ -83,11 +79,7 @@ export PLATFORM_PROFILE=CPU_ONLY_PROFILE echo === Testing Environments === echo JAVA_HOME=$JAVA_HOME -echo DAALROOT=$DAALROOT -echo TBBROOT=$TBBROOT -echo CCL_ROOT=$CCL_ROOT echo Maven Version: $(mvn -v | head -n 1 | cut -f3 -d" ") -echo Clang Version: $(clang -dumpversion) echo Spark Version: $SPARK_VER echo Platform Profile: $PLATFORM_PROFILE echo ============================ @@ -108,10 +100,10 @@ if [[ -z $SUITE ]]; then echo echo Testing ALL suites... echo - mvn $MVN_NO_TRANSFER_PROGRESS -P$SPARK_VER -Dtest=none clean test + mvn $MVN_NO_TRANSFER_PROGRESS -P$SPARK_VER -Dtest=none test else echo echo Testing org.apache.spark.ml.$SUITE ... echo - mvn $MVN_NO_TRANSFER_PROGRESS -P$SPARK_VER -Dtest=none -DwildcardSuites=org.apache.spark.ml.$SUITE clean test + mvn $MVN_NO_TRANSFER_PROGRESS -P$SPARK_VER -Dtest=none -DwildcardSuites=org.apache.spark.ml.$SUITE test fi From 63f7d657900bcfb3cd5db304db4cefb595df1c4d Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Sat, 30 Oct 2021 21:41:21 +0800 Subject: [PATCH 03/19] nit --- mllib-dal/src/main/resources/lib1/README.md | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 mllib-dal/src/main/resources/lib1/README.md diff --git a/mllib-dal/src/main/resources/lib1/README.md b/mllib-dal/src/main/resources/lib1/README.md deleted file mode 100644 index dbabe3fbd..000000000 --- a/mllib-dal/src/main/resources/lib1/README.md +++ /dev/null @@ -1,9 +0,0 @@ -libccl.so.1 -libfabric.so -libfabric.so.1 -libJavaAPI.so -libMLlibDAL.so -libmpi.so.12 -libsockets-fi.so -libtbbmalloc.so.2 -libtbb.so.12 From a76bc7bb51a888aabe8cfe24ce24f3032614e895 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Sat, 30 Oct 2021 21:57:09 +0800 Subject: [PATCH 04/19] Add output version for prepare-build-deps.sh --- dev/prepare-build-deps.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dev/prepare-build-deps.sh b/dev/prepare-build-deps.sh index 06088ceff..414c9d5ec 100755 --- a/dev/prepare-build-deps.sh +++ b/dev/prepare-build-deps.sh @@ -52,3 +52,6 @@ cp $DAALROOT/lib/intel64/libJavaAPI.so.1.1 $TARGET_DIR/libJavaAPI.so cp $TBBROOT/lib/intel64/gcc4.8/libtbb.so.12.4 $TARGET_DIR/libtbb.so.12 cp $TBBROOT/lib/intel64/gcc4.8/libtbbmalloc.so.2.4 $TARGET_DIR/libtbbmalloc.so.2 + +echo oneAPI Toolkit version: $(basename $CCL_ROOT) > $TARGET_DIR/VERSION + From d75bc3cfa0a0c3119b2e3d7ee9ab09aa532192ea Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Mon, 1 Nov 2021 19:37:52 +0800 Subject: [PATCH 05/19] Add dev/build-maven-local-repo.sh --- dev/build-maven-local-repo.sh | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100755 dev/build-maven-local-repo.sh diff --git a/dev/build-maven-local-repo.sh b/dev/build-maven-local-repo.sh new file mode 100755 index 000000000..44a94a794 --- /dev/null +++ b/dev/build-maven-local-repo.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +if [[ -z $DAALROOT ]]; then + echo DAALROOT not defined! + exit 1 +fi + +echo "Building Maven Repo for oneDAL ..." + +mkdir maven-repository +mvn deploy:deploy-file -Dfile=$DAALROOT/lib/onedal.jar -DgroupId=com.intel.onedal -Dversion=2021.4.0 -Dpackaging=jar -Durl=file:./maven-repository -DrepositoryId=maven-repository -DupdateReleaseInfo=true + +echo "DONE" + +find ./maven-repository + +# Add the following into pom.xml: + +# +# +# maven-repository +# file:///${project.basedir}/maven-repository +# +# + +# +# com.intel.dal +# dal +# 2021.4.0 +# \ No newline at end of file From 9c66fdbae70cfa5ffcc09ee82123e7d13969512d Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Mon, 1 Nov 2021 22:21:15 +0800 Subject: [PATCH 06/19] Add dal 2021.4 deps from central instead of local, clean assembly.xml, update build & test scripts --- dev/prepare-build-deps-gpu.sh | 71 ++++++++++++++ dev/prepare-build-deps.sh | 4 +- mllib-dal/build-cpu-gpu.sh | 13 +++ mllib-dal/build.sh | 17 ++++ mllib-dal/pom.xml | 17 ++-- mllib-dal/src/assembly/assembly-cpu-gpu.xml | 103 -------------------- mllib-dal/src/assembly/assembly.xml | 19 ++-- mllib-dal/test.sh | 16 +-- 8 files changed, 132 insertions(+), 128 deletions(-) create mode 100755 dev/prepare-build-deps-gpu.sh delete mode 100644 mllib-dal/src/assembly/assembly-cpu-gpu.xml diff --git a/dev/prepare-build-deps-gpu.sh b/dev/prepare-build-deps-gpu.sh new file mode 100755 index 000000000..e6762e1c7 --- /dev/null +++ b/dev/prepare-build-deps-gpu.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash + +if [ -z ${ONEAPI_ROOT} ]; then + echo Please source Intel oneAPI Toolkit environments! + exit 1 +fi + +if [[ -z $DAALROOT ]]; then + echo DAALROOT not defined! + exit 1 +fi + +if [[ -z $TBBROOT ]]; then + echo TBBROOT not defined! + exit 1 +fi + +if [[ -z $I_MPI_ROOT ]]; then + echo I_MPI_ROOT not defined! + exit 1 +fi + +if [[ -z $CCL_ROOT ]]; then + echo CCL_ROOT not defined! + exit 1 +fi + +# Use patchelf to change SONAME for libfabric +if [[ -z $(which patchelf) ]]; then + echo Please install \"patchelf\"! + exit 1 +fi + +if [[ $(basename $(pwd)) != "mllib-dal" ]]; then + echo Please execute the script from \"mllib-dal\" directory! + exit 1 +fi + +TARGET_DIR=./src/main/resources/lib + +rm -f $TARGET_DIR/*.so* + +cp $CCL_ROOT/lib/cpu_icc/libccl.so.1.0 $TARGET_DIR/libccl.so.1 + +cp $I_MPI_ROOT/libfabric/lib/libfabric.so.1 $TARGET_DIR/libfabric.so.1 +cp $I_MPI_ROOT/libfabric/lib/prov/libsockets-fi.so $TARGET_DIR + +# Workaround dlopen (libfabric.so) in oneCCL +cp $I_MPI_ROOT/libfabric/lib/libfabric.so.1 $TARGET_DIR/libfabric.so +patchelf --set-soname libfabric.so $TARGET_DIR/libfabric.so + +cp $I_MPI_ROOT/lib/release_mt/libmpi.so.12.0.0 $TARGET_DIR/libmpi.so.12 + +cp $DAALROOT/lib/intel64/libJavaAPI.so.1.1 $TARGET_DIR/libJavaAPI.so + +cp $TBBROOT/lib/intel64/gcc4.8/libtbb.so.12.4 $TARGET_DIR/libtbb.so.12 +cp $TBBROOT/lib/intel64/gcc4.8/libtbbmalloc.so.2.4 $TARGET_DIR/libtbbmalloc.so.2 + +# SYCL libs +cp $CMPLR_ROOT/linux/compiler/lib/intel64_lin/libintlc.so.5 $TARGET_DIR +cp $CMPLR_ROOT/linux/compiler/lib/intel64_lin/libsvml.so $TARGET_DIR + +# Workaround lib loading for JNI as libirng.so doesn't have soname +cp $CMPLR_ROOT/linux/compiler/lib/intel64_lin/libirng.so $TARGET_DIR +patchelf --set-soname libirng.so $TARGET_DIR/libirng.so + +cp $CMPLR_ROOT/linux/compiler/lib/intel64_lin/libimf.so $TARGET_DIR +cp $CMPLR_ROOT/linux/lib/libOpenCL.so.1 $TARGET_DIR +cp $CMPLR_ROOT/linux/lib/libsycl.so.5 $TARGET_DIR + +echo oneAPI Toolkit version: $(basename $CCL_ROOT) > $TARGET_DIR/VERSION diff --git a/dev/prepare-build-deps.sh b/dev/prepare-build-deps.sh index 414c9d5ec..6b74dfed2 100755 --- a/dev/prepare-build-deps.sh +++ b/dev/prepare-build-deps.sh @@ -38,11 +38,14 @@ fi TARGET_DIR=./src/main/resources/lib +rm -f $TARGET_DIR/*.so* + cp $CCL_ROOT/lib/cpu_icc/libccl.so.1.0 $TARGET_DIR/libccl.so.1 cp $I_MPI_ROOT/libfabric/lib/libfabric.so.1 $TARGET_DIR/libfabric.so.1 cp $I_MPI_ROOT/libfabric/lib/prov/libsockets-fi.so $TARGET_DIR +# Workaround dlopen (libfabric.so) in oneCCL cp $I_MPI_ROOT/libfabric/lib/libfabric.so.1 $TARGET_DIR/libfabric.so patchelf --set-soname libfabric.so $TARGET_DIR/libfabric.so @@ -54,4 +57,3 @@ cp $TBBROOT/lib/intel64/gcc4.8/libtbb.so.12.4 $TARGET_DIR/libtbb.so.12 cp $TBBROOT/lib/intel64/gcc4.8/libtbbmalloc.so.2.4 $TARGET_DIR/libtbbmalloc.so.2 echo oneAPI Toolkit version: $(basename $CCL_ROOT) > $TARGET_DIR/VERSION - diff --git a/mllib-dal/build-cpu-gpu.sh b/mllib-dal/build-cpu-gpu.sh index 27b1777d9..c88c05ca3 100755 --- a/mllib-dal/build-cpu-gpu.sh +++ b/mllib-dal/build-cpu-gpu.sh @@ -26,6 +26,19 @@ if [[ -z $CCL_ROOT ]]; then exit 1 fi +# Check lib dependencies for building +RESOURCE_PATH=src/main/resources/lib +LIBS=(libccl.so.1 libfabric.so libfabric.so.1 libJavaAPI.so libmpi.so.12 \ + libsockets-fi.so libtbbmalloc.so.2 libtbb.so.12 libintlc.so.5 libsvml.so libirng.so libimf.so \ + libOpenCL.so.1 libsycl.so.5) +for lib in ${LIBS[@]} +do + if [[ ! -f ./$RESOURCE_PATH/$lib ]]; then + echo $RESOURCE_PATH/$lib does not exsit, please run ../dev/prepare-builds-deps-gpu.sh! + exit 1 +fi +done + versionArray=( spark-3.0.0 \ spark-3.0.1 \ diff --git a/mllib-dal/build.sh b/mllib-dal/build.sh index 7ae84e01f..25546bb77 100755 --- a/mllib-dal/build.sh +++ b/mllib-dal/build.sh @@ -26,6 +26,23 @@ if [[ -z $CCL_ROOT ]]; then exit 1 fi +# Check lib dependencies for building +RESOURCE_PATH=src/main/resources/lib +LIBS=(libccl.so.1 libfabric.so libfabric.so.1 libJavaAPI.so libmpi.so.12 \ + libsockets-fi.so libtbbmalloc.so.2 libtbb.so.12) +for lib in ${LIBS[@]} +do + if [[ ! -f ./$RESOURCE_PATH/$lib ]]; then + echo $RESOURCE_PATH/$lib does not exsit, please run ../dev/prepare-builds-deps.sh! + exit 1 +fi +done + +if [[ -f ./$RESOURCE_PATH/libsycl.so.5 ]]; then + echo GPU libs found! Please re-run ../dev/prepare-builds-deps.sh! + exit 1 +fi + versionArray=( spark-3.0.0 \ spark-3.0.1 \ diff --git a/mllib-dal/pom.xml b/mllib-dal/pom.xml index 9ac307b68..5d26acc18 100644 --- a/mllib-dal/pom.xml +++ b/mllib-dal/pom.xml @@ -62,15 +62,13 @@ spark-mllib_2.12 ${spark.version} provided - + - com.intel.onedal - onedal - ${oneapi.version} - system - ${env.DAALROOT}/lib/onedal.jar - + com.intel.dal + dal + 2021.4.0.83 + junit @@ -158,10 +156,7 @@ env.PLATFORM_PROFILE CPU_GPU_PROFILE - - - src/assembly/assembly-cpu-gpu.xml - + diff --git a/mllib-dal/src/assembly/assembly-cpu-gpu.xml b/mllib-dal/src/assembly/assembly-cpu-gpu.xml deleted file mode 100644 index d49fd1adf..000000000 --- a/mllib-dal/src/assembly/assembly-cpu-gpu.xml +++ /dev/null @@ -1,103 +0,0 @@ - - jar-with-dependencies - - jar - - false - - - / - true - true - runtime - - - - / - true - system - - - - - ${project.basedir} - / - - README* - LICENSE* - NOTICE* - - - - ${project.build.directory} - lib - - *.so - - - - - - - ${env.TBBROOT}/lib/intel64/gcc4.8/${tbb.lib} - lib - libtbb.so.2 - - - ${env.TBBROOT}/lib/intel64/gcc4.8/${tbb.malloc.lib} - lib - libtbbmalloc.so.2 - - - - ${env.DAALROOT}/lib/intel64/${dal.java.lib} - lib - libJavaAPI.so - - - - ${env.CCL_ROOT}/lib/${ccl.fabric.lib} - lib - - - ${env.CCL_ROOT}/lib/${ccl.mpi.lib} - lib - libmpi.so.12 - - - ${env.CCL_ROOT}/lib/libccl.so - lib - - - ${env.CCL_ROOT}/lib/prov/libsockets-fi.so - lib - - - - ${env.CMPLR_ROOT}/linux/compiler/lib/intel64_lin/libintlc.so.5 - lib - - - ${env.CMPLR_ROOT}/linux/compiler/lib/intel64_lin/libsvml.so - lib - - - ${env.CMPLR_ROOT}/linux/compiler/lib/intel64_lin/libirng.so - lib - - - ${env.CMPLR_ROOT}/linux/compiler/lib/intel64_lin/libimf.so - lib - - - ${env.CMPLR_ROOT}/linux/lib/${opencl.lib} - lib - - - ${env.CMPLR_ROOT}/linux/lib/${sycl.lib} - lib - - - diff --git a/mllib-dal/src/assembly/assembly.xml b/mllib-dal/src/assembly/assembly.xml index 932acc8a1..983c8482c 100644 --- a/mllib-dal/src/assembly/assembly.xml +++ b/mllib-dal/src/assembly/assembly.xml @@ -12,12 +12,17 @@ true true runtime - - - + + + + + ${project.basedir} / - true - system - - + + README* + LICENSE* + NOTICE* + + + diff --git a/mllib-dal/test.sh b/mllib-dal/test.sh index c2c41fc90..1d740cd50 100755 --- a/mllib-dal/test.sh +++ b/mllib-dal/test.sh @@ -1,7 +1,16 @@ #!/usr/bin/env bash -if ! [[ -f target/oap-mllib-1.2.0.jar ]]; then +if [[ -n $DAALROOT ]]; then + echo + echo ==================================================================================== + echo WARNING: DAALROOT detected. It is recommended to test without oneAPI environment! + echo ==================================================================================== + echo +fi + +if [[ ! -f target/oap-mllib-1.2.0.jar ]]; then echo Please run ./build.sh first to do a complete build before testing! + exit 1 fi # Check envs for building @@ -15,11 +24,6 @@ if [[ -z $(which mvn) ]]; then exit 1 fi -if [[ -z $DAALROOT ]]; then - echo DAALROOT not defined! - exit 1 -fi - export OAP_MLLIB_TESTING=true versionArray=( From 1889f8d0ae5f29695aac534cf0c6ab28732cd03b Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 3 Nov 2021 10:38:53 +0800 Subject: [PATCH 07/19] update scripts --- .github/PULL_REQUEST_TEMPLATE | 6 ++ .github/workflows/oap-mllib-ci.yml | 2 +- dev/install-build-deps-centos.sh | 12 +-- dev/install-build-deps-ubuntu.sh | 12 +-- examples/correlation/run.sh | 0 mllib-dal/pom.xml | 157 +++++++++++++++-------------- 6 files changed, 89 insertions(+), 100 deletions(-) mode change 100644 => 100755 examples/correlation/run.sh diff --git a/.github/PULL_REQUEST_TEMPLATE b/.github/PULL_REQUEST_TEMPLATE index b0289f4f6..3d8952163 100644 --- a/.github/PULL_REQUEST_TEMPLATE +++ b/.github/PULL_REQUEST_TEMPLATE @@ -1,3 +1,9 @@ ## What changes were proposed in this pull request? (Please fill in changes proposed in this fix) + +## Does this PR also require the following changes? + +- CI +- Documentation +- Example diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index f0c1ab3e5..1b28d0a79 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -20,7 +20,7 @@ jobs: ~/.m2/repository /opt/intel/oneapi ~/opt - key: ${{ runner.os }}_spark-3.1.1_hadoop-3.2.0_oneapi-2021.3.0 + key: ${{ runner.os }}_spark-3.1.1_hadoop-3.2.0_oneapi-2021.4.0 restore-keys: | ${{ runner.os }}- - name: Set up environments diff --git a/dev/install-build-deps-centos.sh b/dev/install-build-deps-centos.sh index 275222be8..877992228 100755 --- a/dev/install-build-deps-centos.sh +++ b/dev/install-build-deps-centos.sh @@ -15,17 +15,7 @@ EOF sudo mv /tmp/oneAPI.repo /etc/yum.repos.d # sudo yum groupinstall -y "Development Tools" # sudo yum install -y cmake - sudo yum install -y intel-oneapi-dpcpp-cpp-2021.3.0 intel-oneapi-dal-devel-2021.3.0 intel-oneapi-tbb-devel-2021.3.0 + sudo yum install -y intel-oneapi-dpcpp-cpp-2021.4.0 intel-oneapi-dal-devel-2021.4.0 intel-oneapi-tbb-devel-2021.4.0 intel-oneapi-ccl-devel-2021.4.0 intel-oneapi-mpi-devel-2021.4.0 else echo "oneAPI components already installed!" fi - -echo "Building oneCCL ..." -cd /tmp -rm -rf oneCCL -git clone https://github.com/oneapi-src/oneCCL -cd oneCCL -git checkout 2021.2.1 -mkdir build && cd build -cmake .. -make -j 2 install diff --git a/dev/install-build-deps-ubuntu.sh b/dev/install-build-deps-ubuntu.sh index a6379dae9..027956b74 100755 --- a/dev/install-build-deps-ubuntu.sh +++ b/dev/install-build-deps-ubuntu.sh @@ -9,17 +9,7 @@ if [ ! -d /opt/intel/oneapi ]; then echo "deb https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list sudo apt-get update # sudo apt-get install -y build-essential cmake - sudo apt-get install -y intel-oneapi-dpcpp-cpp-2021.3.0 intel-oneapi-dal-devel-2021.3.0 intel-oneapi-tbb-devel-2021.3.0 + sudo apt-get install -y intel-oneapi-dpcpp-cpp-2021.4.0 intel-oneapi-dal-devel-2021.4.0 intel-oneapi-tbb-devel-2021.4.0 intel-oneapi-ccl-devel-2021.4.0 intel-oneapi-mpi-devel-2021.4.0 else echo "oneAPI components already installed!" fi - -echo "Building oneCCL ..." -cd /tmp -rm -rf oneCCL -git clone https://github.com/oneapi-src/oneCCL -cd oneCCL -git checkout 2021.2.1 -mkdir build && cd build -cmake .. -make -j 2 install diff --git a/examples/correlation/run.sh b/examples/correlation/run.sh old mode 100644 new mode 100755 diff --git a/mllib-dal/pom.xml b/mllib-dal/pom.xml index 5d26acc18..d2b74863d 100644 --- a/mllib-dal/pom.xml +++ b/mllib-dal/pom.xml @@ -1,6 +1,5 @@ + xmlns="http://maven.apache.org/POM/4.0.0" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 com.intel.oap @@ -29,61 +28,84 @@ src/assembly/assembly.xml - + + + gcs-maven-central-mirror + + GCS Maven Central mirror + https://maven-central.storage-download.googleapis.com/maven2/ + + true + + + false + + + + + central + Maven Repository + https://repo.maven.apache.org/maven2 + + true + + + false + + + + org.scala-lang scala-library 2.12.10 - com.github.scopt scopt_2.12 3.7.0 - org.apache.spark spark-core_2.12 ${spark.version} provided - org.apache.spark spark-sql_2.12 ${spark.version} provided - org.apache.spark spark-mllib_2.12 ${spark.version} provided - - + com.intel.dal dal 2021.4.0.83 - - + junit junit 4.12 test - org.scalatest scalatest_${scala.binary.version} ${scalatest.version} test - org.apache.spark spark-mllib_2.12 @@ -91,7 +113,6 @@ test-jar test - org.apache.spark spark-mllib-local_${scala.binary.version} @@ -99,7 +120,6 @@ test-jar test - org.jpmml pmml-model @@ -112,7 +132,6 @@ - org.apache.spark spark-sql_2.12 @@ -120,7 +139,6 @@ test-jar test - org.apache.spark spark-core_2.12 @@ -128,7 +146,6 @@ test-jar test - org.apache.spark spark-catalyst_2.12 @@ -136,7 +153,6 @@ test-jar test - org.apache.spark spark-tags_2.12 @@ -144,11 +160,9 @@ test-jar test - - cpu-gpu @@ -156,9 +170,8 @@ env.PLATFORM_PROFILE CPU_GPU_PROFILE - + - spark-3.0.0 @@ -166,7 +179,6 @@ 3.0.8 - spark-3.0.1 @@ -174,7 +186,6 @@ 3.0.8 - spark-3.0.2 @@ -182,7 +193,6 @@ 3.0.8 - spark-3.1.1 @@ -197,58 +207,58 @@ - - org.codehaus.mojo - build-helper-maven-plugin - 3.2.0 - + + org.codehaus.mojo + build-helper-maven-plugin + 3.2.0 + - add-source - generate-sources - - add-source - - - - src/spark-${spark.version}/main/java - src/spark-${spark.version}/main/scala - - + add-source + generate-sources + + add-source + + + + src/spark-${spark.version}/main/java + src/spark-${spark.version}/main/scala + + - add-test-source - generate-sources - - add-test-source - - - - src/spark-${spark.version}/test/scala - - + add-test-source + generate-sources + + add-test-source + + + + src/spark-${spark.version}/test/scala + + - - + + net.alchim31.maven scala-maven-plugin 4.4.0 - - scala-compile-first - process-resources - - add-source - compile - - - - scala-test-compile - process-test-resources - - testCompile - - + + scala-compile-first + process-resources + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + ${scala.version} @@ -324,7 +334,6 @@ true - org.scalatest scalatest-maven-plugin @@ -343,7 +352,6 @@ - maven-antrun-plugin 1.8 @@ -353,8 +361,7 @@ Building native code - + @@ -364,12 +371,10 @@ - maven-resources-plugin 3.0.2 - maven-assembly-plugin 3.0.0 @@ -390,8 +395,6 @@ - - From 0d50a8fda57cdbc6114d29866f463e19753cdf5d Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 3 Nov 2021 10:51:16 +0800 Subject: [PATCH 08/19] update ci --- dev/ci-test.sh | 7 ++++++- mllib-dal/build.sh | 6 +++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/dev/ci-test.sh b/dev/ci-test.sh index ce079fe7d..60de8e091 100755 --- a/dev/ci-test.sh +++ b/dev/ci-test.sh @@ -4,8 +4,12 @@ source /opt/intel/oneapi/setvars.sh source /tmp/oneCCL/build/_install/env/setvars.sh -SupportedSparkVersions=("spark-3.0.0" "spark-3.0.1" "spark-3.0.2" "spark-3.1.1") +# Prepare lib resources +cd $GITHUB_WORKSPACE/mllib-dal +../dev/prepare-builds-deps.sh +# Test for all versions +SupportedSparkVersions=("spark-3.0.0" "spark-3.0.1" "spark-3.0.2" "spark-3.1.1") for SparkVer in ${SupportedSparkVersions[*]}; do echo echo "========================================" @@ -13,6 +17,7 @@ for SparkVer in ${SupportedSparkVersions[*]}; do echo "========================================" echo cd $GITHUB_WORKSPACE/mllib-dal + ./build.sh -q ./test.sh -q -p $SparkVer done diff --git a/mllib-dal/build.sh b/mllib-dal/build.sh index 25546bb77..069389fef 100755 --- a/mllib-dal/build.sh +++ b/mllib-dal/build.sh @@ -31,8 +31,8 @@ RESOURCE_PATH=src/main/resources/lib LIBS=(libccl.so.1 libfabric.so libfabric.so.1 libJavaAPI.so libmpi.so.12 \ libsockets-fi.so libtbbmalloc.so.2 libtbb.so.12) for lib in ${LIBS[@]} -do - if [[ ! -f ./$RESOURCE_PATH/$lib ]]; then +do + if [[ ! -f ./$RESOURCE_PATH/$lib ]]; then echo $RESOURCE_PATH/$lib does not exsit, please run ../dev/prepare-builds-deps.sh! exit 1 fi @@ -62,7 +62,7 @@ print_usage() { do echo " $version" done - echo + echo } while getopts "hqp:" opt From addbe197ca610d9bc8c09282778c16d62430dfc1 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 3 Nov 2021 11:46:47 +0800 Subject: [PATCH 09/19] Add vscode settings Add RELEASE and revise env.sh for test-cluster Add exe mode to build.sh --- .gitignore | 2 +- .vscode/c_cpp_properties.json | 19 +++++++++++++++ .vscode/settings.json | 37 +++++++++++++++++++++++++++++ RELEASE | 1 + dev/test-cluster/env.sh | 8 ++++--- examples/correlation/build.sh | 0 mllib-dal/src/assembly/assembly.xml | 7 +++--- 7 files changed, 67 insertions(+), 7 deletions(-) create mode 100644 .vscode/c_cpp_properties.json create mode 100644 .vscode/settings.json create mode 100644 RELEASE mode change 100644 => 100755 examples/correlation/build.sh diff --git a/.gitignore b/.gitignore index 1d621bdd4..b69b6d7f3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ *.o *.log -.vscode *.iml +.vscode/ target/ .idea/ .idea_modules/ diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 000000000..baa3db3b1 --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,19 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/mllib-dal/src/main/native/**", + "${CCL_ROOT}/include/**", + "${DAALROOT}/include/**", + "${JAVA_HOME}/include/**" + ], + "defines": [], + "compilerPath": "${CMPLR_ROOT}/linux/bin/clang", + "cStandard": "c17", + "cppStandard": "c++14", + "intelliSenseMode": "clang-x64" + } + ], + "version": 4 +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..2edd51bcb --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,37 @@ +{ + "files.associations": { + "*.tcc": "cpp", + "cctype": "cpp", + "chrono": "cpp", + "cstdint": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "exception": "cpp", + "initializer_list": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "limits": "cpp", + "ostream": "cpp", + "ratio": "cpp", + "string_view": "cpp", + "type_traits": "cpp", + "clocale": "cpp", + "streambuf": "cpp", + "algorithm": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdio": "cpp", + "deque": "cpp", + "vector": "cpp", + "functional": "cpp", + "memory_resource": "cpp", + "string": "cpp", + "utility": "cpp", + "fstream": "cpp", + "iomanip": "cpp", + "new": "cpp", + "sstream": "cpp", + "*.template": "shellscript" + } +} \ No newline at end of file diff --git a/RELEASE b/RELEASE new file mode 100644 index 000000000..a72a32503 --- /dev/null +++ b/RELEASE @@ -0,0 +1 @@ +OAP_MLLIB_VERSION=1.2.0 \ No newline at end of file diff --git a/dev/test-cluster/env.sh b/dev/test-cluster/env.sh index 225db0b7b..c36a21397 100644 --- a/dev/test-cluster/env.sh +++ b/dev/test-cluster/env.sh @@ -2,8 +2,10 @@ # ============== Minimum Settings ============= # +# Import RELEASE envs +source $GITHUB_WORKSPACE/RELEASE # Set OAP MLlib version (e.g. 1.1.0) -OAP_MLLIB_VERSION=1.1.0 +OAP_MLLIB_VERSION=${OAP_MLLB_VERSION} # Set Spark master SPARK_MASTER=yarn # Set Hadoop home path @@ -23,7 +25,7 @@ export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop # Set JAR name & path OAP_MLLIB_JAR_NAME=oap-mllib-$OAP_MLLIB_VERSION.jar OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME -# Set Spark driver & executor classpaths, +# Set Spark driver & executor classpaths, # absolute path for driver, relative path for executor SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME @@ -37,7 +39,7 @@ SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES # Checks -for dir in $SPARK_HOME $HADOOP_HOME $OAP_MLLIB_JAR +for dir in $SPARK_HOME $HADOOP_HOME $OAP_MLLIB_JAR do if [[ ! -e $dir ]]; then echo $dir does not exist! diff --git a/examples/correlation/build.sh b/examples/correlation/build.sh old mode 100644 new mode 100755 diff --git a/mllib-dal/src/assembly/assembly.xml b/mllib-dal/src/assembly/assembly.xml index 983c8482c..e0d177b95 100644 --- a/mllib-dal/src/assembly/assembly.xml +++ b/mllib-dal/src/assembly/assembly.xml @@ -12,7 +12,7 @@ true true runtime - + @@ -22,7 +22,8 @@ README* LICENSE* NOTICE* + RELEASE* - - + + From 484829299a90d873af21bb31b6a62d785eaf35cb Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 3 Nov 2021 11:57:56 +0800 Subject: [PATCH 10/19] nit --- dev/ci-test.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dev/ci-test.sh b/dev/ci-test.sh index 60de8e091..b5ade50fb 100755 --- a/dev/ci-test.sh +++ b/dev/ci-test.sh @@ -2,11 +2,10 @@ # Setup building envs source /opt/intel/oneapi/setvars.sh -source /tmp/oneCCL/build/_install/env/setvars.sh # Prepare lib resources cd $GITHUB_WORKSPACE/mllib-dal -../dev/prepare-builds-deps.sh +../dev/prepare-build-deps.sh # Test for all versions SupportedSparkVersions=("spark-3.0.0" "spark-3.0.1" "spark-3.0.2" "spark-3.1.1") From a9fed84c7cccee4d68a2545fa1c04d2c9a5f4f48 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 3 Nov 2021 12:09:12 +0800 Subject: [PATCH 11/19] add lib as empty dir --- mllib-dal/.gitignore | 1 - mllib-dal/src/main/resources/lib/.gitignore | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) delete mode 100644 mllib-dal/.gitignore create mode 100644 mllib-dal/src/main/resources/lib/.gitignore diff --git a/mllib-dal/.gitignore b/mllib-dal/.gitignore deleted file mode 100644 index d50d2b06f..000000000 --- a/mllib-dal/.gitignore +++ /dev/null @@ -1 +0,0 @@ -src/main/resources/lib/ diff --git a/mllib-dal/src/main/resources/lib/.gitignore b/mllib-dal/src/main/resources/lib/.gitignore new file mode 100644 index 000000000..86d0cb272 --- /dev/null +++ b/mllib-dal/src/main/resources/lib/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore \ No newline at end of file From a2135bbd3e143a7422b4714cf626d99e0bd6d13c Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 3 Nov 2021 17:53:15 +0800 Subject: [PATCH 12/19] set log4j to WARN, update doc and code comments --- README.md | 79 +++++++------------ mllib-dal/build-cpu-gpu.sh | 8 +- mllib-dal/build.sh | 4 +- .../src/main/native/CorrelationDALImpl.cpp | 2 - mllib-dal/src/test/resources/log4j.properties | 42 ++++++++++ 5 files changed, 77 insertions(+), 58 deletions(-) create mode 100644 mllib-dal/src/test/resources/log4j.properties diff --git a/README.md b/README.md index 6088d0350..0667dc69c 100644 --- a/README.md +++ b/README.md @@ -10,11 +10,11 @@ OAP MLlib is an optimized package to accelerate machine learning algorithms in ## Compatibility -OAP MLlib maintains the same API interfaces with Spark MLlib. That means the application built with Spark MLlib can be running directly with minimum configuration. +OAP MLlib maintains the same API interfaces with Spark MLlib. That means the application built with Spark MLlib can be running directly with minimum configuration. -Most of the algorithms can produce the same results that are identical with Spark MLlib. However due to the nature of distributed float point operations, there may be some small deviation from the original result, we will make sure the error is within acceptable range and the accuracy is on par with Spark MLlib. +Most of the algorithms can produce the same results that are identical with Spark MLlib. However due to the nature of distributed float point operations, there may be some small deviation from the original result, we will make sure the error is within acceptable range and the accuracy is on par with Spark MLlib. -For those algorithms that are not accelerated by OAP MLlib, the original Spark MLlib one will be used. +For those algorithms that are not accelerated by OAP MLlib, the original Spark MLlib one will be used. ## Online Documentation @@ -55,7 +55,7 @@ Intel® oneAPI Toolkits components used by the project are already included into #### General Configuration ##### YARN Cluster Manager -Users usually run Spark application on __YARN__ with __client__ mode. In that case, you only need to add the following configurations in `spark-defaults.conf` or in `spark-submit` command line before running. +Users usually run Spark application on __YARN__ with __client__ mode. In that case, you only need to add the following configurations in `spark-defaults.conf` or in `spark-submit` command line before running. ``` # absolute path of the jar for uploading @@ -85,14 +85,14 @@ OAP MLlib expects 1 executor acts as 1 oneCCL rank for compute. As `spark.shuffl ### Sanity Check #### Setup `env.sh` -``` +```bash $ cd conf $ cp env.sh.template env.sh ``` Edit related variables in "`Minimun Settings`" of `env.sh` #### Upload example data files to HDFS -``` +```bash $ cd examples $ hadoop fs -mkdir -p /user/$USER $ hadoop fs -copyFromLocal data @@ -100,7 +100,7 @@ Edit related variables in "`Minimun Settings`" of `env.sh` ``` #### Run K-means -``` +```bash $ cd examples/kmeans $ ./build.sh $ ./run.sh @@ -119,45 +119,27 @@ We use [Apache Maven](https://maven.apache.org/) to manage and build source code * JDK 8.0+ * Apache Maven 3.6.2+ * GNU GCC 4.8.5+ -* Intel® oneAPI Toolkits 2021.3.0 Components: +* Intel® oneAPI Base Toolkit (>=2021.4.0) Components : - DPC++/C++ Compiler (dpcpp/clang++) - Data Analytics Library (oneDAL) - Threading Building Blocks (oneTBB) -* [Open Source Intel® oneAPI Collective Communications Library (oneCCL)](https://github.com/oneapi-src/oneCCL) - -Intel® oneAPI Toolkits and its components can be downloaded and install from [here](https://software.intel.com/content/www/us/en/develop/tools/oneapi.html). Installation process for oneAPI using Package Managers (YUM (DNF), APT, and ZYPPER) is also available. Generally you only need to install oneAPI Base Toolkit for Linux with all or selected components mentioned above. Instead of using oneCCL included in Intel® oneAPI Toolkits, we prefer to build from open source oneCCL to resolve some bugs. + - Collective Communications Library (oneCCL)] -More details about oneAPI can be found [here](https://software.intel.com/content/www/us/en/develop/tools/oneapi.html). +Generally you only need to install __Intel® oneAPI Base Toolkit for Linux__ with all or selected components mentioned above. Intel® oneAPI Base Toolkit can be downloaded and installed from [here](https://software.intel.com/content/www/us/en/develop/tools/oneapi.html). Installation process for oneAPI using Package Managers (YUM (DNF), APT, and ZYPPER) is also available. More details about oneAPI can be found [here](https://software.intel.com/content/www/us/en/develop/tools/oneapi.html). -Scala and Java dependency descriptions are already included in Maven POM file. +Scala and Java dependency descriptions are already included in Maven POM file. ***Note:*** You can refer to [this script](dev/install-build-deps-centos.sh) to install correct dependencies: DPC++/C++, oneDAL, oneTBB, oneCCL. ### Build -#### Building oneCCL - -To clone and build from open source oneCCL, run the following commands: -``` - $ git clone https://github.com/oneapi-src/oneCCL - $ cd oneCCL - $ git checkout 2021.2.1 - $ mkdir build && cd build - $ cmake .. - $ make -j install -``` - -The generated files will be placed in `/your/oneCCL_source_code/build/_install` - -#### Building OAP MLlib - To clone and checkout source code, run the following commands: -``` - $ git clone https://github.com/oap-project/oap-mllib.git +```bash + $ git clone https://github.com/oap-project/oap-mllib.git ``` __Optional__ to checkout specific release branch: -``` - $ cd oap-mllib && git checkout ${version} +```bash + $ cd oap-mllib && git checkout ${version} ``` We rely on environment variables to find required toolchains and libraries. Please make sure the following environment variables are set for building: @@ -171,25 +153,22 @@ CCL_ROOT | Path to oneCCL home directory We suggest you to source `setvars.sh` script into current shell to setup building environments as following: -``` +```bash $ source /opt/intel/oneapi/setvars.sh - $ source /your/oneCCL_source_code/build/_install/env/setvars.sh ``` -__Be noticed we are using our own built oneCCL instead, we should source oneCCL's `setvars.sh` to overwrite oneAPI one.__ - You can also refer to [this CI script](dev/ci-test.sh) to setup the building environments. -If you prefer to buid your own open source [oneDAL](https://github.com/oneapi-src/oneDAL), [oneTBB](https://github.com/oneapi-src/oneTBB) versions rather than use the ones included in oneAPI TookKits, you can refer to the related build instructions and manually source `setvars.sh` accordingly. +If you prefer to buid your own open source [oneDAL](https://github.com/oneapi-src/oneDAL), [oneTBB](https://github.com/oneapi-src/oneTBB), [oneCCL](https://github.com/oneapi-src/oneCCL) versions rather than use the ones included in oneAPI Base Toolkit, you can refer to the related build instructions and manually source `setvars.sh` accordingly. -To build, run the following commands: -``` +To build, run the following commands: +```bash $ cd mllib-dal $ ./build.sh ``` If no parameter is given, the Spark version __3.1.1__ will be activated by default. You can also specify a different Spark version with option `-p spark-x.x.x`. For example: -``` +```bash $ ./build.sh -p spark-3.0.0 ``` @@ -206,6 +185,7 @@ pca | PCA example for Scala als | ALS example for Scala naive-bayes | Naive Bayes example for Scala linear-regression | Linear Regression example for Scala +correlation | Correlation example for Scala ### Python Examples @@ -217,12 +197,11 @@ als-pyspark | ALS example for PySpark ## List of Accelerated Algorithms -Algorithm | Category | Maturity -------------------|----------|------------- -K-Means | CPU | Stable -K-Means | GPU | Experimental -PCA | CPU | Stable -PCA | GPU | Experimental -ALS | CPU | Stable -Naive Bayes | CPU | Experimental -Linear Regression | CPU | Experimental +Algorithm | CPU | GPU | Maturity +------------------|-----|-----|--------- +K-Means | X | X | Stable +PCA | X | X | Stable +ALS | X | | Experimental +Naive Bayes | X | | Stable +Linear Regression | X | | Stable +Correlation | X | X | Experimental diff --git a/mllib-dal/build-cpu-gpu.sh b/mllib-dal/build-cpu-gpu.sh index c88c05ca3..4317471e1 100755 --- a/mllib-dal/build-cpu-gpu.sh +++ b/mllib-dal/build-cpu-gpu.sh @@ -32,9 +32,9 @@ LIBS=(libccl.so.1 libfabric.so libfabric.so.1 libJavaAPI.so libmpi.so.12 \ libsockets-fi.so libtbbmalloc.so.2 libtbb.so.12 libintlc.so.5 libsvml.so libirng.so libimf.so \ libOpenCL.so.1 libsycl.so.5) for lib in ${LIBS[@]} -do - if [[ ! -f ./$RESOURCE_PATH/$lib ]]; then - echo $RESOURCE_PATH/$lib does not exsit, please run ../dev/prepare-builds-deps-gpu.sh! +do + if [[ ! -f ./$RESOURCE_PATH/$lib ]]; then + echo $RESOURCE_PATH/$lib does not exsit, please run ../dev/prepare-build-deps-gpu.sh! exit 1 fi done @@ -58,7 +58,7 @@ print_usage() { do echo " $version" done - echo + echo } while getopts "hqp:" opt diff --git a/mllib-dal/build.sh b/mllib-dal/build.sh index 069389fef..96393f1ca 100755 --- a/mllib-dal/build.sh +++ b/mllib-dal/build.sh @@ -33,13 +33,13 @@ LIBS=(libccl.so.1 libfabric.so libfabric.so.1 libJavaAPI.so libmpi.so.12 \ for lib in ${LIBS[@]} do if [[ ! -f ./$RESOURCE_PATH/$lib ]]; then - echo $RESOURCE_PATH/$lib does not exsit, please run ../dev/prepare-builds-deps.sh! + echo $RESOURCE_PATH/$lib does not exsit, please run ../dev/prepare-build-deps.sh! exit 1 fi done if [[ -f ./$RESOURCE_PATH/libsycl.so.5 ]]; then - echo GPU libs found! Please re-run ../dev/prepare-builds-deps.sh! + echo GPU libs found! Please re-run ../dev/prepare-build-deps.sh! exit 1 fi diff --git a/mllib-dal/src/main/native/CorrelationDALImpl.cpp b/mllib-dal/src/main/native/CorrelationDALImpl.cpp index 347f5afda..f2efb70ea 100644 --- a/mllib-dal/src/main/native/CorrelationDALImpl.cpp +++ b/mllib-dal/src/main/native/CorrelationDALImpl.cpp @@ -150,8 +150,6 @@ Java_org_apache_spark_ml_stat_CorrelationDALImpl_cCorrelationTrainDAL( ccl::communicator &comm = getComm(); size_t rankId = comm.rank(); - std::cout << " rankId : " << rankId << " ! " - << std::endl; const size_t nBlocks = executor_num; diff --git a/mllib-dal/src/test/resources/log4j.properties b/mllib-dal/src/test/resources/log4j.properties new file mode 100644 index 000000000..ff29121c2 --- /dev/null +++ b/mllib-dal/src/test/resources/log4j.properties @@ -0,0 +1,42 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +log4j.rootCategory=WARN, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Set the default spark-shell log level to WARN. When running the spark-shell, the +# log level for this class is used to overwrite the root logger's log level, so that +# the user can have different defaults for the shell and regular Spark apps. +log4j.logger.org.apache.spark.repl.Main=WARN + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.sparkproject.jetty=WARN +log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO +log4j.logger.org.apache.parquet=ERROR +log4j.logger.parquet=ERROR + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL +log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR + +log4j.logger.org.apache.spark.ml.util.LibLoader=DEBUG From c81b68b557d2e49c61b013f300fdbaa8d2f13eca Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 3 Nov 2021 18:37:39 +0800 Subject: [PATCH 13/19] nit --- dev/test-cluster/env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/test-cluster/env.sh b/dev/test-cluster/env.sh index c36a21397..f16255e54 100644 --- a/dev/test-cluster/env.sh +++ b/dev/test-cluster/env.sh @@ -5,7 +5,7 @@ # Import RELEASE envs source $GITHUB_WORKSPACE/RELEASE # Set OAP MLlib version (e.g. 1.1.0) -OAP_MLLIB_VERSION=${OAP_MLLB_VERSION} +OAP_MLLIB_VERSION=${OAP_MLLIB_VERSION} # Set Spark master SPARK_MASTER=yarn # Set Hadoop home path From acff02969680703d481b9e4540d673dd4685737d Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 3 Nov 2021 21:56:58 +0800 Subject: [PATCH 14/19] update env.sh & template --- conf/env.sh.template | 7 ++++--- dev/test-cluster/env.sh | 22 +++++++++++++--------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/conf/env.sh.template b/conf/env.sh.template index 7bdb97f22..168f9d133 100644 --- a/conf/env.sh.template +++ b/conf/env.sh.template @@ -2,8 +2,6 @@ # ============== Minimum Settings ============= # -# Set OAP MLlib version (e.g. 1.1.0) -OAP_MLLIB_VERSION=x.x.x # Set Spark master SPARK_MASTER=yarn # Set Hadoop home path @@ -17,6 +15,9 @@ export OAP_MLLIB_ROOT=/path/to/oap-mllib/home # ============================================= # +# Import RELEASE envs +source $OAP_MLLIB_ROOT/RELEASE + # Set HADOOP_CONF_DIR for Spark export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop @@ -42,7 +43,7 @@ SPARK_TOTAL_CORES=$((SPARK_NUM_EXECUTORS * SPARK_EXECUTOR_CORES)) SPARK_DEFAULT_PARALLELISM=$((SPARK_TOTAL_CORES * 2)) # Checks -for dir in $SPARK_HOME $HADOOP_HOME $OAP_MLLIB_JAR +for dir in $SPARK_HOME $HADOOP_HOME $OAP_MLLIB_JAR do if [[ ! -e $dir ]]; then echo $dir does not exist! diff --git a/dev/test-cluster/env.sh b/dev/test-cluster/env.sh index f16255e54..b2d65bcae 100644 --- a/dev/test-cluster/env.sh +++ b/dev/test-cluster/env.sh @@ -2,10 +2,6 @@ # ============== Minimum Settings ============= # -# Import RELEASE envs -source $GITHUB_WORKSPACE/RELEASE -# Set OAP MLlib version (e.g. 1.1.0) -OAP_MLLIB_VERSION=${OAP_MLLIB_VERSION} # Set Spark master SPARK_MASTER=yarn # Set Hadoop home path @@ -19,26 +15,34 @@ export OAP_MLLIB_ROOT=$GITHUB_WORKSPACE # ============================================= # +# Import RELEASE envs +source $OAP_MLLIB_ROOT/RELEASE + # Set HADOOP_CONF_DIR for Spark export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop # Set JAR name & path OAP_MLLIB_JAR_NAME=oap-mllib-$OAP_MLLIB_VERSION.jar OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME -# Set Spark driver & executor classpaths, -# absolute path for driver, relative path for executor +# Set Spark driver & executor classpaths +# YARN mode: use absolute path for driver, relative path for executors +# Standalone mode: use absolute path for both driver and executors SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR -SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME +if [[ $SPARK_MASTER == yarn ]]; then + SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME +else + SPARK_EXECUTOR_CLASSPATH=$OAP_MLLIB_JAR +fi # Set Spark resources, can be overwritten in example SPARK_DRIVER_MEMORY=1G SPARK_NUM_EXECUTORS=2 SPARK_EXECUTOR_CORES=1 SPARK_EXECUTOR_MEMORY=1G -SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) +SPARK_TOTAL_CORES=$((SPARK_NUM_EXECUTORS * SPARK_EXECUTOR_CORES)) +SPARK_DEFAULT_PARALLELISM=$((SPARK_TOTAL_CORES * 2)) # Checks - for dir in $SPARK_HOME $HADOOP_HOME $OAP_MLLIB_JAR do if [[ ! -e $dir ]]; then From 07af2708f15017d3050d3d458def7bfb7ddb04aa Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Thu, 4 Nov 2021 10:58:01 +0800 Subject: [PATCH 15/19] update HOST_NAME --- dev/test-cluster/env.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dev/test-cluster/env.sh b/dev/test-cluster/env.sh index b2d65bcae..0a92a1a10 100644 --- a/dev/test-cluster/env.sh +++ b/dev/test-cluster/env.sh @@ -9,7 +9,9 @@ export HADOOP_HOME=$HADOOP_HOME # Set Spark home path export SPARK_HOME=$SPARK_HOME # Set HDFS Root, should be hdfs://xxx or file://xxx -export HDFS_ROOT=hdfs://localhost:8020 + +HOST_NAME=$(hostname -f) +export HDFS_ROOT=hdfs://$HOST_NAME:8020 # Set OAP MLlib source code root directory export OAP_MLLIB_ROOT=$GITHUB_WORKSPACE From 8dc446531caeb26fbd0e4829f121dcf9a04f99fe Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Thu, 4 Nov 2021 11:32:08 +0800 Subject: [PATCH 16/19] add trap to capture script error --- dev/ci-test.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dev/ci-test.sh b/dev/ci-test.sh index b5ade50fb..59c64eb7d 100755 --- a/dev/ci-test.sh +++ b/dev/ci-test.sh @@ -1,5 +1,13 @@ #!/usr/bin/env bash +# exit when any command fails +set -e + +# keep track of the last executed command +trap 'last_command=$current_command; current_command=$BASH_COMMAND' DEBUG +# echo an error message before exiting +trap 'echo "\"${last_command}\" command filed with exit code $?."' EXIT + # Setup building envs source /opt/intel/oneapi/setvars.sh From a0d4c4a242ff0b64a16d5ec77c35c96c6fda0482 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Thu, 4 Nov 2021 12:12:40 +0800 Subject: [PATCH 17/19] Prepare hdfs data --- dev/setup-all.sh | 8 ++++++ dev/test-cluster/ci-test-cluster.sh | 16 ++++++++--- dev/test-cluster/log4j.properties | 42 +++++++++++++++++++++++++++++ dev/test-cluster/setup-cluster.sh | 9 +++++++ 4 files changed, 72 insertions(+), 3 deletions(-) create mode 100644 dev/test-cluster/log4j.properties diff --git a/dev/setup-all.sh b/dev/setup-all.sh index 66510e85e..7c08ce0e4 100755 --- a/dev/setup-all.sh +++ b/dev/setup-all.sh @@ -1,5 +1,13 @@ #!/usr/bin/env bash +# exit when any command fails +set -e + +# keep track of the last executed command +trap 'last_command=$current_command; current_command=$BASH_COMMAND' DEBUG +# echo an error message before exiting +trap 'echo "\"${last_command}\" command filed with exit code $?."' EXIT + # Install dependencies for building $GITHUB_WORKSPACE/dev/install-build-deps-ubuntu.sh diff --git a/dev/test-cluster/ci-test-cluster.sh b/dev/test-cluster/ci-test-cluster.sh index 7a4600267..55ae3211d 100755 --- a/dev/test-cluster/ci-test-cluster.sh +++ b/dev/test-cluster/ci-test-cluster.sh @@ -1,5 +1,13 @@ #!/usr/bin/env bash +# exit when any command fails +set -e + +# keep track of the last executed command +trap 'last_command=$current_command; current_command=$BASH_COMMAND' DEBUG +# echo an error message before exiting +trap 'echo "\"${last_command}\" command filed with exit code $?."' EXIT + # Setup Spark envs source $GITHUB_WORKSPACE/dev/test-cluster/setup-spark-envs.sh @@ -8,10 +16,12 @@ cp $GITHUB_WORKSPACE/dev/test-cluster/env.sh $GITHUB_WORKSPACE/conf cd $GITHUB_WORKSPACE/examples +HOST_NAME=$(hostname -f) +export HDFS_ROOT=hdfs://$HOST_NAME:8020 + # Copy examples data to HDFS -hadoop fs -mkdir -p /user/$USER -hadoop fs -copyFromLocal data -hadoop fs -ls data +hadoop fs -copyFromLocal data / +hadoop fs -find / # Build and run all examples ./build-all.sh diff --git a/dev/test-cluster/log4j.properties b/dev/test-cluster/log4j.properties new file mode 100644 index 000000000..ff29121c2 --- /dev/null +++ b/dev/test-cluster/log4j.properties @@ -0,0 +1,42 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +log4j.rootCategory=WARN, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Set the default spark-shell log level to WARN. When running the spark-shell, the +# log level for this class is used to overwrite the root logger's log level, so that +# the user can have different defaults for the shell and regular Spark apps. +log4j.logger.org.apache.spark.repl.Main=WARN + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.sparkproject.jetty=WARN +log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO +log4j.logger.org.apache.parquet=ERROR +log4j.logger.parquet=ERROR + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL +log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR + +log4j.logger.org.apache.spark.ml.util.LibLoader=DEBUG diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh index 633d848e9..77274cbd0 100755 --- a/dev/test-cluster/setup-cluster.sh +++ b/dev/test-cluster/setup-cluster.sh @@ -1,5 +1,13 @@ #!/usr/bin/env bash +# exit when any command fails +set -e + +# keep track of the last executed command +trap 'last_command=$current_command; current_command=$BASH_COMMAND' DEBUG +# echo an error message before exiting +trap 'echo "\"${last_command}\" command filed with exit code $?."' EXIT + WORK_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" cd $WORK_DIR @@ -28,6 +36,7 @@ cp ./core-site.xml ~/opt/hadoop-$HADOOP_VERSION/etc/hadoop/ cp ./hdfs-site.xml ~/opt/hadoop-$HADOOP_VERSION/etc/hadoop/ cp ./yarn-site.xml ~/opt/hadoop-$HADOOP_VERSION/etc/hadoop/ cp ./hadoop-env.sh ~/opt/hadoop-$HADOOP_VERSION/etc/hadoop/ +cp ./log4j.properties ~/opt/spark-$SPARK_VERSION-bin-$SPARK_HADOOP_VERSION/conf cp ./spark-defaults.conf ~/opt/spark-$SPARK_VERSION-bin-$SPARK_HADOOP_VERSION/conf source ./setup-spark-envs.sh From 33153dad7f5821d3b8be7c037acc038f911604c9 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Thu, 4 Nov 2021 15:28:31 +0800 Subject: [PATCH 18/19] update scripts --- dev/test-cluster/ci-test-cluster.sh | 2 +- dev/test-cluster/setup-cluster.sh | 12 +++++------- dev/test-cluster/setup-spark-envs.sh | 6 +++++- examples/{build-all.sh => build-all-scala.sh} | 0 4 files changed, 11 insertions(+), 9 deletions(-) rename examples/{build-all.sh => build-all-scala.sh} (100%) diff --git a/dev/test-cluster/ci-test-cluster.sh b/dev/test-cluster/ci-test-cluster.sh index 55ae3211d..d86d89aef 100755 --- a/dev/test-cluster/ci-test-cluster.sh +++ b/dev/test-cluster/ci-test-cluster.sh @@ -24,6 +24,6 @@ hadoop fs -copyFromLocal data / hadoop fs -find / # Build and run all examples -./build-all.sh +./build-all-scala.sh ./run-all-scala.sh ./run-all-pyspark.sh diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh index 77274cbd0..dcde776da 100755 --- a/dev/test-cluster/setup-cluster.sh +++ b/dev/test-cluster/setup-cluster.sh @@ -10,14 +10,12 @@ trap 'echo "\"${last_command}\" command filed with exit code $?."' EXIT WORK_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -cd $WORK_DIR - echo JAVA_HOME is $JAVA_HOME -HADOOP_VERSION=3.2.0 -SPARK_VERSION=3.1.1 -SPARK_HADOOP_VERSION=hadoop3.2 +# setup envs +source ./setup-spark-envs.sh +# download spark & hadoop bins [ -d ~/opt ] || mkdir ~/opt cd ~/opt [ -f spark-$SPARK_VERSION-bin-$SPARK_HADOOP_VERSION.tgz ] || wget --no-verbose https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-$SPARK_HADOOP_VERSION.tgz @@ -39,11 +37,11 @@ cp ./hadoop-env.sh ~/opt/hadoop-$HADOOP_VERSION/etc/hadoop/ cp ./log4j.properties ~/opt/spark-$SPARK_VERSION-bin-$SPARK_HADOOP_VERSION/conf cp ./spark-defaults.conf ~/opt/spark-$SPARK_VERSION-bin-$SPARK_HADOOP_VERSION/conf -source ./setup-spark-envs.sh - echo $HOST_IP > $HADOOP_HOME/etc/hadoop/slaves echo $HOST_IP > $SPARK_HOME/conf/slaves +ls -l $SPARK_HOME/conf + # create directories mkdir -p /tmp/run/hdfs/namenode mkdir -p /tmp/run/hdfs/datanode diff --git a/dev/test-cluster/setup-spark-envs.sh b/dev/test-cluster/setup-spark-envs.sh index 6e4e06423..5e988c3a9 100755 --- a/dev/test-cluster/setup-spark-envs.sh +++ b/dev/test-cluster/setup-spark-envs.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +set -x + HADOOP_VERSION=3.2.0 SPARK_VERSION=3.1.1 SPARK_HADOOP_VERSION=hadoop3.2 @@ -12,4 +14,6 @@ export SPARK_HOME=~/opt/spark-$SPARK_VERSION-bin-$SPARK_HADOOP_VERSION export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH export PYSPARK_PYTHON=python3 -export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH \ No newline at end of file +export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH + +set +x \ No newline at end of file diff --git a/examples/build-all.sh b/examples/build-all-scala.sh similarity index 100% rename from examples/build-all.sh rename to examples/build-all-scala.sh From 0a66f1ac59e10e8bab6a0b8a0a9f7bf87dbb0327 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Thu, 4 Nov 2021 20:11:30 +0800 Subject: [PATCH 19/19] nit --- dev/test-cluster/setup-cluster.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh index dcde776da..a5b48490e 100755 --- a/dev/test-cluster/setup-cluster.sh +++ b/dev/test-cluster/setup-cluster.sh @@ -8,12 +8,12 @@ trap 'last_command=$current_command; current_command=$BASH_COMMAND' DEBUG # echo an error message before exiting trap 'echo "\"${last_command}\" command filed with exit code $?."' EXIT -WORK_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" echo JAVA_HOME is $JAVA_HOME # setup envs -source ./setup-spark-envs.sh +source $SCRIPT_DIR/setup-spark-envs.sh # download spark & hadoop bins [ -d ~/opt ] || mkdir ~/opt @@ -23,7 +23,7 @@ cd ~/opt [ -f hadoop-$HADOOP_VERSION.tar.gz ] || wget --no-verbose https://archive.apache.org/dist/hadoop/core/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz [ -d hadoop-$HADOOP_VERSION ] || tar -xzf hadoop-$HADOOP_VERSION.tar.gz -cd $WORK_DIR +cd $SCRIPT_DIR HOST_IP=$(hostname -f)