From b2ceb7e98555448fa25e94e0713c2164becb4880 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Sun, 7 Feb 2021 10:47:55 +0800 Subject: [PATCH] Add test-cluster --- dev/test-cluster/config-ssh.sh | 6 ++ dev/test-cluster/core-site.xml | 24 +++++ dev/test-cluster/envs.sh | 22 +++++ dev/test-cluster/hadoop-env.sh | 99 +++++++++++++++++++ dev/test-cluster/hdfs-site.xml | 32 ++++++ dev/test-cluster/setup-cluster.sh | 42 ++++++++ dev/test-cluster/setup-python3-env.sh | 12 +++ dev/test-cluster/spark-defaults.conf | 34 +++++++ dev/test-cluster/workloads/kmeans-pyspark.py | 70 +++++++++++++ .../workloads/run-kmeans-pyspark.sh | 48 +++++++++ dev/test-cluster/yarn-site.xml | 67 +++++++++++++ mllib-dal/test-cluster.sh | 5 + 12 files changed, 461 insertions(+) create mode 100755 dev/test-cluster/config-ssh.sh create mode 100644 dev/test-cluster/core-site.xml create mode 100644 dev/test-cluster/envs.sh create mode 100755 dev/test-cluster/hadoop-env.sh create mode 100644 dev/test-cluster/hdfs-site.xml create mode 100755 dev/test-cluster/setup-cluster.sh create mode 100755 dev/test-cluster/setup-python3-env.sh create mode 100644 dev/test-cluster/spark-defaults.conf create mode 100644 dev/test-cluster/workloads/kmeans-pyspark.py create mode 100755 dev/test-cluster/workloads/run-kmeans-pyspark.sh create mode 100644 dev/test-cluster/yarn-site.xml create mode 100755 mllib-dal/test-cluster.sh diff --git a/dev/test-cluster/config-ssh.sh b/dev/test-cluster/config-ssh.sh new file mode 100755 index 000000000..d093fa17a --- /dev/null +++ b/dev/test-cluster/config-ssh.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +ssh-keygen -q -N "" -t rsa -f ~/.ssh/id_rsa +cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys +echo " StrictHostKeyChecking no " | sudo tee -a /etc/ssh/ssh_config +sudo service ssh restart diff --git a/dev/test-cluster/core-site.xml b/dev/test-cluster/core-site.xml new file mode 100644 index 000000000..7016e477e --- /dev/null +++ b/dev/test-cluster/core-site.xml @@ -0,0 +1,24 @@ + + + + + + + + + fs.default.name + hdfs://localhost:8020 + + diff --git a/dev/test-cluster/envs.sh b/dev/test-cluster/envs.sh new file mode 100644 index 000000000..71e8506e6 --- /dev/null +++ b/dev/test-cluster/envs.sh @@ -0,0 +1,22 @@ +# Set user Spark and Hadoop home directory +export HADOOP_HOME=~/opt/hadoop-2.7.7 +export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop +export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7 + +export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH +export PYSPARK_PYTHON=python3 + +# Set user HDFS Root +export HDFS_ROOT=hdfs://localhost:8020 +export OAP_MLLIB_DATA_ROOT=OAPMLlib/Data +# Set user Intel MLlib Root directory +export OAP_MLLIB_ROOT=${GITHUB_WORKSPACE} + +# Target jar built +OAP_MLLIB_JAR_NAME=oap-mllib-1.1.0.jar +OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME + +# Use absolute path +SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR +# Use relative path +SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME diff --git a/dev/test-cluster/hadoop-env.sh b/dev/test-cluster/hadoop-env.sh new file mode 100755 index 000000000..bee6c1f69 --- /dev/null +++ b/dev/test-cluster/hadoop-env.sh @@ -0,0 +1,99 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Set Hadoop-specific environment variables here. + +# The only required environment variable is JAVA_HOME. All others are +# optional. When running a distributed configuration it is best to +# set JAVA_HOME in this file, so that it is correctly defined on +# remote nodes. + +# The java implementation to use. +# export JAVA_HOME=${JAVA_HOME} +export JAVA_HOME=/usr/local/lib/jvm/openjdk8 + +# The jsvc implementation to use. Jsvc is required to run secure datanodes +# that bind to privileged ports to provide authentication of data transfer +# protocol. Jsvc is not required if SASL is configured for authentication of +# data transfer protocol using non-privileged ports. +#export JSVC_HOME=${JSVC_HOME} + +export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"} + +# Extra Java CLASSPATH elements. Automatically insert capacity-scheduler. +for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do + if [ "$HADOOP_CLASSPATH" ]; then + export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f + else + export HADOOP_CLASSPATH=$f + fi +done + +# The maximum amount of heap to use, in MB. Default is 1000. +#export HADOOP_HEAPSIZE= +#export HADOOP_NAMENODE_INIT_HEAPSIZE="" + +# Extra Java runtime options. Empty by default. +export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true" + +# Command specific options appended to HADOOP_OPTS when specified +export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS" +export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS" + +export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS" + +export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS" +export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS" + +# The following applies to multiple commands (fs, dfs, fsck, distcp etc) +export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS" +#HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS" + +# On secure datanodes, user to run the datanode as after dropping privileges. +# This **MUST** be uncommented to enable secure HDFS if using privileged ports +# to provide authentication of data transfer protocol. This **MUST NOT** be +# defined if SASL is configured for authentication of data transfer protocol +# using non-privileged ports. +export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER} + +# Where log files are stored. $HADOOP_HOME/logs by default. +#export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER + +# Where log files are stored in the secure data environment. +export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER} + +### +# HDFS Mover specific parameters +### +# Specify the JVM options to be used when starting the HDFS Mover. +# These options will be appended to the options specified as HADOOP_OPTS +# and therefore may override any similar flags set in HADOOP_OPTS +# +# export HADOOP_MOVER_OPTS="" + +### +# Advanced Users Only! +### + +# The directory where pid files are stored. /tmp by default. +# NOTE: this should be set to a directory that can only be written to by +# the user that will run the hadoop daemons. Otherwise there is the +# potential for a symlink attack. +export HADOOP_PID_DIR=${HADOOP_PID_DIR} +export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR} + +# A string representing this instance of hadoop. $USER by default. +export HADOOP_IDENT_STRING=$USER diff --git a/dev/test-cluster/hdfs-site.xml b/dev/test-cluster/hdfs-site.xml new file mode 100644 index 000000000..40fcbb5d6 --- /dev/null +++ b/dev/test-cluster/hdfs-site.xml @@ -0,0 +1,32 @@ + + + + + + + + + dfs.replication + 1 + + + dfs.namenode.name.dir + /tmp/run/hdfs/namenode + + + dfs.datanode.data.dir + /tmp/run/hdfs/datanode + + diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh new file mode 100755 index 000000000..eea058f80 --- /dev/null +++ b/dev/test-cluster/setup-cluster.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +WORK_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +cd $WORK_DIR + +echo JAVA_HOME is $JAVA_HOME + +mkdir ~/opt +cd ~/opt +wget https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz +tar -xzf spark-3.0.0-bin-hadoop2.7.tgz +wget https://archive.apache.org/dist/hadoop/core/hadoop-2.7.7/hadoop-2.7.7.tar.gz +tar -xzf hadoop-2.7.7.tar.gz + +cd $WORK_DIR + +cp ./core-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/ +cp ./hdfs-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/ +cp ./yarn-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/ +cp ./hadoop-env.sh ~/opt/hadoop-2.7.7/etc/hadoop/ +cp ./spark-defaults.conf ~/opt/spark-3.0.0-bin-hadoop2.7/conf + +# create directories +mkdir -p /tmp/run/hdfs/namenode +mkdir -p /tmp/run/hdfs/datanode + +# hdfs format +~/opt/hadoop-2.7.7/bin/hdfs namenode -format + +export HADOOP_HOME=~/opt/hadoop-2.7.7 +export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop +export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7 + +export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH + +# start hdfs and yarn +$HADOOP_HOME/sbin/start-dfs.sh +$HADOOP_HOME/sbin/start-yarn.sh + +hadoop fs -ls / +yarn node -list diff --git a/dev/test-cluster/setup-python3-env.sh b/dev/test-cluster/setup-python3-env.sh new file mode 100755 index 000000000..29208dc5e --- /dev/null +++ b/dev/test-cluster/setup-python3-env.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +sudo apt-get update +sudo apt-get install python3-pip python3-setuptools python3-wheel + +pip3 install --user numpy + +echo python is in $(which python) +python --version + +echo python3 is in $(which python3) +python3 --version diff --git a/dev/test-cluster/spark-defaults.conf b/dev/test-cluster/spark-defaults.conf new file mode 100644 index 000000000..1c25bb2ec --- /dev/null +++ b/dev/test-cluster/spark-defaults.conf @@ -0,0 +1,34 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +# spark.master spark://master:7077 +# spark.eventLog.enabled true +# spark.eventLog.dir hdfs://namenode:8021/directory +# spark.serializer org.apache.spark.serializer.KryoSerializer +# spark.driver.memory 5g +# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" + +spark.master yarn +spark.serializer org.apache.spark.serializer.KryoSerializer +spark.driver.memory 3g +spark.executor.num 2 +spark.executor.cores 1 +spark.executor.memory 4g diff --git a/dev/test-cluster/workloads/kmeans-pyspark.py b/dev/test-cluster/workloads/kmeans-pyspark.py new file mode 100644 index 000000000..cf93e6034 --- /dev/null +++ b/dev/test-cluster/workloads/kmeans-pyspark.py @@ -0,0 +1,70 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +An example demonstrating k-means clustering. +Run with: + bin/spark-submit examples/src/main/python/ml/kmeans_example.py + +This example requires NumPy (http://www.numpy.org/). +""" +from __future__ import print_function +import sys + +# $example on$ +from pyspark.ml.clustering import KMeans +from pyspark.ml.evaluation import ClusteringEvaluator +# $example off$ + +from pyspark.sql import SparkSession + +if __name__ == "__main__": + spark = SparkSession\ + .builder\ + .appName("KMeansExample")\ + .getOrCreate() + + if (len(sys.argv) != 2) : + println("Require data file path as input parameter") + sys.exit(1) + + # $example on$ + # Loads data. + dataset = spark.read.format("libsvm").load(sys.argv[1]) + + # Trains a k-means model. + kmeans = KMeans().setK(2).setSeed(1) + model = kmeans.fit(dataset) + + # Make predictions + predictions = model.transform(dataset) + + # Evaluate clustering by computing Silhouette score + evaluator = ClusteringEvaluator() + + silhouette = evaluator.evaluate(predictions) + print("Silhouette with squared euclidean distance = " + str(silhouette)) + + # Shows the result. + centers = model.clusterCenters() + print("Cluster Centers: ") + for center in centers: + print(center) + # $example off$ + + spark.stop() + diff --git a/dev/test-cluster/workloads/run-kmeans-pyspark.sh b/dev/test-cluster/workloads/run-kmeans-pyspark.sh new file mode 100755 index 000000000..e07f3f7b6 --- /dev/null +++ b/dev/test-cluster/workloads/run-kmeans-pyspark.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +source ../envs.sh + +# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt), the data file should be copied to HDFS +$HADOOP_HOME/bin/hadoop fs -mkdir -p $OAP_MLLIB_DATA_ROOT +$HADOOP_HOME/bin/hadoop fs -copyFromLocal $SPARK_HOME/data/mllib/sample_kmeans_data.txt $OAP_MLLIB_DATA_ROOT + +# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly +SPARK_MASTER=yarn +SPARK_DRIVER_MEMORY=1G +SPARK_NUM_EXECUTORS=2 +SPARK_EXECUTOR_CORES=1 +SPARK_EXECUTOR_MEMORY=1G + +SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) + +# ======================================================= # + +# Check env +if [[ -z $SPARK_HOME ]]; then + echo SPARK_HOME not defined! + exit 1 +fi + +if [[ -z $HADOOP_HOME ]]; then + echo HADOOP_HOME not defined! + exit 1 +fi + +APP_PY="$OAP_MLLIB_ROOT/dev/test-cluster/workloads/kmeans-pyspark.py" +DATA_FILE=$OAP_MLLIB_DATA_ROOT/sample_kmeans_data.txt + +$SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ + --num-executors $SPARK_NUM_EXECUTORS \ + --driver-memory $SPARK_DRIVER_MEMORY \ + --executor-cores $SPARK_EXECUTOR_CORES \ + --executor-memory $SPARK_EXECUTOR_MEMORY \ + --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ + --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \ + --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \ + --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \ + --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \ + --conf "spark.shuffle.reduceLocality.enabled=false" \ + --conf "spark.network.timeout=1200s" \ + --conf "spark.task.maxFailures=1" \ + --jars $OAP_MLLIB_JAR \ + $APP_PY $DATA_FILE diff --git a/dev/test-cluster/yarn-site.xml b/dev/test-cluster/yarn-site.xml new file mode 100644 index 000000000..ff74d23a7 --- /dev/null +++ b/dev/test-cluster/yarn-site.xml @@ -0,0 +1,67 @@ + + + + + + yarn.nodemanager.aux-services + mapreduce_shuffle + + + yarn.nodemanager.aux-services.mapreduce.shuffle.class + org.apache.hadoop.mapred.ShuffleHandler + + + yarn.resourcemanager.hostname + localhost + + + yarn.resourcemanager.address + localhost:8032 + + + + yarn.nodemanager.resource.memory-mb + 7168 + + + yarn.nodemanager.resource.cpu-vcores + 2 + + + yarn.nodemanager.vmem-check-enabled + false + + + yarn.nodemanager.vmem-pmem-ratio + 2 + + + yarn.scheduler.minimum-allocation-mb + 1024 + + + yarn.scheduler.maximum-allocation-mb + 7168 + + + yarn.scheduler.minimum-allocation-vcores + 1 + + + yarn.scheduler.maximum-allocation-vcores + 2 + + + diff --git a/mllib-dal/test-cluster.sh b/mllib-dal/test-cluster.sh new file mode 100755 index 000000000..4f5a6132a --- /dev/null +++ b/mllib-dal/test-cluster.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +cd ../dev/test-cluster/workloads + +./run-kmeans-pyspark.sh