diff --git a/README.md b/README.md index dd23ad8c..079487b3 100755 --- a/README.md +++ b/README.md @@ -50,3 +50,103 @@ The hadoop quickstart guides dont mention this but you need to set the hdfs conf +
brew install homebrew/versions/protobuf250+ +**cmake and openssl** + +brew install cmake +brew install openssl + +**Install gnu sed** +
brew install gsed+ +You will need to **compile native hadoop libraries** and install them to the hadoop directory. Reason? This package relies on **lz4** that is obtained by building the native libs. + +**Install hadoop** +
+brew install hadoop ++ +**Build Hadoop native libraries** +
+git clone https://github.com/apache/hadoop +cd hadoop +git checkout branch-2.7.1 +mvn package -Pdist,native -DskipTests -Dtar -Dmaven.javadoc.skip=true +# Add to your /etc/profile: +echo 'export HADOOP_HOME="/usr/local/Cellar/hadoop/2.7.2/libexec"' | sudo tee /etc/profile +source /etc/profile +# Need to update the hadoop-pipes ant build-main.xml script: +gsed -i $HADOOP_HOME/git/hadoop/hadoop-tools/hadoop-pipes/src/ -DJVM_ARCH_DATA_MODEL=64 -DOPENSSL_ROOT_DIR=/usr/local/Cellar/openssl/1.0.2h_1 -DOPENSSL_LIBRARIES=/usr/local/Cellar/openssl/1.0.2h_1/lib"/> ++ +**Set up env variables** + +Need to disable SIP to change the **DYLD_LIBRARY_PaTH** . Consult online resources for more details on how to do this. +
sudo csrutil disable+
echo 'export DYLD_LIBRARY_PATH="/usr/local/Cellar/hadoop/2.7.2/libexec/share/hadoop/common/lib:$DYLD_LIBRARY_PATH' >> sudo tee /etc/profile ++Then you can reenable SIP: consult online resources for details. +
sudo csrutil enable+ + +**Updates to spark-env.sh** +
+echo 'export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop +export SPARK_EXECUTOR_CORES=1 #, Number of cores for the executors (Default: 1). +export SPARK_MASTER_OPTS="-Dspark.deploy.defaultCores=1" +export SPARK_WORKER_MEMORY=6G +export SPARK_DRIVER_MEMORY=4G +export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:/usr/local/Cellar/hadoop/2.7.2/libexec/lib:/usr/local/Cellar/hadoop/2.7.2/libexec/share/hadoop/common:/usr/local/Cellar/hadoop/2.7.2/libexec/share/hadoop/common/lib +' >> $SPARK_HOME/conf/spark-env.sh ++ +**Updates to spark-default.conf** +
+cp $SPARK_HOME/conf/spark-defaults.conf.template $SPARK_HOME/conf/spark-defaults.conf +echo ' +spark.executor.memory 6g +' >> $SPARK_HOME/conf/spark-defaults.conf ++ +**Updates to spark-default.conf** +
+cp $SPARK_HOME/conf/spark-defaults.conf.template $SPARK_HOME/conf/spark-defaults.conf +echo ' +spark.executor.memory 6g +' >> $SPARK_HOME/conf/spark-defaults.conf ++ +**Testing on Mac** +I was unable to run the full 8million row MNIST. For some smoke testing on Mac a 200K row file can be created using the following script: + +
scripts/append_mnist_200k.ssc+ + +**Here is how the testing was performed on Mac** + + +
+spark-shell --executor-memory 6g --total-executor-cores 1 --master spark://sparkbook:7077 --jars /git/BIDMach_Spark/BIDMatHDFS.jar,/git/BIDMach/lib/BIDMat.jar,/git/BIDMach/BIDMach.jar --driver-java-options "-Dbidmach.path=/git/BIDMach -Dbidmach.merged.hdfs.path=hdfs://sparkbook:8020/bidmach/BIDMach_MNIST/partsmerged.fmat.lz4 -Dhdfs.path=hdfs://sparkbook:8020/bidmach -Dspark.executors=1" ++ +Inside the spark-shell: + +
+:load /git/BIDMach_Spark/scripts/load_mnist.ssc +:load /git/BIDMach_Spark/scripts/append_mnist_200k.ssc +:load /git/BIDMach_Spark/scripts/KMeansLearner.ssc +\ No newline at end of file diff --git a/scripts/KMeansLearner.ssc b/scripts/KMeansLearner.ssc index 78985a4f..93b0a1f8 100644 --- a/scripts/KMeansLearner.ssc +++ b/scripts/KMeansLearner.ssc @@ -6,11 +6,12 @@ import BIDMach.RunOnSpark._ import BIDMach.Learner import BIDMach.models.KMeans import org.apache.spark.HashPartitioner +import java.net.InetAddress -// Specify IP address of master here -val MASTER_DNS = "ip-10-22-32-153.us-west-2.compute.internal" -val num_executors = 16 -val rdd_data = sc.sequenceFile("hdfs://%s:9000/BIDMach_MNIST/merged.fmat.lz4".format(MASTER_DNS), classOf[SerText], classOf[BIDMat.MatIO]).partitionBy(new HashPartitioner(num_executors)).persist() +val mnistPath=System.getProperty("bidmach.merged.hdfs.path") +val numExecutors = System.getProperty("spark.executors").toInt +println(s"mnistPath=$mnistPath") +val rddData = sc.sequenceFile(mnistPath, classOf[SerText], classOf[BIDMat.MatIO]).partitionBy(new HashPartitioner(numExecutors)).persist() val (learner,opts) = KMeans.learner() opts.batchSize = 10000 opts.npasses = 10 @@ -22,4 +23,4 @@ def time[R](block: => R): R = { println("Elapsed time: " + (t1 - t0)/math.pow(10, 9)+ "s") result } -val result = time {runOnSpark(sc,learner, rdd_data, num_executors)} +val result = time {runOnSpark(sc,learner, rddData, numExecutors)} diff --git a/scripts/append_mnist.ssc b/scripts/append_mnist.ssc index 1bee0f8b..50849b92 100644 --- a/scripts/append_mnist.ssc +++ b/scripts/append_mnist.ssc @@ -2,11 +2,14 @@ import BIDMat.MatIO import BIDMat.HDFSIO import BIDMat.SerText -val myname = java.net.InetAddress.getLocalHost.getHostAddress -val prefix = "hdfs://" + myname + ":9000/BIDMach_MNIST/" -val fnames = (0 to 80).map(i => (prefix + "parts/alls%02d.fmat.lz4" format i)).toList +val bmDir=System.getProperty("bidmach.path") +val hdfsServer = System.getProperty("hdfs.path") +val hdfsPrefix = s"$hdfsServer/BIDMach_MNIST/parts" +println(s"Loading mnist data from $hdfsPrefix ..") + +val fnames = (0 to 80).map(i => (f"$hdfsPrefix/alls$i%02.0f.fmat.lz4")).toList val hdfsio = new HDFSIO -hdfsio.appendFiles(fnames, prefix + "merged.fmat.lz4", 2); +hdfsio.appendFiles(fnames, hdfsPrefix + "merged.fmat.lz4", 2); diff --git a/scripts/append_mnist_200k.ssc b/scripts/append_mnist_200k.ssc new file mode 100644 index 00000000..24a7736b --- /dev/null +++ b/scripts/append_mnist_200k.ssc @@ -0,0 +1,15 @@ +import BIDMat.MatIO +import BIDMat.HDFSIO +import BIDMat.SerText + +val bmDir=System.getProperty("bidmach.path") +val hdfsServer = System.getProperty("hdfs.path") +val hdfsPrefix = s"$hdfsServer/BIDMach_MNIST/parts" +println(s"Loading mnist data from $hdfsPrefix ..") + +val fnames = (0 to 1).map(i => (f"$hdfsPrefix/alls$i%02.0f.fmat.lz4")).toList + +val hdfsio = new HDFSIO +hdfsio.appendFiles(fnames, hdfsPrefix + "merged.fmat.lz4", 2); + + diff --git a/scripts/load_mnist.ssc b/scripts/load_mnist.ssc index 26cfef68..0207b333 100644 --- a/scripts/load_mnist.ssc +++ b/scripts/load_mnist.ssc @@ -1,13 +1,17 @@ import BIDMat.MatIO +import BIDMat.HMat._ import BIDMat.SerText -val myname = java.net.InetAddress.getLocalHost.getHostAddress -val prefix = "hdfs://" + myname + ":9000/BIDMach_MNIST/" +val bmDir=System.getProperty("bidmach.path") +val hdfsServer = System.getProperty("hdfs.path") +val hdfsPrefix = s"$hdfsServer/BIDMach_MNIST/" +val mnistPath = s"${bmDir}/data/MNIST8M/parts/" +println(s"Loading mnist data from $mnistPath and saving to hdfs at $hdfsPrefix ..") for (i <- 0 to 80) { - val fname = "alls%02d.fmat.lz4" format i; - val a =loadFMat("/opt/BIDMach/data/MNIST8M/parts/" + fname); - saveFMat(prefix + "parts/" + fname, a, 2); - print("."); + val fname = "alls%02d.fmat.lz4" format i + val a =loadFMat(s"$mnistPath/$fname") + saveFMat(s"${hdfsPrefix}/parts/$fname", a, 2) + print(".") } -println(); +println()