BIDData · javadba · Jun 16, 2016
diff --git a/README.md b/README.md
@@ -50,3 +50,103 @@ The hadoop quickstart guides dont mention this but you need to set the hdfs conf
 </pre>
 
 
+<h3>System properties to control the testing scripts</h3>
+
+The **.ssc** files under **scripts/** directory are used to test the installation. Here are environment variables / System properties to configure them properly for your local environment:
+
+- **bidmach.path**: path to the BIDMach installation   e.g. -Dbidmach.path=/git/BIDMach
+- **hdfs.path**: path to the file saved to hdfs e.g. -Dhdfs.path=hdfs://sparkbook:8020/bidmach 
+- **bidmach.merged.hdfs.path**: path to the final merged/combined lz4 output e.g. -Dbidmach.merged.hdfs.path=hdfs://sparkbook:8020/bidmach/BIDMach_MNIST/partsmerged.fmat.lz4 
+- **spark.executors**: number of executors to use in processing e.g. -Dspark.executors=1
+
+<h3>Additional steps and settings on Mac</h3>
+
+**Protobuf**:
+
+<pre>brew install homebrew/versions/protobuf250</pre>
+
+**cmake and openssl**
+
+brew install cmake
+brew install openssl
+
+**Install gnu sed**
+<pre>brew install gsed</pre>
+
+You will need to **compile native hadoop libraries** and install them to the hadoop directory. Reason? This package relies on **lz4** that is obtained by building the native libs.
+
+**Install hadoop**
+<pre>
+brew install hadoop
+</pre>
+
+**Build Hadoop native libraries**
+<pre>
+git clone https://github.com/apache/hadoop
+cd hadoop
+git checkout branch-2.7.1
+mvn package -Pdist,native -DskipTests -Dtar -Dmaven.javadoc.skip=true
+# Add to your /etc/profile:
+echo 'export HADOOP_HOME="/usr/local/Cellar/hadoop/2.7.2/libexec"' | sudo tee /etc/profile
+source /etc/profile
+# Need to update the hadoop-pipes ant build-main.xml script:
+gsed -i  $HADOOP_HOME/git/hadoop/hadoop-tools/hadoop-pipes/src/ -DJVM_ARCH_DATA_MODEL=64 -DOPENSSL_ROOT_DIR=/usr/local/Cellar/openssl/1.0.2h_1 -DOPENSSL_LIBRARIES=/usr/local/Cellar/openssl/1.0.2h_1/lib"/>
+</pre>
+
+**Set up env variables**
+
+Need to disable SIP to change the **DYLD_LIBRARY_PaTH** . Consult online resources for more details on how to do this. 
+<pre>sudo csrutil disable</pre>
+<pre>echo 'export DYLD_LIBRARY_PATH="/usr/local/Cellar/hadoop/2.7.2/libexec/share/hadoop/common/lib:$DYLD_LIBRARY_PATH' >> sudo tee /etc/profile
+</pre>
+Then you can reenable SIP: consult online resources for details.
+<pre>sudo csrutil enable</pre>
+
+
+**Updates to spark-env.sh**
+<pre>
+echo 'export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
+export SPARK_EXECUTOR_CORES=1  #, Number of cores for the executors (Default: 1).
+export SPARK_MASTER_OPTS="-Dspark.deploy.defaultCores=1"
+export SPARK_WORKER_MEMORY=6G
+export SPARK_DRIVER_MEMORY=4G
+export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:/usr/local/Cellar/hadoop/2.7.2/libexec/lib:/usr/local/Cellar/hadoop/2.7.2/libexec/share/hadoop/common:/usr/local/Cellar/hadoop/2.7.2/libexec/share/hadoop/common/lib
+' >> $SPARK_HOME/conf/spark-env.sh
+</pre>
+
+**Updates to spark-default.conf**
+<pre>
+cp $SPARK_HOME/conf/spark-defaults.conf.template $SPARK_HOME/conf/spark-defaults.conf
+echo '
+spark.executor.memory	6g
+' >> $SPARK_HOME/conf/spark-defaults.conf
+</pre>
+
+**Updates to spark-default.conf**
+<pre>
+cp $SPARK_HOME/conf/spark-defaults.conf.template $SPARK_HOME/conf/spark-defaults.conf
+echo '
+spark.executor.memory	6g
+' >> $SPARK_HOME/conf/spark-defaults.conf
+</pre>
+
+**Testing on Mac**
+I was unable to run the full 8million row MNIST.  For some smoke testing on Mac a 200K row file can be created using the following script:
+
+<pre>scripts/append_mnist_200k.ssc</pre>
+
+
+**Here is how the testing was performed on Mac**
+
+
+<pre>
+spark-shell --executor-memory 6g --total-executor-cores 1 --master spark://sparkbook:7077 --jars /git/BIDMach_Spark/BIDMatHDFS.jar,/git/BIDMach/lib/BIDMat.jar,/git/BIDMach/BIDMach.jar --driver-java-options "-Dbidmach.path=/git/BIDMach -Dbidmach.merged.hdfs.path=hdfs://sparkbook:8020/bidmach/BIDMach_MNIST/partsmerged.fmat.lz4 -Dhdfs.path=hdfs://sparkbook:8020/bidmach -Dspark.executors=1"
+</pre>
+
+Inside the spark-shell:
+
+<pre>
+:load /git/BIDMach_Spark/scripts/load_mnist.ssc
+:load /git/BIDMach_Spark/scripts/append_mnist_200k.ssc
+:load /git/BIDMach_Spark/scripts/KMeansLearner.ssc
+</pre>
diff --git a/scripts/KMeansLearner.ssc b/scripts/KMeansLearner.ssc
@@ -6,11 +6,12 @@ import BIDMach.RunOnSpark._
 import BIDMach.Learner
 import BIDMach.models.KMeans
 import org.apache.spark.HashPartitioner
+import java.net.InetAddress
 
-// Specify IP address of master here
-val MASTER_DNS = "ip-10-22-32-153.us-west-2.compute.internal"
-val num_executors = 16
-val rdd_data = sc.sequenceFile("hdfs://%s:9000/BIDMach_MNIST/merged.fmat.lz4".format(MASTER_DNS), classOf[SerText], classOf[BIDMat.MatIO]).partitionBy(new HashPartitioner(num_executors)).persist()
+val mnistPath=System.getProperty("bidmach.merged.hdfs.path")
+val numExecutors = System.getProperty("spark.executors").toInt
+println(s"mnistPath=$mnistPath")
+val rddData = sc.sequenceFile(mnistPath, classOf[SerText], classOf[BIDMat.MatIO]).partitionBy(new HashPartitioner(numExecutors)).persist()
 val (learner,opts) = KMeans.learner()
 opts.batchSize = 10000
 opts.npasses = 10
@@ -22,4 +23,4 @@ def time[R](block: => R): R = {
     println("Elapsed time: " + (t1 - t0)/math.pow(10, 9)+ "s")
     result
 }
-val result = time {runOnSpark(sc,learner, rdd_data, num_executors)}
+val result = time {runOnSpark(sc,learner, rddData, numExecutors)}
diff --git a/scripts/append_mnist.ssc b/scripts/append_mnist.ssc
@@ -2,11 +2,14 @@ import BIDMat.MatIO
 import BIDMat.HDFSIO
 import BIDMat.SerText
 
-val myname = java.net.InetAddress.getLocalHost.getHostAddress
-val prefix = "hdfs://" + myname + ":9000/BIDMach_MNIST/"
-val fnames = (0 to 80).map(i => (prefix + "parts/alls%02d.fmat.lz4" format i)).toList
+val bmDir=System.getProperty("bidmach.path")
+val hdfsServer = System.getProperty("hdfs.path")
+val hdfsPrefix = s"$hdfsServer/BIDMach_MNIST/parts"
+println(s"Loading mnist data from $hdfsPrefix ..")
+
+val fnames = (0 to 80).map(i => (f"$hdfsPrefix/alls$i%02.0f.fmat.lz4")).toList
 
 val hdfsio = new HDFSIO
-hdfsio.appendFiles(fnames, prefix + "merged.fmat.lz4", 2);
+hdfsio.appendFiles(fnames, hdfsPrefix + "merged.fmat.lz4", 2);
 
 
diff --git a/scripts/append_mnist_200k.ssc b/scripts/append_mnist_200k.ssc
@@ -0,0 +1,15 @@
+import BIDMat.MatIO
+import BIDMat.HDFSIO
+import BIDMat.SerText
+
+val bmDir=System.getProperty("bidmach.path")
+val hdfsServer = System.getProperty("hdfs.path")
+val hdfsPrefix = s"$hdfsServer/BIDMach_MNIST/parts"
+println(s"Loading mnist data from $hdfsPrefix ..")
+
+val fnames = (0 to 1).map(i => (f"$hdfsPrefix/alls$i%02.0f.fmat.lz4")).toList
+
+val hdfsio = new HDFSIO
+hdfsio.appendFiles(fnames, hdfsPrefix + "merged.fmat.lz4", 2);
+
+
diff --git a/scripts/load_mnist.ssc b/scripts/load_mnist.ssc
@@ -1,13 +1,17 @@
 import BIDMat.MatIO
+import BIDMat.HMat._
 import BIDMat.SerText
 
-val myname = java.net.InetAddress.getLocalHost.getHostAddress
-val prefix = "hdfs://" + myname + ":9000/BIDMach_MNIST/"
+val bmDir=System.getProperty("bidmach.path")
+val hdfsServer = System.getProperty("hdfs.path")
+val hdfsPrefix = s"$hdfsServer/BIDMach_MNIST/"
+val mnistPath = s"${bmDir}/data/MNIST8M/parts/"
+println(s"Loading mnist data from $mnistPath and saving to hdfs at $hdfsPrefix ..")
 
 for (i <- 0 to 80) {
-    val fname = "alls%02d.fmat.lz4" format i;
-    val a =loadFMat("/opt/BIDMach/data/MNIST8M/parts/" + fname);
-    saveFMat(prefix + "parts/" + fname, a, 2);
-    print(".");
+    val fname = "alls%02d.fmat.lz4" format i
+    val a =loadFMat(s"$mnistPath/$fname")
+    saveFMat(s"${hdfsPrefix}/parts/$fname", a, 2)
+    print(".")
 }   
-println();
+println()