add python notebook support (intel-analytics#3)

* add python notebook support * put post process calculation into executors
wzhongyuan · May 18, 2017 · d26dc50 · d26dc50
1 parent 5ad8c44
commit d26dc50
Show file tree

Hide file tree

Showing 11 changed files with 709 additions and 55 deletions.
diff --git a/notebook/README.md b/notebook/README.md
@@ -0,0 +1,50 @@
+# Demo Setup Guide
+
+## Install Dependency Packages
+
+Reference https://github.com/intel-analytics/BigDL/wiki/Python-Support
+
+## Download BigDL jars
+
+Download BigDL Nightly Build jars from https://github.com/intel-analytics/BigDL/wiki/Downloads
+
+The default spark version is Spark 1.5.1
+
+
+## 2 Start Jupyter Server
+
+* Create start_notebook.sh, copy and paste the contents below, and edit SPARK_HOME, BigDL_HOME accordingly. Change other parameter settings as you need. 
+```bash
+#!/bin/bash
+
+#setup pathes
+SPARK_HOME=/Users/bigdl/spark-1.6.0-bin-hadoop2.6/
+Analytics_HOME=/Users/bigdl/analytics-zoo
+BigDL_HOME=/Users/bigdl/dist-spark-1.5.1-scala-2.10.5-linux64-0.2.0-20170510.012057-18-dist
+#use local mode or cluster mode
+#MASTER=spark://xxxx:7077
+MASTER="local[4]"
+
+PYTHON_API_ZIP_PATH=${BigDL_HOME}/lib/bigdl-0.2.0-SNAPSHOT-python-api.zip
+BigDL_JAR_PATH=${Analytics_HOME}/pipeline/target/pipeline-0.1-SNAPSHOT-jar-with-dependencies.jar
+
+export PYTHONPATH=${PYTHON_API_ZIP_PATH}:$PYTHONPATH
+export IPYTHON_OPTS="notebook --notebook-dir=./  --ip=* --no-browser --NotebookApp.token=''"
+
+${SPARK_HOME}/bin/pyspark \
+    --master ${MASTER} \
+    --properties-file ${BigDL_HOME}/conf/spark-bigdl.conf \
+    --driver-cores 1  \
+    --driver-memory 10g  \
+    --total-executor-cores 3  \
+    --executor-cores 1  \
+    --executor-memory 20g \
+    --conf spark.akka.frameSize=64 \
+    --py-files ${PYTHON_API_ZIP_PATH} \
+    --jars ${BigDL_JAR_PATH} \
+    --conf spark.driver.extraClassPath=${BigDL_JAR_PATH} \
+    --conf spark.executor.extraClassPath=pipeline-0.1-SNAPSHOT-jar-with-dependencies.jar
+```
+* Put start_notebook.sh and start_tensorboard.sh in home directory and execute them in bash.
+
+
diff --git a/notebook/example/SSD.ipynb b/notebook/example/SSD.ipynb
diff --git a/pipeline/assembly/python-zip.xml b/pipeline/assembly/python-zip.xml
@@ -0,0 +1,18 @@
+<assembly xmlns="http://maven.apache.org/ASSEMBLY/2.0.0"
+          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+          xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.0.0 http://maven.apache.org/xsd/assembly-2.0.0.xsd">
+    <id>python-api</id>
+    <formats>
+        <format>zip</format>
+    </formats>
+    <includeBaseDirectory>false</includeBaseDirectory>
+    <fileSets>
+        <fileSet>
+            <includes>
+                <include>**/*.py</include>
+            </includes>
+            <outputDirectory>/..</outputDirectory>
+            <directory>src/main/python/</directory>
+        </fileSet>
+    </fileSets>
+</assembly>
diff --git a/pipeline/pom.xml b/pipeline/pom.xml
@@ -189,6 +189,21 @@
                 </configuration>
 
                 <executions>
+                    <execution>
+                        <id>python</id>
+                        <inherited>false</inherited>
+                        <configuration>
+                            <finalName>pipeline-${project.version}</finalName>
+                            <descriptors>
+                                <descriptor>assembly/python-zip.xml
+                                </descriptor>
+                            </descriptors>
+                        </configuration>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>single</goal>
+                        </goals>
+                    </execution>
                     <execution>
                         <phase>package</phase>
                         <goals>

diff --git a/pipeline/src/main/python/ssd/__init__.py b/pipeline/src/main/python/ssd/__init__.py
@@ -0,0 +1,4 @@
+from util.common import JavaCreator
+
+JavaCreator.set_creator_class(
+    "com.intel.analytics.bigdl.python.api.SSDPythonBigDL")  # noqa
diff --git a/pipeline/src/main/python/ssd/ssd.py b/pipeline/src/main/python/ssd/ssd.py
@@ -0,0 +1,41 @@
+from nn.layer import *
+
+class Test(Model):
+    """
+    >>> test = Test("myworld")
+    creating: createTest
+    >>> print(test.value)
+    hello myworld
+    >>> linear = Linear(1, 2)
+    creating: createLinear
+    """
+    def __init__(self, message, bigdl_type="float"):
+        super(Test, self).__init__(None, bigdl_type, message)
+
+
+def _test():
+    import sys
+    print sys.path
+    import doctest
+    from pyspark import SparkContext
+    from util.common import init_engine
+    from util.common import create_spark_conf
+    from util.common import JavaCreator
+    import ssd
+    globs = ssd.__dict__.copy()
+    sc = SparkContext(master="local[4]", appName="test layer",
+                      conf=create_spark_conf())
+    globs['sc'] = sc
+    JavaCreator.set_creator_class("com.intel.analytics.bigdl.python.api.SSDPythonBigDL")  # noqa
+    init_engine()
+    (failure_count, test_count) = doctest.testmod(globs=globs,
+                                                  optionflags=doctest.ELLIPSIS)
+    if failure_count:
+        exit(-1)
+
+def predict(resolution, batch_size, n_partition, folder, _sc, _model, n_classes):
+    return callBigDlFunc("float", "ssdPredict", resolution, batch_size, n_partition,
+                         folder, _sc, _model, n_classes)
+
+if __name__ == "__main__":
+    _test()
diff --git a/pipeline/src/main/scala/com/intel/analytics/zoo/pipeline/common/BboxUtil.scala b/pipeline/src/main/scala/com/intel/analytics/zoo/pipeline/common/BboxUtil.scala
@@ -17,9 +17,55 @@
 package com.intel.analytics.zoo.pipeline.common
 
 import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.zoo.pipeline.common.dataset.roiimage.Target
 import org.apache.log4j.Logger
 
 object BboxUtil {
+
+  val logger = Logger.getLogger(getClass)
+
+  def decodeOutput(output: Tensor[Float], nclass: Int): Array[Target] = {
+    val num = output.valueAt(1).toInt
+    val result = output.narrow(1, 2, output.nElement() - 1).view(num, 6)
+
+    val indices = getClassIndices(result)
+    val decoded = new Array[Target](nclass)
+    val iter = indices.iterator
+    while (iter.hasNext) {
+      val item = iter.next()
+      val sub = result.narrow(1, item._2._1, item._2._2)
+      decoded(item._1) = Target(sub.select(2, 2),
+        sub.narrow(2, 3, 4))
+    }
+    decoded
+  }
+
+  private def getClassIndices(result: Tensor[Float]): Map[Int, (Int, Int)] = {
+    var indices = Map[Int, (Int, Int)]()
+    if (result.nElement() == 0) return indices
+    var prev = -1f
+    var i = 1
+    var start = 1
+    if (result.size(1) == 1) {
+      indices += (result.valueAt(i, 1).toInt -> (1, 1))
+      return indices
+    }
+    while (i <= result.size(1)) {
+      if (prev != result.valueAt(i, 1)) {
+        if (prev >= 0) {
+          indices += (prev.toInt -> (start, i - start))
+        }
+        start = i
+      }
+      prev = result.valueAt(i, 1)
+      if (i == result.size(1)) {
+        indices += (prev.toInt -> (start, i - start + 1))
+      }
+      i += 1
+    }
+    indices
+  }
+
   // inplace scale
   def scaleBBox(classBboxes: Tensor[Float], height: Float, width: Float): Unit = {
     if (classBboxes.nElement() == 0) return
@@ -30,8 +76,6 @@ object BboxUtil {
   }
 
 
-  val logger = Logger.getLogger(getClass)
-
   private def decodeSingleBbox(i: Int, priorBox: Tensor[Float], priorVariance: Tensor[Float],
     isClipBoxes: Boolean, bbox: Tensor[Float], varianceEncodedInTarget: Boolean,
     decodedBoxes: Tensor[Float]): Unit = {

diff --git a/pipeline/src/main/scala/com/intel/analytics/zoo/pipeline/common/nn/DetectionOutput.scala b/pipeline/src/main/scala/com/intel/analytics/zoo/pipeline/common/nn/DetectionOutput.scala
@@ -17,15 +17,15 @@
 package com.intel.analytics.zoo.pipeline.common.nn
 
 import com.intel.analytics.bigdl.nn.abstractnn.AbstractModule
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.utils.Table
 import com.intel.analytics.zoo.pipeline.common.BboxUtil
-import com.intel.analytics.zoo.pipeline.common.dataset.roiimage.Target
 import com.intel.analytics.zoo.pipeline.common.nn.DetectionOutput._
 import com.intel.analytics.zoo.pipeline.ssd.PostProcessParam
-import com.intel.analytics.bigdl.tensor.Tensor
-import com.intel.analytics.bigdl.utils.{T, Table}
 import org.apache.log4j.Logger
 
-class DetectionOutput(param: PostProcessParam) extends AbstractModule[Table, Table, Float] {
+@SerialVersionUID(5253792953255433914L)
+class DetectionOutput(param: PostProcessParam) extends AbstractModule[Table, Tensor[Float], Float] {
   @transient private var nms: Nms = _
 
   private def filterBboxes(decodedBboxes: Array[Tensor[Float]],
@@ -137,11 +137,7 @@ class DetectionOutput(param: PostProcessParam) extends AbstractModule[Table, Tab
     }
   }
 
-  override def updateOutput(input: Table): Table = {
-    if (isTraining()) {
-      output = input
-      return output
-    }
+  override def updateOutput(input: Table): Tensor[Float] = {
     if (nms == null) nms = new Nms()
     val loc = input[Tensor[Float]](1)
     val conf = input[Tensor[Float]](2)
@@ -163,60 +159,55 @@ class DetectionOutput(param: PostProcessParam) extends AbstractModule[Table, Tab
     val allDecodedBboxes = BboxUtil.decodeBboxesAll(allLocPreds, priorBoxes, priorVariances,
       numLocClasses, param.bgLabel, false, param.varianceEncodedInTarget, param.shareLocation,
       allLocPreds)
-    var numKept = 0
+    val numKepts = new Array[Int](batch)
+    var maxDetection = 0
 
     i = 0
     while (i < batch) {
-      numKept += filterBboxes(allDecodedBboxes(i), allConfScores(i),
+      val num = filterBboxes(allDecodedBboxes(i), allConfScores(i),
         allIndices(i), allIndicesNum(i))
+      numKepts(i) = num
+      maxDetection = Math.max(maxDetection, num)
       i += 1
     }
-    val results = new Array[Array[Target]](batch)
-    if (numKept > 0) {
+    // the first element is the number of detection numbers
+    output = Tensor[Float](batch, 1 + maxDetection * 6)
+    if (numKepts.sum > 0) {
       i = 0
       while (i < batch) {
+        val outi = output(i + 1)
         var c = 0
-        val result = new Array[Target](param.nClasses)
-        while (c < param.nClasses) {
+        outi.setValue(1, numKepts(i))
+        var offset = 2
+        while (c < allIndices(i).length) {
           val indices = allIndices(i)(c)
           if (indices != null) {
             val indicesNum = allIndicesNum(i)(c)
             val locLabel = if (param.shareLocation) allDecodedBboxes(i).length - 1 else c
             val bboxes = allDecodedBboxes(i)(locLabel)
             var j = 0
-            val classBboxes = Tensor[Float](indicesNum, 4)
-            val classScores = Tensor[Float](indicesNum)
             while (j < indicesNum) {
               val idx = indices(j)
-              classScores.setValue(j + 1, allConfScores(i)(c).valueAt(idx))
-              classBboxes.setValue(j + 1, 1, bboxes.valueAt(idx, 1))
-              classBboxes.setValue(j + 1, 2, bboxes.valueAt(idx, 2))
-              classBboxes.setValue(j + 1, 3, bboxes.valueAt(idx, 3))
-              classBboxes.setValue(j + 1, 4, bboxes.valueAt(idx, 4))
+              outi.setValue(offset, c)
+              outi.setValue(offset + 1, allConfScores(i)(c).valueAt(idx))
+              outi.setValue(offset + 2, bboxes.valueAt(idx, 1))
+              outi.setValue(offset + 3, bboxes.valueAt(idx, 2))
+              outi.setValue(offset + 4, bboxes.valueAt(idx, 3))
+              outi.setValue(offset + 5, bboxes.valueAt(idx, 4))
+              offset += 6
               j += 1
             }
-            // Clip the normalized bbox first.
-            BboxUtil.clipBoxes(classBboxes)
-            result(c) = Target(classScores, classBboxes)
           }
           c += 1
         }
-        results(i) = result
         i += 1
       }
     }
-    if (output == null) {
-      output = T()
-      output.insert(results)
-    } else {
-      output(1) = results
-    }
     output
   }
 
-  override def updateGradInput(input: Table, gradOutput: Table): Table = {
-    gradInput = gradOutput
-    gradInput
+  override def updateGradInput(input: Table, gradOutput: Tensor[Float]): Table = {
+    null
   }
 }
 

diff --git a/pipeline/src/main/scala/com/intel/analytics/zoo/pipeline/python/api/SSDPythonBigDL.scala b/pipeline/src/main/scala/com/intel/analytics/zoo/pipeline/python/api/SSDPythonBigDL.scala
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.bigdl.python.api
+
+import java.lang.{Boolean => JBoolean}
+import java.util.{ArrayList => JArrayList, HashMap => JHashMap, List => JList, Map => JMap}
+
+import com.intel.analytics.bigdl.dataset.{Identity => DIdentity, Sample => JSample}
+import com.intel.analytics.bigdl.nn.abstractnn.{AbstractModule, Activity}
+import com.intel.analytics.bigdl.numeric._
+import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
+import com.intel.analytics.zoo.pipeline.ssd._
+import org.apache.spark.api.java.JavaSparkContext
+
+import scala.collection.JavaConverters._
+import scala.language.existentials
+import scala.reflect.ClassTag
+
+object SSDPythonBigDL {
+
+  def ofFloat(): PythonBigDL[Float] = new SSDPythonBigDL[Float]()
+
+  def ofDouble(): PythonBigDL[Double] = new SSDPythonBigDL[Double]()
+
+}
+
+
+class SSDPythonBigDL[T: ClassTag](implicit ev: TensorNumeric[T]) extends PythonBigDL[T] {
+
+  def ssdPredict(resolution: Int, batchSize: Int, nPartition: Int,
+    folder: String, sc: JavaSparkContext,
+    model: AbstractModule[Activity, Activity, Float], nClasses: Int = 21)
+  : JList[JList[JList[JList[Float]]]] = {
+    val predictor = new Predictor(model,
+      PreProcessParam(batchSize, resolution, (123f, 117f, 104f), false), nClasses)
+    val data = IOUtils.loadLocalFolder(nPartition, folder, sc)
+    val results = predictor.predict(data).collect()
+    val pathArr = data.map(x => x.path).collect()
+    results.zip(pathArr).map(res => {
+      val bboxes = res._1.map(r => if (r != null) r.bboxes.storage().array().toList.asJava
+      else null).toList.asJava
+      val scores = res._1.map(r => if (r != null) r.classes.storage().array().toList.asJava
+      else null).toList.asJava
+      List(scores, bboxes).asJava
+    }).toList.asJava
+  }
+}
+
+
+