Adding dropout implementation from ann-dropout branch

avulanov · May 15, 2015 · 52497e1 · 52497e1
1 parent cb03fe0
commit 52497e1
Show file tree

Hide file tree

Showing 2 changed files with 187 additions and 5 deletions.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/ann/Dropout.scala b/mllib/src/main/scala/org/apache/spark/mllib/ann/Dropout.scala
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.ann
+
+import scala.collection.BitSet
+
+import breeze.linalg.{DenseMatrix => BDM}
+
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+import org.apache.spark.util.random.XORShiftRandom
+
+class DropoutTopology(val layers: Array[Layer], val inputDropoutProb: Double,
+                      val layerDropoutProb: Double) extends Topology {
+  override def getInstance(weights: Vector): TopologyModel =
+    DropoutModel(this, weights)
+
+  override def getInstance(seed: Long): TopologyModel = DropoutModel(this, seed)
+}
+
+/* Model of Feed Forward Neural Network with drop-out.
+* Implements forward, gradient computation and can return weights in vector format.
+* */
+class DropoutModel(val layerModels: Array[LayerModel],
+                   val topology: DropoutTopology) extends TopologyModel {
+  private val rand = new XORShiftRandom(System.nanoTime())
+
+  override def forward(data: BDM[Double]): Array[BDM[Double]] = {
+    val outputs = new Array[BDM[Double]](layerModels.length)
+    val lastIndex = layerModels.lastIndexWhere(lm => lm.size > 0)
+    for(i <- 0 until layerModels.length){
+      outputs(i) = layerModels(i).eval(if (i==0) data else outputs(i - 1))
+      // use probabilities only on layers with weights except the last one
+      if (topology.layerDropoutProb > 0 && layerModels(i).size > 0 && i < lastIndex) {
+        outputs(i) :*= 1.0 - topology.layerDropoutProb
+      }
+    }
+    outputs
+  }
+
+  override def computeGradient(data: BDM[Double], target: BDM[Double], cumGradient: Vector,
+                      realBatchSize: Int): Double = {
+    // preparing masks
+    var inputMask: BitSet = null
+    val layerMasks = new Array[BitSet](layerModels.length)
+    if (topology.inputDropoutProb > 0) {
+      inputMask = makeMask(data, topology.inputDropoutProb)
+      applyMask(data, inputMask)
+    }
+    // forward with masks
+    val lastIndex = layerModels.lastIndexWhere(lm => lm.size > 0)
+    val outputs = new Array[BDM[Double]](layerModels.length)
+    for (i <- 0 until layerModels.length){
+      outputs(i) = layerModels(i).eval(if (i==0) data else outputs(i - 1))
+      if (i < lastIndex && topology.layerDropoutProb > 0) {
+        layerMasks(i) = if (layerModels(i).size > 0) {
+          makeMask(outputs(i), topology.layerDropoutProb)
+        } else {
+          if (i==0) inputMask else layerMasks(i - 1)
+        }
+        applyMask(outputs(i), layerMasks(i))
+      }
+    }
+    // error depending on output layer
+    val (newE, newError) = layerModels.last match {
+      case flm: FunctionalLayerModel => flm.error(outputs.last, target)
+      case _ =>
+        throw new UnsupportedOperationException("Non-functional layer not supported at the top")
+    }
+    // compute delta with masks
+    val L = layerModels.length - 1
+    val deltas = new Array[BDM[Double]](layerModels.length)
+    deltas(L) = new BDM[Double](0, 0)
+    deltas(L - 1) = newE
+    for (i <- (L - 2) to (0, -1)) {
+      deltas(i) = layerModels(i + 1).prevDelta(deltas(i + 1), outputs(i + 1))
+      applyMask(deltas(i), layerMasks(i))
+    }
+    // compute gradient
+    val grads = new Array[Array[Double]](layerModels.length)
+    for (i <- 0 until layerModels.length) {
+      val input = if (i==0) data else outputs(i - 1)
+      grads(i) = layerModels(i).grad(deltas(i), input)
+    }
+    // update cumGradient
+    val cumGradientArray = cumGradient.toArray
+    var offset = 0
+    // TODO: extract roll
+    for (i <- 0 until grads.length) {
+      val gradArray = grads(i)
+      var k = 0
+      while (k < gradArray.length) {
+        cumGradientArray(offset + k) += gradArray(k)
+        k += 1
+      }
+      offset += gradArray.length
+    }
+    newError
+  }
+
+  private def makeMask(data: BDM[Double], dropoutProb: Double): BitSet = {
+    val mask = scala.collection.mutable.BitSet(data.size)
+    mask(data.size) = false
+    var k = 0
+    while (k < data.size) {
+      if (rand.nextDouble() > dropoutProb) {
+        mask(k) = true
+      }
+      k += 1
+    }
+    mask
+  }
+
+  private def applyMask(data: BDM[Double], mask: BitSet): Unit = {
+    var k = 0
+    while (k < data.size && mask != null) {
+      if (mask(k) == false) {
+        data.data(k) = 0.0
+      }
+      k += 1
+    }
+  }
+
+  override def weights(): Vector = {
+    // TODO: extract roll
+    var size = 0
+    for(i <- 0 until layerModels.length) {
+      size += layerModels(i).size
+    }
+    val array = new Array[Double](size)
+    var offset = 0
+    for(i <- 0 until layerModels.length) {
+      val layerWeights = layerModels(i).weights().toArray
+      System.arraycopy(layerWeights, 0, array, offset, layerWeights.length)
+      offset += layerWeights.length
+    }
+    Vectors.dense(array)
+  }
+
+  override def predict(data: Vector): Vector = {
+    val result = forward(data.toBreeze.toDenseVector.toDenseMatrix.t)
+    Vectors.dense(result.last.toArray)
+  }
+
+}
+
+// TODO: make a fabric of models (unite with object FeedForwardModel)
+object DropoutModel {
+  def apply(topology: DropoutTopology, weights: Vector): DropoutModel = {
+    val layers = topology.layers
+    val layerModels = new Array[LayerModel](layers.length)
+    var offset = 0
+    for(i <- 0 until layers.length){
+      layerModels(i) = layers(i).getInstance(weights, offset)
+      offset += layerModels(i).size
+    }
+    new DropoutModel(layerModels, topology)
+  }
+
+  def apply(topology: DropoutTopology, seed: Long = 11L): DropoutModel = {
+    val layers = topology.layers
+    val layerModels = new Array[LayerModel](layers.length)
+    var offset = 0
+    for(i <- 0 until layers.length){
+      layerModels(i) = layers(i).getInstance(seed)
+      offset += layerModels(i).size
+    }
+    new DropoutModel(layerModels, topology)
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/ann/Layer.scala b/mllib/src/main/scala/org/apache/spark/mllib/ann/Layer.scala
@@ -305,7 +305,8 @@ class FunctionalLayerModel private (val activationFunction: ActivationFunction
 }
 
 object FunctionalLayerModel {
-  def apply(layer: FunctionalLayer) = new FunctionalLayerModel(layer.activationFunction)
+  def apply(layer: FunctionalLayer): FunctionalLayerModel =
+    new FunctionalLayerModel(layer.activationFunction)
 }
 
 trait Topology extends Serializable{
@@ -423,7 +424,6 @@ class FeedForwardModel(val layerModels: Array[LayerModel],
     val result = forward(data.toBreeze.toDenseVector.toDenseMatrix.t)
     Vectors.dense(result.last.toArray)
   }
-
 }
 
 object FeedForwardModel {
@@ -531,14 +531,13 @@ class FeedForwardTrainer (topology: Topology, val inputSize: Int,
                           val outputSize: Int) extends Serializable {
 
   // TODO: what if we need to pass random seed?
-  private var _weights = topology.getInstance(11L).weights()//FeedForwardModel(topology).weights()
+  private var _weights = topology.getInstance(11L).weights()
   private var _batchSize = 1
   private var dataStacker = new DataStacker(_batchSize, inputSize, outputSize)
   private var _gradient: Gradient = new ANNGradient(topology, dataStacker)
   private var _updater: Updater = new ANNUpdater()
   private var optimizer: Optimizer = LBFGSOptimizer.setConvergenceTol(1e-4).setNumIterations(100)
 
-
   def getWeights: Vector = _weights
 
   def setWeights(value: Vector): FeedForwardTrainer = {
@@ -596,7 +595,6 @@ class FeedForwardTrainer (topology: Topology, val inputSize: Int,
 
   def train(data: RDD[(Vector, Vector)]): TopologyModel = {
     val newWeights = optimizer.optimize(dataStacker.stack(data), getWeights)
-    //FeedForwardModel(topology, newWeights)
     topology.getInstance(newWeights)
   }