From dab28d5bff21daeb632376d359abaec5a587b94b Mon Sep 17 00:00:00 2001 From: qiuxin2012 Date: Tue, 10 Jul 2018 16:01:18 +0800 Subject: [PATCH 01/11] ncfperf --- .../example/recommendation/NcfPerf.scala | 97 +++++ .../example/recommendation/NeuralCFV2.scala | 218 +++++++++++ .../analytics/bigdl/optim/NCFOptimizer.scala | 364 ++++++++++++++++++ 3 files changed, 679 insertions(+) create mode 100644 spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NcfPerf.scala create mode 100644 spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NeuralCFV2.scala create mode 100644 spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/NCFOptimizer.scala diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NcfPerf.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NcfPerf.scala new file mode 100644 index 00000000000..71602eb116b --- /dev/null +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NcfPerf.scala @@ -0,0 +1,97 @@ +/* + * Copyright 2018 Analytics Zoo Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.intel.analytics.bigdl.example.recommendation + +import com.intel.analytics.bigdl.nn.BCECriterion +import com.intel.analytics.bigdl.tensor.Tensor +import com.intel.analytics.bigdl.utils.Engine + +import scala.util.Random + +object NcfPerf { + def main(args: Array[String]): Unit = { + val iteration = args(0).toInt + val batchSize = args(1).toInt + val core = args(2).toInt + System.setProperty("bigdl.localMode", "true") + Engine.init(1, core, false) + val userCount = 138493 + val itemCount = 26744 + + val model = NeuralCFV2[Float](userCount, itemCount, 1, 64, 128, hiddenLayers = Array(128, 64)) + .buildModel() + val criterion = BCECriterion[Float]() + + val input = Tensor[Float](batchSize * core, 2) + input.select(2, 1).apply1(_ => Random.nextInt(userCount) + 1) + input.select(2, 2).apply1(_ => Random.nextInt(itemCount) + 1) + val target = Tensor[Float](batchSize * core, 1).apply1(_ => Random.nextInt(2)) + + var modelForwardTime = new Array[Long](core) + var modelBackwardTime = new Array[Long](core) + var criterionForwardTime = new Array[Long](core) + var criterionBackwardTime = new Array[Long](core) + val (w, g) = model.getParameters() + val models = (0 until core).map{i => + val newmodel = model.cloneModule() + newmodel.getParameters()._1.set(w) + newmodel + } + + (0 until iteration).foreach { i => + input.select(2, 1).apply1(_ => Random.nextInt(userCount) + 1) + input.select(2, 2).apply1(_ => Random.nextInt(itemCount) + 1) + target.apply1(_ => Random.nextInt(2)) + + Engine.default.invokeAndWait((0 until core).map { tid => + () => + val currentInput = input.narrow(1, tid * batchSize + 1, batchSize) + val currentTarget = target.narrow(1, tid * batchSize + 1, batchSize) + val currentModel = models(tid) + + + var start = System.nanoTime() + + val output = currentModel.forward(currentInput) + modelForwardTime(tid) += System.nanoTime() - start + + start = System.nanoTime() + val loss = criterion.forward(output, currentTarget) + criterionForwardTime(tid) += System.nanoTime() - start + + start = System.nanoTime() + val gradOutput = criterion.backward(output, currentTarget) + criterionBackwardTime(tid) += System.nanoTime() - start + + start = System.nanoTime() + val gradInput = currentModel.backward(currentInput, gradOutput) + modelBackwardTime(tid) += System.nanoTime() - start + + }) + } + + println(s"${modelForwardTime.sum / 1e6 / iteration}ms") + println(s"${criterionForwardTime.sum / 1e6 / iteration}ms") + println(s"${criterionBackwardTime.sum / 1e6 / iteration}ms") + println(s"${modelBackwardTime.sum / 1e6 / iteration}ms") + + + + + } + +} diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NeuralCFV2.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NeuralCFV2.scala new file mode 100644 index 00000000000..4344face725 --- /dev/null +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NeuralCFV2.scala @@ -0,0 +1,218 @@ +/* + * Copyright 2018 Analytics Zoo Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.intel.analytics.bigdl.example.recommendation + +import com.intel.analytics.bigdl.Module +import com.intel.analytics.bigdl.tensor.Tensor +import com.intel.analytics.bigdl.nn._ +import com.intel.analytics.bigdl.nn.abstractnn.{AbstractModule, Activity} +import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric +import com.intel.analytics.bigdl.utils.Table +import org.apache.spark.sql.catalyst.plans.physical.IdentityBroadcastMode + +import scala.reflect.ClassTag + +/** + * The model is for neural collaborative filtering. + * + * @param numClasses The number of classes. Positive integer. + * @param userCount The number of users. Positive integer. + * @param itemCount The number of items. Positive integer. + * @param userEmbed Units of user embedding. Positive integer. + * @param itemEmbed Units of item embedding. Positive integer. + * @param hiddenLayers Units hidenLayers of MLP part. Array of positive integer. + * @param includeMF Include Matrix Factorization or not. Boolean. + * @param mfEmbed Units of matrix factorization embedding. Positive integer. + * @tparam T Numeric type of parameter(e.g. weight, bias). Only support float/double now. + */ + +class NeuralCFV2[T: ClassTag] private(val userCount: Int, + val itemCount: Int, + val numClasses: Int, + val userEmbed: Int = 20, + val itemEmbed: Int = 20, + val hiddenLayers: Array[Int] = Array(40, 20, 10), + val includeMF: Boolean = true, + val mfEmbed: Int = 20 + )(implicit ev: TensorNumeric[T]) extends Container[Tensor[T], Tensor[T], T] { + + override def updateOutput(input: Tensor[T]): Tensor[T] = { + output = ncfModel.forward(input).toTensor[T] + output + } + + override def updateGradInput(input: Tensor[T], gradOutput: Tensor[T]): Tensor[T] = { + gradInput = ncfModel.updateGradInput(input, gradOutput).toTensor[T] + gradInput + } + + override def accGradParameters(input: Tensor[T], gradOutput: Tensor[T]): Unit = { + ncfModel.accGradParameters(input, gradOutput) + } + + override def backward(input: Tensor[T], gradOutput: Tensor[T]): Tensor[T] = { + gradInput = ncfModel.backward(input, gradOutput).toTensor[T] + gradInput + } + + override def parameters(): (Array[Tensor[T]], Array[Tensor[T]]) = { + (embeddingModel.parameters()._1 ++ ncfLayers.parameters()._1, + embeddingModel.parameters()._2 ++ ncfLayers.parameters()._2) + } + +// var embeddingModel: ConcatTable[T] = _ +// var ncfLayers: Sequential[T] = _ +// var ncfModel: Sequential[T] = _ +// +// def buildModel(): this.type = { +// embeddingModel = ConcatTable[Tensor[T], T]() +// val mlpEmbedding = Sequential[T]() +// val mlpUserTable = LookupTable[T](userCount, userEmbed) +// val mlpItemTable = LookupTable[T](itemCount, itemEmbed) +// mlpUserTable.setWeightsBias(Array(Tensor[T](userCount, userEmbed).randn(0, 0.1))) +// mlpItemTable.setWeightsBias(Array(Tensor[T](itemCount, itemEmbed).randn(0, 0.1))) +// mlpEmbedding.add(ConcatTable[Tensor[T], T]() +// .add(Sequential[T]().add(Select(2, 1)).add(mlpUserTable)) +// .add(Sequential[T]().add(Select(2, 2)).add(mlpItemTable))) +// .add(JoinTable(2, 2)) +// embeddingModel.add(mlpEmbedding) +// +// if (includeMF) { +// val mfUserTable = LookupTable[T](userCount, mfEmbed) +// val mfItemTable = LookupTable[T](itemCount, mfEmbed) +// mfUserTable.setWeightsBias(Array(Tensor[T](userCount, mfEmbed).randn(0, 0.1))) +// mfItemTable.setWeightsBias(Array(Tensor[T](itemCount, mfEmbed).randn(0, 0.1))) +// val mfEmbedding = Sequential[T]() +// mfEmbedding.add(ConcatTable[Tensor[T], T]() +// .add(Sequential[T]().add(Select(2, 1)).add(mfUserTable)) +// .add(Sequential[T]().add(Select(2, 2)).add(mfItemTable))) +// .add(CMulTable()) +// embeddingModel.add(mfEmbedding) +// } +// +// val mlpLinears = Sequential[T]() +// val linear1 = Linear[T](itemEmbed + userEmbed, hiddenLayers(0)) +// mlpLinears.add(linear1).add(ReLU()) +// for (i <- 1 to hiddenLayers.length - 1) { +// mlpLinears.add(Linear(hiddenLayers(i - 1), hiddenLayers(i))).add(ReLU()) +// } +// +// +// ncfLayers = Sequential[T]() +// if (includeMF) { +// ncfLayers.add(ParallelTable[T]() +// .add(mlpLinears) +// .add(Identity[T]())) +// .add(JoinTable(2, 2)) +// .add(Linear(mfEmbed + hiddenLayers.last, numClasses)) +// } else { +// ncfLayers.add(Linear(hiddenLayers.last, numClasses)) +// } +// ncfLayers.add(Sigmoid()) +// +// val ncfModel = Sequential[T]() +// +// ncfModel.add(embeddingModel).add(ncfLayers) +// +// this +// } + + var embeddingModel: Graph[T] = _ + var ncfLayers: Graph[T] = _ + var ncfModel: Sequential[T] = _ + + def buildModel(): this.type = { +// embeddingModel = ConcatTable[Tensor[T], T]() + val input = Identity().inputs() + val userId = Select(2, 1).setName("userId").inputs(input) + val itemId = Select(2, 2).setName("itemId").inputs(input) + val mlpUserTable = LookupTable[T](userCount, userEmbed) + .setName("mlpUserEmbedding") + .setWeightsBias(Array(Tensor[T](userCount, userEmbed).randn(0, 0.1))) + .inputs(userId) + val mlpItemTable = LookupTable[T](itemCount, itemEmbed) + .setName("mlpItemEmbedding") + .setWeightsBias(Array(Tensor[T](itemCount, itemEmbed).randn(0, 0.1))) + .inputs(itemId) +// val mlpEmbedding = JoinTable(2, 2).inputs(mlpUserTable, mlpItemTable) +// embeddingModel.add(mlpEmbedding) + + val mfUserTable = LookupTable[T](userCount, mfEmbed) + .setName("mfUserEmbedding") + .setWeightsBias(Array(Tensor[T](userCount, mfEmbed).randn(0, 0.1))) + .inputs(userId) + val mfItemTable = LookupTable[T](itemCount, mfEmbed) + .setName("mfItemEmbedding") + .setWeightsBias(Array(Tensor[T](itemCount, mfEmbed).randn(0, 0.1))) + .inputs(itemId) + embeddingModel = + Graph(input, Array(mlpUserTable, mlpItemTable, mfUserTable, mfItemTable)) + + val mlpUser = Identity().inputs() + val mlpItem = Identity().inputs() + val mfUser = Identity().inputs() + val mfItem = Identity().inputs() + + val mlpMerge = JoinTable(2, 2).inputs(mlpUser, mlpItem) + val mfMerge = CMulTable().inputs(mfUser, mfItem) + + var linear = Linear[T](itemEmbed + userEmbed, hiddenLayers(0)).inputs(mlpMerge) + var relu = ReLU[T]().inputs(linear) + for (i <- 1 to hiddenLayers.length - 1) { + linear = Linear(hiddenLayers(i - 1), hiddenLayers(i)).inputs(relu) + relu = ReLU().inputs(linear) + } + + val merge = JoinTable(2, 2).inputs(mfMerge, relu) + val finalLinear = Linear(mfEmbed + hiddenLayers.last, numClasses).inputs(merge) + val sigmoid = Sigmoid().inputs(finalLinear) + + ncfLayers = Graph(Array(mlpUser, mlpItem, mfUser, mfItem), sigmoid) + + ncfModel = Sequential[T]() + + ncfModel.add(embeddingModel).add(ncfLayers) + + this + } +} + +object NeuralCFV2 { + + def apply[@specialized(Float, Double) T: ClassTag] + (userCount: Int, + itemCount: Int, + numClasses: Int, + userEmbed: Int, + itemEmbed: Int, + hiddenLayers: Array[Int], + includeMF: Boolean = true, + mfEmbed: Int = 20 + )(implicit ev: TensorNumeric[T]): NeuralCFV2[T] = { + new NeuralCFV2[T]( + userCount, itemCount, numClasses, userEmbed, itemEmbed, hiddenLayers, includeMF, mfEmbed) + .buildModel() + } + + +// def loadModel[T: ClassTag](path: String, +// weightPath: String = null)(implicit ev: TensorNumeric[T]): +// NeuralCF[T] = { +// Model.load(path, weightPath).asInstanceOf[NeuralCF[T]] +// } +} + diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/NCFOptimizer.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/NCFOptimizer.scala new file mode 100644 index 00000000000..402d8ce8cde --- /dev/null +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/NCFOptimizer.scala @@ -0,0 +1,364 @@ +/* + * Copyright 2016 The BigDL Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.intel.analytics.bigdl.optim + +import com.intel.analytics.bigdl.dataset.{LocalDataSet, MiniBatch} +import com.intel.analytics.bigdl._ +import com.intel.analytics.bigdl.example.recommendation.NeuralCFV2 +import com.intel.analytics.bigdl.models.utils.ModelBroadcast +import com.intel.analytics.bigdl.nn.{ConcatTable, Graph, Utils} +import com.intel.analytics.bigdl.nn.abstractnn.{AbstractModule, Activity} +import com.intel.analytics.bigdl.tensor.Tensor +import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric +import com.intel.analytics.bigdl.utils._ +import org.apache.log4j.Logger + +import scala.reflect.ClassTag + +/** + * Optimize a model on a single machine + * + * @param model model to be optimized + * @param dataset data set + * @param criterion criterion to be used + */ +//class NCFOptimizer[T: ClassTag] ( +// model: Module[T], +// dataset: LocalDataSet[MiniBatch[T]], +// criterion: Criterion[T] +//)(implicit ev: TensorNumeric[T]) +// extends Optimizer[T, MiniBatch[T]]( +// model, dataset, criterion) { +// +// import NCFOptimizer._ +// import Optimizer._ +// +// private val coreNumber = Engine.coreNumber() +// +// private val subModelNumber = Engine.getEngineType match { +// case MklBlas => coreNumber +// case _ => throw new IllegalArgumentException +// } +// +// val ncfModel = model.asInstanceOf[NeuralCFV2[T]] +// +// // TODO: sharing failed +// private val workingEmbeddingModels = initModel(ncfModel.embeddingModel, +// subModelNumber, true) +// private val workingLinears = initModel(ncfModel.ncfLayers, +// subModelNumber, false) +// +// workingEmbeddingModels(0).parameters()._2.apply(0).setValue(1, 1, ev.fromType(0.01f)) +// workingEmbeddingModels(0).parameters()._1.apply(0).setValue(1, 1, ev.fromType(1.01f)) +// +// workingEmbeddingModels(0).getParameters()._2.setValue(1, ev.fromType(0.1f)) +// workingEmbeddingModels(0).getParameters()._1.setValue(1, ev.fromType(1.1f)) +// private val (embeddingWeight, embeddingGrad) = ncfModel.embeddingModel.getParameters() +// private val (linearsWeight, linearsGrad) = ncfModel.ncfLayers.getParameters() +// workingEmbeddingModels(0).getParameters()._2.setValue(1, ev.fromType(0.2f)) +// workingEmbeddingModels(0).getParameters()._1.setValue(1, ev.fromType(1.2f)) +// +// private val workingEmbeddingModelWAndG = workingEmbeddingModels.map(_.getParameters()) +// private val workingLinearModelWAndG = workingLinears.map(_.getParameters()) +// +// private val linearGradLength = linearsGrad.nElement() +// private val linearSyncGradTaskSize = linearGradLength / subModelNumber +// private val linearSyncGradExtraTask = linearGradLength % subModelNumber +// private val linearSyncGradParallelNum = +// if (linearSyncGradTaskSize == 0) linearSyncGradExtraTask else subModelNumber +// +// private val embeddingGradLength = embeddingGrad.nElement() +// private val embeddingSyncGradTaskSize = embeddingGradLength / subModelNumber +// private val embeddingSyncGradExtraTask = embeddingGradLength % subModelNumber +// private val embeddingSyncGradParallelNum = +// if (embeddingSyncGradTaskSize == 0) linearSyncGradExtraTask else subModelNumber +// +// private val workingCriterion = +// (1 to subModelNumber).map(_ => criterion.cloneCriterion()).toArray +// +// override def optimize(): Module[T] = { +// var wallClockTime = 0L +// var count = 0 +// optimMethod.clearHistory() +// optimMethod.loadFromTable(state) +// state("epoch") = state.get[Int]("epoch").getOrElse(1) +// state("neval") = state.get[Int]("neval").getOrElse(1) +// state("isLayerwiseScaled") = Utils.isLayerwiseScaled(model) +// val optimMethod2: OptimMethod[T] = optimMethod.clone() +// dataset.shuffle() +// val numSamples = dataset.data(train = false).map(_.size()).reduce(_ + _) +// var iter = dataset.data(train = true) +// logger.info("model thread pool size is " + Engine.model.getPoolSize) +// while (!endWhen(state)) { +// val start = System.nanoTime() +// println("start") +// +// val tasks = Engine.default.invoke((0 until embeddingSyncGradParallelNum).map(tid => +// () => { +// val offset = tid * embeddingSyncGradTaskSize + math.min(tid, embeddingSyncGradExtraTask) +// val length = embeddingSyncGradTaskSize + +// (if (tid < embeddingSyncGradExtraTask) 1 else 0) +// embeddingGrad.narrow(1, offset + 1, length).zero() +// })) +// +// // Fetch data and prepare tensors +// val batch = iter.next() +// var b = 0 +// val stackSize = batch.size() / subModelNumber +// val extraSize = batch.size() % subModelNumber +// val parallelism = if (stackSize == 0) extraSize else subModelNumber +// val miniBatchBuffer = new Array[MiniBatch[T]](parallelism) +// while (b < parallelism) { +// val offset = b * stackSize + math.min(b, extraSize) + 1 +// val length = stackSize + (if (b < extraSize) 1 else 0) +// miniBatchBuffer(b) = batch.slice(offset, length) +// b += 1 +// } +// val dataFetchTime = System.nanoTime() +// println("dataFetch") +// val lossSum = Engine.default.invokeAndWait( +// (0 until parallelism).map(i => +// () => { +// val localEmbedding = workingEmbeddingModels(i) +// val localLinears = workingLinears(i) +//// localEmbedding.zeroGradParameters() +// localEmbedding.training() +// localLinears.training() +// localLinears.zeroGradParameters() +// val localCriterion = workingCriterion(i) +// val input = miniBatchBuffer(i).getInput() +// val target = miniBatchBuffer(i).getTarget() +// +// val embeddingOutput = localEmbedding.forward(input) +// val output = localLinears.forward(embeddingOutput) +// val _loss = ev.toType[Double](localCriterion.forward(output, target)) +// val errors = localCriterion.backward(output, target) +// localEmbedding.updateGradInput(input, +// localLinears.backward(localEmbedding.output, errors)) +// _loss +// }) +// ).sum +// +// val loss = lossSum / parallelism +// +// val computingTime = System.nanoTime() +// println("computingTime") +// +// Engine.default.sync(tasks) +//// Engine.default.invokeAndWait( +//// (0 until embeddingSyncGradParallelNum).map(tid => +//// () => { +//// val offset = tid * embeddingSyncGradTaskSize + math.min(tid, embeddingSyncGradExtraTask) +//// val length = embeddingSyncGradTaskSize + +//// (if (tid < embeddingSyncGradExtraTask) 1 else 0) +//// embeddingGrad.narrow(1, offset + 1, length).zero() +//// }) +//// ) +// +// val zeroGradTime = System.nanoTime() +// println("zeroGrad") +// +// (0 until parallelism).foreach { i => +// val input = miniBatchBuffer(i).getInput() +// val localEmbedding = workingEmbeddingModels(i).asInstanceOf[Graph[T]] +// val input1 = localEmbedding("userId").get.output.asInstanceOf[Tensor[T]] +// val input2 = localEmbedding("itemId").get.output.asInstanceOf[Tensor[T]] +// val mlpUserEmbedding = localEmbedding("mlpUserEmbedding").get +// .asInstanceOf[AbstractModule[Activity, Activity, T]] +// val mlpItemEmbedding = localEmbedding("mlpItemEmbedding").get +// .asInstanceOf[AbstractModule[Activity, Activity, T]] +// val mfUserEmbedding = localEmbedding("mfUserEmbedding").get +// .asInstanceOf[AbstractModule[Activity, Activity, T]] +// val mfItemEmbedding = localEmbedding("mfItemEmbedding").get +// .asInstanceOf[AbstractModule[Activity, Activity, T]] +// val localLinears = workingLinears(i) +// val a = Seq( +// (mlpUserEmbedding, input1, localLinears.gradInput.toTable[Tensor[T]](1)), +// (mlpItemEmbedding, input2, localLinears.gradInput.toTable[Tensor[T]](2)), +// (mfUserEmbedding, input1, localLinears.gradInput.toTable[Tensor[T]](3)), +// (mfItemEmbedding, input2, localLinears.gradInput.toTable[Tensor[T]](4))) +// Engine.default.invokeAndWait(a.map(v => () => { +// v._1.accGradParameters(v._2, v._3) +// })) +// } +// +// val computingTime2 = System.nanoTime() +// println("computing2") +// +// +// // copy multi-model gradient to the buffer +// Engine.default.invokeAndWait( +// (0 until linearSyncGradParallelNum).map(tid => +// () => { +// val offset = tid * linearSyncGradTaskSize + math.min(tid, linearSyncGradExtraTask) +// val length = linearSyncGradTaskSize + (if (tid < linearSyncGradExtraTask) 1 else 0) +// var i = 0 +// while (i < parallelism) { +// if (i == 0) { +// linearsGrad.narrow(1, offset + 1, length) +// .copy(workingLinearModelWAndG(i)._2.narrow(1, offset + 1, length)) +// } else { +// linearsGrad.narrow(1, offset + 1, length) +// .add(workingLinearModelWAndG(i)._2.narrow(1, offset + 1, length)) +// } +// i += 1 +// } +// }) +// ) +// +// val aggTime = System.nanoTime() +// println("agg") +// +// optimMethod.state.update("epoch", state.get("epoch")) +// optimMethod.state.update("neval", state.get("neval")) +// optimMethod.optimize(_ => (ev.fromType(loss), linearsGrad), linearsWeight) +// +// optimMethod2.state.update("epoch", state.get("epoch")) +// optimMethod2.state.update("neval", state.get("neval")) +// optimMethod2.optimize(_ => (ev.fromType(loss), embeddingGrad), embeddingWeight) +// +// val updateWeightTime = System.nanoTime() +// println("update weight") +// val end = System.nanoTime() +// wallClockTime += end - start +// count += batch.size() +// val head = header(state[Int]("epoch"), count, numSamples, state[Int]("neval"), wallClockTime) +// logger.info(s"$head " + +// s"loss is $loss, iteration time is ${(end - start) / 1e9}s " + +// s"train time ${(end - dataFetchTime) / 1e9}s. " + +// s"Throughput is ${batch.size().toDouble / (end - start) * 1e9} record / second. " + +// optimMethod.getHyperParameter() +// ) +// logger.debug( s"data fetch time is ${(dataFetchTime - start) / 1e9}s \n" + +// s"model computing time is ${(computingTime - dataFetchTime) / 1e9}s \n" + +// s"zero grad time is ${(zeroGradTime - computingTime) / 1e9}s \n" + +// s"acc embedding time is ${(computingTime2 - zeroGradTime) / 1e9}s \n" + +// s"aggregate linear is ${(aggTime - computingTime2) / 1e9}s \n" + +// s"update weight time is ${(updateWeightTime - aggTime) / 1e9}s") +// +// state("neval") = state[Int]("neval") + 1 +// +// if (count >= numSamples) { +// state("epoch") = state[Int]("epoch") + 1 +// dataset.shuffle() +// iter = dataset.toLocal().data(train = true) +// count = 0 +// } +// +// validate(head) +// checkpoint(wallClockTime) +// } +// +// model +// } +// +// private def checkpoint(wallClockTime: Long): Unit = { +// if (checkpointTrigger.isEmpty || checkpointPath.isEmpty) { +// return +// } +// +// val trigger = checkpointTrigger.get +// if (trigger(state) && checkpointPath.isDefined) { +// logger.info(s"[Wall Clock ${wallClockTime / 1e9}s] Save model to path") +// saveModel(model, checkpointPath, isOverWrite, s".${state[Int]("neval")}") +// saveState(state, checkpointPath, isOverWrite, s".${state[Int]("neval")}") +// } +// } +// +// private def validate(header: String): Unit = { +//// if (validationTrigger.isEmpty || validationDataSet.isEmpty) { +//// return +//// } +//// val trigger = validationTrigger.get +//// if (!trigger(state)) { +//// return +//// } +//// val vMethods = validationMethods.get +//// val vMethodsArr = (1 to subModelNumber).map(i => vMethods.map(_.clone())).toArray +//// val dataIter = validationDataSet.get.toLocal().data(train = false) +//// logger.info(s"$header Validate model...") +//// +//// workingModels.foreach(_.evaluate()) +//// +//// var count = 0 +//// dataIter.map(batch => { +//// val stackSize = batch.size() / subModelNumber +//// val extraSize = batch.size() % subModelNumber +//// val parallelism = if (stackSize == 0) extraSize else subModelNumber +//// val start = System.nanoTime() +//// val result = Engine.default.invokeAndWait( +//// (0 until parallelism).map(b => +//// () => { +//// val offset = b * stackSize + math.min(b, extraSize) + 1 +//// val length = stackSize + (if (b < extraSize) 1 else 0) +//// val currentMiniBatch = batch.slice(offset, length) +//// val input = currentMiniBatch.getInput() +//// val target = currentMiniBatch.getTarget() +//// val output = workingModels(b).forward(input) +//// val validatMethods = vMethodsArr(b) +//// validatMethods.map(validation => { +//// validation(output, target) +//// }) +//// } +//// ) +//// ).reduce((left, right) => { +//// left.zip(right).map { case (l, r) => +//// l + r +//// } +//// }) +//// count += batch.size() +//// logger.info(s"$header Throughput is ${ +//// batch.size() / ((System.nanoTime() - start) / 1e9) +//// } record / sec") +//// result +//// }).reduce((left, right) => { +//// left.zip(right).map { case (l, r) => +//// l + r +//// } +//// }).zip(vMethods).foreach(r => { +//// logger.info(s"$header ${r._2} is ${r._1}") +//// }) +// } +//} +// +//object NCFOptimizer { +// val logger = Logger.getLogger(this.getClass) +// +// def initModel[T: ClassTag](model: Module[T], copies: Int, +// shareGradient: Boolean)( +// implicit ev: TensorNumeric[T]): Array[Module[T]] = { +// val (wb, grad) = Util.getAndClearWeightBiasGrad(model.parameters()) +// +// val models = (1 to copies).map(i => { +// logger.info(s"Clone $i model...") +// val m = if (i == copies) { +// model +// } else { +// model.cloneModule() +// } +// Util.putWeightBias(wb, m) +// if (shareGradient) { +// Util.putGradWeightBias(grad, m) +// } else { +// Util.initGradWeightBias(wb, m) +// } +// m +// }).toArray +// models +// } +//} + From 0dac18fa4e4c309b30414c2663bcaff7832cfab0 Mon Sep 17 00:00:00 2001 From: qiuxin2012 Date: Tue, 10 Jul 2018 16:03:19 +0800 Subject: [PATCH 02/11] some changes --- .../example/recommendation/NcfPerf.scala | 92 ++++++++++++++++--- 1 file changed, 78 insertions(+), 14 deletions(-) diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NcfPerf.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NcfPerf.scala index 71602eb116b..b52ee45219f 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NcfPerf.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NcfPerf.scala @@ -17,6 +17,7 @@ package com.intel.analytics.bigdl.example.recommendation import com.intel.analytics.bigdl.nn.BCECriterion +import com.intel.analytics.bigdl.optim.Adam import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.utils.Engine @@ -32,36 +33,70 @@ object NcfPerf { val userCount = 138493 val itemCount = 26744 - val model = NeuralCFV2[Float](userCount, itemCount, 1, 64, 128, hiddenLayers = Array(128, 64)) + val model = NeuralCFV2[Float](userCount, itemCount, 1, 128, 128, + hiddenLayers = Array(128, 64), + mfEmbed = 64) .buildModel() val criterion = BCECriterion[Float]() + val optimMethod = new Adam[Float]() val input = Tensor[Float](batchSize * core, 2) input.select(2, 1).apply1(_ => Random.nextInt(userCount) + 1) input.select(2, 2).apply1(_ => Random.nextInt(itemCount) + 1) val target = Tensor[Float](batchSize * core, 1).apply1(_ => Random.nextInt(2)) - var modelForwardTime = new Array[Long](core) - var modelBackwardTime = new Array[Long](core) - var criterionForwardTime = new Array[Long](core) - var criterionBackwardTime = new Array[Long](core) + val modelForwardTime = new Array[Long](core) + val modelBackwardTime = new Array[Long](core) + val criterionForwardTime = new Array[Long](core) + val criterionBackwardTime = new Array[Long](core) + var accgradientTime = 0L + var updateWeightTime = 0L + + val (w, g) = model.getParameters() - val models = (0 until core).map{i => + println(s"model weight length ${w.nElement()}") + val workingModels = (0 until core).map{i => val newmodel = model.cloneModule() newmodel.getParameters()._1.set(w) newmodel } + val workingModelWAndG = workingModels.map(_.getParameters()) + + val subModelNumber = core + val parallelism = core + val gradLength = g.nElement() + val syncGradTaskSize = gradLength / subModelNumber + val syncGradExtraTask = gradLength % subModelNumber + val syncGradParallelNum = + if (syncGradTaskSize == 0) syncGradExtraTask else subModelNumber + + // warm up + (0 until 5).foreach{i => + Engine.default.invokeAndWait((0 until core).map { tid => + () => + val currentInput = input.narrow(1, tid * batchSize + 1, batchSize) + val currentTarget = target.narrow(1, tid * batchSize + 1, batchSize) + val currentModel = workingModels(tid) + + val output = currentModel.forward(currentInput) + val loss = criterion.forward(output, currentTarget) + val gradOutput = criterion.backward(output, currentTarget) + val gradInput = currentModel.backward(currentInput, gradOutput) + + }) + } (0 until iteration).foreach { i => input.select(2, 1).apply1(_ => Random.nextInt(userCount) + 1) input.select(2, 2).apply1(_ => Random.nextInt(itemCount) + 1) target.apply1(_ => Random.nextInt(2)) + println(i) Engine.default.invokeAndWait((0 until core).map { tid => () => val currentInput = input.narrow(1, tid * batchSize + 1, batchSize) val currentTarget = target.narrow(1, tid * batchSize + 1, batchSize) - val currentModel = models(tid) + val currentModel = workingModels(tid) var start = System.nanoTime() @@ -82,15 +117,44 @@ object NcfPerf { modelBackwardTime(tid) += System.nanoTime() - start }) - } - - println(s"${modelForwardTime.sum / 1e6 / iteration}ms") - println(s"${criterionForwardTime.sum / 1e6 / iteration}ms") - println(s"${criterionBackwardTime.sum / 1e6 / iteration}ms") - println(s"${modelBackwardTime.sum / 1e6 / iteration}ms") - + var start = System.nanoTime() + val grad = g + Engine.default.invokeAndWait( + (0 until syncGradParallelNum).map(tid => + () => { + val offset = tid * syncGradTaskSize + math.min(tid, syncGradExtraTask) + val length = syncGradTaskSize + (if (tid < syncGradExtraTask) 1 else 0) + var i = 0 + while (i < parallelism) { + val sliceG = workingModelWAndG(i)._2.narrow(1, offset + 1, length) + if (i == 0) { + grad.narrow(1, offset + 1, length) + .copy(sliceG) + sliceG.zero() + } else { + grad.narrow(1, offset + 1, length) + .add(sliceG) + sliceG.zero() + } + i += 1 + } + }) + ) + grad.div(parallelism) + accgradientTime += System.nanoTime() - start + + start = System.nanoTime() + optimMethod.optimize(_ => (1, grad), w) + updateWeightTime += System.nanoTime() - start + } + println(s"${modelForwardTime.max / 1e6 / iteration}ms") + println(s"${criterionForwardTime.max / 1e6 / iteration}ms") + println(s"${criterionBackwardTime.max / 1e6 / iteration}ms") + println(s"${modelBackwardTime.max / 1e6 / iteration}ms") + println(s"${accgradientTime / 1e6 / iteration}ms") + println(s"${updateWeightTime / 1e6 / iteration}ms") } From e35ff04d6622b4bac29f1948095813bf82933802 Mon Sep 17 00:00:00 2001 From: qiuxin2012 Date: Thu, 12 Jul 2018 14:20:37 +0800 Subject: [PATCH 03/11] ncfOptimizer --- .../analytics/bigdl/optim/NCFOptimizer.scala | 636 +++++++++--------- .../intel/analytics/bigdl/utils/Util.scala | 92 ++- 2 files changed, 380 insertions(+), 348 deletions(-) diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/NCFOptimizer.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/NCFOptimizer.scala index 402d8ce8cde..7e448e067ac 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/NCFOptimizer.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/NCFOptimizer.scala @@ -19,8 +19,7 @@ package com.intel.analytics.bigdl.optim import com.intel.analytics.bigdl.dataset.{LocalDataSet, MiniBatch} import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.example.recommendation.NeuralCFV2 -import com.intel.analytics.bigdl.models.utils.ModelBroadcast -import com.intel.analytics.bigdl.nn.{ConcatTable, Graph, Utils} +import com.intel.analytics.bigdl.nn.{Graph, Utils} import com.intel.analytics.bigdl.nn.abstractnn.{AbstractModule, Activity} import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric @@ -36,329 +35,332 @@ import scala.reflect.ClassTag * @param dataset data set * @param criterion criterion to be used */ -//class NCFOptimizer[T: ClassTag] ( -// model: Module[T], -// dataset: LocalDataSet[MiniBatch[T]], -// criterion: Criterion[T] -//)(implicit ev: TensorNumeric[T]) -// extends Optimizer[T, MiniBatch[T]]( -// model, dataset, criterion) { -// -// import NCFOptimizer._ -// import Optimizer._ -// -// private val coreNumber = Engine.coreNumber() -// -// private val subModelNumber = Engine.getEngineType match { -// case MklBlas => coreNumber -// case _ => throw new IllegalArgumentException -// } -// -// val ncfModel = model.asInstanceOf[NeuralCFV2[T]] -// -// // TODO: sharing failed -// private val workingEmbeddingModels = initModel(ncfModel.embeddingModel, -// subModelNumber, true) -// private val workingLinears = initModel(ncfModel.ncfLayers, -// subModelNumber, false) -// -// workingEmbeddingModels(0).parameters()._2.apply(0).setValue(1, 1, ev.fromType(0.01f)) -// workingEmbeddingModels(0).parameters()._1.apply(0).setValue(1, 1, ev.fromType(1.01f)) -// -// workingEmbeddingModels(0).getParameters()._2.setValue(1, ev.fromType(0.1f)) -// workingEmbeddingModels(0).getParameters()._1.setValue(1, ev.fromType(1.1f)) -// private val (embeddingWeight, embeddingGrad) = ncfModel.embeddingModel.getParameters() -// private val (linearsWeight, linearsGrad) = ncfModel.ncfLayers.getParameters() -// workingEmbeddingModels(0).getParameters()._2.setValue(1, ev.fromType(0.2f)) -// workingEmbeddingModels(0).getParameters()._1.setValue(1, ev.fromType(1.2f)) -// -// private val workingEmbeddingModelWAndG = workingEmbeddingModels.map(_.getParameters()) -// private val workingLinearModelWAndG = workingLinears.map(_.getParameters()) -// -// private val linearGradLength = linearsGrad.nElement() -// private val linearSyncGradTaskSize = linearGradLength / subModelNumber -// private val linearSyncGradExtraTask = linearGradLength % subModelNumber -// private val linearSyncGradParallelNum = -// if (linearSyncGradTaskSize == 0) linearSyncGradExtraTask else subModelNumber -// -// private val embeddingGradLength = embeddingGrad.nElement() -// private val embeddingSyncGradTaskSize = embeddingGradLength / subModelNumber -// private val embeddingSyncGradExtraTask = embeddingGradLength % subModelNumber -// private val embeddingSyncGradParallelNum = -// if (embeddingSyncGradTaskSize == 0) linearSyncGradExtraTask else subModelNumber -// -// private val workingCriterion = -// (1 to subModelNumber).map(_ => criterion.cloneCriterion()).toArray -// -// override def optimize(): Module[T] = { -// var wallClockTime = 0L -// var count = 0 -// optimMethod.clearHistory() -// optimMethod.loadFromTable(state) -// state("epoch") = state.get[Int]("epoch").getOrElse(1) -// state("neval") = state.get[Int]("neval").getOrElse(1) -// state("isLayerwiseScaled") = Utils.isLayerwiseScaled(model) -// val optimMethod2: OptimMethod[T] = optimMethod.clone() -// dataset.shuffle() -// val numSamples = dataset.data(train = false).map(_.size()).reduce(_ + _) -// var iter = dataset.data(train = true) -// logger.info("model thread pool size is " + Engine.model.getPoolSize) -// while (!endWhen(state)) { -// val start = System.nanoTime() -// println("start") -// -// val tasks = Engine.default.invoke((0 until embeddingSyncGradParallelNum).map(tid => -// () => { -// val offset = tid * embeddingSyncGradTaskSize + math.min(tid, embeddingSyncGradExtraTask) -// val length = embeddingSyncGradTaskSize + -// (if (tid < embeddingSyncGradExtraTask) 1 else 0) -// embeddingGrad.narrow(1, offset + 1, length).zero() -// })) -// -// // Fetch data and prepare tensors -// val batch = iter.next() -// var b = 0 -// val stackSize = batch.size() / subModelNumber -// val extraSize = batch.size() % subModelNumber -// val parallelism = if (stackSize == 0) extraSize else subModelNumber -// val miniBatchBuffer = new Array[MiniBatch[T]](parallelism) -// while (b < parallelism) { -// val offset = b * stackSize + math.min(b, extraSize) + 1 -// val length = stackSize + (if (b < extraSize) 1 else 0) -// miniBatchBuffer(b) = batch.slice(offset, length) -// b += 1 -// } -// val dataFetchTime = System.nanoTime() -// println("dataFetch") -// val lossSum = Engine.default.invokeAndWait( -// (0 until parallelism).map(i => -// () => { -// val localEmbedding = workingEmbeddingModels(i) -// val localLinears = workingLinears(i) -//// localEmbedding.zeroGradParameters() -// localEmbedding.training() -// localLinears.training() -// localLinears.zeroGradParameters() -// val localCriterion = workingCriterion(i) -// val input = miniBatchBuffer(i).getInput() -// val target = miniBatchBuffer(i).getTarget() -// -// val embeddingOutput = localEmbedding.forward(input) -// val output = localLinears.forward(embeddingOutput) -// val _loss = ev.toType[Double](localCriterion.forward(output, target)) -// val errors = localCriterion.backward(output, target) -// localEmbedding.updateGradInput(input, -// localLinears.backward(localEmbedding.output, errors)) -// _loss -// }) -// ).sum -// -// val loss = lossSum / parallelism -// -// val computingTime = System.nanoTime() -// println("computingTime") -// -// Engine.default.sync(tasks) -//// Engine.default.invokeAndWait( -//// (0 until embeddingSyncGradParallelNum).map(tid => -//// () => { -//// val offset = tid * embeddingSyncGradTaskSize + math.min(tid, embeddingSyncGradExtraTask) -//// val length = embeddingSyncGradTaskSize + -//// (if (tid < embeddingSyncGradExtraTask) 1 else 0) -//// embeddingGrad.narrow(1, offset + 1, length).zero() -//// }) -//// ) -// -// val zeroGradTime = System.nanoTime() -// println("zeroGrad") -// -// (0 until parallelism).foreach { i => -// val input = miniBatchBuffer(i).getInput() -// val localEmbedding = workingEmbeddingModels(i).asInstanceOf[Graph[T]] -// val input1 = localEmbedding("userId").get.output.asInstanceOf[Tensor[T]] -// val input2 = localEmbedding("itemId").get.output.asInstanceOf[Tensor[T]] -// val mlpUserEmbedding = localEmbedding("mlpUserEmbedding").get -// .asInstanceOf[AbstractModule[Activity, Activity, T]] -// val mlpItemEmbedding = localEmbedding("mlpItemEmbedding").get -// .asInstanceOf[AbstractModule[Activity, Activity, T]] -// val mfUserEmbedding = localEmbedding("mfUserEmbedding").get -// .asInstanceOf[AbstractModule[Activity, Activity, T]] -// val mfItemEmbedding = localEmbedding("mfItemEmbedding").get -// .asInstanceOf[AbstractModule[Activity, Activity, T]] -// val localLinears = workingLinears(i) -// val a = Seq( -// (mlpUserEmbedding, input1, localLinears.gradInput.toTable[Tensor[T]](1)), -// (mlpItemEmbedding, input2, localLinears.gradInput.toTable[Tensor[T]](2)), -// (mfUserEmbedding, input1, localLinears.gradInput.toTable[Tensor[T]](3)), -// (mfItemEmbedding, input2, localLinears.gradInput.toTable[Tensor[T]](4))) -// Engine.default.invokeAndWait(a.map(v => () => { -// v._1.accGradParameters(v._2, v._3) -// })) -// } -// -// val computingTime2 = System.nanoTime() -// println("computing2") -// -// -// // copy multi-model gradient to the buffer +class NCFOptimizer[T: ClassTag] ( + model: Module[T], + dataset: LocalDataSet[MiniBatch[T]], + criterion: Criterion[T] +)(implicit ev: TensorNumeric[T]) + extends Optimizer[T, MiniBatch[T]]( + model, dataset, criterion) { + + import NCFOptimizer._ + import Optimizer._ + + private val coreNumber = Engine.coreNumber() + + private val subModelNumber = Engine.getEngineType match { + case MklBlas => coreNumber + case _ => throw new IllegalArgumentException + } + + val ncfModel = model.asInstanceOf[NeuralCFV2[T]] + + // TODO: sharing failed + private val workingEmbeddingModels = initModel(ncfModel.embeddingModel, + subModelNumber, true) + private val workingLinears = initModel(ncfModel.ncfLayers, + subModelNumber, false) + + workingEmbeddingModels(0).parameters()._2.apply(0).setValue(1, 1, ev.fromType(0.01f)) + workingEmbeddingModels(0).parameters()._1.apply(0).setValue(1, 1, ev.fromType(1.01f)) + + workingEmbeddingModels(0).getParameters()._2.setValue(1, ev.fromType(0.1f)) + workingEmbeddingModels(0).getParameters()._1.setValue(1, ev.fromType(1.1f)) + private val (embeddingWeight, embeddingGrad) = ncfModel.embeddingModel.getParameters() + private val (linearsWeight, linearsGrad) = ncfModel.ncfLayers.getParameters() + workingEmbeddingModels(0).getParameters()._2.setValue(1, ev.fromType(0.2f)) + workingEmbeddingModels(0).getParameters()._1.setValue(1, ev.fromType(1.2f)) + + private val workingEmbeddingModelWAndG = workingEmbeddingModels.map(_.getParameters()) + private val workingLinearModelWAndG = workingLinears.map(_.getParameters()) + + private val linearGradLength = linearsGrad.nElement() + private val linearSyncGradTaskSize = linearGradLength / subModelNumber + private val linearSyncGradExtraTask = linearGradLength % subModelNumber + private val linearSyncGradParallelNum = + if (linearSyncGradTaskSize == 0) linearSyncGradExtraTask else subModelNumber + + private val embeddingGradLength = embeddingGrad.nElement() + private val embeddingSyncGradTaskSize = embeddingGradLength / subModelNumber + private val embeddingSyncGradExtraTask = embeddingGradLength % subModelNumber + private val embeddingSyncGradParallelNum = + if (embeddingSyncGradTaskSize == 0) linearSyncGradExtraTask else subModelNumber + + private val workingCriterion = + (1 to subModelNumber).map(_ => criterion.cloneCriterion()).toArray + + override def optimize(): Module[T] = { + var wallClockTime = 0L + var count = 0 + optimMethods.values.foreach { optimMethod => + optimMethod.clearHistory() + } + state("epoch") = state.get[Int]("epoch").getOrElse(1) + state("neval") = state.get[Int]("neval").getOrElse(1) + state("isLayerwiseScaled") = Utils.isLayerwiseScaled(model) + val optimMethod: OptimMethod[T] = optimMethods(model.getName()) + val optimMethod2: OptimMethod[T] = optimMethods(model.getName()).clone() + dataset.shuffle() + val numSamples = dataset.data(train = false).map(_.size()).reduce(_ + _) + var iter = dataset.data(train = true) + logger.info("model thread pool size is " + Engine.model.getPoolSize) + while (!endWhen(state)) { + val start = System.nanoTime() + println("start") + + val tasks = Engine.default.invoke((0 until embeddingSyncGradParallelNum).map(tid => + () => { + val offset = tid * embeddingSyncGradTaskSize + math.min(tid, embeddingSyncGradExtraTask) + val length = embeddingSyncGradTaskSize + + (if (tid < embeddingSyncGradExtraTask) 1 else 0) + embeddingGrad.narrow(1, offset + 1, length).zero() + })) + + // Fetch data and prepare tensors + val batch = iter.next() + var b = 0 + val stackSize = batch.size() / subModelNumber + val extraSize = batch.size() % subModelNumber + val parallelism = if (stackSize == 0) extraSize else subModelNumber + val miniBatchBuffer = new Array[MiniBatch[T]](parallelism) + while (b < parallelism) { + val offset = b * stackSize + math.min(b, extraSize) + 1 + val length = stackSize + (if (b < extraSize) 1 else 0) + miniBatchBuffer(b) = batch.slice(offset, length) + b += 1 + } + val dataFetchTime = System.nanoTime() + println("dataFetch") + val lossSum = Engine.default.invokeAndWait( + (0 until parallelism).map(i => + () => { + val localEmbedding = workingEmbeddingModels(i) + val localLinears = workingLinears(i) +// localEmbedding.zeroGradParameters() + localEmbedding.training() + localLinears.training() + localLinears.zeroGradParameters() + val localCriterion = workingCriterion(i) + val input = miniBatchBuffer(i).getInput() + val target = miniBatchBuffer(i).getTarget() + + val embeddingOutput = localEmbedding.forward(input) + val output = localLinears.forward(embeddingOutput) + val _loss = ev.toType[Double](localCriterion.forward(output, target)) + val errors = localCriterion.backward(output, target) + localEmbedding.updateGradInput(input, + localLinears.backward(localEmbedding.output, errors)) + _loss + }) + ).sum + + val loss = lossSum / parallelism + + val computingTime = System.nanoTime() + println("computingTime") + + Engine.default.sync(tasks) // Engine.default.invokeAndWait( -// (0 until linearSyncGradParallelNum).map(tid => +// (0 until embeddingSyncGradParallelNum).map(tid => // () => { -// val offset = tid * linearSyncGradTaskSize + math.min(tid, linearSyncGradExtraTask) -// val length = linearSyncGradTaskSize + (if (tid < linearSyncGradExtraTask) 1 else 0) -// var i = 0 -// while (i < parallelism) { -// if (i == 0) { -// linearsGrad.narrow(1, offset + 1, length) -// .copy(workingLinearModelWAndG(i)._2.narrow(1, offset + 1, length)) -// } else { -// linearsGrad.narrow(1, offset + 1, length) -// .add(workingLinearModelWAndG(i)._2.narrow(1, offset + 1, length)) -// } -// i += 1 -// } +// val offset = tid * embeddingSyncGradTaskSize +// + math.min(tid, embeddingSyncGradExtraTask) +// val length = embeddingSyncGradTaskSize + +// (if (tid < embeddingSyncGradExtraTask) 1 else 0) +// embeddingGrad.narrow(1, offset + 1, length).zero() // }) // ) -// -// val aggTime = System.nanoTime() -// println("agg") -// -// optimMethod.state.update("epoch", state.get("epoch")) -// optimMethod.state.update("neval", state.get("neval")) -// optimMethod.optimize(_ => (ev.fromType(loss), linearsGrad), linearsWeight) -// -// optimMethod2.state.update("epoch", state.get("epoch")) -// optimMethod2.state.update("neval", state.get("neval")) -// optimMethod2.optimize(_ => (ev.fromType(loss), embeddingGrad), embeddingWeight) -// -// val updateWeightTime = System.nanoTime() -// println("update weight") -// val end = System.nanoTime() -// wallClockTime += end - start -// count += batch.size() -// val head = header(state[Int]("epoch"), count, numSamples, state[Int]("neval"), wallClockTime) -// logger.info(s"$head " + -// s"loss is $loss, iteration time is ${(end - start) / 1e9}s " + -// s"train time ${(end - dataFetchTime) / 1e9}s. " + -// s"Throughput is ${batch.size().toDouble / (end - start) * 1e9} record / second. " + -// optimMethod.getHyperParameter() -// ) -// logger.debug( s"data fetch time is ${(dataFetchTime - start) / 1e9}s \n" + -// s"model computing time is ${(computingTime - dataFetchTime) / 1e9}s \n" + -// s"zero grad time is ${(zeroGradTime - computingTime) / 1e9}s \n" + -// s"acc embedding time is ${(computingTime2 - zeroGradTime) / 1e9}s \n" + -// s"aggregate linear is ${(aggTime - computingTime2) / 1e9}s \n" + -// s"update weight time is ${(updateWeightTime - aggTime) / 1e9}s") -// -// state("neval") = state[Int]("neval") + 1 -// -// if (count >= numSamples) { -// state("epoch") = state[Int]("epoch") + 1 -// dataset.shuffle() -// iter = dataset.toLocal().data(train = true) -// count = 0 -// } -// -// validate(head) -// checkpoint(wallClockTime) -// } -// -// model -// } -// -// private def checkpoint(wallClockTime: Long): Unit = { -// if (checkpointTrigger.isEmpty || checkpointPath.isEmpty) { + + val zeroGradTime = System.nanoTime() + println("zeroGrad") + + (0 until parallelism).foreach { i => + val input = miniBatchBuffer(i).getInput() + val localEmbedding = workingEmbeddingModels(i).asInstanceOf[Graph[T]] + val input1 = localEmbedding("userId").get.output.asInstanceOf[Tensor[T]] + val input2 = localEmbedding("itemId").get.output.asInstanceOf[Tensor[T]] + val mlpUserEmbedding = localEmbedding("mlpUserEmbedding").get + .asInstanceOf[AbstractModule[Activity, Activity, T]] + val mlpItemEmbedding = localEmbedding("mlpItemEmbedding").get + .asInstanceOf[AbstractModule[Activity, Activity, T]] + val mfUserEmbedding = localEmbedding("mfUserEmbedding").get + .asInstanceOf[AbstractModule[Activity, Activity, T]] + val mfItemEmbedding = localEmbedding("mfItemEmbedding").get + .asInstanceOf[AbstractModule[Activity, Activity, T]] + val localLinears = workingLinears(i) + val a = Seq( + (mlpUserEmbedding, input1, localLinears.gradInput.toTable[Tensor[T]](1)), + (mlpItemEmbedding, input2, localLinears.gradInput.toTable[Tensor[T]](2)), + (mfUserEmbedding, input1, localLinears.gradInput.toTable[Tensor[T]](3)), + (mfItemEmbedding, input2, localLinears.gradInput.toTable[Tensor[T]](4))) + Engine.default.invokeAndWait(a.map(v => () => { + v._1.accGradParameters(v._2, v._3) + })) + } + + val computingTime2 = System.nanoTime() + println("computing2") + + + // copy multi-model gradient to the buffer + Engine.default.invokeAndWait( + (0 until linearSyncGradParallelNum).map(tid => + () => { + val offset = tid * linearSyncGradTaskSize + math.min(tid, linearSyncGradExtraTask) + val length = linearSyncGradTaskSize + (if (tid < linearSyncGradExtraTask) 1 else 0) + var i = 0 + while (i < parallelism) { + if (i == 0) { + linearsGrad.narrow(1, offset + 1, length) + .copy(workingLinearModelWAndG(i)._2.narrow(1, offset + 1, length)) + } else { + linearsGrad.narrow(1, offset + 1, length) + .add(workingLinearModelWAndG(i)._2.narrow(1, offset + 1, length)) + } + i += 1 + } + }) + ) + + val aggTime = System.nanoTime() + println("agg") + + optimMethod.state.update("epoch", state.get("epoch")) + optimMethod.state.update("neval", state.get("neval")) + optimMethod.optimize(_ => (ev.fromType(loss), linearsGrad), linearsWeight) + + optimMethod2.state.update("epoch", state.get("epoch")) + optimMethod2.state.update("neval", state.get("neval")) + optimMethod2.optimize(_ => (ev.fromType(loss), embeddingGrad), embeddingWeight) + + val updateWeightTime = System.nanoTime() + println("update weight") + val end = System.nanoTime() + wallClockTime += end - start + count += batch.size() + val head = header(state[Int]("epoch"), count, numSamples, state[Int]("neval"), wallClockTime) + logger.info(s"$head " + + s"loss is $loss, iteration time is ${(end - start) / 1e9}s " + + s"train time ${(end - dataFetchTime) / 1e9}s. " + + s"Throughput is ${batch.size().toDouble / (end - start) * 1e9} record / second. " + + optimMethod.getHyperParameter() + ) + logger.debug( s"data fetch time is ${(dataFetchTime - start) / 1e9}s \n" + + s"model computing time is ${(computingTime - dataFetchTime) / 1e9}s \n" + + s"zero grad time is ${(zeroGradTime - computingTime) / 1e9}s \n" + + s"acc embedding time is ${(computingTime2 - zeroGradTime) / 1e9}s \n" + + s"aggregate linear is ${(aggTime - computingTime2) / 1e9}s \n" + + s"update weight time is ${(updateWeightTime - aggTime) / 1e9}s") + + state("neval") = state[Int]("neval") + 1 + + if (count >= numSamples) { + state("epoch") = state[Int]("epoch") + 1 + dataset.shuffle() + iter = dataset.toLocal().data(train = true) + count = 0 + } + + validate(head) + checkpoint(wallClockTime) + } + + model + } + + private def checkpoint(wallClockTime: Long): Unit = { + if (checkpointTrigger.isEmpty || checkpointPath.isEmpty) { + return + } + + val trigger = checkpointTrigger.get + if (trigger(state) && checkpointPath.isDefined) { + logger.info(s"[Wall Clock ${wallClockTime / 1e9}s] Save model to path") + saveModel(model, checkpointPath, isOverWrite, s".${state[Int]("neval")}") + saveState(state, checkpointPath, isOverWrite, s".${state[Int]("neval")}") + } + } + + private def validate(header: String): Unit = { +// if (validationTrigger.isEmpty || validationDataSet.isEmpty) { // return // } -// -// val trigger = checkpointTrigger.get -// if (trigger(state) && checkpointPath.isDefined) { -// logger.info(s"[Wall Clock ${wallClockTime / 1e9}s] Save model to path") -// saveModel(model, checkpointPath, isOverWrite, s".${state[Int]("neval")}") -// saveState(state, checkpointPath, isOverWrite, s".${state[Int]("neval")}") +// val trigger = validationTrigger.get +// if (!trigger(state)) { +// return // } -// } +// val vMethods = validationMethods.get +// val vMethodsArr = (1 to subModelNumber).map(i => vMethods.map(_.clone())).toArray +// val dataIter = validationDataSet.get.toLocal().data(train = false) +// logger.info(s"$header Validate model...") // -// private def validate(header: String): Unit = { -//// if (validationTrigger.isEmpty || validationDataSet.isEmpty) { -//// return -//// } -//// val trigger = validationTrigger.get -//// if (!trigger(state)) { -//// return -//// } -//// val vMethods = validationMethods.get -//// val vMethodsArr = (1 to subModelNumber).map(i => vMethods.map(_.clone())).toArray -//// val dataIter = validationDataSet.get.toLocal().data(train = false) -//// logger.info(s"$header Validate model...") -//// -//// workingModels.foreach(_.evaluate()) -//// -//// var count = 0 -//// dataIter.map(batch => { -//// val stackSize = batch.size() / subModelNumber -//// val extraSize = batch.size() % subModelNumber -//// val parallelism = if (stackSize == 0) extraSize else subModelNumber -//// val start = System.nanoTime() -//// val result = Engine.default.invokeAndWait( -//// (0 until parallelism).map(b => -//// () => { -//// val offset = b * stackSize + math.min(b, extraSize) + 1 -//// val length = stackSize + (if (b < extraSize) 1 else 0) -//// val currentMiniBatch = batch.slice(offset, length) -//// val input = currentMiniBatch.getInput() -//// val target = currentMiniBatch.getTarget() -//// val output = workingModels(b).forward(input) -//// val validatMethods = vMethodsArr(b) -//// validatMethods.map(validation => { -//// validation(output, target) -//// }) -//// } -//// ) -//// ).reduce((left, right) => { -//// left.zip(right).map { case (l, r) => -//// l + r -//// } -//// }) -//// count += batch.size() -//// logger.info(s"$header Throughput is ${ -//// batch.size() / ((System.nanoTime() - start) / 1e9) -//// } record / sec") -//// result -//// }).reduce((left, right) => { -//// left.zip(right).map { case (l, r) => -//// l + r -//// } -//// }).zip(vMethods).foreach(r => { -//// logger.info(s"$header ${r._2} is ${r._1}") -//// }) -// } -//} +// workingModels.foreach(_.evaluate()) // -//object NCFOptimizer { -// val logger = Logger.getLogger(this.getClass) -// -// def initModel[T: ClassTag](model: Module[T], copies: Int, -// shareGradient: Boolean)( -// implicit ev: TensorNumeric[T]): Array[Module[T]] = { -// val (wb, grad) = Util.getAndClearWeightBiasGrad(model.parameters()) -// -// val models = (1 to copies).map(i => { -// logger.info(s"Clone $i model...") -// val m = if (i == copies) { -// model -// } else { -// model.cloneModule() -// } -// Util.putWeightBias(wb, m) -// if (shareGradient) { -// Util.putGradWeightBias(grad, m) -// } else { -// Util.initGradWeightBias(wb, m) +// var count = 0 +// dataIter.map(batch => { +// val stackSize = batch.size() / subModelNumber +// val extraSize = batch.size() % subModelNumber +// val parallelism = if (stackSize == 0) extraSize else subModelNumber +// val start = System.nanoTime() +// val result = Engine.default.invokeAndWait( +// (0 until parallelism).map(b => +// () => { +// val offset = b * stackSize + math.min(b, extraSize) + 1 +// val length = stackSize + (if (b < extraSize) 1 else 0) +// val currentMiniBatch = batch.slice(offset, length) +// val input = currentMiniBatch.getInput() +// val target = currentMiniBatch.getTarget() +// val output = workingModels(b).forward(input) +// val validatMethods = vMethodsArr(b) +// validatMethods.map(validation => { +// validation(output, target) +// }) +// } +// ) +// ).reduce((left, right) => { +// left.zip(right).map { case (l, r) => +// l + r +// } +// }) +// count += batch.size() +// logger.info(s"$header Throughput is ${ +// batch.size() / ((System.nanoTime() - start) / 1e9) +// } record / sec") +// result +// }).reduce((left, right) => { +// left.zip(right).map { case (l, r) => +// l + r // } -// m -// }).toArray -// models -// } -//} +// }).zip(vMethods).foreach(r => { +// logger.info(s"$header ${r._2} is ${r._1}") +// }) + } +} + +object NCFOptimizer { + val logger = Logger.getLogger(this.getClass) + + def initModel[T: ClassTag](model: Module[T], copies: Int, + shareGradient: Boolean)( + implicit ev: TensorNumeric[T]): Array[Module[T]] = { + val (wb, grad) = Util.getAndClearWeightBiasGrad(model.parameters()) + + val models = (1 to copies).map(i => { + logger.info(s"Clone $i model...") + val m = if (i == copies) { + model + } else { + model.cloneModule() + } + Util.putWeightBias(wb, m) + if (shareGradient) { + Util.putGradWeightBias(grad, m) + } else { + Util.initGradWeightBias(wb, m) + } + m + }).toArray + models + } +} diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/utils/Util.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/utils/Util.scala index 3b539b6fcd7..91c9463f178 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/utils/Util.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/utils/Util.scala @@ -91,49 +91,64 @@ object Util { } } - - private[bigdl] def getAndClearWeightBias[T: ClassTag] - (parameters: (Array[Tensor[T]], Array[Tensor[T]]))(implicit ev: TensorNumeric[T]) - : Array[Tensor[T]] = { - if (parameters._1.length != 0) { - var i = 0 - val weightsBias = new Array[Tensor[T]](parameters._1.length) - val isQuantized = parameters._1.exists(_.getTensorType == QuantizedType) + private def getAndClear[T: ClassTag](tensors: Array[Tensor[T]]) + (implicit ev: TensorNumeric[T]): Array[Tensor[T]] = { + if (tensors.length != 0) { + val newTensors = new Array[Tensor[T]](tensors.length) + val isQuantized = tensors.exists(_.getTensorType == QuantizedType) val (isCompacted, storage) = if (!isQuantized) { - val storage = Storage(parameters._1(0).storage.array()) - (parameters._1.map(_.nElement()).sum == storage.length(), storage) + val storageArray = tensors(0).storage.array() + tensors.map(_.storage().array().eq(storageArray)) + .reduce(_ & _) + val storage = Storage(storageArray) + (tensors.map(_.nElement()).sum < storage.length(), storage) } else { (false, null) } + var i = 0 // get weight and bias - while (i < parameters._1.length) { - if (parameters._1(i) != null) { - val wb = parameters._1(i) - wb.getTensorType match { + while (i < tensors.length) { + if (tensors(i) != null) { + val ithTensor = tensors(i) + ithTensor.getTensorType match { case QuantizedType => - val quantTensor = wb.asInstanceOf[QuantizedTensor[T]] - weightsBias(i) = QuantizedTensor[T](quantTensor.getStorage, quantTensor.maxOfRow, + val quantTensor = ithTensor.asInstanceOf[QuantizedTensor[T]] + newTensors(i) = QuantizedTensor[T](quantTensor.getStorage, quantTensor.maxOfRow, quantTensor.minOfRow, quantTensor.sumOfRow, quantTensor.size(), quantTensor.params) case _ => - weightsBias(i) = if (isCompacted) { - Tensor[T](storage, wb.storageOffset(), wb.size(), wb.stride()) + newTensors(i) = if (isCompacted) { + Tensor[T](storage, ithTensor.storageOffset(), ithTensor.size(), ithTensor.stride()) } else { - Tensor[T](Storage(wb.storage().array()), wb.storageOffset(), wb.size(), wb.stride()) + Tensor[T](Storage(ithTensor.storage().array()), + ithTensor.storageOffset(), ithTensor.size(), ithTensor.stride()) } } i += 1 } } // clear parameters - clearTensor(parameters._1) - clearTensor(parameters._2) + clearTensor(tensors) - weightsBias + newTensors } else { // just return an empty array when parameters is empty. Array() } + + } + + private[bigdl] def getAndClearWeightBiasGrad[T: ClassTag] + (parameters: (Array[Tensor[T]], Array[Tensor[T]]))(implicit ev: TensorNumeric[T]) + : (Array[Tensor[T]], Array[Tensor[T]]) = { + (getAndClear(parameters._1), getAndClear(parameters._2)) + } + + private[bigdl] def getAndClearWeightBias[T: ClassTag] + (parameters: (Array[Tensor[T]], Array[Tensor[T]]))(implicit ev: TensorNumeric[T]) + : Array[Tensor[T]] = { + clearTensor(parameters._2) + getAndClear(parameters._1) } private[bigdl] def getAndClearConsts[T: ClassTag]( @@ -176,6 +191,19 @@ object Util { } } + private[bigdl] def putGradWeightBias[T: ClassTag]( + broadcastGradWeightBias: Array[Tensor[T]], + localModel: Module[T])(implicit ev: TensorNumeric[T]): Unit = { + val localWeightBias = localModel.parameters()._2 + var i = 0 + while (i < localWeightBias.length) { + if (localWeightBias(i) != null) { + clearAndSet(localWeightBias(i), broadcastGradWeightBias(i)) + } + i += 1 + } + } + private[bigdl] def putWeightBias[T: ClassTag]( broadcastWeightBias: Array[Tensor[T]], localModel: Module[T])(implicit ev: TensorNumeric[T]): Unit = { @@ -187,19 +215,21 @@ object Util { } i += 1 } + } - def clearAndSet(old: Tensor[T], other: Tensor[T]): Unit = { - if (old.getTensorType == QuantizedType && other.getTensorType == QuantizedType) { - val quantOld = old.asInstanceOf[QuantizedTensor[T]] - val quantOther = other.asInstanceOf[QuantizedTensor[T]] + private def clearAndSet[T: ClassTag]( + old: Tensor[T], + other: Tensor[T])(implicit ev: TensorNumeric[T]): Unit = { + if (old.getTensorType == QuantizedType && other.getTensorType == QuantizedType) { + val quantOld = old.asInstanceOf[QuantizedTensor[T]] + val quantOther = other.asInstanceOf[QuantizedTensor[T]] - if (quantOld.getNativeStorage != quantOther.getNativeStorage) { - quantOld.release() - } + if (quantOld.getNativeStorage != quantOther.getNativeStorage) { + quantOld.release() } - - old.set(other) } + + old.set(other) } private[bigdl] def initGradWeightBias[T: ClassTag]( From e2d71481656c3d2d2897aec68e6e3c37c2237375 Mon Sep 17 00:00:00 2001 From: qiuxin2012 Date: Mon, 16 Jul 2018 14:59:24 +0800 Subject: [PATCH 04/11] some update --- .../example/recommendation/NcfPerf.scala | 2 +- .../example/recommendation/NeuralCFV2.scala | 2 +- .../intel/analytics/bigdl/optim/Adam.scala | 104 +++++++++++------- 3 files changed, 66 insertions(+), 42 deletions(-) diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NcfPerf.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NcfPerf.scala index b52ee45219f..a97102da3c4 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NcfPerf.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NcfPerf.scala @@ -1,5 +1,5 @@ /* - * Copyright 2018 Analytics Zoo Authors. + * Copyright 2016 The BigDL Authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NeuralCFV2.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NeuralCFV2.scala index 4344face725..11ce8d7baf8 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NeuralCFV2.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NeuralCFV2.scala @@ -1,5 +1,5 @@ /* - * Copyright 2018 Analytics Zoo Authors. + * Copyright 2016 The BigDL Authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala index 93cd36aa40b..92a9b4c98fc 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala @@ -16,10 +16,9 @@ package com.intel.analytics.bigdl.optim -import breeze.linalg.* import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric -import com.intel.analytics.bigdl.utils.{T, Table} +import com.intel.analytics.bigdl.utils.{Engine, T, Table} import scala.math._ import scala.reflect.ClassTag @@ -34,14 +33,14 @@ import scala.reflect.ClassTag * @tparam T */ class Adam[@specialized(Float, Double) T: ClassTag]( - var learningRate: Double = 1e-3, - var learningRateDecay: Double = 0.0, - var beta1: Double = 0.9, - var beta2: Double = 0.999, - var Epsilon: Double = 1e-8)(implicit ev: TensorNumeric[T]) extends OptimMethod[T] { + var learningRate: Double = 1e-3, + var learningRateDecay: Double = 0.0, + var beta1: Double = 0.9, + var beta2: Double = 0.999, + var Epsilon: Double = 1e-8)(implicit ev: TensorNumeric[T]) extends OptimMethod[T] { @transient - private var buffer: Tensor[T] = null + private var ones: Tensor[T] = null /** * An implementation of Adam http://arxiv.org/pdf/1412.6980.pdf @@ -52,8 +51,7 @@ class Adam[@specialized(Float, Double) T: ClassTag]( * @return the new x vector and the function list {fx}, evaluated before the update */ override def optimize(feval: (Tensor[T]) => (T, Tensor[T]), - parameter: Tensor[T]): (Tensor[T], Array[T]) = { - if (buffer == null) buffer = Tensor[T]() + parameter: Tensor[T]): (Tensor[T], Array[T]) = { val lr = this.learningRate val lrd = this.learningRateDecay val beta1 = this.beta1 @@ -64,43 +62,44 @@ class Adam[@specialized(Float, Double) T: ClassTag]( var timestep = state.getOrElse[Int]("evalCounter", 0) - val (_s, _r, _denom) = - if (state.get[Tensor[T]]("s").isDefined) { - (state.get[Tensor[T]]("s").get, state.get[Tensor[T]]("r").get, - state.get[Tensor[T]]("denom").get.resizeAs(dfdx)) - } else { - (Tensor[T]().resizeAs(dfdx).zero(), Tensor[T]().resizeAs(dfdx).zero(), - Tensor[T]().resizeAs(dfdx).zero()) - } - val clr = lr / (1 + timestep*lrd) timestep = timestep + 1 - /** - * m_t = beta_1 * m_t-1 + (1 - beta_1) * g_t - * v_t = beta_2 * v_t-1 + (1 - beta_2) * g_t * g_t - */ - _s.mul(ev.fromType[Double](beta1)).add(ev.fromType[Double](1-beta1), dfdx) - // buffer = dfdx * dfdx - buffer.resizeAs(dfdx).cmul(dfdx, dfdx) - _r.mul(ev.fromType[Double](beta2)).add(ev.fromType[Double](1-beta2), buffer) - _denom.sqrt(_r) - - // used as MKL.axpy: 1 * a + y = y, and fill buffer with one - buffer.fill(ev.one) - _denom.add(ev.fromType(eps), buffer) + val parallelNum = Engine.coreNumber() + val gradLength = parameter.nElement() + val taskSize = gradLength / parallelNum + val extraTask = gradLength % parallelNum + if (ones == null || ones.nElement() < taskSize + 1) { + ones = Tensor[T]().resize(taskSize + 1).fill(ev.one) + } + + Engine.default.invokeAndWait((0 until parallelNum).map(tid => () => { + val offset = tid * taskSize + math.min(tid, extraTask) + val length = taskSize + (if (tid < extraTask) 1 else 0) + val currentDfdx = dfdx.narrow(1, offset + 1, length) + val currentParameter = parameter.narrow(1, offset + 1, length) + val currentOnes = ones.narrow(1, 1, length) + val (_s, _r, _denom) = + if (state.get[Tensor[T]](s"s$tid").isDefined && state.get[Tensor[T]](s"r$tid").isDefined + && state.get[Tensor[T]](s"denom$tid").isDefined) { + (state.get[Tensor[T]](s"s$tid").get, state.get[Tensor[T]](s"r$tid").get, + state.get[Tensor[T]](s"denom$tid").get) + } else { + (Tensor[T]().resizeAs(currentParameter).zero(), + Tensor[T]().resizeAs(currentParameter).zero(), + Tensor[T]().resizeAs(currentParameter).zero()) + } + Adam.updateFrame(_s, _r, _denom, clr, currentDfdx, currentParameter, + beta1, beta2, timestep, currentOnes, eps) + + state(s"s$tid") = _s // 1st moment variables + state(s"r$tid") = _r // 2nd moment variables + state(s"denom$tid") = _denom // 3nd moment variables + })) - // efficiency improved upon by changing the order of computation, at expense of clarity - val biasCorrection1 = 1 - pow(beta1, timestep) - val biasCorrection2 = 1 - pow(beta2, timestep) - val stepSize = clr * sqrt(biasCorrection2) / biasCorrection1 - parameter.addcdiv(ev.fromType[Double](-stepSize), _s, _denom) state("evalCounter") = timestep // A tmp tensor to hold the sqrt(v) + epsilon - state("s") = _s // 1st moment variables - state("r") = _r // 2nd moment variables - state("denom") = _denom // 3nd moment variables (parameter, Array(fx)) } @@ -122,3 +121,28 @@ class Adam[@specialized(Float, Double) T: ClassTag]( override def getLearningRate(): Double = this.learningRate } + +object Adam { + private[optim] def updateFrame[T: ClassTag](_s: Tensor[T], _r: Tensor[T], _denom: Tensor[T], + clr: Double, dfdx: Tensor[T], parameter: Tensor[T], + beta1: Double, beta2: Double, timestep: Int, + ones: Tensor[T], eps: Double)( + implicit ev: TensorNumeric[T]): Unit = { + /** + * m_t = beta_1 * m_t-1 + (1 - beta_1) * g_t + * v_t = beta_2 * v_t-1 + (1 - beta_2) * g_t * g_t + */ + _s.mul(ev.fromType[Double](beta1)).add(ev.fromType[Double](1-beta1), dfdx) + _r.mul(ev.fromType[Double](beta2)).addcmul(ev.fromType[Double](1-beta2), dfdx, dfdx) + _denom.sqrt(_r) + + // used as MKL.axpy: 1 * a + y = y, and fill buffer with one + _denom.add(ev.fromType(eps), ones) + + // efficiency improved upon by changing the order of computation, at expense of clarity + val biasCorrection1 = 1 - pow(beta1, timestep) + val biasCorrection2 = 1 - pow(beta2, timestep) + val stepSize = clr * sqrt(biasCorrection2) / biasCorrection1 + parameter.addcdiv(ev.fromType[Double](-stepSize), _s, _denom) + } +} From 2a60c0f6ad62e9ce1b17c57f8aa432484f0acd10 Mon Sep 17 00:00:00 2001 From: qiuxin2012 Date: Fri, 20 Jul 2018 12:37:12 +0800 Subject: [PATCH 05/11] inplace addcdiv --- .../example/recommendation/AdamPerf.scala | 60 +++++++++++++++++++ .../intel/analytics/bigdl/optim/Adam.scala | 13 +++- .../analytics/bigdl/optim/NCFOptimizer.scala | 7 ++- .../analytics/bigdl/optim/AdamSpec.scala | 8 ++- 4 files changed, 83 insertions(+), 5 deletions(-) create mode 100644 spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/AdamPerf.scala diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/AdamPerf.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/AdamPerf.scala new file mode 100644 index 00000000000..755b3c52a68 --- /dev/null +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/AdamPerf.scala @@ -0,0 +1,60 @@ +/* + * Copyright 2016 The BigDL Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.intel.analytics.bigdl.example.recommendation + +import com.intel.analytics.bigdl.optim.Adam +import com.intel.analytics.bigdl.tensor.Tensor +import com.intel.analytics.bigdl.utils.Engine + +import scala.util.Random + +object AdamPerf { + def main(args: Array[String]): Unit = { + val iteration = args(0).toInt + val batchSize = args(1).toInt + val core = args(2).toInt + System.setProperty("bigdl.localMode", "true") + Engine.init(1, core, false) + val userCount = 138493 + val itemCount = 26744 + + val model = NeuralCFV2[Float](userCount, itemCount, 1, 128, 128, + hiddenLayers = Array(128, 64), + mfEmbed = 64) + .buildModel() + val (w, g) = model.getParameters() + val optimMethod = new Adam[Float]() + g.randn() + + // warm up + (0 until 5).foreach{i => + optimMethod.optimize(_ => (1, g), w) + } + + var count = 0L + (0 until iteration).foreach { i => + println(i) + g.randn() + val start = System.nanoTime() + optimMethod.optimize(_ => (1, g), w) + val end = System.nanoTime() + println(s"time is ${(end - start) / 1e6.toLong}") + count += end - start + } + } + +} diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala index 92a9b4c98fc..3d4914db6cb 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala @@ -19,6 +19,7 @@ package com.intel.analytics.bigdl.optim import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric import com.intel.analytics.bigdl.utils.{Engine, T, Table} +import org.apache.log4j.Logger import scala.math._ import scala.reflect.ClassTag @@ -74,7 +75,10 @@ class Adam[@specialized(Float, Double) T: ClassTag]( ones = Tensor[T]().resize(taskSize + 1).fill(ev.one) } + val times = new Array[Long](parallelNum) + Engine.default.invokeAndWait((0 until parallelNum).map(tid => () => { + val start = System.nanoTime() val offset = tid * taskSize + math.min(tid, extraTask) val length = taskSize + (if (tid < extraTask) 1 else 0) val currentDfdx = dfdx.narrow(1, offset + 1, length) @@ -96,8 +100,12 @@ class Adam[@specialized(Float, Double) T: ClassTag]( state(s"s$tid") = _s // 1st moment variables state(s"r$tid") = _r // 2nd moment variables state(s"denom$tid") = _denom // 3nd moment variables + times(tid) = (System.nanoTime() - start) / 1000000L })) + Adam.logger.info(s"update ${parameter.nElement()} parameters, maximum time is ${times.max} ms") + Adam.logger.info(s"Time is ${times.mkString("\t")} ms") + state("evalCounter") = timestep // A tmp tensor to hold the sqrt(v) + epsilon @@ -123,6 +131,8 @@ class Adam[@specialized(Float, Double) T: ClassTag]( } object Adam { + val logger = Logger.getLogger(this.getClass) + private[optim] def updateFrame[T: ClassTag](_s: Tensor[T], _r: Tensor[T], _denom: Tensor[T], clr: Double, dfdx: Tensor[T], parameter: Tensor[T], beta1: Double, beta2: Double, timestep: Int, @@ -143,6 +153,7 @@ object Adam { val biasCorrection1 = 1 - pow(beta1, timestep) val biasCorrection2 = 1 - pow(beta2, timestep) val stepSize = clr * sqrt(biasCorrection2) / biasCorrection1 - parameter.addcdiv(ev.fromType[Double](-stepSize), _s, _denom) + _denom.cdiv(_s, _denom) + parameter.add(ev.fromType[Double](-stepSize), _denom) } } diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/NCFOptimizer.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/NCFOptimizer.scala index 7e448e067ac..777c0972aae 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/NCFOptimizer.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/NCFOptimizer.scala @@ -229,11 +229,13 @@ class NCFOptimizer[T: ClassTag] ( optimMethod.state.update("neval", state.get("neval")) optimMethod.optimize(_ => (ev.fromType(loss), linearsGrad), linearsWeight) + val updateWeightTime1 = System.nanoTime() + optimMethod2.state.update("epoch", state.get("epoch")) optimMethod2.state.update("neval", state.get("neval")) optimMethod2.optimize(_ => (ev.fromType(loss), embeddingGrad), embeddingWeight) - val updateWeightTime = System.nanoTime() + val updateWeightTime2 = System.nanoTime() println("update weight") val end = System.nanoTime() wallClockTime += end - start @@ -250,7 +252,8 @@ class NCFOptimizer[T: ClassTag] ( s"zero grad time is ${(zeroGradTime - computingTime) / 1e9}s \n" + s"acc embedding time is ${(computingTime2 - zeroGradTime) / 1e9}s \n" + s"aggregate linear is ${(aggTime - computingTime2) / 1e9}s \n" + - s"update weight time is ${(updateWeightTime - aggTime) / 1e9}s") + s"update linear time is ${(updateWeightTime1 - aggTime) / 1e9}s" + + s"update embedding time is ${(updateWeightTime2 - updateWeightTime1) / 1e9}s") state("neval") = state[Int]("neval") + 1 diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/optim/AdamSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/optim/AdamSpec.scala index c02260d2ed6..8b85b99bf13 100644 --- a/spark/dl/src/test/scala/com/intel/analytics/bigdl/optim/AdamSpec.scala +++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/optim/AdamSpec.scala @@ -18,7 +18,7 @@ package com.intel.analytics.bigdl.optim import com.intel.analytics.bigdl.nn.{CrossEntropyCriterion, Linear, Sequential} import com.intel.analytics.bigdl.tensor.Tensor -import com.intel.analytics.bigdl.utils.{RandomGenerator, T, TestUtils} +import com.intel.analytics.bigdl.utils.{Engine, RandomGenerator, T, TestUtils} import org.scalatest.{FlatSpec, Matchers} import scala.collection.mutable.ArrayBuffer @@ -28,8 +28,10 @@ import scala.util.Random class AdamSpec extends FlatSpec with Matchers { val start = System.currentTimeMillis() "adam" should "perform well on rosenbrock function" in { + System.setProperty("bigdl.localMode", "true") + Engine.init(1, 2, false) val x = Tensor[Double](2).fill(0) - val config = T("learningRate" -> 0.002) + val config = T("learningRate" -> 0.005) val optm = new Adam[Double] var fx = new ArrayBuffer[Double] for (i <- 1 to 10001) { @@ -53,6 +55,8 @@ class AdamSpec extends FlatSpec with Matchers { x(Array(2)) should be(1.0 +- 0.01) } "adam" should " work fast with MKL" in { + System.setProperty("bigdl.localMode", "true") + Engine.init(1, 2, false) RandomGenerator.RNG.setSeed(100) val inputSize = 500 val hiddenSize = 500 From 07404809426ac2d797943a0540cb93c49c337974 Mon Sep 17 00:00:00 2001 From: qiuxin2012 Date: Fri, 20 Jul 2018 16:43:59 +0800 Subject: [PATCH 06/11] update adam --- .../intel/analytics/bigdl/optim/Adam.scala | 163 +++++++++++++++++- 1 file changed, 162 insertions(+), 1 deletion(-) diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala index 3d4914db6cb..d0a0c2d718b 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala @@ -16,7 +16,7 @@ package com.intel.analytics.bigdl.optim -import com.intel.analytics.bigdl.tensor.Tensor +import com.intel.analytics.bigdl.tensor.{Storage, Tensor} import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric import com.intel.analytics.bigdl.utils.{Engine, T, Table} import org.apache.log4j.Logger @@ -112,6 +112,138 @@ class Adam[@specialized(Float, Double) T: ClassTag]( (parameter, Array(fx)) } + val embeddingSize = 64 + val numEmbedding = 128 + val lastUpdated = collection.mutable.HashMap( + Array.tabulate(numEmbedding)(i => (i + 1, 0)): _*) + + def updateNograd(indices: Tensor[T], parameter: Tensor[T]): Unit = { + val ones = Tensor(embeddingSize).fill(ev.one) + + val lr = this.learningRate + val lrd = this.learningRateDecay + val beta1 = this.beta1 + val beta2 = this.beta2 + val eps = this.Epsilon + + val uniqueIndices = Tensor[T](Storage(indices.storage().array().distinct)) + + var timestep = state.getOrElse[Int]("evalCounter", 0) + + val clr = lr / (1 + timestep*lrd) + + timestep = timestep + 1 + + val parallelNum = Engine.coreNumber() + val gradLength = uniqueIndices.nElement() + val taskSize = gradLength / parallelNum + val extraTask = gradLength % parallelNum + + val times = new Array[Long](parallelNum) + + Engine.default.invokeAndWait((0 until parallelNum).map(tid => () => { + val start = System.nanoTime() + val offset = tid * taskSize + math.min(tid, extraTask) + val length = taskSize + (if (tid < extraTask) 1 else 0) + val currentDfdx = uniqueIndices.narrow(1, offset + 1, length) + val currentParameter = parameter.narrow(1, offset + 1, length) + var i = 1 + while(i <= currentDfdx.nElement()) { + val index = ev.toType[Int](currentDfdx.valueAt(i)) + val (_s, _r, _denom) = + if (state.get[Tensor[T]](s"s$index").isDefined && + state.get[Tensor[T]](s"r$index").isDefined + && state.get[Tensor[T]](s"denom$index").isDefined) { + (state.get[Tensor[T]](s"s$index").get, + state.get[Tensor[T]](s"r$index").get, + state.get[Tensor[T]](s"denom$index").get) + } else { + (Tensor[T](embeddingSize).zero(), + Tensor[T](embeddingSize).zero(), + Tensor[T](embeddingSize).zero()) + } + val indexThParameter = parameter.narrow(1, index * embeddingSize + 1, embeddingSize) + Adam.updateFrameZeroGrad( + timestep, lastUpdated(index), + _s, _r, _denom, clr, indexThParameter, + beta1, beta2, ones, eps) + state(s"s$index") = _s // 1st moment variables + state(s"r$index") = _r // 2nd moment variables + state(s"denom$index") = _denom // 3nd moment variables + lastUpdated(index) = timestep + i += 1 + } + + times(tid) = (System.nanoTime() - start) / 1000000L + })) + Adam.logger.info(s"update ${parameter.nElement()} parameters, maximum time is ${times.max} ms") + Adam.logger.info(s"Time is ${times.mkString("\t")} ms") + + + state("evalCounter") = timestep // A tmp tensor to hold the sqrt(v) + epsilon + } + + override def optimize(feval: (Tensor[T]) => Array[(Tensor[T], Tensor[T])], + parameter: Tensor[T]): (Tensor[T], Array[T]) = { + val lr = this.learningRate + val lrd = this.learningRateDecay + val beta1 = this.beta1 + val beta2 = this.beta2 + val eps = this.Epsilon + + val (fx, dfdx) = feval(parameter) + + var timestep = state.getOrElse[Int]("evalCounter", 0) + + val clr = lr / (1 + timestep*lrd) + + timestep = timestep + 1 + + val parallelNum = Engine.coreNumber() + val gradLength = parameter.nElement() + val taskSize = gradLength / parallelNum + val extraTask = gradLength % parallelNum + if (ones == null || ones.nElement() < taskSize + 1) { + ones = Tensor[T]().resize(taskSize + 1).fill(ev.one) + } + + val times = new Array[Long](parallelNum) + + Engine.default.invokeAndWait((0 until parallelNum).map(tid => () => { + val start = System.nanoTime() + val offset = tid * taskSize + math.min(tid, extraTask) + val length = taskSize + (if (tid < extraTask) 1 else 0) + val currentDfdx = dfdx.narrow(1, offset + 1, length) + val currentParameter = parameter.narrow(1, offset + 1, length) + val currentOnes = ones.narrow(1, 1, length) + val (_s, _r, _denom) = + if (state.get[Tensor[T]](s"s$tid").isDefined && state.get[Tensor[T]](s"r$tid").isDefined + && state.get[Tensor[T]](s"denom$tid").isDefined) { + (state.get[Tensor[T]](s"s$tid").get, state.get[Tensor[T]](s"r$tid").get, + state.get[Tensor[T]](s"denom$tid").get) + } else { + (Tensor[T]().resizeAs(currentParameter).zero(), + Tensor[T]().resizeAs(currentParameter).zero(), + Tensor[T]().resizeAs(currentParameter).zero()) + } + Adam.updateFrame(_s, _r, _denom, clr, currentDfdx, currentParameter, + beta1, beta2, timestep, currentOnes, eps) + + state(s"s$tid") = _s // 1st moment variables + state(s"r$tid") = _r // 2nd moment variables + state(s"denom$tid") = _denom // 3nd moment variables + times(tid) = (System.nanoTime() - start) / 1000000L + })) + + Adam.logger.info(s"update ${parameter.nElement()} parameters, maximum time is ${times.max} ms") + Adam.logger.info(s"Time is ${times.mkString("\t")} ms") + + + state("evalCounter") = timestep // A tmp tensor to hold the sqrt(v) + epsilon + + (parameter, Array(fx)) + } + override def loadFromTable(config: Table): this.type = { this.learningRate = config.get[Double]("learningRate").getOrElse(this.learningRate) this.learningRateDecay = config.get[Double]("learningRateDecay") @@ -156,4 +288,33 @@ object Adam { _denom.cdiv(_s, _denom) parameter.add(ev.fromType[Double](-stepSize), _denom) } + + + private[optim] def updateFrameZeroGrad[T: ClassTag]( + currentIteration: Int, lastUpdatedIteration: Int, + _s: Tensor[T], _r: Tensor[T], _denom: Tensor[T], + clr: Double, parameter: Tensor[T], + beta1: Double, beta2: Double, + ones: Tensor[T], eps: Double)( + implicit ev: TensorNumeric[T]): Unit = { + (lastUpdatedIteration until currentIteration).foreach{timestep => + /** + * m_t = beta_1 * m_t-1 + * v_t = beta_2 * v_t-1 + */ + _s.mul(ev.fromType[Double](beta1)) + _r.mul(ev.fromType[Double](beta2)) + _denom.sqrt(_r) + + // used as MKL.axpy: 1 * a + y = y + _denom.add(ev.fromType(eps), ones) + + // efficiency improved upon by changing the order of computation, at expense of clarity + val biasCorrection1 = 1 - pow(beta1, timestep) + val biasCorrection2 = 1 - pow(beta2, timestep) + val stepSize = clr * sqrt(biasCorrection2) / biasCorrection1 + _denom.cdiv(_s, _denom) + parameter.add(ev.fromType[Double](-stepSize), _denom) + } + } } From 915acb18a4e959d38695b827d2f9199410db9280 Mon Sep 17 00:00:00 2001 From: qiuxin2012 Date: Tue, 24 Jul 2018 15:37:34 +0800 Subject: [PATCH 07/11] add embedding adam --- .../example/recommendation/AdamPerf.scala | 81 +++- .../intel/analytics/bigdl/optim/Adam.scala | 150 +------- .../analytics/bigdl/optim/EmbeddingAdam.scala | 357 ++++++++++++++++++ 3 files changed, 432 insertions(+), 156 deletions(-) create mode 100644 spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/EmbeddingAdam.scala diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/AdamPerf.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/AdamPerf.scala index 755b3c52a68..2cef814d54b 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/AdamPerf.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/AdamPerf.scala @@ -16,9 +16,11 @@ package com.intel.analytics.bigdl.example.recommendation -import com.intel.analytics.bigdl.optim.Adam +import com.intel.analytics.bigdl.nn.LookupTable +import com.intel.analytics.bigdl.optim.{Adam, EmbeddingAdam} import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.utils.Engine +import com.intel.analytics.bigdl.numeric.NumericFloat import scala.util.Random @@ -28,6 +30,7 @@ object AdamPerf { val batchSize = args(1).toInt val core = args(2).toInt System.setProperty("bigdl.localMode", "true") + val sparse = args(3) == "1" Engine.init(1, core, false) val userCount = 138493 val itemCount = 26744 @@ -36,24 +39,72 @@ object AdamPerf { hiddenLayers = Array(128, 64), mfEmbed = 64) .buildModel() - val (w, g) = model.getParameters() - val optimMethod = new Adam[Float]() - g.randn() - // warm up - (0 until 5).foreach{i => - optimMethod.optimize(_ => (1, g), w) - } + if (sparse) { + val embeddings = model.embeddingModel.findModules("LookupTable") + .map(_.asInstanceOf[LookupTable[Float]]) + val input = Tensor[Float].range(1, batchSize * core) + val embeddingsGradient = embeddings.map{embedding => + val inputAndGradient = Array.tabulate(core)(c => + (input.narrow(1, batchSize * c + 1, batchSize), + Tensor[Float](batchSize, embedding.nOutput).rand())) + val optimMethod = new EmbeddingAdam[Float]() + optimMethod.setNOutput(embedding.nIndex, embedding.nOutput) + val parameter = embedding.getParameters()._1 + (inputAndGradient, optimMethod, parameter) + } + + def update(): Unit = { + embeddingsGradient.foreach {v => + val inputAndGradient = v._1 + val optimMethod = v._2 + val parameter = v._3 + + optimMethod.updateNograd(input, parameter) + optimMethod.optimizeEmbedding(inputAndGradient, parameter) + } + } + + // warm up + (0 until 5).foreach { i => + update() + } - var count = 0L - (0 until iteration).foreach { i => - println(i) - g.randn() val start = System.nanoTime() - optimMethod.optimize(_ => (1, g), w) + var count = 0L + (0 until iteration).foreach { i => + println(i) + val start = System.nanoTime() + update() + val end = System.nanoTime() + println(s"sparse time is ${(end - start) / 1e6.toLong}") + count += end - start + } val end = System.nanoTime() - println(s"time is ${(end - start) / 1e6.toLong}") - count += end - start + println(s"average sparse time is ${(end - start) / 1e6.toLong / iteration}") + + + } else { + // update with dense gradient + val (w, g) = model.embeddingModel.getParameters() + val optimMethod = new Adam[Float]() + g.randn() + + // warm up + (0 until 5).foreach { i => + optimMethod.optimize(_ => (1, g), w) + } + + var count = 0L + (0 until iteration).foreach { i => + println(i) + g.randn() + val start = System.nanoTime() + optimMethod.optimize(_ => (1, g), w) + val end = System.nanoTime() + println(s"time is ${(end - start) / 1e6.toLong}") + count += end - start + } } } diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala index d0a0c2d718b..84868ceb0d9 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala @@ -77,138 +77,14 @@ class Adam[@specialized(Float, Double) T: ClassTag]( val times = new Array[Long](parallelNum) - Engine.default.invokeAndWait((0 until parallelNum).map(tid => () => { - val start = System.nanoTime() - val offset = tid * taskSize + math.min(tid, extraTask) - val length = taskSize + (if (tid < extraTask) 1 else 0) - val currentDfdx = dfdx.narrow(1, offset + 1, length) - val currentParameter = parameter.narrow(1, offset + 1, length) - val currentOnes = ones.narrow(1, 1, length) - val (_s, _r, _denom) = - if (state.get[Tensor[T]](s"s$tid").isDefined && state.get[Tensor[T]](s"r$tid").isDefined - && state.get[Tensor[T]](s"denom$tid").isDefined) { - (state.get[Tensor[T]](s"s$tid").get, state.get[Tensor[T]](s"r$tid").get, - state.get[Tensor[T]](s"denom$tid").get) - } else { - (Tensor[T]().resizeAs(currentParameter).zero(), - Tensor[T]().resizeAs(currentParameter).zero(), - Tensor[T]().resizeAs(currentParameter).zero()) - } - Adam.updateFrame(_s, _r, _denom, clr, currentDfdx, currentParameter, - beta1, beta2, timestep, currentOnes, eps) - - state(s"s$tid") = _s // 1st moment variables - state(s"r$tid") = _r // 2nd moment variables - state(s"denom$tid") = _denom // 3nd moment variables - times(tid) = (System.nanoTime() - start) / 1000000L - })) - - Adam.logger.info(s"update ${parameter.nElement()} parameters, maximum time is ${times.max} ms") - Adam.logger.info(s"Time is ${times.mkString("\t")} ms") - - - state("evalCounter") = timestep // A tmp tensor to hold the sqrt(v) + epsilon - - (parameter, Array(fx)) - } - - val embeddingSize = 64 - val numEmbedding = 128 - val lastUpdated = collection.mutable.HashMap( - Array.tabulate(numEmbedding)(i => (i + 1, 0)): _*) - - def updateNograd(indices: Tensor[T], parameter: Tensor[T]): Unit = { - val ones = Tensor(embeddingSize).fill(ev.one) - - val lr = this.learningRate - val lrd = this.learningRateDecay - val beta1 = this.beta1 - val beta2 = this.beta2 - val eps = this.Epsilon - - val uniqueIndices = Tensor[T](Storage(indices.storage().array().distinct)) - - var timestep = state.getOrElse[Int]("evalCounter", 0) - - val clr = lr / (1 + timestep*lrd) - - timestep = timestep + 1 - - val parallelNum = Engine.coreNumber() - val gradLength = uniqueIndices.nElement() - val taskSize = gradLength / parallelNum - val extraTask = gradLength % parallelNum - - val times = new Array[Long](parallelNum) - - Engine.default.invokeAndWait((0 until parallelNum).map(tid => () => { - val start = System.nanoTime() - val offset = tid * taskSize + math.min(tid, extraTask) - val length = taskSize + (if (tid < extraTask) 1 else 0) - val currentDfdx = uniqueIndices.narrow(1, offset + 1, length) - val currentParameter = parameter.narrow(1, offset + 1, length) - var i = 1 - while(i <= currentDfdx.nElement()) { - val index = ev.toType[Int](currentDfdx.valueAt(i)) - val (_s, _r, _denom) = - if (state.get[Tensor[T]](s"s$index").isDefined && - state.get[Tensor[T]](s"r$index").isDefined - && state.get[Tensor[T]](s"denom$index").isDefined) { - (state.get[Tensor[T]](s"s$index").get, - state.get[Tensor[T]](s"r$index").get, - state.get[Tensor[T]](s"denom$index").get) - } else { - (Tensor[T](embeddingSize).zero(), - Tensor[T](embeddingSize).zero(), - Tensor[T](embeddingSize).zero()) - } - val indexThParameter = parameter.narrow(1, index * embeddingSize + 1, embeddingSize) - Adam.updateFrameZeroGrad( - timestep, lastUpdated(index), - _s, _r, _denom, clr, indexThParameter, - beta1, beta2, ones, eps) - state(s"s$index") = _s // 1st moment variables - state(s"r$index") = _r // 2nd moment variables - state(s"denom$index") = _denom // 3nd moment variables - lastUpdated(index) = timestep - i += 1 + (0 until parallelNum).foreach{tid => + if (state.get[Tensor[T]](s"s$tid").isEmpty) { + state(s"s$tid") = Tensor[T]() + state(s"r$tid") = Tensor[T]() + state(s"denom$tid") = Tensor[T]() } - - times(tid) = (System.nanoTime() - start) / 1000000L - })) - Adam.logger.info(s"update ${parameter.nElement()} parameters, maximum time is ${times.max} ms") - Adam.logger.info(s"Time is ${times.mkString("\t")} ms") - - - state("evalCounter") = timestep // A tmp tensor to hold the sqrt(v) + epsilon - } - - override def optimize(feval: (Tensor[T]) => Array[(Tensor[T], Tensor[T])], - parameter: Tensor[T]): (Tensor[T], Array[T]) = { - val lr = this.learningRate - val lrd = this.learningRateDecay - val beta1 = this.beta1 - val beta2 = this.beta2 - val eps = this.Epsilon - - val (fx, dfdx) = feval(parameter) - - var timestep = state.getOrElse[Int]("evalCounter", 0) - - val clr = lr / (1 + timestep*lrd) - - timestep = timestep + 1 - - val parallelNum = Engine.coreNumber() - val gradLength = parameter.nElement() - val taskSize = gradLength / parallelNum - val extraTask = gradLength % parallelNum - if (ones == null || ones.nElement() < taskSize + 1) { - ones = Tensor[T]().resize(taskSize + 1).fill(ev.one) } - val times = new Array[Long](parallelNum) - Engine.default.invokeAndWait((0 until parallelNum).map(tid => () => { val start = System.nanoTime() val offset = tid * taskSize + math.min(tid, extraTask) @@ -217,21 +93,13 @@ class Adam[@specialized(Float, Double) T: ClassTag]( val currentParameter = parameter.narrow(1, offset + 1, length) val currentOnes = ones.narrow(1, 1, length) val (_s, _r, _denom) = - if (state.get[Tensor[T]](s"s$tid").isDefined && state.get[Tensor[T]](s"r$tid").isDefined - && state.get[Tensor[T]](s"denom$tid").isDefined) { - (state.get[Tensor[T]](s"s$tid").get, state.get[Tensor[T]](s"r$tid").get, - state.get[Tensor[T]](s"denom$tid").get) - } else { - (Tensor[T]().resizeAs(currentParameter).zero(), - Tensor[T]().resizeAs(currentParameter).zero(), - Tensor[T]().resizeAs(currentParameter).zero()) - } + (state.get[Tensor[T]](s"s$tid").get.resizeAs(currentParameter), + state.get[Tensor[T]](s"r$tid").get.resizeAs(currentParameter), + state.get[Tensor[T]](s"denom$tid").get.resizeAs(currentParameter)) + Adam.updateFrame(_s, _r, _denom, clr, currentDfdx, currentParameter, beta1, beta2, timestep, currentOnes, eps) - state(s"s$tid") = _s // 1st moment variables - state(s"r$tid") = _r // 2nd moment variables - state(s"denom$tid") = _denom // 3nd moment variables times(tid) = (System.nanoTime() - start) / 1000000L })) diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/EmbeddingAdam.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/EmbeddingAdam.scala new file mode 100644 index 00000000000..80c0106593c --- /dev/null +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/EmbeddingAdam.scala @@ -0,0 +1,357 @@ +/* + * Copyright 2016 The BigDL Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.intel.analytics.bigdl.optim + +import com.intel.analytics.bigdl.tensor.{Storage, Tensor} +import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric +import com.intel.analytics.bigdl.utils.{Engine, T, Table} +import org.apache.log4j.Logger + +import scala.math._ +import scala.reflect.ClassTag + +/** + * An implementation of Adam http://arxiv.org/pdf/1412.6980.pdf + * @param learningRate learning rate + * @param learningRateDecay learning rate decay + * @param beta1 first moment coefficient + * @param beta2 second moment coefficient + * @param Epsilon for numerical stability + * @tparam T + */ +class EmbeddingAdam[@specialized(Float, Double) T: ClassTag]( + var learningRate: Double = 1e-3, + var learningRateDecay: Double = 0.0, + var beta1: Double = 0.9, + var beta2: Double = 0.999, + var Epsilon: Double = 1e-8)(implicit ev: TensorNumeric[T]) extends OptimMethod[T] { + + @transient + private var ones: Tensor[T] = null + + /** + * An implementation of Adam http://arxiv.org/pdf/1412.6980.pdf + * + * @param feval a function that takes a single input (X), the point of a evaluation, and + * returns f(X) and df/dX + * @param parameter the initial point + * @return the new x vector and the function list {fx}, evaluated before the update + */ + override def optimize(feval: (Tensor[T]) => (T, Tensor[T]), + parameter: Tensor[T]): (Tensor[T], Array[T]) = { + val lr = this.learningRate + val lrd = this.learningRateDecay + val beta1 = this.beta1 + val beta2 = this.beta2 + val eps = this.Epsilon + + val (fx, dfdx) = feval(parameter) + + var timestep = state.getOrElse[Int]("evalCounter", 0) + + val clr = lr / (1 + timestep*lrd) + + timestep = timestep + 1 + + val parallelNum = Engine.coreNumber() + val gradLength = parameter.nElement() + val taskSize = gradLength / parallelNum + val extraTask = gradLength % parallelNum + if (ones == null || ones.nElement() < taskSize + 1) { + ones = Tensor[T]().resize(taskSize + 1).fill(ev.one) + } + + val times = new Array[Long](parallelNum) + + Engine.default.invokeAndWait((0 until parallelNum).map(tid => () => { + val start = System.nanoTime() + val offset = tid * taskSize + math.min(tid, extraTask) + val length = taskSize + (if (tid < extraTask) 1 else 0) + val currentDfdx = dfdx.narrow(1, offset + 1, length) + val currentParameter = parameter.narrow(1, offset + 1, length) + val currentOnes = ones.narrow(1, 1, length) + val (_s, _r, _denom) = + if (state.get[Tensor[T]](s"s$tid").isDefined && state.get[Tensor[T]](s"r$tid").isDefined + && state.get[Tensor[T]](s"denom$tid").isDefined) { + (state.get[Tensor[T]](s"s$tid").get, state.get[Tensor[T]](s"r$tid").get, + state.get[Tensor[T]](s"denom$tid").get) + } else { + (Tensor[T]().resizeAs(currentParameter).zero(), + Tensor[T]().resizeAs(currentParameter).zero(), + Tensor[T]().resizeAs(currentParameter).zero()) + } + Adam.updateFrame(_s, _r, _denom, clr, currentDfdx, currentParameter, + beta1, beta2, timestep, currentOnes, eps) + + state(s"s$tid") = _s // 1st moment variables + state(s"r$tid") = _r // 2nd moment variables + state(s"denom$tid") = _denom // 3nd moment variables + times(tid) = (System.nanoTime() - start) / 1000000L + })) + + Adam.logger.info(s"update ${parameter.nElement()} parameters, maximum time is ${times.max} ms") + Adam.logger.info(s"Time is ${times.mkString("\t")} ms") + + + state("evalCounter") = timestep // A tmp tensor to hold the sqrt(v) + epsilon + + (parameter, Array(fx)) + } + + + var embeddingNoutput = 0 + var embeddingNIndex = 0 + var lastUpdated = collection.mutable.HashMap( + Array.tabulate(embeddingNoutput)(i => (i + 1, 0)): _*) + + // TODO: clear before saving + var s: Array[Tensor[T]] = _ + var r: Array[Tensor[T]] = _ + var denom: Array[Tensor[T]] = _ + def setNOutput(nIndex: Int, nOutput: Int): Unit = { + embeddingNoutput = nOutput + embeddingNIndex = nIndex + lastUpdated = collection.mutable.HashMap( + Array.tabulate(nIndex)(i => (i + 1, 1)): _*) + ones = Tensor(embeddingNoutput).fill(ev.one) + s = Array.tabulate(nIndex)(_ => Tensor[T](nOutput)) + r = Array.tabulate(nIndex)(_ => Tensor[T](nOutput)) + denom = Array.tabulate(nIndex)(_ => Tensor[T](nOutput)) + (1 to nIndex).foreach{i => + state(s"s$i") = Tensor[Float]() // 1st moment variables + state(s"r$i") = Tensor[Float]() // 2nd moment variables + state(s"denom$i") = Tensor[Float]() // 3nd moment variables + } + } + + def updateNograd(indices: Tensor[T], parameter: Tensor[T]): Unit = { + val lr = this.learningRate + val lrd = this.learningRateDecay + val beta1 = this.beta1 + val beta2 = this.beta2 + val eps = this.Epsilon + + val uniqueIndices = Tensor[T](Storage(indices.storage().array().distinct)) + + var timestep = state.getOrElse[Int]("evalCounter", 1) + + val clr = lr / (1 + timestep*lrd) + + timestep = timestep + 1 + + val parallelNum = Engine.coreNumber() + val gradLength = uniqueIndices.nElement() + val taskSize = gradLength / parallelNum + val extraTask = gradLength % parallelNum + +// val times = new Array[Long](parallelNum) + + Engine.default.invokeAndWait((0 until parallelNum).map(tid => () => { + val start = System.nanoTime() + val offset = tid * taskSize + math.min(tid, extraTask) + val length = taskSize + (if (tid < extraTask) 1 else 0) + val currentIndex = uniqueIndices.narrow(1, offset + 1, length) + var i = 1 + while(i <= currentIndex.nElement()) { + val index = ev.toType[Int](currentIndex.valueAt(i)) + val (_s, _r, _denom) = (s(index - 1), r(index - 1), denom(index - 1)) +// (state.get[Tensor[T]](s"s$index").get.resize(embeddingNoutput), +// state.get[Tensor[T]](s"r$index").get.resize(embeddingNoutput), +// state.get[Tensor[T]](s"denom$index").get.resize(embeddingNoutput)) +// if (state.get[Tensor[T]](s"s$index").isDefined) { +// (state.get[Tensor[T]](s"s$index").get, +// state.get[Tensor[T]](s"r$index").get, +// state.get[Tensor[T]](s"denom$index").get) +// } else { +// (Tensor[T](embeddingNoutput), +// Tensor[T](embeddingNoutput), +// Tensor[T](embeddingNoutput)) +// } + val indexThParameter = parameter.narrow(1, + (index - 1) * embeddingNoutput + 1, embeddingNoutput) + Adam.updateFrameZeroGrad( + timestep, lastUpdated(index), + _s, _r, _denom, clr, indexThParameter, + beta1, beta2, ones, eps) +// state(s"s$index") = _s // 1st moment variables +// state(s"r$index") = _r // 2nd moment variables +// state(s"denom$index") = _denom // 3nd moment variables + lastUpdated(index) = timestep +// println(index) + i += 1 + } +// Adam.logger.info(s"zero grad${tid} $i ${ev.toType[Int](currentIndex.valueAt(i - 1))}") +// times(tid) = (System.nanoTime() - start) / 1000000L + })) +// Adam.logger.info(s"update ${parameter.nElement()} parameters, maximum time is ${times.max} ms") +// Adam.logger.info(s"Time is ${times.mkString("\t")} ms") + + + state("evalCounter") = timestep // A tmp tensor to hold the sqrt(v) + epsilon + } + + /** + * update embedding gradient + * @param dfdx input -> gradOutput + * @param parameter + * @return + */ + def optimizeEmbedding( + dfdx: Array[(Tensor[T], Tensor[T])], + parameter: Tensor[T]): Unit = { + // TODO: assert input is unique + val lr = this.learningRate + val lrd = this.learningRateDecay + val beta1 = this.beta1 + val beta2 = this.beta2 + val eps = this.Epsilon + + var timestep = state.getOrElse[Int]("evalCounter", 1) + + val clr = lr / (1 + timestep*lrd) + +// timestep = timestep + 1 + + val parallelNum = Engine.coreNumber() +// val gradLength = parameter.nElement() +// val taskSize = gradLength / parallelNum +// val extraTask = gradLength % parallelNum +// if (ones == null || ones.nElement() < taskSize + 1) { +// ones = Tensor[T]().resize(taskSize + 1).fill(ev.one) +// } + + val times = new Array[Long](parallelNum) + + Engine.default.invokeAndWait((0 until parallelNum).map(tid => () => { +// val start = System.nanoTime() + val currentGradient = dfdx(tid) + val currentIndex = currentGradient._1 + val currentDfdx = currentGradient._2 + var i = 1 + while(i <= currentIndex.nElement()) { + val index = ev.toType[Int](currentIndex.valueAt(i)) + val (_s, _r, _denom) = (s(index - 1), r(index - 1), denom(index - 1)) +// (state.get[Tensor[T]](s"s$index").get.resize(embeddingNoutput), +// state.get[Tensor[T]](s"r$index").get.resize(embeddingNoutput), +// state.get[Tensor[T]](s"denom$index").get.resize(embeddingNoutput)) +// if (state.get[Tensor[T]](s"s$index").isDefined && +// state.get[Tensor[T]](s"r$index").isDefined +// && state.get[Tensor[T]](s"denom$index").isDefined) { +// (state.get[Tensor[T]](s"s$index").get, +// state.get[Tensor[T]](s"r$index").get, +// state.get[Tensor[T]](s"denom$index").get) +// } else { +// (Tensor[T](embeddingNoutput).zero(), +// Tensor[T](embeddingNoutput).zero(), +// Tensor[T](embeddingNoutput).zero()) +// } + val indexThParameter = parameter.narrow(1, + (index - 1) * embeddingNoutput + 1, embeddingNoutput) + val iThGradient = currentDfdx.select(1, i) + Adam.updateFrame( + _s, _r, _denom, clr, iThGradient, indexThParameter, + beta1, beta2, timestep, ones, eps) +// state(s"s$index") = _s // 1st moment variables +// state(s"r$index") = _r // 2nd moment variables +// state(s"denom$index") = _denom // 3nd moment variables + lastUpdated(index) = timestep + i += 1 + } +// Adam.logger.info(s"update grad${tid} $i ${ev.toType[Int](currentIndex.valueAt(i - 1))}") + })) + +// Adam.logger.info(s"update ${parameter.nElement()} parameters, maximum time is ${times.max} ms") +// Adam.logger.info(s"Time is ${times.mkString("\t")} ms") + + + state("evalCounter") = timestep // A tmp tensor to hold the sqrt(v) + epsilon + } + + override def loadFromTable(config: Table): this.type = { + this.learningRate = config.get[Double]("learningRate").getOrElse(this.learningRate) + this.learningRateDecay = config.get[Double]("learningRateDecay") + .getOrElse(this.learningRateDecay) + this.beta1 = config.get[Double]("beta1").getOrElse(this.beta1) + this.beta2 = config.get[Double]("beta2").getOrElse(this.beta2) + this.Epsilon = config.get[Double]("Epsilon").getOrElse(this.Epsilon) + this + } + + override def clearHistory(): Unit = { + state.delete("s") + state.delete("r") + } + + override def getLearningRate(): Double = this.learningRate +} + +//object Adam { +// val logger = Logger.getLogger(this.getClass) +// +// private[optim] def updateFrame[T: ClassTag](_s: Tensor[T], _r: Tensor[T], _denom: Tensor[T], +// clr: Double, dfdx: Tensor[T], parameter: Tensor[T], +// beta1: Double, beta2: Double, timestep: Int, +// ones: Tensor[T], eps: Double)( +// implicit ev: TensorNumeric[T]): Unit = { +// /** +// * m_t = beta_1 * m_t-1 + (1 - beta_1) * g_t +// * v_t = beta_2 * v_t-1 + (1 - beta_2) * g_t * g_t +// */ +// _s.mul(ev.fromType[Double](beta1)).add(ev.fromType[Double](1-beta1), dfdx) +// _r.mul(ev.fromType[Double](beta2)).addcmul(ev.fromType[Double](1-beta2), dfdx, dfdx) +// _denom.sqrt(_r) +// +// // used as MKL.axpy: 1 * a + y = y, and fill buffer with one +// _denom.add(ev.fromType(eps), ones) +// +// // efficiency improved upon by changing the order of computation, at expense of clarity +// val biasCorrection1 = 1 - pow(beta1, timestep) +// val biasCorrection2 = 1 - pow(beta2, timestep) +// val stepSize = clr * sqrt(biasCorrection2) / biasCorrection1 +// _denom.cdiv(_s, _denom) +// parameter.add(ev.fromType[Double](-stepSize), _denom) +// } +// +// +// private[optim] def updateFrameZeroGrad[T: ClassTag]( +// currentIteration: Int, lastUpdatedIteration: Int, +// _s: Tensor[T], _r: Tensor[T], _denom: Tensor[T], +// clr: Double, parameter: Tensor[T], +// beta1: Double, beta2: Double, +// ones: Tensor[T], eps: Double)( +// implicit ev: TensorNumeric[T]): Unit = { +// (lastUpdatedIteration until currentIteration).foreach{timestep => +// /** +// * m_t = beta_1 * m_t-1 +// * v_t = beta_2 * v_t-1 +// */ +// _s.mul(ev.fromType[Double](beta1)) +// _r.mul(ev.fromType[Double](beta2)) +// _denom.sqrt(_r) +// +// // used as MKL.axpy: 1 * a + y = y +// _denom.add(ev.fromType(eps), ones) +// +// // efficiency improved upon by changing the order of computation, at expense of clarity +// val biasCorrection1 = 1 - pow(beta1, timestep) +// val biasCorrection2 = 1 - pow(beta2, timestep) +// val stepSize = clr * sqrt(biasCorrection2) / biasCorrection1 +// _denom.cdiv(_s, _denom) +// parameter.add(ev.fromType[Double](-stepSize), _denom) +// } +// } +//} From 2f66d36561274da68b42e6917d01ccdf80fef677 Mon Sep 17 00:00:00 2001 From: qiuxin2012 Date: Tue, 24 Jul 2018 17:06:47 +0800 Subject: [PATCH 08/11] some fix --- .../example/recommendation/AdamPerf.scala | 6 +- .../intel/analytics/bigdl/optim/Adam.scala | 2 +- .../analytics/bigdl/optim/EmbeddingAdam.scala | 70 ++++++------------- 3 files changed, 24 insertions(+), 54 deletions(-) diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/AdamPerf.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/AdamPerf.scala index 2cef814d54b..12e8162c8d4 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/AdamPerf.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/AdamPerf.scala @@ -70,9 +70,10 @@ object AdamPerf { update() } - val start = System.nanoTime() var count = 0L (0 until iteration).foreach { i => + val n = i % 10 + input.range(1 + n * batchSize * core, 1 + (n + 1) * batchSize * core) println(i) val start = System.nanoTime() update() @@ -80,9 +81,8 @@ object AdamPerf { println(s"sparse time is ${(end - start) / 1e6.toLong}") count += end - start } - val end = System.nanoTime() - println(s"average sparse time is ${(end - start) / 1e6.toLong / iteration}") + println(s"average sparse time is ${count / 1e6.toLong / iteration}") } else { // update with dense gradient diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala index 84868ceb0d9..81ee18a30f2 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala @@ -165,7 +165,7 @@ object Adam { beta1: Double, beta2: Double, ones: Tensor[T], eps: Double)( implicit ev: TensorNumeric[T]): Unit = { - (lastUpdatedIteration until currentIteration).foreach{timestep => + (lastUpdatedIteration until (currentIteration - 1)).foreach{timestep => /** * m_t = beta_1 * m_t-1 * v_t = beta_2 * v_t-1 diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/EmbeddingAdam.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/EmbeddingAdam.scala index 80c0106593c..59fb4dacb39 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/EmbeddingAdam.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/EmbeddingAdam.scala @@ -115,8 +115,9 @@ class EmbeddingAdam[@specialized(Float, Double) T: ClassTag]( var embeddingNoutput = 0 var embeddingNIndex = 0 - var lastUpdated = collection.mutable.HashMap( - Array.tabulate(embeddingNoutput)(i => (i + 1, 0)): _*) + var lastUpdated = Array.tabulate(embeddingNIndex)(_ => 1) +// var lastUpdated = collection.mutable.HashMap( +// Array.tabulate(embeddingNoutput)(i => (i + 1, 0)): _*) // TODO: clear before saving var s: Array[Tensor[T]] = _ @@ -125,8 +126,9 @@ class EmbeddingAdam[@specialized(Float, Double) T: ClassTag]( def setNOutput(nIndex: Int, nOutput: Int): Unit = { embeddingNoutput = nOutput embeddingNIndex = nIndex - lastUpdated = collection.mutable.HashMap( - Array.tabulate(nIndex)(i => (i + 1, 1)): _*) +// lastUpdated = collection.mutable.HashMap( +// Array.tabulate(nIndex)(i => (i + 1, 1)): _*) + lastUpdated = Array.tabulate(nIndex)(_ => 1) ones = Tensor(embeddingNoutput).fill(ev.one) s = Array.tabulate(nIndex)(_ => Tensor[T](nOutput)) r = Array.tabulate(nIndex)(_ => Tensor[T](nOutput)) @@ -149,9 +151,7 @@ class EmbeddingAdam[@specialized(Float, Double) T: ClassTag]( var timestep = state.getOrElse[Int]("evalCounter", 1) - val clr = lr / (1 + timestep*lrd) - - timestep = timestep + 1 + val clr = lr / (1 + (timestep - 1) *lrd) val parallelNum = Engine.coreNumber() val gradLength = uniqueIndices.nElement() @@ -161,36 +161,23 @@ class EmbeddingAdam[@specialized(Float, Double) T: ClassTag]( // val times = new Array[Long](parallelNum) Engine.default.invokeAndWait((0 until parallelNum).map(tid => () => { - val start = System.nanoTime() val offset = tid * taskSize + math.min(tid, extraTask) val length = taskSize + (if (tid < extraTask) 1 else 0) val currentIndex = uniqueIndices.narrow(1, offset + 1, length) var i = 1 while(i <= currentIndex.nElement()) { val index = ev.toType[Int](currentIndex.valueAt(i)) - val (_s, _r, _denom) = (s(index - 1), r(index - 1), denom(index - 1)) -// (state.get[Tensor[T]](s"s$index").get.resize(embeddingNoutput), -// state.get[Tensor[T]](s"r$index").get.resize(embeddingNoutput), -// state.get[Tensor[T]](s"denom$index").get.resize(embeddingNoutput)) -// if (state.get[Tensor[T]](s"s$index").isDefined) { -// (state.get[Tensor[T]](s"s$index").get, -// state.get[Tensor[T]](s"r$index").get, -// state.get[Tensor[T]](s"denom$index").get) -// } else { -// (Tensor[T](embeddingNoutput), -// Tensor[T](embeddingNoutput), -// Tensor[T](embeddingNoutput)) -// } - val indexThParameter = parameter.narrow(1, - (index - 1) * embeddingNoutput + 1, embeddingNoutput) - Adam.updateFrameZeroGrad( - timestep, lastUpdated(index), - _s, _r, _denom, clr, indexThParameter, - beta1, beta2, ones, eps) -// state(s"s$index") = _s // 1st moment variables -// state(s"r$index") = _r // 2nd moment variables -// state(s"denom$index") = _denom // 3nd moment variables - lastUpdated(index) = timestep + if (timestep > lastUpdated(index - 1)) { + val (_s, _r, _denom) = (s(index - 1), r(index - 1), denom(index - 1)) + val indexThParameter = parameter.narrow(1, + (index - 1) * embeddingNoutput + 1, embeddingNoutput) + println(s"update index ${index}") + Adam.updateFrameZeroGrad( + timestep, lastUpdated(index - 1), + _s, _r, _denom, clr, indexThParameter, + beta1, beta2, ones, eps) + lastUpdated(index - 1) = timestep + } // println(index) i += 1 } @@ -245,30 +232,13 @@ class EmbeddingAdam[@specialized(Float, Double) T: ClassTag]( while(i <= currentIndex.nElement()) { val index = ev.toType[Int](currentIndex.valueAt(i)) val (_s, _r, _denom) = (s(index - 1), r(index - 1), denom(index - 1)) -// (state.get[Tensor[T]](s"s$index").get.resize(embeddingNoutput), -// state.get[Tensor[T]](s"r$index").get.resize(embeddingNoutput), -// state.get[Tensor[T]](s"denom$index").get.resize(embeddingNoutput)) -// if (state.get[Tensor[T]](s"s$index").isDefined && -// state.get[Tensor[T]](s"r$index").isDefined -// && state.get[Tensor[T]](s"denom$index").isDefined) { -// (state.get[Tensor[T]](s"s$index").get, -// state.get[Tensor[T]](s"r$index").get, -// state.get[Tensor[T]](s"denom$index").get) -// } else { -// (Tensor[T](embeddingNoutput).zero(), -// Tensor[T](embeddingNoutput).zero(), -// Tensor[T](embeddingNoutput).zero()) -// } val indexThParameter = parameter.narrow(1, (index - 1) * embeddingNoutput + 1, embeddingNoutput) val iThGradient = currentDfdx.select(1, i) Adam.updateFrame( _s, _r, _denom, clr, iThGradient, indexThParameter, beta1, beta2, timestep, ones, eps) -// state(s"s$index") = _s // 1st moment variables -// state(s"r$index") = _r // 2nd moment variables -// state(s"denom$index") = _denom // 3nd moment variables - lastUpdated(index) = timestep + lastUpdated(index - 1) = timestep + 1 i += 1 } // Adam.logger.info(s"update grad${tid} $i ${ev.toType[Int](currentIndex.valueAt(i - 1))}") @@ -277,7 +247,7 @@ class EmbeddingAdam[@specialized(Float, Double) T: ClassTag]( // Adam.logger.info(s"update ${parameter.nElement()} parameters, maximum time is ${times.max} ms") // Adam.logger.info(s"Time is ${times.mkString("\t")} ms") - + timestep = timestep + 1 state("evalCounter") = timestep // A tmp tensor to hold the sqrt(v) + epsilon } From 254d64bf08132fa58f58af8a0daae2b96e9f7b5f Mon Sep 17 00:00:00 2001 From: qiuxin2012 Date: Fri, 3 Aug 2018 13:42:56 +0800 Subject: [PATCH 09/11] checkpoint --- .../example/recommendation/AdamPerf.scala | 29 +++- .../example/recommendation/NcfPerf.scala | 161 ++++++++++-------- .../intel/analytics/bigdl/optim/Adam.scala | 142 +++++---------- .../analytics/bigdl/optim/EmbeddingAdam.scala | 139 +++++++-------- .../analytics/bigdl/optim/AdamSpec.scala | 102 +++++++++++ 5 files changed, 330 insertions(+), 243 deletions(-) diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/AdamPerf.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/AdamPerf.scala index 12e8162c8d4..a5983011689 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/AdamPerf.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/AdamPerf.scala @@ -31,6 +31,7 @@ object AdamPerf { val core = args(2).toInt System.setProperty("bigdl.localMode", "true") val sparse = args(3) == "1" + val rand = args(4) == "1" Engine.init(1, core, false) val userCount = 138493 val itemCount = 26744 @@ -51,29 +52,49 @@ object AdamPerf { val optimMethod = new EmbeddingAdam[Float]() optimMethod.setNOutput(embedding.nIndex, embedding.nOutput) val parameter = embedding.getParameters()._1 - (inputAndGradient, optimMethod, parameter) + val parameterArray = Array.tabulate(embedding.nIndex)(i => + embedding.weight.select(1, i + 1) + ) + (inputAndGradient, optimMethod, parameter, parameterArray) } def update(): Unit = { - embeddingsGradient.foreach {v => + var i = 0 + while (i < embeddingsGradient.size) { + val v = embeddingsGradient(i) val inputAndGradient = v._1 val optimMethod = v._2 val parameter = v._3 + val parameterArray = v._4 + var start = System.nanoTime() +// optimMethod.updateNograd(input, parameterArray) +// println(s"${i}update parameter array ${parameterArray.length} " + +// s"Nograd ${System.nanoTime() - start}") optimMethod.updateNograd(input, parameter) + println(s"${i}update parameter ${parameter.nElement()} " + + s"Nograd ${System.nanoTime() - start}") + start = System.nanoTime() optimMethod.optimizeEmbedding(inputAndGradient, parameter) + println(s"${i}update parameter ${parameter.nElement()} " + + s"Embedding ${System.nanoTime() - start}") + + i += 1 } + } // warm up - (0 until 5).foreach { i => + (0 until 20).foreach { i => + val n = i % 10 + if (rand) input.range(1 + n * batchSize * core, (n + 1) * batchSize * core) update() } var count = 0L (0 until iteration).foreach { i => val n = i % 10 - input.range(1 + n * batchSize * core, 1 + (n + 1) * batchSize * core) + if (rand) input.range(1 + n * batchSize * core, (n + 1) * batchSize * core) println(i) val start = System.nanoTime() update() diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NcfPerf.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NcfPerf.scala index a97102da3c4..a0b4bf4dcc5 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NcfPerf.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/example/recommendation/NcfPerf.scala @@ -17,7 +17,7 @@ package com.intel.analytics.bigdl.example.recommendation import com.intel.analytics.bigdl.nn.BCECriterion -import com.intel.analytics.bigdl.optim.Adam +import com.intel.analytics.bigdl.optim.{Adam, ParallelAdam} import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.utils.Engine @@ -28,6 +28,7 @@ object NcfPerf { val iteration = args(0).toInt val batchSize = args(1).toInt val core = args(2).toInt + val train = args(3).toInt == 1 System.setProperty("bigdl.localMode", "true") Engine.init(1, core, false) val userCount = 138493 @@ -86,76 +87,100 @@ object NcfPerf { }) } - (0 until iteration).foreach { i => - input.select(2, 1).apply1(_ => Random.nextInt(userCount) + 1) - input.select(2, 2).apply1(_ => Random.nextInt(itemCount) + 1) - target.apply1(_ => Random.nextInt(2)) - println(i) - - Engine.default.invokeAndWait((0 until core).map { tid => - () => - val currentInput = input.narrow(1, tid * batchSize + 1, batchSize) - val currentTarget = target.narrow(1, tid * batchSize + 1, batchSize) - val currentModel = workingModels(tid) - - - var start = System.nanoTime() - - val output = currentModel.forward(currentInput) - modelForwardTime(tid) += System.nanoTime() - start - - start = System.nanoTime() - val loss = criterion.forward(output, currentTarget) - criterionForwardTime(tid) += System.nanoTime() - start - - start = System.nanoTime() - val gradOutput = criterion.backward(output, currentTarget) - criterionBackwardTime(tid) += System.nanoTime() - start - - start = System.nanoTime() - val gradInput = currentModel.backward(currentInput, gradOutput) - modelBackwardTime(tid) += System.nanoTime() - start - - }) - - var start = System.nanoTime() - val grad = g - Engine.default.invokeAndWait( - (0 until syncGradParallelNum).map(tid => - () => { - val offset = tid * syncGradTaskSize + math.min(tid, syncGradExtraTask) - val length = syncGradTaskSize + (if (tid < syncGradExtraTask) 1 else 0) - var i = 0 - while (i < parallelism) { - val sliceG = workingModelWAndG(i)._2.narrow(1, offset + 1, length) - if (i == 0) { - grad.narrow(1, offset + 1, length) - .copy(sliceG) - sliceG.zero() - } else { - grad.narrow(1, offset + 1, length) - .add(sliceG) - sliceG.zero() + if (train) { + (0 until iteration).foreach { i => + input.select(2, 1).apply1(_ => Random.nextInt(userCount) + 1) + input.select(2, 2).apply1(_ => Random.nextInt(itemCount) + 1) + target.apply1(_ => Random.nextInt(2)) + println(i) + + Engine.default.invokeAndWait((0 until core).map { tid => + () => + val currentInput = input.narrow(1, tid * batchSize + 1, batchSize) + val currentTarget = target.narrow(1, tid * batchSize + 1, batchSize) + val currentModel = workingModels(tid) + + + var start = System.nanoTime() + + val output = currentModel.forward(currentInput) + modelForwardTime(tid) += System.nanoTime() - start + + start = System.nanoTime() + val loss = criterion.forward(output, currentTarget) + criterionForwardTime(tid) += System.nanoTime() - start + + start = System.nanoTime() + val gradOutput = criterion.backward(output, currentTarget) + criterionBackwardTime(tid) += System.nanoTime() - start + + start = System.nanoTime() + val gradInput = currentModel.backward(currentInput, gradOutput) + modelBackwardTime(tid) += System.nanoTime() - start + + }) + + var start = System.nanoTime() + val grad = g + Engine.default.invokeAndWait( + (0 until syncGradParallelNum).map(tid => + () => { + val offset = tid * syncGradTaskSize + math.min(tid, syncGradExtraTask) + val length = syncGradTaskSize + (if (tid < syncGradExtraTask) 1 else 0) + var i = 0 + while (i < parallelism) { + val sliceG = workingModelWAndG(i)._2.narrow(1, offset + 1, length) + if (i == 0) { + grad.narrow(1, offset + 1, length) + .copy(sliceG) + sliceG.zero() + } else { + grad.narrow(1, offset + 1, length) + .add(sliceG) + sliceG.zero() + } + i += 1 } - i += 1 - } - }) - ) - grad.div(parallelism) - accgradientTime += System.nanoTime() - start - - start = System.nanoTime() - optimMethod.optimize(_ => (1, grad), w) - updateWeightTime += System.nanoTime() - start + }) + ) + grad.div(parallelism) + accgradientTime += System.nanoTime() - start + + start = System.nanoTime() + optimMethod.optimize(_ => (1, grad), w) + updateWeightTime += System.nanoTime() - start + } + + println(s"${modelForwardTime.max / 1e6 / iteration}ms") + println(s"${criterionForwardTime.max / 1e6 / iteration}ms") + println(s"${criterionBackwardTime.max / 1e6 / iteration}ms") + println(s"${modelBackwardTime.max / 1e6 / iteration}ms") + println(s"${accgradientTime / 1e6 / iteration}ms") + println(s"${updateWeightTime / 1e6 / iteration}ms") + } else { + var computingTime = 0L + (0 until iteration).foreach { i => + input.select(2, 1).apply1(_ => Random.nextInt(userCount) + 1) + input.select(2, 2).apply1(_ => Random.nextInt(itemCount) + 1) + target.apply1(_ => Random.nextInt(2)) + println(i) + + var start = System.nanoTime() + Engine.default.invokeAndWait((0 until core).map { tid => + () => + val currentInput = input.narrow(1, tid * batchSize + 1, batchSize) + val currentTarget = target.narrow(1, tid * batchSize + 1, batchSize) + val currentModel = workingModels(tid) + + val output = currentModel.forward(currentInput) + modelForwardTime(tid) += System.nanoTime() - start + }) + computingTime += System.nanoTime() - start + } + + println(s"Throughput is ${batchSize * core * iteration * 1e9 / computingTime} records/s") } - println(s"${modelForwardTime.max / 1e6 / iteration}ms") - println(s"${criterionForwardTime.max / 1e6 / iteration}ms") - println(s"${criterionBackwardTime.max / 1e6 / iteration}ms") - println(s"${modelBackwardTime.max / 1e6 / iteration}ms") - println(s"${accgradientTime / 1e6 / iteration}ms") - println(s"${updateWeightTime / 1e6 / iteration}ms") - } } diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala index 81ee18a30f2..ef573e820e6 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/Adam.scala @@ -16,10 +16,10 @@ package com.intel.analytics.bigdl.optim -import com.intel.analytics.bigdl.tensor.{Storage, Tensor} +import breeze.linalg.* +import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric -import com.intel.analytics.bigdl.utils.{Engine, T, Table} -import org.apache.log4j.Logger +import com.intel.analytics.bigdl.utils.{T, Table} import scala.math._ import scala.reflect.ClassTag @@ -34,14 +34,14 @@ import scala.reflect.ClassTag * @tparam T */ class Adam[@specialized(Float, Double) T: ClassTag]( - var learningRate: Double = 1e-3, - var learningRateDecay: Double = 0.0, - var beta1: Double = 0.9, - var beta2: Double = 0.999, - var Epsilon: Double = 1e-8)(implicit ev: TensorNumeric[T]) extends OptimMethod[T] { + var learningRate: Double = 1e-3, + var learningRateDecay: Double = 0.0, + var beta1: Double = 0.9, + var beta2: Double = 0.999, + var Epsilon: Double = 1e-8)(implicit ev: TensorNumeric[T]) extends OptimMethod[T] { @transient - private var ones: Tensor[T] = null + private var buffer: Tensor[T] = null /** * An implementation of Adam http://arxiv.org/pdf/1412.6980.pdf @@ -52,7 +52,8 @@ class Adam[@specialized(Float, Double) T: ClassTag]( * @return the new x vector and the function list {fx}, evaluated before the update */ override def optimize(feval: (Tensor[T]) => (T, Tensor[T]), - parameter: Tensor[T]): (Tensor[T], Array[T]) = { + parameter: Tensor[T]): (Tensor[T], Array[T]) = { + if (buffer == null) buffer = Tensor[T]() val lr = this.learningRate val lrd = this.learningRateDecay val beta1 = this.beta1 @@ -63,51 +64,43 @@ class Adam[@specialized(Float, Double) T: ClassTag]( var timestep = state.getOrElse[Int]("evalCounter", 0) + val (_s, _r, _denom) = + if (state.get[Tensor[T]]("s").isDefined) { + (state.get[Tensor[T]]("s").get, state.get[Tensor[T]]("r").get, + state.get[Tensor[T]]("denom").get.resizeAs(dfdx)) + } else { + (Tensor[T]().resizeAs(dfdx).zero(), Tensor[T]().resizeAs(dfdx).zero(), + Tensor[T]().resizeAs(dfdx).zero()) + } + val clr = lr / (1 + timestep*lrd) timestep = timestep + 1 - val parallelNum = Engine.coreNumber() - val gradLength = parameter.nElement() - val taskSize = gradLength / parallelNum - val extraTask = gradLength % parallelNum - if (ones == null || ones.nElement() < taskSize + 1) { - ones = Tensor[T]().resize(taskSize + 1).fill(ev.one) - } - - val times = new Array[Long](parallelNum) - - (0 until parallelNum).foreach{tid => - if (state.get[Tensor[T]](s"s$tid").isEmpty) { - state(s"s$tid") = Tensor[T]() - state(s"r$tid") = Tensor[T]() - state(s"denom$tid") = Tensor[T]() - } - } - - Engine.default.invokeAndWait((0 until parallelNum).map(tid => () => { - val start = System.nanoTime() - val offset = tid * taskSize + math.min(tid, extraTask) - val length = taskSize + (if (tid < extraTask) 1 else 0) - val currentDfdx = dfdx.narrow(1, offset + 1, length) - val currentParameter = parameter.narrow(1, offset + 1, length) - val currentOnes = ones.narrow(1, 1, length) - val (_s, _r, _denom) = - (state.get[Tensor[T]](s"s$tid").get.resizeAs(currentParameter), - state.get[Tensor[T]](s"r$tid").get.resizeAs(currentParameter), - state.get[Tensor[T]](s"denom$tid").get.resizeAs(currentParameter)) - - Adam.updateFrame(_s, _r, _denom, clr, currentDfdx, currentParameter, - beta1, beta2, timestep, currentOnes, eps) + /** + * m_t = beta_1 * m_t-1 + (1 - beta_1) * g_t + * v_t = beta_2 * v_t-1 + (1 - beta_2) * g_t * g_t + */ + _s.mul(ev.fromType[Double](beta1)).add(ev.fromType[Double](1-beta1), dfdx) + _r.mul(ev.fromType[Double](beta2)).addcmul(ev.fromType[Double](1-beta2), dfdx, dfdx) + _denom.sqrt(_r) - times(tid) = (System.nanoTime() - start) / 1000000L - })) + // used as MKL.axpy: 1 * a + y = y, and fill buffer with one + buffer.resizeAs(dfdx).fill(ev.one) + _denom.add(ev.fromType(eps), buffer) - Adam.logger.info(s"update ${parameter.nElement()} parameters, maximum time is ${times.max} ms") - Adam.logger.info(s"Time is ${times.mkString("\t")} ms") + // efficiency improved upon by changing the order of computation, at expense of clarity + val biasCorrection1 = 1 - pow(beta1, timestep) + val biasCorrection2 = 1 - pow(beta2, timestep) + val stepSize = clr * sqrt(biasCorrection2) / biasCorrection1 + _denom.cdiv(_s, _denom) + parameter.add(ev.fromType[Double](-stepSize), _denom) state("evalCounter") = timestep // A tmp tensor to hold the sqrt(v) + epsilon + state("s") = _s // 1st moment variables + state("r") = _r // 2nd moment variables + state("denom") = _denom // 3nd moment variables (parameter, Array(fx)) } @@ -129,60 +122,3 @@ class Adam[@specialized(Float, Double) T: ClassTag]( override def getLearningRate(): Double = this.learningRate } - -object Adam { - val logger = Logger.getLogger(this.getClass) - - private[optim] def updateFrame[T: ClassTag](_s: Tensor[T], _r: Tensor[T], _denom: Tensor[T], - clr: Double, dfdx: Tensor[T], parameter: Tensor[T], - beta1: Double, beta2: Double, timestep: Int, - ones: Tensor[T], eps: Double)( - implicit ev: TensorNumeric[T]): Unit = { - /** - * m_t = beta_1 * m_t-1 + (1 - beta_1) * g_t - * v_t = beta_2 * v_t-1 + (1 - beta_2) * g_t * g_t - */ - _s.mul(ev.fromType[Double](beta1)).add(ev.fromType[Double](1-beta1), dfdx) - _r.mul(ev.fromType[Double](beta2)).addcmul(ev.fromType[Double](1-beta2), dfdx, dfdx) - _denom.sqrt(_r) - - // used as MKL.axpy: 1 * a + y = y, and fill buffer with one - _denom.add(ev.fromType(eps), ones) - - // efficiency improved upon by changing the order of computation, at expense of clarity - val biasCorrection1 = 1 - pow(beta1, timestep) - val biasCorrection2 = 1 - pow(beta2, timestep) - val stepSize = clr * sqrt(biasCorrection2) / biasCorrection1 - _denom.cdiv(_s, _denom) - parameter.add(ev.fromType[Double](-stepSize), _denom) - } - - - private[optim] def updateFrameZeroGrad[T: ClassTag]( - currentIteration: Int, lastUpdatedIteration: Int, - _s: Tensor[T], _r: Tensor[T], _denom: Tensor[T], - clr: Double, parameter: Tensor[T], - beta1: Double, beta2: Double, - ones: Tensor[T], eps: Double)( - implicit ev: TensorNumeric[T]): Unit = { - (lastUpdatedIteration until (currentIteration - 1)).foreach{timestep => - /** - * m_t = beta_1 * m_t-1 - * v_t = beta_2 * v_t-1 - */ - _s.mul(ev.fromType[Double](beta1)) - _r.mul(ev.fromType[Double](beta2)) - _denom.sqrt(_r) - - // used as MKL.axpy: 1 * a + y = y - _denom.add(ev.fromType(eps), ones) - - // efficiency improved upon by changing the order of computation, at expense of clarity - val biasCorrection1 = 1 - pow(beta1, timestep) - val biasCorrection2 = 1 - pow(beta2, timestep) - val stepSize = clr * sqrt(biasCorrection2) / biasCorrection1 - _denom.cdiv(_s, _denom) - parameter.add(ev.fromType[Double](-stepSize), _denom) - } - } -} diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/EmbeddingAdam.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/EmbeddingAdam.scala index 59fb4dacb39..1aecc55ca0b 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/EmbeddingAdam.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/EmbeddingAdam.scala @@ -94,7 +94,7 @@ class EmbeddingAdam[@specialized(Float, Double) T: ClassTag]( Tensor[T]().resizeAs(currentParameter).zero(), Tensor[T]().resizeAs(currentParameter).zero()) } - Adam.updateFrame(_s, _r, _denom, clr, currentDfdx, currentParameter, + ParallelAdam.updateFrame(_s, _r, _denom, clr, currentDfdx, currentParameter, beta1, beta2, timestep, currentOnes, eps) state(s"s$tid") = _s // 1st moment variables @@ -103,8 +103,9 @@ class EmbeddingAdam[@specialized(Float, Double) T: ClassTag]( times(tid) = (System.nanoTime() - start) / 1000000L })) - Adam.logger.info(s"update ${parameter.nElement()} parameters, maximum time is ${times.max} ms") - Adam.logger.info(s"Time is ${times.mkString("\t")} ms") + ParallelAdam.logger + .info(s"update ${parameter.nElement()} parameters, maximum time is ${times.max} ms") + ParallelAdam.logger.info(s"Time is ${times.mkString("\t")} ms") state("evalCounter") = timestep // A tmp tensor to hold the sqrt(v) + epsilon @@ -123,6 +124,7 @@ class EmbeddingAdam[@specialized(Float, Double) T: ClassTag]( var s: Array[Tensor[T]] = _ var r: Array[Tensor[T]] = _ var denom: Array[Tensor[T]] = _ + var buffer: Array[Tensor[T]] = _ def setNOutput(nIndex: Int, nOutput: Int): Unit = { embeddingNoutput = nOutput embeddingNIndex = nIndex @@ -133,6 +135,7 @@ class EmbeddingAdam[@specialized(Float, Double) T: ClassTag]( s = Array.tabulate(nIndex)(_ => Tensor[T](nOutput)) r = Array.tabulate(nIndex)(_ => Tensor[T](nOutput)) denom = Array.tabulate(nIndex)(_ => Tensor[T](nOutput)) + buffer = Array.tabulate(nIndex)(_ => Tensor[T](nOutput)) (1 to nIndex).foreach{i => state(s"s$i") = Tensor[Float]() // 1st moment variables state(s"r$i") = Tensor[Float]() // 2nd moment variables @@ -147,7 +150,9 @@ class EmbeddingAdam[@specialized(Float, Double) T: ClassTag]( val beta2 = this.beta2 val eps = this.Epsilon + val uniqueStart = System.nanoTime() val uniqueIndices = Tensor[T](Storage(indices.storage().array().distinct)) + println(s"unique indices ${System.nanoTime() - uniqueStart}") var timestep = state.getOrElse[Int]("evalCounter", 1) @@ -160,7 +165,9 @@ class EmbeddingAdam[@specialized(Float, Double) T: ClassTag]( // val times = new Array[Long](parallelNum) - Engine.default.invokeAndWait((0 until parallelNum).map(tid => () => { + var updateTime = System.nanoTime() + (0 until parallelNum).map(tid => { +// Engine.default.invokeAndWait((0 until parallelNum).map(tid => () => { val offset = tid * taskSize + math.min(tid, extraTask) val length = taskSize + (if (tid < extraTask) 1 else 0) val currentIndex = uniqueIndices.narrow(1, offset + 1, length) @@ -168,13 +175,14 @@ class EmbeddingAdam[@specialized(Float, Double) T: ClassTag]( while(i <= currentIndex.nElement()) { val index = ev.toType[Int](currentIndex.valueAt(i)) if (timestep > lastUpdated(index - 1)) { - val (_s, _r, _denom) = (s(index - 1), r(index - 1), denom(index - 1)) + val (_s, _r, _denom, _buffer) = + (s(index - 1), r(index - 1), denom(index - 1), buffer(index - 1)) val indexThParameter = parameter.narrow(1, (index - 1) * embeddingNoutput + 1, embeddingNoutput) - println(s"update index ${index}") - Adam.updateFrameZeroGrad( +// println(s"update index ${index}") + ParallelAdam.updateFrameZeroGrad( timestep, lastUpdated(index - 1), - _s, _r, _denom, clr, indexThParameter, + _s, _r, _denom, _buffer, clr, indexThParameter, beta1, beta2, ones, eps) lastUpdated(index - 1) = timestep } @@ -183,14 +191,66 @@ class EmbeddingAdam[@specialized(Float, Double) T: ClassTag]( } // Adam.logger.info(s"zero grad${tid} $i ${ev.toType[Int](currentIndex.valueAt(i - 1))}") // times(tid) = (System.nanoTime() - start) / 1000000L - })) + }) // Adam.logger.info(s"update ${parameter.nElement()} parameters, maximum time is ${times.max} ms") // Adam.logger.info(s"Time is ${times.mkString("\t")} ms") + println(s"${parallelNum}nograd update frame time cost ${System.nanoTime() - updateTime}") state("evalCounter") = timestep // A tmp tensor to hold the sqrt(v) + epsilon } +// def updateNograd(indices: Tensor[T], parameter: Array[Tensor[T]]): Unit = { +// val lr = this.learningRate +// val lrd = this.learningRateDecay +// val beta1 = this.beta1 +// val beta2 = this.beta2 +// val eps = this.Epsilon +// +// val uniqueStart = System.nanoTime() +// val uniqueIndices = Tensor[T](Storage(indices.storage().array().distinct)) +// println(s"unique indices ${System.nanoTime() - uniqueStart}") +// +// var timestep = state.getOrElse[Int]("evalCounter", 1) +// +// val clr = lr / (1 + (timestep - 1) *lrd) +// +// val parallelNum = Engine.coreNumber() +// val gradLength = uniqueIndices.nElement() +// val taskSize = gradLength / parallelNum +// val extraTask = gradLength % parallelNum +// +// // val times = new Array[Long](parallelNum) +// +// var updateTime = System.nanoTime() +// Engine.default.invokeAndWait((0 until parallelNum).map(tid => () => { +// val offset = tid * taskSize + math.min(tid, extraTask) +// val length = taskSize + (if (tid < extraTask) 1 else 0) +// val currentIndex = uniqueIndices.narrow(1, offset + 1, length) +// var i = 1 +// while(i <= currentIndex.nElement()) { +// val index = ev.toType[Int](currentIndex.valueAt(i)) +// if (timestep > lastUpdated(index - 1)) { +// val (_s, _r, _denom, _buffer) = +// (s(index - 1), r(index - 1), denom(index - 1), buffer(index - 1)) +// val indexThParameter = parameter(index - 1) +// // println(s"update index ${index}") +// ParallelAdam.updateFrameZeroGrad( +// timestep, lastUpdated(index - 1), +// _s, _r, _denom, _buffer, clr, indexThParameter, +// beta1, beta2, ones, eps) +// lastUpdated(index - 1) = timestep +// } +// // println(index) +// i += 1 +// } +// })) +// println(s"${parallelNum}nograd update frame time cost ${System.nanoTime() - updateTime}") +// +// +// state("evalCounter") = timestep // A tmp tensor to hold the sqrt(v) + epsilon +// } + /** * update embedding gradient * @param dfdx input -> gradOutput @@ -235,7 +295,7 @@ class EmbeddingAdam[@specialized(Float, Double) T: ClassTag]( val indexThParameter = parameter.narrow(1, (index - 1) * embeddingNoutput + 1, embeddingNoutput) val iThGradient = currentDfdx.select(1, i) - Adam.updateFrame( + ParallelAdam.updateFrame( _s, _r, _denom, clr, iThGradient, indexThParameter, beta1, beta2, timestep, ones, eps) lastUpdated(index - 1) = timestep + 1 @@ -244,9 +304,8 @@ class EmbeddingAdam[@specialized(Float, Double) T: ClassTag]( // Adam.logger.info(s"update grad${tid} $i ${ev.toType[Int](currentIndex.valueAt(i - 1))}") })) -// Adam.logger.info(s"update ${parameter.nElement()} parameters, maximum time is ${times.max} ms") -// Adam.logger.info(s"Time is ${times.mkString("\t")} ms") + // TODO: timestep timestep = timestep + 1 state("evalCounter") = timestep // A tmp tensor to hold the sqrt(v) + epsilon } @@ -269,59 +328,3 @@ class EmbeddingAdam[@specialized(Float, Double) T: ClassTag]( override def getLearningRate(): Double = this.learningRate } -//object Adam { -// val logger = Logger.getLogger(this.getClass) -// -// private[optim] def updateFrame[T: ClassTag](_s: Tensor[T], _r: Tensor[T], _denom: Tensor[T], -// clr: Double, dfdx: Tensor[T], parameter: Tensor[T], -// beta1: Double, beta2: Double, timestep: Int, -// ones: Tensor[T], eps: Double)( -// implicit ev: TensorNumeric[T]): Unit = { -// /** -// * m_t = beta_1 * m_t-1 + (1 - beta_1) * g_t -// * v_t = beta_2 * v_t-1 + (1 - beta_2) * g_t * g_t -// */ -// _s.mul(ev.fromType[Double](beta1)).add(ev.fromType[Double](1-beta1), dfdx) -// _r.mul(ev.fromType[Double](beta2)).addcmul(ev.fromType[Double](1-beta2), dfdx, dfdx) -// _denom.sqrt(_r) -// -// // used as MKL.axpy: 1 * a + y = y, and fill buffer with one -// _denom.add(ev.fromType(eps), ones) -// -// // efficiency improved upon by changing the order of computation, at expense of clarity -// val biasCorrection1 = 1 - pow(beta1, timestep) -// val biasCorrection2 = 1 - pow(beta2, timestep) -// val stepSize = clr * sqrt(biasCorrection2) / biasCorrection1 -// _denom.cdiv(_s, _denom) -// parameter.add(ev.fromType[Double](-stepSize), _denom) -// } -// -// -// private[optim] def updateFrameZeroGrad[T: ClassTag]( -// currentIteration: Int, lastUpdatedIteration: Int, -// _s: Tensor[T], _r: Tensor[T], _denom: Tensor[T], -// clr: Double, parameter: Tensor[T], -// beta1: Double, beta2: Double, -// ones: Tensor[T], eps: Double)( -// implicit ev: TensorNumeric[T]): Unit = { -// (lastUpdatedIteration until currentIteration).foreach{timestep => -// /** -// * m_t = beta_1 * m_t-1 -// * v_t = beta_2 * v_t-1 -// */ -// _s.mul(ev.fromType[Double](beta1)) -// _r.mul(ev.fromType[Double](beta2)) -// _denom.sqrt(_r) -// -// // used as MKL.axpy: 1 * a + y = y -// _denom.add(ev.fromType(eps), ones) -// -// // efficiency improved upon by changing the order of computation, at expense of clarity -// val biasCorrection1 = 1 - pow(beta1, timestep) -// val biasCorrection2 = 1 - pow(beta2, timestep) -// val stepSize = clr * sqrt(biasCorrection2) / biasCorrection1 -// _denom.cdiv(_s, _denom) -// parameter.add(ev.fromType[Double](-stepSize), _denom) -// } -// } -//} diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/optim/AdamSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/optim/AdamSpec.scala index 8b85b99bf13..add285d7247 100644 --- a/spark/dl/src/test/scala/com/intel/analytics/bigdl/optim/AdamSpec.scala +++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/optim/AdamSpec.scala @@ -110,5 +110,107 @@ class AdamSpec extends FlatSpec with Matchers { } println(s"average eta = ${sum / iter} seconds") } + + "adam for embedding" should "works the same with adam" in { + System.setProperty("bigdl.localMode", "true") + Engine.init(1, 2, false) + val t = Tensor[Float](T(1.0f, 2.0f, 4.0f, 6.0f)) + val input = Array(t.narrow(1, 1, 2), t.narrow(1, 3, 2)) + val gradient = Array(Tensor[Float](2, 4).range(1, 8, 1), + Tensor[Float](2, 4).range(9, 16, 1)) + val weightEmbedding = Tensor[Float](24).randn() // 6 * 4 + val weightDense = weightEmbedding.clone() + val denseGradient = Tensor[Float](24) + + val adam = new Adam[Float]() + val adamEm = new EmbeddingAdam[Float]() + adamEm.setNOutput(6, 4) + + (0 until 100).foreach {i => + denseGradient.zero() + var j = 1 + input(0).apply1{v => + val index = v % 6 + 1 + denseGradient.narrow(1, (index.toInt - 1) * 4 + 1, 4).copy( + gradient(0).select(1, j) + ) + j += 1 + index + } + j = 1 + input(1).apply1{v => + val index = v % 6 + 1 + denseGradient.narrow(1, (index.toInt - 1) * 4 + 1, 4).copy( + gradient(1).select(1, j) + ) + j += 1 + index + } + +// adamEm.updateNograd(input(0), weightEmbedding) +// adamEm.updateNograd(input(1), weightEmbedding) + adamEm.updateNograd(t, weightEmbedding) + weightEmbedding should be (weightDense) + adamEm.optimizeEmbedding(input.zip(gradient), weightEmbedding) + + adam.optimize(_ => (1, denseGradient), weightDense) + + + if (weightEmbedding != weightDense) { + println + } + } + } + + "adam for embedding" should "works the same with adam 2" in { + System.setProperty("bigdl.localMode", "true") + Engine.init(1, 2, false) + val t = Tensor[Float](T(1.0f, 2.0f, 6.0f, 7.0f)) + val input = Array(t.narrow(1, 1, 2), t.narrow(1, 3, 2)) + val gradient = Array(Tensor[Float](2, 4).range(1, 8, 1), + Tensor[Float](2, 4).range(9, 16, 1)) + val weightEmbedding = Tensor[Float](32).randn() // 8 * 4 + val weightDense = weightEmbedding.clone() + val denseGradient = Tensor[Float](32) + + val adam = new Adam[Float]() + val adamEm = new EmbeddingAdam[Float]() + adamEm.setNOutput(8, 4) + + (0 until 100).foreach {i => + denseGradient.zero() + var j = 1 + input(0).apply1{v => + val index = v % 8 + 1 + denseGradient.narrow(1, (index.toInt - 1) * 4 + 1, 4).copy( + gradient(0).select(1, j) + ) + j += 1 + index + } + j = 1 + input(1).apply1{v => + val index = v % 8 + 1 + denseGradient.narrow(1, (index.toInt - 1) * 4 + 1, 4).copy( + gradient(1).select(1, j) + ) + j += 1 + index + } + + // adamEm.updateNograd(input(0), weightEmbedding) + // adamEm.updateNograd(input(1), weightEmbedding) + adamEm.updateNograd(t, weightEmbedding) +// weightEmbedding should be (weightDense) + adamEm.optimizeEmbedding(input.zip(gradient), weightEmbedding) + + adam.optimize(_ => (1, denseGradient), weightDense) + + + if (weightEmbedding != weightDense) { + println + } + } + } } From b4d17674c82c816612951cba91b1414326af9de4 Mon Sep 17 00:00:00 2001 From: qiuxin2012 Date: Fri, 3 Aug 2018 13:43:07 +0800 Subject: [PATCH 10/11] checkpoint --- .../analytics/bigdl/optim/ParallelAdam.scala | 193 ++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/ParallelAdam.scala diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/ParallelAdam.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/ParallelAdam.scala new file mode 100644 index 00000000000..5878656ce0b --- /dev/null +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/ParallelAdam.scala @@ -0,0 +1,193 @@ +/* + * Copyright 2016 The BigDL Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.intel.analytics.bigdl.optim + +import com.intel.analytics.bigdl.tensor.{Storage, Tensor} +import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric +import com.intel.analytics.bigdl.utils.{Engine, T, Table} +import org.apache.log4j.Logger + +import scala.math._ +import scala.reflect.ClassTag + +/** + * An implementation of Adam http://arxiv.org/pdf/1412.6980.pdf + * @param learningRate learning rate + * @param learningRateDecay learning rate decay + * @param beta1 first moment coefficient + * @param beta2 second moment coefficient + * @param Epsilon for numerical stability + * @tparam T + */ +class ParallelAdam[@specialized(Float, Double) T: ClassTag]( + var learningRate: Double = 1e-3, + var learningRateDecay: Double = 0.0, + var beta1: Double = 0.9, + var beta2: Double = 0.999, + var Epsilon: Double = 1e-8)(implicit ev: TensorNumeric[T]) extends OptimMethod[T] { + + @transient + private var ones: Tensor[T] = null + + /** + * An implementation of Adam http://arxiv.org/pdf/1412.6980.pdf + * + * @param feval a function that takes a single input (X), the point of a evaluation, and + * returns f(X) and df/dX + * @param parameter the initial point + * @return the new x vector and the function list {fx}, evaluated before the update + */ + override def optimize(feval: (Tensor[T]) => (T, Tensor[T]), + parameter: Tensor[T]): (Tensor[T], Array[T]) = { + val lr = this.learningRate + val lrd = this.learningRateDecay + val beta1 = this.beta1 + val beta2 = this.beta2 + val eps = this.Epsilon + + val (fx, dfdx) = feval(parameter) + + var timestep = state.getOrElse[Int]("evalCounter", 0) + + val clr = lr / (1 + timestep*lrd) + + timestep = timestep + 1 + + val parallelNum = Engine.coreNumber() + val gradLength = parameter.nElement() + val taskSize = gradLength / parallelNum + val extraTask = gradLength % parallelNum + if (ones == null || ones.nElement() < taskSize + 1) { + ones = Tensor[T]().resize(taskSize + 1).fill(ev.one) + } + + val times = new Array[Long](parallelNum) + + (0 until parallelNum).foreach{tid => + if (state.get[Tensor[T]](s"s$tid").isEmpty) { + state(s"s$tid") = Tensor[T]() + state(s"r$tid") = Tensor[T]() + state(s"denom$tid") = Tensor[T]() + } + } + + Engine.default.invokeAndWait((0 until parallelNum).map(tid => () => { + val start = System.nanoTime() + val offset = tid * taskSize + math.min(tid, extraTask) + val length = taskSize + (if (tid < extraTask) 1 else 0) + val currentDfdx = dfdx.narrow(1, offset + 1, length) + val currentParameter = parameter.narrow(1, offset + 1, length) + val currentOnes = ones.narrow(1, 1, length) + val (_s, _r, _denom) = + (state.get[Tensor[T]](s"s$tid").get.resizeAs(currentParameter), + state.get[Tensor[T]](s"r$tid").get.resizeAs(currentParameter), + state.get[Tensor[T]](s"denom$tid").get.resizeAs(currentParameter)) + + ParallelAdam.updateFrame(_s, _r, _denom, clr, currentDfdx, currentParameter, + beta1, beta2, timestep, currentOnes, eps) + + times(tid) = (System.nanoTime() - start) / 1000000L + })) + + ParallelAdam.logger. + info(s"update ${parameter.nElement()} parameters, maximum time is ${times.max} ms") + ParallelAdam.logger.info(s"Time is ${times.mkString("\t")} ms") + + + state("evalCounter") = timestep // A tmp tensor to hold the sqrt(v) + epsilon + + (parameter, Array(fx)) + } + + override def loadFromTable(config: Table): this.type = { + this.learningRate = config.get[Double]("learningRate").getOrElse(this.learningRate) + this.learningRateDecay = config.get[Double]("learningRateDecay") + .getOrElse(this.learningRateDecay) + this.beta1 = config.get[Double]("beta1").getOrElse(this.beta1) + this.beta2 = config.get[Double]("beta2").getOrElse(this.beta2) + this.Epsilon = config.get[Double]("Epsilon").getOrElse(this.Epsilon) + this + } + + override def clearHistory(): Unit = { + state.delete("s") + state.delete("r") + } + + override def getLearningRate(): Double = this.learningRate +} + +object ParallelAdam { + val logger = Logger.getLogger(this.getClass) + + private[optim] def updateFrame[T: ClassTag](_s: Tensor[T], _r: Tensor[T], _denom: Tensor[T], + clr: Double, dfdx: Tensor[T], parameter: Tensor[T], + beta1: Double, beta2: Double, timestep: Int, + ones: Tensor[T], eps: Double)( + implicit ev: TensorNumeric[T]): Unit = { + /** + * m_t = beta_1 * m_t-1 + (1 - beta_1) * g_t + * v_t = beta_2 * v_t-1 + (1 - beta_2) * g_t * g_t + */ + _s.mul(ev.fromType[Double](beta1)).add(ev.fromType[Double](1-beta1), dfdx) + _denom.cmul(dfdx, dfdx) + _r.mul(ev.fromType[Double](beta2)).add(ev.fromType[Double](1-beta2), _denom) + _denom.sqrt(_r) + + // used as MKL.axpy: 1 * a + y = y, and fill buffer with one + _denom.add(ev.fromType(eps), ones) + + // efficiency improved upon by changing the order of computation, at expense of clarity + val biasCorrection1 = 1 - pow(beta1, timestep) + val biasCorrection2 = 1 - pow(beta2, timestep) + val stepSize = clr * sqrt(biasCorrection2) / biasCorrection1 + _denom.cdiv(_s, _denom) + parameter.add(ev.fromType[Double](-stepSize), _denom) + } + + + private[optim] def updateFrameZeroGrad[T: ClassTag]( + currentIteration: Int, lastUpdatedIteration: Int, + _s: Tensor[T], _r: Tensor[T], _denom: Tensor[T], _buffer: Tensor[T], + clr: Double, parameter: Tensor[T], + beta1: Double, beta2: Double, + ones: Tensor[T], eps: Double)( + implicit ev: TensorNumeric[T]): Unit = { + + var timestep = lastUpdatedIteration + while(timestep < currentIteration) { + val biasCorrection1 = 1 - pow(beta1, timestep) + val biasCorrection2 = 1 - pow(beta2, timestep) + val stepSize = clr * sqrt(biasCorrection2) / biasCorrection1 + /** + * m_t = beta_1 * m_t-1 + * v_t = beta_2 * v_t-1 + */ + _s.mul(ev.fromType[Double](beta1)) + _r.mul(ev.fromType[Double](beta2)) + _denom.sqrt(_r) + + // used as MKL.axpy: 1 * a + y = y + _denom.add(ev.fromType(eps), ones) + + _denom.cdiv(_s, _denom) + parameter.add(ev.fromType[Double](-stepSize), _denom) + + timestep += 1 + } + } +} From 0dfeb5832b4f437188b2d1aee4586640b1219a93 Mon Sep 17 00:00:00 2001 From: qiuxin2012 Date: Wed, 22 Aug 2018 14:31:04 +0800 Subject: [PATCH 11/11] dense --- .../com/intel/analytics/bigdl/optim/NCFOptimizer.scala | 6 ++++++ .../com/intel/analytics/bigdl/optim/ParallelAdam.scala | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/NCFOptimizer.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/NCFOptimizer.scala index 777c0972aae..168bb777e3d 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/NCFOptimizer.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/NCFOptimizer.scala @@ -131,9 +131,11 @@ class NCFOptimizer[T: ClassTag] ( } val dataFetchTime = System.nanoTime() println("dataFetch") + val modelTimeArray = new Array[Long](parallelism) val lossSum = Engine.default.invokeAndWait( (0 until parallelism).map(i => () => { + val start = System.nanoTime() val localEmbedding = workingEmbeddingModels(i) val localLinears = workingLinears(i) // localEmbedding.zeroGradParameters() @@ -150,10 +152,14 @@ class NCFOptimizer[T: ClassTag] ( val errors = localCriterion.backward(output, target) localEmbedding.updateGradInput(input, localLinears.backward(localEmbedding.output, errors)) + modelTimeArray(i) = System.nanoTime() - start _loss }) ).sum + logger.info(s"Max model time is ${modelTimeArray.max}," + + s"Time is ${modelTimeArray.sortWith((a, b) => a > b).mkString("\t")} ms") + val loss = lossSum / parallelism val computingTime = System.nanoTime() diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/ParallelAdam.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/ParallelAdam.scala index 5878656ce0b..091266ee848 100644 --- a/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/ParallelAdam.scala +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim/ParallelAdam.scala @@ -105,7 +105,7 @@ class ParallelAdam[@specialized(Float, Double) T: ClassTag]( ParallelAdam.logger. info(s"update ${parameter.nElement()} parameters, maximum time is ${times.max} ms") - ParallelAdam.logger.info(s"Time is ${times.mkString("\t")} ms") + ParallelAdam.logger.info(s"Time is ${times.sortWith((a, b) => a > b).mkString("\t")} ms") state("evalCounter") = timestep // A tmp tensor to hold the sqrt(v) + epsilon