Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP]Ncf perf dense #20

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/*
* Copyright 2016 The BigDL Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.intel.analytics.bigdl.example.recommendation

import com.intel.analytics.bigdl.nn.LookupTable
import com.intel.analytics.bigdl.optim.{Adam, EmbeddingAdam}
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.Engine
import com.intel.analytics.bigdl.numeric.NumericFloat

import scala.util.Random

object AdamPerf {
def main(args: Array[String]): Unit = {
val iteration = args(0).toInt
val batchSize = args(1).toInt
val core = args(2).toInt
System.setProperty("bigdl.localMode", "true")
val sparse = args(3) == "1"
val rand = args(4) == "1"
Engine.init(1, core, false)
val userCount = 138493
val itemCount = 26744

val model = NeuralCFV2[Float](userCount, itemCount, 1, 128, 128,
hiddenLayers = Array(128, 64),
mfEmbed = 64)
.buildModel()

if (sparse) {
val embeddings = model.embeddingModel.findModules("LookupTable")
.map(_.asInstanceOf[LookupTable[Float]])
val input = Tensor[Float].range(1, batchSize * core)
val embeddingsGradient = embeddings.map{embedding =>
val inputAndGradient = Array.tabulate(core)(c =>
(input.narrow(1, batchSize * c + 1, batchSize),
Tensor[Float](batchSize, embedding.nOutput).rand()))
val optimMethod = new EmbeddingAdam[Float]()
optimMethod.setNOutput(embedding.nIndex, embedding.nOutput)
val parameter = embedding.getParameters()._1
val parameterArray = Array.tabulate(embedding.nIndex)(i =>
embedding.weight.select(1, i + 1)
)
(inputAndGradient, optimMethod, parameter, parameterArray)
}

def update(): Unit = {
var i = 0
while (i < embeddingsGradient.size) {
val v = embeddingsGradient(i)
val inputAndGradient = v._1
val optimMethod = v._2
val parameter = v._3
val parameterArray = v._4

var start = System.nanoTime()
// optimMethod.updateNograd(input, parameterArray)
// println(s"${i}update parameter array ${parameterArray.length} " +
// s"Nograd ${System.nanoTime() - start}")
optimMethod.updateNograd(input, parameter)
println(s"${i}update parameter ${parameter.nElement()} " +
s"Nograd ${System.nanoTime() - start}")
start = System.nanoTime()
optimMethod.optimizeEmbedding(inputAndGradient, parameter)
println(s"${i}update parameter ${parameter.nElement()} " +
s"Embedding ${System.nanoTime() - start}")

i += 1
}

}

// warm up
(0 until 20).foreach { i =>
val n = i % 10
if (rand) input.range(1 + n * batchSize * core, (n + 1) * batchSize * core)
update()
}

var count = 0L
(0 until iteration).foreach { i =>
val n = i % 10
if (rand) input.range(1 + n * batchSize * core, (n + 1) * batchSize * core)
println(i)
val start = System.nanoTime()
update()
val end = System.nanoTime()
println(s"sparse time is ${(end - start) / 1e6.toLong}")
count += end - start
}

println(s"average sparse time is ${count / 1e6.toLong / iteration}")

} else {
// update with dense gradient
val (w, g) = model.embeddingModel.getParameters()
val optimMethod = new Adam[Float]()
g.randn()

// warm up
(0 until 5).foreach { i =>
optimMethod.optimize(_ => (1, g), w)
}

var count = 0L
(0 until iteration).foreach { i =>
println(i)
g.randn()
val start = System.nanoTime()
optimMethod.optimize(_ => (1, g), w)
val end = System.nanoTime()
println(s"time is ${(end - start) / 1e6.toLong}")
count += end - start
}
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
/*
* Copyright 2016 The BigDL Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.intel.analytics.bigdl.example.recommendation

import com.intel.analytics.bigdl.nn.BCECriterion
import com.intel.analytics.bigdl.optim.{Adam, ParallelAdam}
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.Engine

import scala.util.Random

object NcfPerf {
def main(args: Array[String]): Unit = {
val iteration = args(0).toInt
val batchSize = args(1).toInt
val core = args(2).toInt
val train = args(3).toInt == 1
System.setProperty("bigdl.localMode", "true")
Engine.init(1, core, false)
val userCount = 138493
val itemCount = 26744

val model = NeuralCFV2[Float](userCount, itemCount, 1, 128, 128,
hiddenLayers = Array(128, 64),
mfEmbed = 64)
.buildModel()
val criterion = BCECriterion[Float]()
val optimMethod = new Adam[Float]()

val input = Tensor[Float](batchSize * core, 2)
input.select(2, 1).apply1(_ => Random.nextInt(userCount) + 1)
input.select(2, 2).apply1(_ => Random.nextInt(itemCount) + 1)
val target = Tensor[Float](batchSize * core, 1).apply1(_ => Random.nextInt(2))

val modelForwardTime = new Array[Long](core)
val modelBackwardTime = new Array[Long](core)
val criterionForwardTime = new Array[Long](core)
val criterionBackwardTime = new Array[Long](core)
var accgradientTime = 0L
var updateWeightTime = 0L


val (w, g) = model.getParameters()
println(s"model weight length ${w.nElement()}")
val workingModels = (0 until core).map{i =>
val newmodel = model.cloneModule()
newmodel.getParameters()._1.set(w)
newmodel
}
val workingModelWAndG = workingModels.map(_.getParameters())

val subModelNumber = core
val parallelism = core
val gradLength = g.nElement()
val syncGradTaskSize = gradLength / subModelNumber
val syncGradExtraTask = gradLength % subModelNumber
val syncGradParallelNum =
if (syncGradTaskSize == 0) syncGradExtraTask else subModelNumber

// warm up
(0 until 5).foreach{i =>
Engine.default.invokeAndWait((0 until core).map { tid =>
() =>
val currentInput = input.narrow(1, tid * batchSize + 1, batchSize)
val currentTarget = target.narrow(1, tid * batchSize + 1, batchSize)
val currentModel = workingModels(tid)

val output = currentModel.forward(currentInput)
val loss = criterion.forward(output, currentTarget)
val gradOutput = criterion.backward(output, currentTarget)
val gradInput = currentModel.backward(currentInput, gradOutput)

})
}

if (train) {
(0 until iteration).foreach { i =>
input.select(2, 1).apply1(_ => Random.nextInt(userCount) + 1)
input.select(2, 2).apply1(_ => Random.nextInt(itemCount) + 1)
target.apply1(_ => Random.nextInt(2))
println(i)

Engine.default.invokeAndWait((0 until core).map { tid =>
() =>
val currentInput = input.narrow(1, tid * batchSize + 1, batchSize)
val currentTarget = target.narrow(1, tid * batchSize + 1, batchSize)
val currentModel = workingModels(tid)


var start = System.nanoTime()

val output = currentModel.forward(currentInput)
modelForwardTime(tid) += System.nanoTime() - start

start = System.nanoTime()
val loss = criterion.forward(output, currentTarget)
criterionForwardTime(tid) += System.nanoTime() - start

start = System.nanoTime()
val gradOutput = criterion.backward(output, currentTarget)
criterionBackwardTime(tid) += System.nanoTime() - start

start = System.nanoTime()
val gradInput = currentModel.backward(currentInput, gradOutput)
modelBackwardTime(tid) += System.nanoTime() - start

})

var start = System.nanoTime()
val grad = g
Engine.default.invokeAndWait(
(0 until syncGradParallelNum).map(tid =>
() => {
val offset = tid * syncGradTaskSize + math.min(tid, syncGradExtraTask)
val length = syncGradTaskSize + (if (tid < syncGradExtraTask) 1 else 0)
var i = 0
while (i < parallelism) {
val sliceG = workingModelWAndG(i)._2.narrow(1, offset + 1, length)
if (i == 0) {
grad.narrow(1, offset + 1, length)
.copy(sliceG)
sliceG.zero()
} else {
grad.narrow(1, offset + 1, length)
.add(sliceG)
sliceG.zero()
}
i += 1
}
})
)
grad.div(parallelism)
accgradientTime += System.nanoTime() - start

start = System.nanoTime()
optimMethod.optimize(_ => (1, grad), w)
updateWeightTime += System.nanoTime() - start
}

println(s"${modelForwardTime.max / 1e6 / iteration}ms")
println(s"${criterionForwardTime.max / 1e6 / iteration}ms")
println(s"${criterionBackwardTime.max / 1e6 / iteration}ms")
println(s"${modelBackwardTime.max / 1e6 / iteration}ms")
println(s"${accgradientTime / 1e6 / iteration}ms")
println(s"${updateWeightTime / 1e6 / iteration}ms")
} else {
var computingTime = 0L
(0 until iteration).foreach { i =>
input.select(2, 1).apply1(_ => Random.nextInt(userCount) + 1)
input.select(2, 2).apply1(_ => Random.nextInt(itemCount) + 1)
target.apply1(_ => Random.nextInt(2))
println(i)

var start = System.nanoTime()
Engine.default.invokeAndWait((0 until core).map { tid =>
() =>
val currentInput = input.narrow(1, tid * batchSize + 1, batchSize)
val currentTarget = target.narrow(1, tid * batchSize + 1, batchSize)
val currentModel = workingModels(tid)

val output = currentModel.forward(currentInput)
modelForwardTime(tid) += System.nanoTime() - start
})
computingTime += System.nanoTime() - start
}

println(s"Throughput is ${batchSize * core * iteration * 1e9 / computingTime} records/s")
}

}

}
Loading