Skip to content

Commit

Permalink
cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
jkbradley committed Jun 12, 2017
1 parent 2048c00 commit 6bcf66f
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 12 deletions.
18 changes: 12 additions & 6 deletions mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -340,8 +340,10 @@ object Word2VecModel extends MLReadable[Word2VecModel] {
val wordVectors = instance.wordVectors.getVectors
val dataSeq = wordVectors.toSeq.map { case (word, vector) => Data(word, vector) }
val dataPath = new Path(path, "data").toString
val bufferSizeInBytes = Utils.byteStringAsBytes(
sc.conf.get("spark.kryoserializer.buffer.max", "64m"))
val numPartitions = Word2VecModelWriter.calculateNumberOfPartitions(
sc, instance.wordVectors.wordIndex.size, instance.getVectorSize)
bufferSizeInBytes, instance.wordVectors.wordIndex.size, instance.getVectorSize)
sparkSession.createDataFrame(dataSeq)
.repartition(numPartitions)
.write
Expand All @@ -351,16 +353,20 @@ object Word2VecModel extends MLReadable[Word2VecModel] {

private[feature]
object Word2VecModelWriter {
/**
* Calculate the number of partitions to use in saving the model.
* [SPARK-11994] - We want to partition the model in partitions smaller than
* spark.kryoserializer.buffer.max
* @param bufferSizeInBytes Set to spark.kryoserializer.buffer.max
* @param numWords Vocab size
* @param vectorSize Vector length for each word
*/
def calculateNumberOfPartitions(
sc: SparkContext,
bufferSizeInBytes: Long,
numWords: Int,
vectorSize: Int): Int = {
val floatSize = 4L // Use Long to help avoid overflow
val averageWordSize = 15
// [SPARK-11994] - We want to partition the model in partitions smaller than
// spark.kryoserializer.buffer.max
val bufferSizeInBytes = Utils.byteStringAsBytes(
sc.conf.get("spark.kryoserializer.buffer.max", "64m"))
// Calculate the approximate size of the model.
// Assuming an average word size of 15 bytes, the formula is:
// (floatSize * vectorSize + 15) * numWords
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.feature.{Word2VecModel => OldWord2VecModel}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row
import org.apache.spark.util.Utils

class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

Expand Down Expand Up @@ -189,12 +190,12 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
}

test("Word2Vec read/write numPartitions calculation") {
val tinyModelNumPartitions = Word2VecModel.Word2VecModelWriter.calculateNumberOfPartitions(
sc, numWords = 10, vectorSize = 5)
assert(tinyModelNumPartitions === 1)
val mediumModelNumPartitions = Word2VecModel.Word2VecModelWriter.calculateNumberOfPartitions(
sc, numWords = 1000000, vectorSize = 5000)
assert(mediumModelNumPartitions > 1)
val smallModelNumPartitions = Word2VecModel.Word2VecModelWriter.calculateNumberOfPartitions(
Utils.byteStringAsBytes("64m"), numWords = 10, vectorSize = 5)
assert(smallModelNumPartitions === 1)
val largeModelNumPartitions = Word2VecModel.Word2VecModelWriter.calculateNumberOfPartitions(
Utils.byteStringAsBytes("64m"), numWords = 1000000, vectorSize = 5000)
assert(largeModelNumPartitions > 1)
}

test("Word2Vec read/write") {
Expand Down

0 comments on commit 6bcf66f

Please sign in to comment.