alibaba-archive · tosky001 · Feb 6, 2018 · Feb 6, 2018
diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/nn/ops/CategoricalColVocaList.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/nn/ops/CategoricalColVocaList.scala
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.ops
+
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
+
+import scala.collection.mutable.ArrayBuffer
+import scala.reflect.ClassTag
+import scala.util.hashing.MurmurHash3
+
+class CategoricalColVocaList[T: ClassTag](
+  val vocaList: Array[String],
+  val strDelimiter: String = ",",
+  val defaultValue: Int = -1,
+  val numOovBuckets: Int = 0
+) (implicit ev: TensorNumeric[T])
+  extends Operation[Tensor[String], Tensor[Int], T]{
+
+  private val vocaLen = vocaList.length
+  private val vocaMap = vocaList.zipWithIndex.toMap
+
+  require(numOovBuckets >= 0,
+    "numOovBuckets is a negative integer")
+  require(!(defaultValue != -1 && numOovBuckets != 0),
+    "defaultValue and numOovBuckets are both specified")
+  require(vocaLen > 0,
+    "the vocabulary list is empty")
+  require(vocaLen == vocaMap.size,
+    "the vocabulary list contains duplicate keys")
+
+  output = Tensor[Int]()
+
+  override def updateOutput(input: Tensor[String]): Tensor[Int] = {
+
+    val resTensor = Tensor[Int]()
+    val rows = input.size(dim = 1)
+    val cols = if (numOovBuckets==0) vocaLen + 1 else vocaLen + numOovBuckets
+    val shape = Array(rows, cols)
+    val indices0 = new ArrayBuffer[Int]()
+    val indices1 = new ArrayBuffer[Int]()
+    val values = new ArrayBuffer[Int]()
+
+    var i = 1
+    while (i <= rows) {
+      val feaStrArr = input.valueAt(i, 1).split(strDelimiter)
+      var j = 0
+      while (j < feaStrArr.length) {
+        val mapVal = numOovBuckets==0 match {
+          case true =>
+            vocaMap.getOrElse(feaStrArr(j), defaultValue)
+          case false =>
+            vocaMap.getOrElse(feaStrArr(j),
+              MurmurHash3.stringHash(feaStrArr(j)) % numOovBuckets match {
+                case v if v < 0 => v + numOovBuckets + vocaLen
+                case v if v >= 0 => v + vocaLen
+              })
+        }
+        indices0 += i-1
+        indices1 += j
+        values += mapVal
+        j += 1
+      }
+      i += 1
+    }
+    val indices = Array(indices0.toArray, indices1.toArray)
+    output = Tensor.sparse(indices, values.toArray, shape)
+    output
+  }
+}
+
+object CategoricalColVocaList {
+  def apply[T: ClassTag](
+    vocaList: Array[String],
+    strDelimiter: String = ",",
+    defaultValue: Int = -1,
+    numOovBuckets: Int = 0
+  ) (implicit ev: TensorNumeric[T]): CategoricalColVocaList[T]
+  = new CategoricalColVocaList[T](
+    vocaList = vocaList,
+    strDelimiter = strDelimiter,
+    defaultValue = defaultValue,
+    numOovBuckets = numOovBuckets
+  )
+}
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/ops/CategoricalColVocaListSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/ops/CategoricalColVocaListSpec.scala
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.ops
+
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.utils.T
+import org.scalatest.{FlatSpec, Matchers}
+
+class CategoricalColVocaListSpec extends FlatSpec with Matchers{
+
+  "CategoricalColVocaList operation with default value" should "work correctly" in {
+    val input = Tensor[String](T(T("A"), T("B"), T("C"), T("D")))
+    val indices = Array(Array(0, 1, 2, 3), Array(0, 0, 0, 0))
+    val values = Array(0, 1, 2, -1)
+    val shape = Array(4, 4)
+    val expectOutput = Tensor.sparse(
+      indices, values, shape
+    )
+    val output = CategoricalColVocaList[Double](
+      vocaList = Array("A", "B", "C"),
+      strDelimiter = ",",
+      defaultValue = -1,
+      numOovBuckets = 0
+    ).forward(input)
+
+    output should be(expectOutput)
+  }
+
+  "CategoricalColVocaList operation with numOvvBucket" should "work correctly" in {
+    val input = Tensor[String](T(T("A,B"), T("C"), T("B,C,D"), T("A,D")))
+    val indices = Array(
+      Array(0, 0, 1, 2, 2, 2, 3, 3),
+      Array(0, 1, 0, 0, 1, 2, 0, 1))
+    val values = Array(0, 1, 2, 1, 2, 4, 0, 4)
+    val shape = Array(4, 5)
+    val expectOutput = Tensor.sparse(
+      indices, values, shape
+    )
+    val output = CategoricalColVocaList[Double](
+      vocaList = Array("A", "B", "C"),
+      strDelimiter = ",",
+      numOovBuckets = 2
+    ).forward(input)
+
+    output should be(expectOutput)
+  }
+}
diff --git a/...l/src/test/scala/com/intel/analytics/bigdl/utils/serializer/OperationSerializerSpec.scala b/...l/src/test/scala/com/intel/analytics/bigdl/utils/serializer/OperationSerializerSpec.scala
@@ -21,7 +21,7 @@ import java.io.{File => JFile}
 import com.google.protobuf.{ByteString, CodedOutputStream}
 import com.intel.analytics.bigdl.nn._
 import com.intel.analytics.bigdl.nn.abstractnn.DataFormat
-import com.intel.analytics.bigdl.nn.ops.{All, Any, ApproximateEqual, ArgMax, Assert, Assign, AssignGrad, AvgPoolGrad, BatchMatMul, BiasAddGrad, BroadcastGradientArgs, Cast, CategoricalColHashBucket, Ceil, ControlNodes, Conv2D, Conv2DBackFilter, Conv2DTranspose, Conv3D, Conv3DBackpropFilter, Conv3DBackpropFilterV2, Conv3DBackpropInput, Conv3DBackpropInputV2, CrossEntropy, DecodeImage, DepthwiseConv2D, DepthwiseConv2DBackpropFilter, DepthwiseConv2DBackpropInput, Digamma, Dilation2D, Dilation2DBackpropFilter, Dilation2DBackpropInput, EluGrad, Equal, Erf, Erfc, Expm1, Floor, FloorDiv, FloorMod, FusedBatchNorm, FusedBatchNormGrad, Greater, GreaterEqual, InTopK, Inv, InvGrad, IsFinite, IsInf, IsNan, Kv2Tensor, L2Loss, LRNGrad, Less, LessEqual, Lgamma, LogicalAnd, LogicalNot, LogicalOr, MaxPool, MaxPoolGrad, Maximum, MergeOps, Minimum, Mod, ModuleToOperation, NotEqual, OneHot, Pad, ParseExample, Prod, RandomUniform, RangeOps, Rank, Relu6Grad, ReluGrad, ResizeBilinearGrad, ResizeBilinearOps, Rint, Round, RsqrtGrad, SegmentSum, SigmoidGrad, Sign, Slice, SoftplusGrad, SoftsignGrad, SqrtGrad, SquaredDifference, Substr, SwitchOps, TanhGrad, TopK, TruncateDiv, TruncatedNormal, Add => AddOps, DecodeGif => DecodeGifOps, DecodeJpeg => DecodeJpegOps, DecodePng => DecodePngOps, DecodeRaw => DecodeRawOps, Exp => ExpOps, Pow => PowOps, Select => SelectOps, Sum => SumOps, Tile => TileOps}
+import com.intel.analytics.bigdl.nn.ops.{All, Any, ApproximateEqual, ArgMax, Assert, Assign, AssignGrad, AvgPoolGrad, BatchMatMul, BiasAddGrad, BroadcastGradientArgs, Cast, CategoricalColHashBucket, CategoricalColVocaList, Ceil, ControlNodes, Conv2D, Conv2DBackFilter, Conv2DTranspose, Conv3D, Conv3DBackpropFilter, Conv3DBackpropFilterV2, Conv3DBackpropInput, Conv3DBackpropInputV2, CrossEntropy, DecodeImage, DepthwiseConv2D, DepthwiseConv2DBackpropFilter, DepthwiseConv2DBackpropInput, Digamma, Dilation2D, Dilation2DBackpropFilter, Dilation2DBackpropInput, EluGrad, Equal, Erf, Erfc, Expm1, Floor, FloorDiv, FloorMod, FusedBatchNorm, FusedBatchNormGrad, Greater, GreaterEqual, InTopK, Inv, InvGrad, IsFinite, IsInf, IsNan, Kv2Tensor, L2Loss, LRNGrad, Less, LessEqual, Lgamma, LogicalAnd, LogicalNot, LogicalOr, MaxPool, MaxPoolGrad, Maximum, MergeOps, Minimum, Mod, ModuleToOperation, NotEqual, OneHot, Pad, ParseExample, Prod, RandomUniform, RangeOps, Rank, Relu6Grad, ReluGrad, ResizeBilinearGrad, ResizeBilinearOps, Rint, Round, RsqrtGrad, SegmentSum, SigmoidGrad, Sign, Slice, SoftplusGrad, SoftsignGrad, SqrtGrad, SquaredDifference, Substr, SwitchOps, TanhGrad, TopK, TruncateDiv, TruncatedNormal, Add => AddOps, DecodeGif => DecodeGifOps, DecodeJpeg => DecodeJpegOps, DecodePng => DecodePngOps, DecodeRaw => DecodeRawOps, Exp => ExpOps, Pow => PowOps, Select => SelectOps, Sum => SumOps, Tile => TileOps}
 import com.intel.analytics.bigdl.nn.tf._
 import com.intel.analytics.bigdl.nn.{SoftPlus => BigDLSoftPlus}
 import com.intel.analytics.bigdl.tensor._
@@ -466,6 +466,17 @@ class OperationSerializerSpec extends SerializerSpecHelper {
     runSerializationTest(categoricalColHashBucket, input)
   }
 
+  "CategoricalColVocaList" should "work properly" in {
+    val categoricalColVocaList = CategoricalColVocaList[Float](
+      vocaList = Array("A", "B", "C"),
+      strDelimiter = ",",
+      defaultValue = -1,
+      numOovBuckets = 0
+    ).setName("categoricalColVocaList")
+    val input = Tensor[String](T(T("A"), T("B"), T("C"), T("D")))
+    runSerializationTest(categoricalColVocaList, input)
+  }
+
   "LessEqual serializer" should "work properly" in {
     val lessEqual = LessEqual[Float]().setName("lessEqual")
     val input1 = Tensor[Float](5).apply1(_ => Random.nextFloat())