From b81d2e8b2c5f350f6106ca95ca3e96b4fade0620 Mon Sep 17 00:00:00 2001 From: tosky001 Date: Tue, 6 Feb 2018 17:30:14 +0800 Subject: [PATCH] solve conflict --- .../bigdl/nn/ops/CategoricalColVocaList.scala | 98 +++++++++++++++++++ .../nn/ops/CategoricalColVocaListSpec.scala | 60 ++++++++++++ .../serializer/OperationSerializerSpec.scala | 13 ++- 3 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 spark/dl/src/main/scala/com/intel/analytics/bigdl/nn/ops/CategoricalColVocaList.scala create mode 100644 spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/ops/CategoricalColVocaListSpec.scala diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/nn/ops/CategoricalColVocaList.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/nn/ops/CategoricalColVocaList.scala new file mode 100644 index 00000000000..1de23cf06ef --- /dev/null +++ b/spark/dl/src/main/scala/com/intel/analytics/bigdl/nn/ops/CategoricalColVocaList.scala @@ -0,0 +1,98 @@ +/* + * Copyright 2016 The BigDL Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.intel.analytics.bigdl.nn.ops + +import com.intel.analytics.bigdl.tensor.Tensor +import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric + +import scala.collection.mutable.ArrayBuffer +import scala.reflect.ClassTag +import scala.util.hashing.MurmurHash3 + +class CategoricalColVocaList[T: ClassTag]( + val vocaList: Array[String], + val strDelimiter: String = ",", + val defaultValue: Int = -1, + val numOovBuckets: Int = 0 +) (implicit ev: TensorNumeric[T]) + extends Operation[Tensor[String], Tensor[Int], T]{ + + private val vocaLen = vocaList.length + private val vocaMap = vocaList.zipWithIndex.toMap + + require(numOovBuckets >= 0, + "numOovBuckets is a negative integer") + require(!(defaultValue != -1 && numOovBuckets != 0), + "defaultValue and numOovBuckets are both specified") + require(vocaLen > 0, + "the vocabulary list is empty") + require(vocaLen == vocaMap.size, + "the vocabulary list contains duplicate keys") + + output = Tensor[Int]() + + override def updateOutput(input: Tensor[String]): Tensor[Int] = { + + val resTensor = Tensor[Int]() + val rows = input.size(dim = 1) + val cols = if (numOovBuckets==0) vocaLen + 1 else vocaLen + numOovBuckets + val shape = Array(rows, cols) + val indices0 = new ArrayBuffer[Int]() + val indices1 = new ArrayBuffer[Int]() + val values = new ArrayBuffer[Int]() + + var i = 1 + while (i <= rows) { + val feaStrArr = input.valueAt(i, 1).split(strDelimiter) + var j = 0 + while (j < feaStrArr.length) { + val mapVal = numOovBuckets==0 match { + case true => + vocaMap.getOrElse(feaStrArr(j), defaultValue) + case false => + vocaMap.getOrElse(feaStrArr(j), + MurmurHash3.stringHash(feaStrArr(j)) % numOovBuckets match { + case v if v < 0 => v + numOovBuckets + vocaLen + case v if v >= 0 => v + vocaLen + }) + } + indices0 += i-1 + indices1 += j + values += mapVal + j += 1 + } + i += 1 + } + val indices = Array(indices0.toArray, indices1.toArray) + output = Tensor.sparse(indices, values.toArray, shape) + output + } +} + +object CategoricalColVocaList { + def apply[T: ClassTag]( + vocaList: Array[String], + strDelimiter: String = ",", + defaultValue: Int = -1, + numOovBuckets: Int = 0 + ) (implicit ev: TensorNumeric[T]): CategoricalColVocaList[T] + = new CategoricalColVocaList[T]( + vocaList = vocaList, + strDelimiter = strDelimiter, + defaultValue = defaultValue, + numOovBuckets = numOovBuckets + ) +} diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/ops/CategoricalColVocaListSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/ops/CategoricalColVocaListSpec.scala new file mode 100644 index 00000000000..5739bc4dd4b --- /dev/null +++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/ops/CategoricalColVocaListSpec.scala @@ -0,0 +1,60 @@ +/* + * Copyright 2016 The BigDL Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.intel.analytics.bigdl.nn.ops + +import com.intel.analytics.bigdl.tensor.Tensor +import com.intel.analytics.bigdl.utils.T +import org.scalatest.{FlatSpec, Matchers} + +class CategoricalColVocaListSpec extends FlatSpec with Matchers{ + + "CategoricalColVocaList operation with default value" should "work correctly" in { + val input = Tensor[String](T(T("A"), T("B"), T("C"), T("D"))) + val indices = Array(Array(0, 1, 2, 3), Array(0, 0, 0, 0)) + val values = Array(0, 1, 2, -1) + val shape = Array(4, 4) + val expectOutput = Tensor.sparse( + indices, values, shape + ) + val output = CategoricalColVocaList[Double]( + vocaList = Array("A", "B", "C"), + strDelimiter = ",", + defaultValue = -1, + numOovBuckets = 0 + ).forward(input) + + output should be(expectOutput) + } + + "CategoricalColVocaList operation with numOvvBucket" should "work correctly" in { + val input = Tensor[String](T(T("A,B"), T("C"), T("B,C,D"), T("A,D"))) + val indices = Array( + Array(0, 0, 1, 2, 2, 2, 3, 3), + Array(0, 1, 0, 0, 1, 2, 0, 1)) + val values = Array(0, 1, 2, 1, 2, 4, 0, 4) + val shape = Array(4, 5) + val expectOutput = Tensor.sparse( + indices, values, shape + ) + val output = CategoricalColVocaList[Double]( + vocaList = Array("A", "B", "C"), + strDelimiter = ",", + numOovBuckets = 2 + ).forward(input) + + output should be(expectOutput) + } +} diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/utils/serializer/OperationSerializerSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/utils/serializer/OperationSerializerSpec.scala index d6a977e466b..11b28474d4f 100644 --- a/spark/dl/src/test/scala/com/intel/analytics/bigdl/utils/serializer/OperationSerializerSpec.scala +++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/utils/serializer/OperationSerializerSpec.scala @@ -21,7 +21,7 @@ import java.io.{File => JFile} import com.google.protobuf.{ByteString, CodedOutputStream} import com.intel.analytics.bigdl.nn._ import com.intel.analytics.bigdl.nn.abstractnn.DataFormat -import com.intel.analytics.bigdl.nn.ops.{All, Any, ApproximateEqual, ArgMax, Assert, Assign, AssignGrad, AvgPoolGrad, BatchMatMul, BiasAddGrad, BroadcastGradientArgs, Cast, CategoricalColHashBucket, Ceil, ControlNodes, Conv2D, Conv2DBackFilter, Conv2DTranspose, Conv3D, Conv3DBackpropFilter, Conv3DBackpropFilterV2, Conv3DBackpropInput, Conv3DBackpropInputV2, CrossEntropy, DecodeImage, DepthwiseConv2D, DepthwiseConv2DBackpropFilter, DepthwiseConv2DBackpropInput, Digamma, Dilation2D, Dilation2DBackpropFilter, Dilation2DBackpropInput, EluGrad, Equal, Erf, Erfc, Expm1, Floor, FloorDiv, FloorMod, FusedBatchNorm, FusedBatchNormGrad, Greater, GreaterEqual, InTopK, Inv, InvGrad, IsFinite, IsInf, IsNan, Kv2Tensor, L2Loss, LRNGrad, Less, LessEqual, Lgamma, LogicalAnd, LogicalNot, LogicalOr, MaxPool, MaxPoolGrad, Maximum, MergeOps, Minimum, Mod, ModuleToOperation, NotEqual, OneHot, Pad, ParseExample, Prod, RandomUniform, RangeOps, Rank, Relu6Grad, ReluGrad, ResizeBilinearGrad, ResizeBilinearOps, Rint, Round, RsqrtGrad, SegmentSum, SigmoidGrad, Sign, Slice, SoftplusGrad, SoftsignGrad, SqrtGrad, SquaredDifference, Substr, SwitchOps, TanhGrad, TopK, TruncateDiv, TruncatedNormal, Add => AddOps, DecodeGif => DecodeGifOps, DecodeJpeg => DecodeJpegOps, DecodePng => DecodePngOps, DecodeRaw => DecodeRawOps, Exp => ExpOps, Pow => PowOps, Select => SelectOps, Sum => SumOps, Tile => TileOps} +import com.intel.analytics.bigdl.nn.ops.{All, Any, ApproximateEqual, ArgMax, Assert, Assign, AssignGrad, AvgPoolGrad, BatchMatMul, BiasAddGrad, BroadcastGradientArgs, Cast, CategoricalColHashBucket, CategoricalColVocaList, Ceil, ControlNodes, Conv2D, Conv2DBackFilter, Conv2DTranspose, Conv3D, Conv3DBackpropFilter, Conv3DBackpropFilterV2, Conv3DBackpropInput, Conv3DBackpropInputV2, CrossEntropy, DecodeImage, DepthwiseConv2D, DepthwiseConv2DBackpropFilter, DepthwiseConv2DBackpropInput, Digamma, Dilation2D, Dilation2DBackpropFilter, Dilation2DBackpropInput, EluGrad, Equal, Erf, Erfc, Expm1, Floor, FloorDiv, FloorMod, FusedBatchNorm, FusedBatchNormGrad, Greater, GreaterEqual, InTopK, Inv, InvGrad, IsFinite, IsInf, IsNan, Kv2Tensor, L2Loss, LRNGrad, Less, LessEqual, Lgamma, LogicalAnd, LogicalNot, LogicalOr, MaxPool, MaxPoolGrad, Maximum, MergeOps, Minimum, Mod, ModuleToOperation, NotEqual, OneHot, Pad, ParseExample, Prod, RandomUniform, RangeOps, Rank, Relu6Grad, ReluGrad, ResizeBilinearGrad, ResizeBilinearOps, Rint, Round, RsqrtGrad, SegmentSum, SigmoidGrad, Sign, Slice, SoftplusGrad, SoftsignGrad, SqrtGrad, SquaredDifference, Substr, SwitchOps, TanhGrad, TopK, TruncateDiv, TruncatedNormal, Add => AddOps, DecodeGif => DecodeGifOps, DecodeJpeg => DecodeJpegOps, DecodePng => DecodePngOps, DecodeRaw => DecodeRawOps, Exp => ExpOps, Pow => PowOps, Select => SelectOps, Sum => SumOps, Tile => TileOps} import com.intel.analytics.bigdl.nn.tf._ import com.intel.analytics.bigdl.nn.{SoftPlus => BigDLSoftPlus} import com.intel.analytics.bigdl.tensor._ @@ -466,6 +466,17 @@ class OperationSerializerSpec extends SerializerSpecHelper { runSerializationTest(categoricalColHashBucket, input) } + "CategoricalColVocaList" should "work properly" in { + val categoricalColVocaList = CategoricalColVocaList[Float]( + vocaList = Array("A", "B", "C"), + strDelimiter = ",", + defaultValue = -1, + numOovBuckets = 0 + ).setName("categoricalColVocaList") + val input = Tensor[String](T(T("A"), T("B"), T("C"), T("D"))) + runSerializationTest(categoricalColVocaList, input) + } + "LessEqual serializer" should "work properly" in { val lessEqual = LessEqual[Float]().setName("lessEqual") val input1 = Tensor[Float](5).apply1(_ => Random.nextFloat())