apache · MrBago · Jan 17, 2018 · Jan 17, 2018 · Jan 17, 2018 · Jan 17, 2018
diff --git a/docs/ml-features.md b/docs/ml-features.md
@@ -1283,6 +1283,56 @@ for more details on the API.
 </div>
 </div>
 
+## VectorSizeHint
+
+It can sometimes be useful to explicitly specify the size of the vectors a column of
+`VectorType`. For example, `VectorAssembler` uses size information from its input columns to
+produce size information and metadata for its output column. While in some cases this information
+can be obtained by inspecting the contents of the column, in a streaming dataframe the contents are
+not available until the stream is started. `VectorSizeHint` allows a user to explicitly specify the
+vector size for a column so that `VectorAssembler`, or other transformers that might
+need to know vector size, can use that column as an input.
+
+To use `VectorSizeHint` a user must set the `inputCol` and `size` parameters. Applying this
+transformer to a dataframe produces a new dataframe with updated metadata for `inputCol` specifying
+the vector size. Downstream operations on the resulting dataframe can get this size using the
+meatadata.
+
+`VectorSizeHint` can also take an optional `handleInvalid` parameter which controls its
+behaviour when the vector column contains nulls or vectors of the wrong size. By default
+`handleInvalid` is set to "error", indicating an exception should be thrown. This parameter can
+also be set to "skip", indicating that rows containing invalid values should be filtered out from
+the resulting dataframe, or `optimistic` indicating that all rows should be kept. When
+`handleInvalid` is set to `optimistic` the user takes responsibility for ensuring that the column
+does not have invalid values, values that don't match the column's metadata, or dealing with those
+invalid values downstream.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [VectorSizeHint Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorSizeHint)
+for more details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [VectorSizeHint Java docs](api/java/org/apache/spark/ml/feature/VectorSizeHint.html)
+for more details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+Refer to the [VectorSizeHint Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorSizeHint)
+for more details on the API.
+
+{% include_example python/ml/vector_size_hint_example.py %}
+</div>
+</div>
+
 ## QuantileDiscretizer
 
 `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.ml.feature.VectorAssembler;
+import org.apache.spark.ml.feature.VectorSizeHint;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+import java.util.Arrays;
+
+import static org.apache.spark.sql.types.DataTypes.*;
+
+// $example on$
+// $example off$
+
+public class JavaVectorSizeHintExample {
+  public static void main(String[] args) {
+    SparkSession spark = SparkSession
+      .builder()
+      .appName("JavaVectorSizeHintExample")
+      .getOrCreate();
+
+    // $example on$
+    StructType schema = createStructType(new StructField[]{
+      createStructField("id", IntegerType, false),
+      createStructField("hour", IntegerType, false),
+      createStructField("mobile", DoubleType, false),
+      createStructField("userFeatures", new VectorUDT(), false),
+      createStructField("clicked", DoubleType, false)
+    });
+    Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
+    Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row), schema);
+
+    VectorSizeHint sizeHint = new VectorSizeHint()
+      .setInputCol("userFeatures")
+      .setHandleInvalid("skip")
+      .setSize(3);
+
+    Dataset<Row> datasetWithSize = sizeHint.transform(dataset);
+    System.out.println("Rows where 'userFeatures' is not the right size are filtered out");
+    datasetWithSize.show(false);
+
+    VectorAssembler assembler = new VectorAssembler()
+      .setInputCols(new String[]{"hour", "mobile", "userFeatures"})
+      .setOutputCol("features");
+
+    // This dataframe can be used by used by downstream transformers as before
+    Dataset<Row> output = assembler.transform(datasetWithSize);
+    System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " +
+        "'features'");
+    output.select("features", "clicked").show(false);
+    // $example off$
+
+    spark.stop();
+  }
+}
+
diff --git a/examples/src/main/python/ml/vector_size_hint_example.py b/examples/src/main/python/ml/vector_size_hint_example.py
@@ -0,0 +1,57 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+# $example on$
+from pyspark.ml.linalg import Vectors
+from pyspark.ml.feature import (VectorSizeHint, VectorAssembler)
+# $example off$
+from pyspark.sql import SparkSession
+
+if __name__ == "__main__":
+    spark = SparkSession\
+        .builder\
+        .appName("VectorSizeHintExample")\
+        .getOrCreate()
+
+    # $example on$
+    dataset = spark.createDataFrame(
+        [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0),
+         (0, 18, 1.0, Vectors.dense([0.0, 10.0]), 0.0)],
+        ["id", "hour", "mobile", "userFeatures", "clicked"])
+
+    sizeHint = VectorSizeHint(
+        inputCol="userFeatures",
+        handleInvalid="skip",
+        size=3)
+
+    datasetWithSize = sizeHint.transform(dataset)
+    print("Rows where 'userFeatures' is not the right size are filtered out")
+    datasetWithSize.show(truncate=False)
+
+    assembler = VectorAssembler(
+        inputCols=["hour", "mobile", "userFeatures"],
+        outputCol="features")
+
+    # This dataframe can be used by used by downstream transformers as before
+    output = assembler.transform(datasetWithSize)
+    print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
+    output.select("features", "clicked").show(truncate=False)
+    # $example off$
+
+    spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.{VectorAssembler, VectorSizeHint}
+import org.apache.spark.ml.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+object VectorSizeHintExample {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder
+      .appName("VectorSizeHintExample")
+      .getOrCreate()
+
+    // $example on$
+    val dataset = spark.createDataFrame(
+      Seq(
+        (0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0),
+        (0, 18, 1.0, Vectors.dense(0.0, 10.0), 0.0))
+    ).toDF("id", "hour", "mobile", "userFeatures", "clicked")
+
+    val sizeHint = new VectorSizeHint()
+      .setInputCol("userFeatures")
+      .setHandleInvalid("skip")
+      .setSize(3)
+
+    val datasetWithSize = sizeHint.transform(dataset)
+    println("Rows where 'userFeatures' is not the right size are filtered out")
+    datasetWithSize.show(false)
+
+    val assembler = new VectorAssembler()
+      .setInputCols(Array("hour", "mobile", "userFeatures"))
+      .setOutputCol("features")
+
+    // This dataframe can be used by used by downstream transformers as before
+    val output = assembler.transform(datasetWithSize)
+    println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
+    output.select("features", "clicked").show(false)
+    // $example off$
+
+    spark.stop()
+  }
+}
+// scalastyle:on println