diff --git a/docs/ml-features.md b/docs/ml-features.md
index 72643137d96b1..1e46be474d9bd 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1283,6 +1283,48 @@ for more details on the API.
+## VectorSizeHint
+
+It can sometimes be useful to explicitly specify the size of the vectors an a column of
+`VectorType`. For example, `VectorAssembler` uses size information from its input columns to
+produce size information and metadata for its output column. While in some cases this information
+can be obtained by inspecting the contents of the column, in a streaming dataframe the contents are
+not available until the stream is started. `VectorSizeHint` allows a user to explicitly specify the
+vector size for a column so that `VectorAssembler`, or other transformers that might
+need to know vector size, can use that column as an input.
+
+To use `VectorSizeHint` a user must set the `inputCol` and `size` parameters. Applying this
+transformer to a dataframe produces a new dataframe with updated metadata for `inputCol` specifying
+the vector size. Downstream operations on the resulting dataframe can get this size using the
+meatadata.
+
+`VectorSizeHint` can also take an optional `handleInvalid` parameter which controls its
+behaviour when the vector column contains nulls for vectors of the wrong size. By default
+`handleInvalid` is set to "error", indicating an exception should be thrown. This parameter can
+also be set to "skip", indicating that rows containing invalid values should be filtered out from
+the resulting dataframe, or `optimistic` indicating that all rows should be kept. When
+`handleInvalid` is set to `optimistic` the user takes responsibility for ensuring that the column
+does not have invalid values, values that don't match the column's metadata, or dealing with those
+invalid values downstream.
+
+
+
+
+Refer to the [VectorAssembler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorSizeHint)
+for more details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala %}
+
+
+
+
+Refer to the [VectorAssembler Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorSizeHint)
+for more details on the API.
+
+{% include_example python/ml/vector_size_hint_example.py %}
+
+
+
## QuantileDiscretizer
`QuantileDiscretizer` takes a column with continuous features and outputs a column with binned
diff --git a/examples/src/main/python/ml/vector_size_hint_example.py b/examples/src/main/python/ml/vector_size_hint_example.py
new file mode 100644
index 0000000000000..a91bff6e3545a
--- /dev/null
+++ b/examples/src/main/python/ml/vector_size_hint_example.py
@@ -0,0 +1,55 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+# $example on$
+from pyspark.ml.linalg import Vectors
+from pyspark.ml.feature import (VectorSizeHint, VectorAssembler)
+# $example off$
+from pyspark.sql import SparkSession
+
+if __name__ == "__main__":
+ spark = SparkSession\
+ .builder\
+ .appName("VectorAssemblerExample")\
+ .getOrCreate()
+
+ # $example on$
+ dataset = spark.createDataFrame(
+ [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0),
+ (0, 18, 1.0, Vectors.dense([0.0, 10.0]), 0.0)],
+ ["id", "hour", "mobile", "userFeatures", "clicked"])
+
+ sizeHint = VectorSizeHint(
+ inputCol="userFeatures",
+ handleInvalid="sip",
+ size=3)
+
+ datasetWithSize = sizeHint.transform(dataset)
+ datasetWithSize.show(truncate=False)
+
+ assembler = VectorAssembler(
+ inputCols=["hour", "mobile", "userFeatures"],
+ outputCol="features")
+
+ output = assembler.transform(datasetWithSize)
+ print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
+ output.select("features", "clicked").show(truncate=False)
+ # $example off$
+
+ spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala
new file mode 100644
index 0000000000000..92e1c0200546a
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.{VectorAssembler, VectorSizeHint}
+import org.apache.spark.ml.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+object VectorSizeHintExample {
+ def main(args: Array[String]): Unit = {
+ val spark = SparkSession
+ .builder
+ .appName("VectorAssemblerExample")
+ .getOrCreate()
+
+ // $example on$
+ val dataset = spark.createDataFrame(
+ Seq(
+ (0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0),
+ (0, 18, 1.0, Vectors.dense(0.0, 10.0), 0.0))
+ ).toDF("id", "hour", "mobile", "userFeatures", "clicked")
+
+ val sizeHint = new VectorSizeHint()
+ .setInputCol("userFeatures")
+ .setHandleInvalid("skip")
+ .setSize(3)
+
+ val datasetWithSize = sizeHint.transform(dataset)
+ datasetWithSize.show(false)
+
+ val assembler = new VectorAssembler()
+ .setInputCols(Array("hour", "mobile", "userFeatures"))
+ .setOutputCol("features")
+
+ val output = assembler.transform(datasetWithSize)
+ output.select("features", "clicked").show(false)
+ // $example off$
+
+ spark.stop()
+ }
+}
+// scalastyle:on println