From bc48a179d9cbb730bd5910efdf9d53cb1a9b2120 Mon Sep 17 00:00:00 2001
From: Matt Watson <mattdangerw@google.com>
Date: Wed, 30 Jun 2021 11:25:00 -0700
Subject: [PATCH] Fix TextVectorization with output_sequence_length on unknown
 input shapes

When output_sequence_length is set, for dense tensor input, we would error
out when the static shape contains None on any dimension. We can fix that
by using the dynamic shape to calculate padding for output_sequence_length.

PiperOrigin-RevId: 382347472
---
 .../preprocessing/text_vectorization.py       | 44 ++++++++--------
 .../preprocessing/text_vectorization_test.py  | 52 +++++++++----------
 2 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/keras/layers/preprocessing/text_vectorization.py b/keras/layers/preprocessing/text_vectorization.py
index c14091c339e..80c1a01d839 100644
--- a/keras/layers/preprocessing/text_vectorization.py
+++ b/keras/layers/preprocessing/text_vectorization.py
@@ -335,10 +335,6 @@ def compute_output_signature(self, input_spec):
     return tf.TensorSpec(shape=output_shape, dtype=output_dtype)
 
   def update_state(self, data):
-    if isinstance(data, (list, tuple, np.ndarray)):
-      data = tf.convert_to_tensor(data)
-    if data.shape.rank == 1:
-      data = tf.expand_dims(data, axis=-1)
     self._index_lookup_layer.update_state(self._preprocess(data))
 
   def finalize_state(self):
@@ -485,23 +481,29 @@ def call(self, inputs):
       return inputs
 
     lookup_data = self._index_lookup_layer(inputs)
-    if self._output_mode == INT:
-
-      # Maybe trim the output (NOOP if self._output_sequence_length is None).
-      output_tensor = lookup_data[..., :self._output_sequence_length]
-
-      output_shape = output_tensor.shape.as_list()
-      output_shape[-1] = self._output_sequence_length
-
-      # If it is a ragged tensor, convert it to dense with correct shape.
-      if tf_utils.is_ragged(output_tensor):
-        return output_tensor.to_tensor(default_value=0, shape=output_shape)
-
-      if self._output_sequence_length is None:
-        return output_tensor
 
-      padding, _ = tf.required_space_to_batch_paddings(
-          output_tensor.shape, output_shape)
-      return tf.pad(output_tensor, padding)
+    # For any non-int output, we can return directly from the underlying layer.
+    if self._output_mode is not INT:
+      return lookup_data
+
+    # If we have a ragged tensor, we can pad during the conversion to dense.
+    if tf_utils.is_ragged(lookup_data):
+      shape = lookup_data.shape.as_list()
+      # If output sequence length is None, to_tensor will pad the last dimension
+      # to the bounding shape of the ragged dimension.
+      shape[-1] = self._output_sequence_length
+      return lookup_data.to_tensor(default_value=0, shape=shape)
+
+    # If we have a dense tensor, we need to pad/trim directly.
+    if self._output_sequence_length is not None:
+      # Maybe trim the output.
+      lookup_data = lookup_data[..., :self._output_sequence_length]
+
+      # Maybe pad the output. We need to be careful to use dynamic shape here as
+      # required_space_to_batch_paddings requires a fully known shape.
+      shape = tf.shape(lookup_data)
+      padded_shape = tf.concat((shape[:-1], [self._output_sequence_length]), 0)
+      padding, _ = tf.required_space_to_batch_paddings(shape, padded_shape)
+      return tf.pad(lookup_data, padding)
 
     return lookup_data
diff --git a/keras/layers/preprocessing/text_vectorization_test.py b/keras/layers/preprocessing/text_vectorization_test.py
index 8b8f53b81ab..5b0bbdc78f0 100644
--- a/keras/layers/preprocessing/text_vectorization_test.py
+++ b/keras/layers/preprocessing/text_vectorization_test.py
@@ -318,12 +318,10 @@ def test_scalar_input_int_mode_no_len_limit(self):
     layer = text_vectorization.TextVectorization()
     layer.adapt(vocab_data)
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1])
+    self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1])
     layer.set_vocabulary(["earth", "wind", "and", "fire"])
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1])
+    self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1])
 
   def test_scalar_input_int_mode_trim_to_len_limit(self):
     vocab_data = [
@@ -333,12 +331,10 @@ def test_scalar_input_int_mode_trim_to_len_limit(self):
     layer = text_vectorization.TextVectorization(output_sequence_length=3)
     layer.adapt(vocab_data)
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [2, 3, 4])
+    self.assertAllClose(out.numpy(), [2, 3, 4])
     layer.set_vocabulary(["earth", "wind", "and", "fire"])
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [2, 3, 4])
+    self.assertAllClose(out.numpy(), [2, 3, 4])
 
   def test_scalar_input_int_pad_to_len_limit(self):
     vocab_data = [
@@ -348,12 +344,10 @@ def test_scalar_input_int_pad_to_len_limit(self):
     layer = text_vectorization.TextVectorization(output_sequence_length=10)
     layer.adapt(vocab_data)
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1, 0, 0])
+    self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1, 0, 0])
     layer.set_vocabulary(["earth", "wind", "and", "fire"])
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1, 0, 0])
+    self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1, 0, 0])
 
   def test_list_inputs_1d(self):
     vocab_data = ["two two two", "two three three", "three four four five"]
@@ -361,12 +355,10 @@ def test_list_inputs_1d(self):
     layer = text_vectorization.TextVectorization()
     layer.adapt(vocab_data)
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
     layer.set_vocabulary(["two", "three", "four", "five"])
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
 
   def test_tensor_inputs(self):
     vocab_data = tf.constant(
@@ -375,12 +367,10 @@ def test_tensor_inputs(self):
     layer = text_vectorization.TextVectorization()
     layer.adapt(vocab_data)
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
     layer.set_vocabulary(["two", "three", "four", "five"])
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
 
   def test_list_inputs_2d(self):
     vocab_data = [
@@ -389,22 +379,30 @@ def test_list_inputs_2d(self):
     layer = text_vectorization.TextVectorization()
     layer.adapt(vocab_data)
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
     layer.set_vocabulary(["two", "three", "four", "five"])
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
 
   def test_dataset_of_single_strings(self):
     vocab_data = ["two two two", "two three three", "three four four five"]
     input_data = ["two three", "four five"]
     vocab_ds = tf.data.Dataset.from_tensor_slices(vocab_data)  # unbatched
+    input_ds = tf.data.Dataset.from_tensor_slices(input_data)  # unbatched
     layer = text_vectorization.TextVectorization()
     layer.adapt(vocab_ds)
-    out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    out = input_ds.map(layer)
+    self.assertAllClose(list(out.as_numpy_iterator()), [[2, 3], [4, 5]])
+
+  def test_dataset_of_single_strings_with_output_sequence(self):
+    vocab_data = ["two two two", "two three three", "three four four five"]
+    input_data = ["two three", "four five"]
+    vocab_ds = tf.data.Dataset.from_tensor_slices(vocab_data)  # unbatched
+    input_ds = tf.data.Dataset.from_tensor_slices(input_data)  # unbatched
+    layer = text_vectorization.TextVectorization(output_sequence_length=3)
+    layer.adapt(vocab_ds)
+    out = input_ds.map(layer)
+    self.assertAllClose(list(out.as_numpy_iterator()), [[2, 3, 0], [4, 5, 0]])
 
   @parameterized.named_parameters(
       {