Fix TextVectorization with output_sequence_length on unknown input sh…

…apes When output_sequence_length is set, for dense tensor input, we would error out when the static shape contains None on any dimension. We can fix that by using the dynamic shape to calculate padding for output_sequence_length. PiperOrigin-RevId: 382347472
keras-team · Jun 30, 2021 · bc48a17 · bc48a17
1 parent f3f23fb
commit bc48a17
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 48 deletions.
diff --git a/keras/layers/preprocessing/text_vectorization.py b/keras/layers/preprocessing/text_vectorization.py
@@ -335,10 +335,6 @@ def compute_output_signature(self, input_spec):
     return tf.TensorSpec(shape=output_shape, dtype=output_dtype)
 
   def update_state(self, data):
-    if isinstance(data, (list, tuple, np.ndarray)):
-      data = tf.convert_to_tensor(data)
-    if data.shape.rank == 1:
-      data = tf.expand_dims(data, axis=-1)
     self._index_lookup_layer.update_state(self._preprocess(data))
 
   def finalize_state(self):
@@ -485,23 +481,29 @@ def call(self, inputs):
       return inputs
 
     lookup_data = self._index_lookup_layer(inputs)
-    if self._output_mode == INT:
-
-      # Maybe trim the output (NOOP if self._output_sequence_length is None).
-      output_tensor = lookup_data[..., :self._output_sequence_length]
-
-      output_shape = output_tensor.shape.as_list()
-      output_shape[-1] = self._output_sequence_length
-
-      # If it is a ragged tensor, convert it to dense with correct shape.
-      if tf_utils.is_ragged(output_tensor):
-        return output_tensor.to_tensor(default_value=0, shape=output_shape)
-
-      if self._output_sequence_length is None:
-        return output_tensor
 
-      padding, _ = tf.required_space_to_batch_paddings(
-          output_tensor.shape, output_shape)
-      return tf.pad(output_tensor, padding)
+    # For any non-int output, we can return directly from the underlying layer.
+    if self._output_mode is not INT:
+      return lookup_data
+
+    # If we have a ragged tensor, we can pad during the conversion to dense.
+    if tf_utils.is_ragged(lookup_data):
+      shape = lookup_data.shape.as_list()
+      # If output sequence length is None, to_tensor will pad the last dimension
+      # to the bounding shape of the ragged dimension.
+      shape[-1] = self._output_sequence_length
+      return lookup_data.to_tensor(default_value=0, shape=shape)
+
+    # If we have a dense tensor, we need to pad/trim directly.
+    if self._output_sequence_length is not None:
+      # Maybe trim the output.
+      lookup_data = lookup_data[..., :self._output_sequence_length]
+
+      # Maybe pad the output. We need to be careful to use dynamic shape here as
+      # required_space_to_batch_paddings requires a fully known shape.
+      shape = tf.shape(lookup_data)
+      padded_shape = tf.concat((shape[:-1], [self._output_sequence_length]), 0)
+      padding, _ = tf.required_space_to_batch_paddings(shape, padded_shape)
+      return tf.pad(lookup_data, padding)
 
     return lookup_data
diff --git a/keras/layers/preprocessing/text_vectorization_test.py b/keras/layers/preprocessing/text_vectorization_test.py
@@ -318,12 +318,10 @@ def test_scalar_input_int_mode_no_len_limit(self):
     layer = text_vectorization.TextVectorization()
     layer.adapt(vocab_data)
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1])
+    self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1])
     layer.set_vocabulary(["earth", "wind", "and", "fire"])
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1])
+    self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1])
 
   def test_scalar_input_int_mode_trim_to_len_limit(self):
     vocab_data = [
@@ -333,12 +331,10 @@ def test_scalar_input_int_mode_trim_to_len_limit(self):
     layer = text_vectorization.TextVectorization(output_sequence_length=3)
     layer.adapt(vocab_data)
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [2, 3, 4])
+    self.assertAllClose(out.numpy(), [2, 3, 4])
     layer.set_vocabulary(["earth", "wind", "and", "fire"])
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [2, 3, 4])
+    self.assertAllClose(out.numpy(), [2, 3, 4])
 
   def test_scalar_input_int_pad_to_len_limit(self):
     vocab_data = [
@@ -348,25 +344,21 @@ def test_scalar_input_int_pad_to_len_limit(self):
     layer = text_vectorization.TextVectorization(output_sequence_length=10)
     layer.adapt(vocab_data)
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1, 0, 0])
+    self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1, 0, 0])
     layer.set_vocabulary(["earth", "wind", "and", "fire"])
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1, 0, 0])
+    self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1, 0, 0])
 
   def test_list_inputs_1d(self):
     vocab_data = ["two two two", "two three three", "three four four five"]
     input_data = ["two three", "four five"]
     layer = text_vectorization.TextVectorization()
     layer.adapt(vocab_data)
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
     layer.set_vocabulary(["two", "three", "four", "five"])
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
 
   def test_tensor_inputs(self):
     vocab_data = tf.constant(
@@ -375,12 +367,10 @@ def test_tensor_inputs(self):
     layer = text_vectorization.TextVectorization()
     layer.adapt(vocab_data)
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
     layer.set_vocabulary(["two", "three", "four", "five"])
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
 
   def test_list_inputs_2d(self):
     vocab_data = [
@@ -389,22 +379,30 @@ def test_list_inputs_2d(self):
     layer = text_vectorization.TextVectorization()
     layer.adapt(vocab_data)
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
     layer.set_vocabulary(["two", "three", "four", "five"])
     out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
 
   def test_dataset_of_single_strings(self):
     vocab_data = ["two two two", "two three three", "three four four five"]
     input_data = ["two three", "four five"]
     vocab_ds = tf.data.Dataset.from_tensor_slices(vocab_data)  # unbatched
+    input_ds = tf.data.Dataset.from_tensor_slices(input_data)  # unbatched
     layer = text_vectorization.TextVectorization()
     layer.adapt(vocab_ds)
-    out = layer(input_data)
-    if tf.executing_eagerly():
-      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    out = input_ds.map(layer)
+    self.assertAllClose(list(out.as_numpy_iterator()), [[2, 3], [4, 5]])
+
+  def test_dataset_of_single_strings_with_output_sequence(self):
+    vocab_data = ["two two two", "two three three", "three four four five"]
+    input_data = ["two three", "four five"]
+    vocab_ds = tf.data.Dataset.from_tensor_slices(vocab_data)  # unbatched
+    input_ds = tf.data.Dataset.from_tensor_slices(input_data)  # unbatched
+    layer = text_vectorization.TextVectorization(output_sequence_length=3)
+    layer.adapt(vocab_ds)
+    out = input_ds.map(layer)
+    self.assertAllClose(list(out.as_numpy_iterator()), [[2, 3, 0], [4, 5, 0]])
 
   @parameterized.named_parameters(
       {