From 864df8d701e96aa8a7c679b4a345e8a038130efb Mon Sep 17 00:00:00 2001
From: kamilakesbi <kamil@huggingface.co>
Date: Thu, 2 May 2024 12:22:48 +0200
Subject: [PATCH 01/25] fiw input to generate in pipeline

---
 src/transformers/pipelines/automatic_speech_recognition.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index 2e8682d96a65e0..eb5a75e522c7fc 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -509,6 +509,7 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
                 generate_kwargs["encoder_outputs"] = encoder(inputs, attention_mask=attention_mask)
 
             tokens = self.model.generate(
+                inputs=inputs, 
                 attention_mask=attention_mask,
                 **generate_kwargs,
             )

From ff0c6381635d3b5ae2d776127d43f6d6cdeca083 Mon Sep 17 00:00:00 2001
From: kamilakesbi <kamil@huggingface.co>
Date: Thu, 2 May 2024 12:25:15 +0200
Subject: [PATCH 02/25] fixup

---
 src/transformers/pipelines/automatic_speech_recognition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index eb5a75e522c7fc..5b38e5f7c983e6 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -509,7 +509,7 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
                 generate_kwargs["encoder_outputs"] = encoder(inputs, attention_mask=attention_mask)
 
             tokens = self.model.generate(
-                inputs=inputs, 
+                inputs=inputs,
                 attention_mask=attention_mask,
                 **generate_kwargs,
             )

From 749cfaa5fdcfd5fb1e8de0f887e98431ffea986d Mon Sep 17 00:00:00 2001
From: kamilakesbi <kamil@huggingface.co>
Date: Fri, 3 May 2024 16:27:11 +0200
Subject: [PATCH 03/25] pass input_features to generate with assistant

---
 src/transformers/pipelines/automatic_speech_recognition.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index 5b38e5f7c983e6..784a9cd01a5baf 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -508,8 +508,10 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
             else:
                 generate_kwargs["encoder_outputs"] = encoder(inputs, attention_mask=attention_mask)
 
+            if generate_kwargs["assistant_model"]:
+                generate_kwargs["input_features"] = inputs
+
             tokens = self.model.generate(
-                inputs=inputs,
                 attention_mask=attention_mask,
                 **generate_kwargs,
             )

From f3011b060a34fa6e9460a3320e4d5af9bd2e262b Mon Sep 17 00:00:00 2001
From: kamilakesbi <kamil@huggingface.co>
Date: Fri, 3 May 2024 17:23:41 +0200
Subject: [PATCH 04/25] error if model and assistant with different enc size

---
 src/transformers/models/whisper/generation_whisper.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index a42d7b7dec3626..7e30541296a9f2 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -480,6 +480,14 @@ def generate(
                 FutureWarning,
             )
 
+        if "assistant_model" in kwargs:
+            if kwargs["assistant_model"].config.encoder_attention_heads != self.model.config.encoder_attention_heads:
+                if not type(kwargs["assistant_model"]).__name__ == "WhisperForConditionalGeneration":
+                    raise ValueError(
+                        "The main model and the assistant don't have encoders of the same size.",
+                        "please load the assistant using WhisperForConditionalGeneration",
+                    )
+
         # 1. prepare generation config
         generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs)
 

From 404f67b8419623016330b70f5cd88118199097bf Mon Sep 17 00:00:00 2001
From: kamilakesbi <kamil@huggingface.co>
Date: Fri, 3 May 2024 17:28:40 +0200
Subject: [PATCH 05/25] fix

---
 src/transformers/pipelines/automatic_speech_recognition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index 784a9cd01a5baf..1597c71afd2711 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -508,7 +508,7 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
             else:
                 generate_kwargs["encoder_outputs"] = encoder(inputs, attention_mask=attention_mask)
 
-            if generate_kwargs["assistant_model"]:
+            if "assistant_model" in generate_kwargs:
                 generate_kwargs["input_features"] = inputs
 
             tokens = self.model.generate(

From fd492a756f4189e1c0cc6aca41d96f879f47468d Mon Sep 17 00:00:00 2001
From: kamilakesbi <kamil@huggingface.co>
Date: Mon, 6 May 2024 13:12:03 +0200
Subject: [PATCH 06/25] apply review suggestions

---
 src/transformers/generation/utils.py          | 19 +++++++++++++++++++
 .../models/whisper/generation_whisper.py      |  8 --------
 .../pipelines/automatic_speech_recognition.py |  5 ++---
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index ab8e6019062b78..47978384525d13 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1097,6 +1097,24 @@ def _validate_model_class(self):
                 exception_message += f" Please use one of the following classes instead: {generate_compatible_classes}"
             raise TypeError(exception_message)
 
+    def _validate_assistant(self, assistant_model):
+        if assistant_model is not None:
+            if type(self).__name__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING._model_mapping.values():
+                if type(assistant_model).__name__ in MODEL_FOR_CAUSAL_LM_MAPPING._model_mapping.values():
+                    attributes_to_check = [attr for attr in dir(self.config) if attr.startswith("encoder_")]
+                    are_equal = all(
+                        getattr(self.config, attr) == getattr(assistant_model.config, attr)
+                        for attr in attributes_to_check
+                    )
+                    if not are_equal:
+                        raise ValueError(
+                            "The main model and the assistant don't have encoders of the same size. "
+                            "Load the assistant with AutoModelForSpeechSeq2Seq",
+                        )
+
+                if not self.config.vocab_size == assistant_model.config.vocab_size:
+                    raise ValueError("Make sure the main and assistant model use the same tokenizer")
+
     def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
         """Validates model kwargs for generation. Generate argument typos will also be caught here."""
         # If a `Cache` instance is passed, checks whether the model is compatible with it
@@ -1547,6 +1565,7 @@ def generate(
         tokenizer = kwargs.pop("tokenizer", None)  # Pull this out first, we only use it for stopping criteria
         generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
         self._validate_model_kwargs(model_kwargs.copy())
+        self._validate_assistant(assistant_model)
 
         # 2. Set generation parameters if not already defined
         if synced_gpus is None:
diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 7e30541296a9f2..a42d7b7dec3626 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -480,14 +480,6 @@ def generate(
                 FutureWarning,
             )
 
-        if "assistant_model" in kwargs:
-            if kwargs["assistant_model"].config.encoder_attention_heads != self.model.config.encoder_attention_heads:
-                if not type(kwargs["assistant_model"]).__name__ == "WhisperForConditionalGeneration":
-                    raise ValueError(
-                        "The main model and the assistant don't have encoders of the same size.",
-                        "please load the assistant using WhisperForConditionalGeneration",
-                    )
-
         # 1. prepare generation config
         generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs)
 
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index 1597c71afd2711..07c157f92674f3 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -486,6 +486,8 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
                     "Seq2Seq speech recognition model requires either a "
                     f"`input_features` or `input_values` key, but only has {model_inputs.keys()}"
                 )
+            if "assistant_model" in generate_kwargs:
+                generate_kwargs["input_features"] = inputs
 
             # custom processing for Whisper timestamps and word-level timestamps
             if return_timestamps and self.type == "seq2seq_whisper":
@@ -508,9 +510,6 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
             else:
                 generate_kwargs["encoder_outputs"] = encoder(inputs, attention_mask=attention_mask)
 
-            if "assistant_model" in generate_kwargs:
-                generate_kwargs["input_features"] = inputs
-
             tokens = self.model.generate(
                 attention_mask=attention_mask,
                 **generate_kwargs,

From e41d51976d4f7f3351d51a8b0163560fb03a2f24 Mon Sep 17 00:00:00 2001
From: kamilakesbi <kamil@huggingface.co>
Date: Fri, 10 May 2024 11:21:52 +0200
Subject: [PATCH 07/25] use self.config.is_encoder_decoder

---
 src/transformers/generation/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 47978384525d13..c7730ae8a6e5ee 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1099,8 +1099,8 @@ def _validate_model_class(self):
 
     def _validate_assistant(self, assistant_model):
         if assistant_model is not None:
-            if type(self).__name__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING._model_mapping.values():
-                if type(assistant_model).__name__ in MODEL_FOR_CAUSAL_LM_MAPPING._model_mapping.values():
+            if self.config.is_encoder_decoder:
+                if not assistant_model.config.is_encoder_decoder:
                     attributes_to_check = [attr for attr in dir(self.config) if attr.startswith("encoder_")]
                     are_equal = all(
                         getattr(self.config, attr) == getattr(assistant_model.config, attr)

From 27242a633babd908a2763cdc5705c4b5de1ba158 Mon Sep 17 00:00:00 2001
From: kamilakesbi <kamil@huggingface.co>
Date: Fri, 10 May 2024 11:29:13 +0200
Subject: [PATCH 08/25] pass inputs to generate directly

---
 .../pipelines/automatic_speech_recognition.py        | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index 07c157f92674f3..f077c7d2c7f11c 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -474,7 +474,6 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
             raise ValueError("num_frames must be used only when stride is None")
 
         if self.type in {"seq2seq", "seq2seq_whisper"}:
-            encoder = self.model.get_encoder()
             # Consume values so we can let extra information flow freely through
             # the pipeline (important for `partial` in microphone)
             if "input_features" in model_inputs:
@@ -486,8 +485,6 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
                     "Seq2Seq speech recognition model requires either a "
                     f"`input_features` or `input_values` key, but only has {model_inputs.keys()}"
                 )
-            if "assistant_model" in generate_kwargs:
-                generate_kwargs["input_features"] = inputs
 
             # custom processing for Whisper timestamps and word-level timestamps
             if return_timestamps and self.type == "seq2seq_whisper":
@@ -502,15 +499,8 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
                         else:
                             generate_kwargs["num_frames"] = [s[0] // self.feature_extractor.hop_length for s in stride]
 
-                    else:
-                        generate_kwargs["num_frames"] = num_frames
-
-            if self.type == "seq2seq_whisper" and inputs.shape[-1] > self.feature_extractor.nb_max_frames:
-                generate_kwargs["input_features"] = inputs
-            else:
-                generate_kwargs["encoder_outputs"] = encoder(inputs, attention_mask=attention_mask)
-
             tokens = self.model.generate(
+                inputs=inputs,
                 attention_mask=attention_mask,
                 **generate_kwargs,
             )

From 726f53faf1d5eb42bb910ee2fa48456d73fa5e5d Mon Sep 17 00:00:00 2001
From: kamilakesbi <kamil@huggingface.co>
Date: Fri, 10 May 2024 12:15:35 +0200
Subject: [PATCH 09/25] add slow tests

---
 ..._pipelines_automatic_speech_recognition.py | 116 ++++++++++++++++++
 1 file changed, 116 insertions(+)

diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 5ab18e81d56854..41524b22cb1f72 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import time
 import unittest
 
 import numpy as np
@@ -23,6 +24,8 @@
     MODEL_FOR_CTC_MAPPING,
     MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
     AutoFeatureExtractor,
+    AutoModelForCausalLM,
+    AutoModelForSpeechSeq2Seq,
     AutoProcessor,
     AutoTokenizer,
     Speech2TextForConditionalGeneration,
@@ -1138,6 +1141,119 @@ def test_whisper_language(self):
             {"text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."},
         )
 
+    @slow
+    def test_speculative_decoding_whisper_non_distil(self):
+        # Load data:
+        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]")
+        sample = dataset[0]
+
+        # Load model:
+        model_id = "openai/whisper-large-v2"
+        processor = AutoProcessor.from_pretrained(model_id)
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_id,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+        )
+
+        # Load assistant:
+        assistant_model_id = "openai/whisper-tiny"
+        assistant_model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            assistant_model_id,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+        )
+
+        # Load pipeline:
+        pipe = AutomaticSpeechRecognitionPipeline(
+            model=model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            generate_kwargs={"language": "en"},
+        )
+
+        inputs = {
+            "sampling_rate": sample["audio"]["sampling_rate"],
+            "raw": np.array(sample["audio"]["array"]),
+        }
+
+        start_time = time.time()
+        transcription_non_ass = pipe(inputs=inputs, generate_kwargs={"assistant_model": assistant_model})["text"]
+        total_time_assist = time.time() - start_time
+
+        inputs = {
+            "sampling_rate": sample["audio"]["sampling_rate"],
+            "raw": np.array(sample["audio"]["array"]),
+        }
+
+        start_time = time.time()
+        transcription_ass = pipe(inputs=inputs)["text"]
+        total_time_non_assist = time.time() - start_time
+
+        assert transcription_ass == transcription_non_ass
+        assert (
+            transcription_ass
+            == " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."
+        )
+
+        assert total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster"
+
+    @slow
+    def test_speculative_decoding_whisper_distil(self):
+        # Load data:
+        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]")
+        sample = dataset[0]
+
+        # Load model:
+        model_id = "openai/whisper-large-v2"
+        processor = AutoProcessor.from_pretrained(model_id)
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_id,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+        )
+
+        # Load assistant:
+        assistant_model_id = "distil-whisper/distil-large-v2"
+        assistant_model = AutoModelForCausalLM.from_pretrained(
+            assistant_model_id,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+        )
+
+        # Load pipeline:
+        pipe = AutomaticSpeechRecognitionPipeline(
+            model=model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            generate_kwargs={"language": "en"},
+        )
+
+        inputs = {
+            "sampling_rate": sample["audio"]["sampling_rate"],
+            "raw": np.array(sample["audio"]["array"]),
+        }
+
+        start_time = time.time()
+        transcription_non_ass = pipe(inputs=inputs, generate_kwargs={"assistant_model": assistant_model})["text"]
+        total_time_assist = time.time() - start_time
+
+        inputs = {
+            "sampling_rate": sample["audio"]["sampling_rate"],
+            "raw": np.array(sample["audio"]["array"]),
+        }
+
+        start_time = time.time()
+        transcription_ass = pipe(inputs=inputs)["text"]
+        total_time_non_assist = time.time() - start_time
+
+        assert transcription_ass == transcription_non_ass
+        assert (
+            transcription_ass
+            == " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."
+        )
+        assert total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster"
+
     @slow
     @require_torch
     @require_torchaudio

From c7f3f1ccaef095b3bd1a819e7ab7a410dd53c5c2 Mon Sep 17 00:00:00 2001
From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com>
Date: Fri, 10 May 2024 18:38:18 +0200
Subject: [PATCH 10/25] Update src/transformers/generation/utils.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 src/transformers/generation/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index c7730ae8a6e5ee..8273c4b9c7a188 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1109,7 +1109,7 @@ def _validate_assistant(self, assistant_model):
                     if not are_equal:
                         raise ValueError(
                             "The main model and the assistant don't have encoders of the same size. "
-                            "Load the assistant with AutoModelForSpeechSeq2Seq",
+                            "Ensure you load the assistant with the correct encoder-decoder class, e.g. `AutoModelForSpeechSeq2Seq` for Whisper."
                         )
 
                 if not self.config.vocab_size == assistant_model.config.vocab_size:

From 405606c918d9227411b319af0319aa640179b587 Mon Sep 17 00:00:00 2001
From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com>
Date: Fri, 10 May 2024 18:40:00 +0200
Subject: [PATCH 11/25] Update
 tests/pipelines/test_pipelines_automatic_speech_recognition.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 .../pipelines/test_pipelines_automatic_speech_recognition.py  | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 41524b22cb1f72..b87943a22efcb3 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -1172,10 +1172,6 @@ def test_speculative_decoding_whisper_non_distil(self):
             generate_kwargs={"language": "en"},
         )
 
-        inputs = {
-            "sampling_rate": sample["audio"]["sampling_rate"],
-            "raw": np.array(sample["audio"]["array"]),
-        }
 
         start_time = time.time()
         transcription_non_ass = pipe(inputs=inputs, generate_kwargs={"assistant_model": assistant_model})["text"]

From 2c8c0391144317f6d3e44d3ff3d7fa3ded29f3d0 Mon Sep 17 00:00:00 2001
From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com>
Date: Fri, 10 May 2024 18:40:17 +0200
Subject: [PATCH 12/25] Update
 tests/pipelines/test_pipelines_automatic_speech_recognition.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 tests/pipelines/test_pipelines_automatic_speech_recognition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index b87943a22efcb3..79c17a0a53a844 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -1174,7 +1174,7 @@ def test_speculative_decoding_whisper_non_distil(self):
 
 
         start_time = time.time()
-        transcription_non_ass = pipe(inputs=inputs, generate_kwargs={"assistant_model": assistant_model})["text"]
+        transcription_non_ass = pipe(sample.copy(), generate_kwargs={"assistant_model": assistant_model})["text"]
         total_time_assist = time.time() - start_time
 
         inputs = {

From 5b6f297ae91a3c5d394d42dac9b6a7d0d837e0a1 Mon Sep 17 00:00:00 2001
From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com>
Date: Fri, 10 May 2024 18:40:26 +0200
Subject: [PATCH 13/25] Update
 tests/pipelines/test_pipelines_automatic_speech_recognition.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 .../pipelines/test_pipelines_automatic_speech_recognition.py  | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 79c17a0a53a844..e2dbb960ccac31 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -1177,10 +1177,6 @@ def test_speculative_decoding_whisper_non_distil(self):
         transcription_non_ass = pipe(sample.copy(), generate_kwargs={"assistant_model": assistant_model})["text"]
         total_time_assist = time.time() - start_time
 
-        inputs = {
-            "sampling_rate": sample["audio"]["sampling_rate"],
-            "raw": np.array(sample["audio"]["array"]),
-        }
 
         start_time = time.time()
         transcription_ass = pipe(inputs=inputs)["text"]

From 03d2c3e52afd2edd24528a0c6e21be66edb1ca98 Mon Sep 17 00:00:00 2001
From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com>
Date: Fri, 10 May 2024 18:40:36 +0200
Subject: [PATCH 14/25] Update
 tests/pipelines/test_pipelines_automatic_speech_recognition.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 tests/pipelines/test_pipelines_automatic_speech_recognition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index e2dbb960ccac31..b64bc5a7011ff8 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -1179,7 +1179,7 @@ def test_speculative_decoding_whisper_non_distil(self):
 
 
         start_time = time.time()
-        transcription_ass = pipe(inputs=inputs)["text"]
+        transcription_ass = pipe(sample)["text"]
         total_time_non_assist = time.time() - start_time
 
         assert transcription_ass == transcription_non_ass

From f1c8c8abf46b8b62ff95aa5b049837960a8f6ad8 Mon Sep 17 00:00:00 2001
From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com>
Date: Fri, 10 May 2024 18:40:48 +0200
Subject: [PATCH 15/25] Update
 tests/pipelines/test_pipelines_automatic_speech_recognition.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 tests/pipelines/test_pipelines_automatic_speech_recognition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index b64bc5a7011ff8..de0822b7544fa3 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -1145,7 +1145,7 @@ def test_whisper_language(self):
     def test_speculative_decoding_whisper_non_distil(self):
         # Load data:
         dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]")
-        sample = dataset[0]
+        sample = dataset[0]["audio"]
 
         # Load model:
         model_id = "openai/whisper-large-v2"

From d1571a9c32eb1ee92b7ff8353d86a400283cce55 Mon Sep 17 00:00:00 2001
From: kamilakesbi <kamil@huggingface.co>
Date: Fri, 10 May 2024 18:46:56 +0200
Subject: [PATCH 16/25] apply review

---
 ..._pipelines_automatic_speech_recognition.py | 22 +++----------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index de0822b7544fa3..9f79d2ec72b71a 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -1152,7 +1152,6 @@ def test_speculative_decoding_whisper_non_distil(self):
         processor = AutoProcessor.from_pretrained(model_id)
         model = AutoModelForSpeechSeq2Seq.from_pretrained(
             model_id,
-            low_cpu_mem_usage=True,
             use_safetensors=True,
         )
 
@@ -1160,7 +1159,6 @@ def test_speculative_decoding_whisper_non_distil(self):
         assistant_model_id = "openai/whisper-tiny"
         assistant_model = AutoModelForSpeechSeq2Seq.from_pretrained(
             assistant_model_id,
-            low_cpu_mem_usage=True,
             use_safetensors=True,
         )
 
@@ -1172,12 +1170,10 @@ def test_speculative_decoding_whisper_non_distil(self):
             generate_kwargs={"language": "en"},
         )
 
-
         start_time = time.time()
         transcription_non_ass = pipe(sample.copy(), generate_kwargs={"assistant_model": assistant_model})["text"]
         total_time_assist = time.time() - start_time
 
-
         start_time = time.time()
         transcription_ass = pipe(sample)["text"]
         total_time_non_assist = time.time() - start_time
@@ -1194,14 +1190,13 @@ def test_speculative_decoding_whisper_non_distil(self):
     def test_speculative_decoding_whisper_distil(self):
         # Load data:
         dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]")
-        sample = dataset[0]
+        sample = dataset[0]["audio"]
 
         # Load model:
         model_id = "openai/whisper-large-v2"
         processor = AutoProcessor.from_pretrained(model_id)
         model = AutoModelForSpeechSeq2Seq.from_pretrained(
             model_id,
-            low_cpu_mem_usage=True,
             use_safetensors=True,
         )
 
@@ -1209,7 +1204,6 @@ def test_speculative_decoding_whisper_distil(self):
         assistant_model_id = "distil-whisper/distil-large-v2"
         assistant_model = AutoModelForCausalLM.from_pretrained(
             assistant_model_id,
-            low_cpu_mem_usage=True,
             use_safetensors=True,
         )
 
@@ -1221,22 +1215,12 @@ def test_speculative_decoding_whisper_distil(self):
             generate_kwargs={"language": "en"},
         )
 
-        inputs = {
-            "sampling_rate": sample["audio"]["sampling_rate"],
-            "raw": np.array(sample["audio"]["array"]),
-        }
-
         start_time = time.time()
-        transcription_non_ass = pipe(inputs=inputs, generate_kwargs={"assistant_model": assistant_model})["text"]
+        transcription_non_ass = pipe(sample.copy(), generate_kwargs={"assistant_model": assistant_model})["text"]
         total_time_assist = time.time() - start_time
 
-        inputs = {
-            "sampling_rate": sample["audio"]["sampling_rate"],
-            "raw": np.array(sample["audio"]["array"]),
-        }
-
         start_time = time.time()
-        transcription_ass = pipe(inputs=inputs)["text"]
+        transcription_ass = pipe(sample)["text"]
         total_time_non_assist = time.time() - start_time
 
         assert transcription_ass == transcription_non_ass

From 83e17f659cf31f34a3bd47e20f8dbee7497483b5 Mon Sep 17 00:00:00 2001
From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com>
Date: Wed, 15 May 2024 15:51:37 +0200
Subject: [PATCH 17/25] Update src/transformers/generation/utils.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/generation/utils.py | 31 ++++++++++++++--------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 8273c4b9c7a188..65911bc5c84f1c 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1098,22 +1098,23 @@ def _validate_model_class(self):
             raise TypeError(exception_message)
 
     def _validate_assistant(self, assistant_model):
-        if assistant_model is not None:
-            if self.config.is_encoder_decoder:
-                if not assistant_model.config.is_encoder_decoder:
-                    attributes_to_check = [attr for attr in dir(self.config) if attr.startswith("encoder_")]
-                    are_equal = all(
-                        getattr(self.config, attr) == getattr(assistant_model.config, attr)
-                        for attr in attributes_to_check
-                    )
-                    if not are_equal:
-                        raise ValueError(
-                            "The main model and the assistant don't have encoders of the same size. "
-                            "Ensure you load the assistant with the correct encoder-decoder class, e.g. `AutoModelForSpeechSeq2Seq` for Whisper."
-                        )
+        if assistant_model is None or not self.config.is_encoder_decoder:
+            return
+
+        if not assistant_model.config.is_encoder_decoder:
+            attributes_to_check = [attr for attr in dir(self.config) if attr.startswith("encoder_")]
+            are_equal = all(
+                getattr(self.config, attr) == getattr(assistant_model.config, attr)
+                for attr in attributes_to_check
+            )
+            if not are_equal:
+                raise ValueError(
+                    "The main model and the assistant don't have encoders of the same size. "
+                    "Ensure you load the assistant with the correct encoder-decoder class, e.g. `AutoModelForSpeechSeq2Seq` for Whisper."
+                )
 
-                if not self.config.vocab_size == assistant_model.config.vocab_size:
-                    raise ValueError("Make sure the main and assistant model use the same tokenizer")
+        if not self.config.vocab_size == assistant_model.config.vocab_size:
+            raise ValueError("Make sure the main and assistant model use the same tokenizer")
 
     def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
         """Validates model kwargs for generation. Generate argument typos will also be caught here."""

From 29046c6c926aeb4b1eb60d7c51549516f72d71f1 Mon Sep 17 00:00:00 2001
From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com>
Date: Wed, 15 May 2024 15:51:57 +0200
Subject: [PATCH 18/25] Update
 tests/pipelines/test_pipelines_automatic_speech_recognition.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../test_pipelines_automatic_speech_recognition.py    | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 9f79d2ec72b71a..9b8a3713b89759 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -1178,13 +1178,12 @@ def test_speculative_decoding_whisper_non_distil(self):
         transcription_ass = pipe(sample)["text"]
         total_time_non_assist = time.time() - start_time
 
-        assert transcription_ass == transcription_non_ass
-        assert (
-            transcription_ass
-            == " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."
+        self.assertEqual(transcription_ass, transcription_non_ass)
+        self.assertEqual(
+            transcription_ass,
+            " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."
         )
-
-        assert total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster"
+        self.assertTrue(total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster") 
 
     @slow
     def test_speculative_decoding_whisper_distil(self):

From d21637648483af6e3220b4134cf01b19da24dab8 Mon Sep 17 00:00:00 2001
From: kamilakesbi <kamil@huggingface.co>
Date: Wed, 15 May 2024 16:02:02 +0200
Subject: [PATCH 19/25] apply code review

---
 src/transformers/generation/utils.py               |  3 +--
 .../test_pipelines_automatic_speech_recognition.py | 14 +++++++-------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 65911bc5c84f1c..b5b6e82d16b29a 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1104,8 +1104,7 @@ def _validate_assistant(self, assistant_model):
         if not assistant_model.config.is_encoder_decoder:
             attributes_to_check = [attr for attr in dir(self.config) if attr.startswith("encoder_")]
             are_equal = all(
-                getattr(self.config, attr) == getattr(assistant_model.config, attr)
-                for attr in attributes_to_check
+                getattr(self.config, attr) == getattr(assistant_model.config, attr) for attr in attributes_to_check
             )
             if not are_equal:
                 raise ValueError(
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 9b8a3713b89759..430666990fe5c2 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -1181,9 +1181,9 @@ def test_speculative_decoding_whisper_non_distil(self):
         self.assertEqual(transcription_ass, transcription_non_ass)
         self.assertEqual(
             transcription_ass,
-            " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."
+            " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.",
         )
-        self.assertTrue(total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster") 
+        self.assertTrue(total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster")
 
     @slow
     def test_speculative_decoding_whisper_distil(self):
@@ -1222,12 +1222,12 @@ def test_speculative_decoding_whisper_distil(self):
         transcription_ass = pipe(sample)["text"]
         total_time_non_assist = time.time() - start_time
 
-        assert transcription_ass == transcription_non_ass
-        assert (
-            transcription_ass
-            == " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."
+        self.assertEqual(transcription_ass, transcription_non_ass)
+        self.assertEqual(
+            transcription_ass,
+            " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.",
         )
-        assert total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster"
+        self.assertEqual(total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster")
 
     @slow
     @require_torch

From 87b08e97b0171b5c3cf9169ed4777c1392f5e563 Mon Sep 17 00:00:00 2001
From: kamilakesbi <kamil@huggingface.co>
Date: Wed, 15 May 2024 19:40:45 +0200
Subject: [PATCH 20/25] update attributes encoder_xyz to check

---
 src/transformers/generation/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index b5b6e82d16b29a..45e61b3da3029a 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1102,7 +1102,8 @@ def _validate_assistant(self, assistant_model):
             return
 
         if not assistant_model.config.is_encoder_decoder:
-            attributes_to_check = [attr for attr in dir(self.config) if attr.startswith("encoder_")]
+            attributes_to_check = ["encoder_attention_heads", "encoder_ffn_dim", "encoder_layers"]
+            attributes_to_check = [attr for attr in dir(self.config) if attr in attributes_to_check]
             are_equal = all(
                 getattr(self.config, attr) == getattr(assistant_model.config, attr) for attr in attributes_to_check
             )

From a43e202b6e54e8aa21c105dfa81e00b0ec287cd1 Mon Sep 17 00:00:00 2001
From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com>
Date: Mon, 20 May 2024 14:01:05 +0200
Subject: [PATCH 21/25] Update src/transformers/generation/utils.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
---
 src/transformers/generation/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 45e61b3da3029a..1987f35d525d3d 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1109,7 +1109,7 @@ def _validate_assistant(self, assistant_model):
             )
             if not are_equal:
                 raise ValueError(
-                    "The main model and the assistant don't have encoders of the same size. "
+                    "The main model and the assistant don't have compatible encoder-dependent input shapes. "
                     "Ensure you load the assistant with the correct encoder-decoder class, e.g. `AutoModelForSpeechSeq2Seq` for Whisper."
                 )
 

From b23f1f3f7fb46b1f9a672bbe425b66abd67fc532 Mon Sep 17 00:00:00 2001
From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com>
Date: Mon, 20 May 2024 14:02:45 +0200
Subject: [PATCH 22/25] Update src/transformers/generation/utils.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
---
 src/transformers/generation/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 1987f35d525d3d..6f0ea47d9836dc 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1103,7 +1103,7 @@ def _validate_assistant(self, assistant_model):
 
         if not assistant_model.config.is_encoder_decoder:
             attributes_to_check = ["encoder_attention_heads", "encoder_ffn_dim", "encoder_layers"]
-            attributes_to_check = [attr for attr in dir(self.config) if attr in attributes_to_check]
+            attributes_to_check = [attr for attr in dir(assistant_model.config) if attr in attributes_to_check]
             are_equal = all(
                 getattr(self.config, attr) == getattr(assistant_model.config, attr) for attr in attributes_to_check
             )

From 5547aef75bd1080c6bdc207e0dc6061b4cf804aa Mon Sep 17 00:00:00 2001
From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com>
Date: Mon, 20 May 2024 14:04:03 +0200
Subject: [PATCH 23/25] Update src/transformers/generation/utils.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
---
 src/transformers/generation/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 6f0ea47d9836dc..149ce144e66272 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1098,10 +1098,10 @@ def _validate_model_class(self):
             raise TypeError(exception_message)
 
     def _validate_assistant(self, assistant_model):
-        if assistant_model is None or not self.config.is_encoder_decoder:
+        if assistant_model is None:
             return
 
-        if not assistant_model.config.is_encoder_decoder:
+        if self.config.is_encoder_decoder and not assistant_model.config.is_encoder_decoder:
             attributes_to_check = ["encoder_attention_heads", "encoder_ffn_dim", "encoder_layers"]
             attributes_to_check = [attr for attr in dir(assistant_model.config) if attr in attributes_to_check]
             are_equal = all(

From e2bdde1a62cd82448098ecd24430fd8e828f405f Mon Sep 17 00:00:00 2001
From: kamilakesbi <kamil@huggingface.co>
Date: Tue, 21 May 2024 13:07:52 +0200
Subject: [PATCH 24/25] add slow test

---
 tests/generation/test_utils.py | 62 ++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index b8e90a5b8ed18e..840b64e17db010 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -45,6 +45,7 @@
         AutoModelForSeq2SeqLM,
         AutoModelForSpeechSeq2Seq,
         AutoModelForVision2Seq,
+        AutoProcessor,
         AutoTokenizer,
         BartForCausalLM,
         BartForConditionalGeneration,
@@ -2919,6 +2920,67 @@ def test_assisted_decoding_num_assistant_tokens_heuristic_transient_schedule(sel
         # update_candidate_strategy is called once but assistant_model.generation_config.num_assistant_tokens should stay 5
         self.assertEqual(assistant_model.generation_config.num_assistant_tokens, 5)
 
+    @slow
+    def test_validate_assistant(self):
+        # Generate a random sample:
+        inputs = np.random.rand(160000)
+
+        # Load a main encoder-decoder model:
+        model_id = "openai/whisper-large-v2"
+        processor = AutoProcessor.from_pretrained(model_id)
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_id,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+        )
+        model.to(torch_device)
+
+        # process the input:
+        features = processor(inputs, return_tensors="pt").to(torch_device)
+
+        # Load an encoder-decoder assistant with same encoder as the main model:
+        assistant_distil_model_id = "distil-whisper/distil-large-v2"
+        assistant_seq_to_seq = AutoModelForSpeechSeq2Seq.from_pretrained(
+            assistant_distil_model_id,
+            use_safetensors=True,
+        ).to(torch_device)
+        self.assertTrue(model.generate(**features, assistant_model=assistant_seq_to_seq).sum())
+
+        # Load its decoder only version:
+        assistant_causal_lm = AutoModelForCausalLM.from_pretrained(
+            assistant_distil_model_id,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+        ).to(torch_device)
+        self.assertTrue(model.generate(**features, assistant_model=assistant_causal_lm).sum())
+
+        # Load an encoder-decoder assistant with a different encoder than the main model:
+        assistant_distil_model_id = "openai/whisper-tiny"
+        assistant_seq_to_seq = AutoModelForSpeechSeq2Seq.from_pretrained(
+            assistant_distil_model_id,
+            use_safetensors=True,
+        ).to(torch_device)
+        self.assertTrue(model.generate(**features, assistant_model=assistant_seq_to_seq).sum())
+
+        # Load its decoder only version:
+        assistant_causal_lm = AutoModelForCausalLM.from_pretrained(
+            assistant_distil_model_id,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+        ).to(torch_device)
+        # It will raise an error as the encoder of the main and assistant model are not compatible:
+        with self.assertRaises(ValueError):
+            model.generate(**features, assistant_model=assistant_causal_lm)
+
+        # Load an encoder-decoder model with a different tokenizer than the main model:
+        assistant_distil_model_id = "hf-internal-testing/tiny-random-SeamlessM4Tv2ForSpeechToText"
+        assistant_seq_to_seq = AutoModelForSpeechSeq2Seq.from_pretrained(
+            assistant_distil_model_id,
+        ).to(torch_device)
+        # This should raise an error as the main and assistant model don't use the same tokenizer:
+        with self.assertRaises(ValueError):
+            model.generate(**features, assistant_model=assistant_seq_to_seq)
+
     def test_compare_unprocessed_logit_scores(self):
         # Get unprocessed logit scores back from model generate function.
         # Assert that unprocessed logits from generate() are same as those from modal eval()

From 3a3514547299c44af7676f3a08b86aaf7e96e6a0 Mon Sep 17 00:00:00 2001
From: kamilakesbi <kamil@huggingface.co>
Date: Wed, 22 May 2024 13:07:28 +0200
Subject: [PATCH 25/25] solve conflicts

---
 src/transformers/pipelines/automatic_speech_recognition.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index f077c7d2c7f11c..01faab6d74adac 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -498,6 +498,8 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
                             generate_kwargs["num_frames"] = stride[0] // self.feature_extractor.hop_length
                         else:
                             generate_kwargs["num_frames"] = [s[0] // self.feature_extractor.hop_length for s in stride]
+                    else:
+                        generate_kwargs["num_frames"] = num_frames
 
             tokens = self.model.generate(
                 inputs=inputs,