From 864df8d701e96aa8a7c679b4a345e8a038130efb Mon Sep 17 00:00:00 2001 From: kamilakesbi Date: Thu, 2 May 2024 12:22:48 +0200 Subject: [PATCH 01/25] fiw input to generate in pipeline --- src/transformers/pipelines/automatic_speech_recognition.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py index 2e8682d96a65e0..eb5a75e522c7fc 100644 --- a/src/transformers/pipelines/automatic_speech_recognition.py +++ b/src/transformers/pipelines/automatic_speech_recognition.py @@ -509,6 +509,7 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs): generate_kwargs["encoder_outputs"] = encoder(inputs, attention_mask=attention_mask) tokens = self.model.generate( + inputs=inputs, attention_mask=attention_mask, **generate_kwargs, ) From ff0c6381635d3b5ae2d776127d43f6d6cdeca083 Mon Sep 17 00:00:00 2001 From: kamilakesbi Date: Thu, 2 May 2024 12:25:15 +0200 Subject: [PATCH 02/25] fixup --- src/transformers/pipelines/automatic_speech_recognition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py index eb5a75e522c7fc..5b38e5f7c983e6 100644 --- a/src/transformers/pipelines/automatic_speech_recognition.py +++ b/src/transformers/pipelines/automatic_speech_recognition.py @@ -509,7 +509,7 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs): generate_kwargs["encoder_outputs"] = encoder(inputs, attention_mask=attention_mask) tokens = self.model.generate( - inputs=inputs, + inputs=inputs, attention_mask=attention_mask, **generate_kwargs, ) From 749cfaa5fdcfd5fb1e8de0f887e98431ffea986d Mon Sep 17 00:00:00 2001 From: kamilakesbi Date: Fri, 3 May 2024 16:27:11 +0200 Subject: [PATCH 03/25] pass input_features to generate with assistant --- src/transformers/pipelines/automatic_speech_recognition.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py index 5b38e5f7c983e6..784a9cd01a5baf 100644 --- a/src/transformers/pipelines/automatic_speech_recognition.py +++ b/src/transformers/pipelines/automatic_speech_recognition.py @@ -508,8 +508,10 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs): else: generate_kwargs["encoder_outputs"] = encoder(inputs, attention_mask=attention_mask) + if generate_kwargs["assistant_model"]: + generate_kwargs["input_features"] = inputs + tokens = self.model.generate( - inputs=inputs, attention_mask=attention_mask, **generate_kwargs, ) From f3011b060a34fa6e9460a3320e4d5af9bd2e262b Mon Sep 17 00:00:00 2001 From: kamilakesbi Date: Fri, 3 May 2024 17:23:41 +0200 Subject: [PATCH 04/25] error if model and assistant with different enc size --- src/transformers/models/whisper/generation_whisper.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py index a42d7b7dec3626..7e30541296a9f2 100644 --- a/src/transformers/models/whisper/generation_whisper.py +++ b/src/transformers/models/whisper/generation_whisper.py @@ -480,6 +480,14 @@ def generate( FutureWarning, ) + if "assistant_model" in kwargs: + if kwargs["assistant_model"].config.encoder_attention_heads != self.model.config.encoder_attention_heads: + if not type(kwargs["assistant_model"]).__name__ == "WhisperForConditionalGeneration": + raise ValueError( + "The main model and the assistant don't have encoders of the same size.", + "please load the assistant using WhisperForConditionalGeneration", + ) + # 1. prepare generation config generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs) From 404f67b8419623016330b70f5cd88118199097bf Mon Sep 17 00:00:00 2001 From: kamilakesbi Date: Fri, 3 May 2024 17:28:40 +0200 Subject: [PATCH 05/25] fix --- src/transformers/pipelines/automatic_speech_recognition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py index 784a9cd01a5baf..1597c71afd2711 100644 --- a/src/transformers/pipelines/automatic_speech_recognition.py +++ b/src/transformers/pipelines/automatic_speech_recognition.py @@ -508,7 +508,7 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs): else: generate_kwargs["encoder_outputs"] = encoder(inputs, attention_mask=attention_mask) - if generate_kwargs["assistant_model"]: + if "assistant_model" in generate_kwargs: generate_kwargs["input_features"] = inputs tokens = self.model.generate( From fd492a756f4189e1c0cc6aca41d96f879f47468d Mon Sep 17 00:00:00 2001 From: kamilakesbi Date: Mon, 6 May 2024 13:12:03 +0200 Subject: [PATCH 06/25] apply review suggestions --- src/transformers/generation/utils.py | 19 +++++++++++++++++++ .../models/whisper/generation_whisper.py | 8 -------- .../pipelines/automatic_speech_recognition.py | 5 ++--- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index ab8e6019062b78..47978384525d13 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1097,6 +1097,24 @@ def _validate_model_class(self): exception_message += f" Please use one of the following classes instead: {generate_compatible_classes}" raise TypeError(exception_message) + def _validate_assistant(self, assistant_model): + if assistant_model is not None: + if type(self).__name__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING._model_mapping.values(): + if type(assistant_model).__name__ in MODEL_FOR_CAUSAL_LM_MAPPING._model_mapping.values(): + attributes_to_check = [attr for attr in dir(self.config) if attr.startswith("encoder_")] + are_equal = all( + getattr(self.config, attr) == getattr(assistant_model.config, attr) + for attr in attributes_to_check + ) + if not are_equal: + raise ValueError( + "The main model and the assistant don't have encoders of the same size. " + "Load the assistant with AutoModelForSpeechSeq2Seq", + ) + + if not self.config.vocab_size == assistant_model.config.vocab_size: + raise ValueError("Make sure the main and assistant model use the same tokenizer") + def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]): """Validates model kwargs for generation. Generate argument typos will also be caught here.""" # If a `Cache` instance is passed, checks whether the model is compatible with it @@ -1547,6 +1565,7 @@ def generate( tokenizer = kwargs.pop("tokenizer", None) # Pull this out first, we only use it for stopping criteria generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs) self._validate_model_kwargs(model_kwargs.copy()) + self._validate_assistant(assistant_model) # 2. Set generation parameters if not already defined if synced_gpus is None: diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py index 7e30541296a9f2..a42d7b7dec3626 100644 --- a/src/transformers/models/whisper/generation_whisper.py +++ b/src/transformers/models/whisper/generation_whisper.py @@ -480,14 +480,6 @@ def generate( FutureWarning, ) - if "assistant_model" in kwargs: - if kwargs["assistant_model"].config.encoder_attention_heads != self.model.config.encoder_attention_heads: - if not type(kwargs["assistant_model"]).__name__ == "WhisperForConditionalGeneration": - raise ValueError( - "The main model and the assistant don't have encoders of the same size.", - "please load the assistant using WhisperForConditionalGeneration", - ) - # 1. prepare generation config generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs) diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py index 1597c71afd2711..07c157f92674f3 100644 --- a/src/transformers/pipelines/automatic_speech_recognition.py +++ b/src/transformers/pipelines/automatic_speech_recognition.py @@ -486,6 +486,8 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs): "Seq2Seq speech recognition model requires either a " f"`input_features` or `input_values` key, but only has {model_inputs.keys()}" ) + if "assistant_model" in generate_kwargs: + generate_kwargs["input_features"] = inputs # custom processing for Whisper timestamps and word-level timestamps if return_timestamps and self.type == "seq2seq_whisper": @@ -508,9 +510,6 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs): else: generate_kwargs["encoder_outputs"] = encoder(inputs, attention_mask=attention_mask) - if "assistant_model" in generate_kwargs: - generate_kwargs["input_features"] = inputs - tokens = self.model.generate( attention_mask=attention_mask, **generate_kwargs, From e41d51976d4f7f3351d51a8b0163560fb03a2f24 Mon Sep 17 00:00:00 2001 From: kamilakesbi Date: Fri, 10 May 2024 11:21:52 +0200 Subject: [PATCH 07/25] use self.config.is_encoder_decoder --- src/transformers/generation/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 47978384525d13..c7730ae8a6e5ee 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1099,8 +1099,8 @@ def _validate_model_class(self): def _validate_assistant(self, assistant_model): if assistant_model is not None: - if type(self).__name__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING._model_mapping.values(): - if type(assistant_model).__name__ in MODEL_FOR_CAUSAL_LM_MAPPING._model_mapping.values(): + if self.config.is_encoder_decoder: + if not assistant_model.config.is_encoder_decoder: attributes_to_check = [attr for attr in dir(self.config) if attr.startswith("encoder_")] are_equal = all( getattr(self.config, attr) == getattr(assistant_model.config, attr) From 27242a633babd908a2763cdc5705c4b5de1ba158 Mon Sep 17 00:00:00 2001 From: kamilakesbi Date: Fri, 10 May 2024 11:29:13 +0200 Subject: [PATCH 08/25] pass inputs to generate directly --- .../pipelines/automatic_speech_recognition.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py index 07c157f92674f3..f077c7d2c7f11c 100644 --- a/src/transformers/pipelines/automatic_speech_recognition.py +++ b/src/transformers/pipelines/automatic_speech_recognition.py @@ -474,7 +474,6 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs): raise ValueError("num_frames must be used only when stride is None") if self.type in {"seq2seq", "seq2seq_whisper"}: - encoder = self.model.get_encoder() # Consume values so we can let extra information flow freely through # the pipeline (important for `partial` in microphone) if "input_features" in model_inputs: @@ -486,8 +485,6 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs): "Seq2Seq speech recognition model requires either a " f"`input_features` or `input_values` key, but only has {model_inputs.keys()}" ) - if "assistant_model" in generate_kwargs: - generate_kwargs["input_features"] = inputs # custom processing for Whisper timestamps and word-level timestamps if return_timestamps and self.type == "seq2seq_whisper": @@ -502,15 +499,8 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs): else: generate_kwargs["num_frames"] = [s[0] // self.feature_extractor.hop_length for s in stride] - else: - generate_kwargs["num_frames"] = num_frames - - if self.type == "seq2seq_whisper" and inputs.shape[-1] > self.feature_extractor.nb_max_frames: - generate_kwargs["input_features"] = inputs - else: - generate_kwargs["encoder_outputs"] = encoder(inputs, attention_mask=attention_mask) - tokens = self.model.generate( + inputs=inputs, attention_mask=attention_mask, **generate_kwargs, ) From 726f53faf1d5eb42bb910ee2fa48456d73fa5e5d Mon Sep 17 00:00:00 2001 From: kamilakesbi Date: Fri, 10 May 2024 12:15:35 +0200 Subject: [PATCH 09/25] add slow tests --- ..._pipelines_automatic_speech_recognition.py | 116 ++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py index 5ab18e81d56854..41524b22cb1f72 100644 --- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import time import unittest import numpy as np @@ -23,6 +24,8 @@ MODEL_FOR_CTC_MAPPING, MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING, AutoFeatureExtractor, + AutoModelForCausalLM, + AutoModelForSpeechSeq2Seq, AutoProcessor, AutoTokenizer, Speech2TextForConditionalGeneration, @@ -1138,6 +1141,119 @@ def test_whisper_language(self): {"text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."}, ) + @slow + def test_speculative_decoding_whisper_non_distil(self): + # Load data: + dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]") + sample = dataset[0] + + # Load model: + model_id = "openai/whisper-large-v2" + processor = AutoProcessor.from_pretrained(model_id) + model = AutoModelForSpeechSeq2Seq.from_pretrained( + model_id, + low_cpu_mem_usage=True, + use_safetensors=True, + ) + + # Load assistant: + assistant_model_id = "openai/whisper-tiny" + assistant_model = AutoModelForSpeechSeq2Seq.from_pretrained( + assistant_model_id, + low_cpu_mem_usage=True, + use_safetensors=True, + ) + + # Load pipeline: + pipe = AutomaticSpeechRecognitionPipeline( + model=model, + tokenizer=processor.tokenizer, + feature_extractor=processor.feature_extractor, + generate_kwargs={"language": "en"}, + ) + + inputs = { + "sampling_rate": sample["audio"]["sampling_rate"], + "raw": np.array(sample["audio"]["array"]), + } + + start_time = time.time() + transcription_non_ass = pipe(inputs=inputs, generate_kwargs={"assistant_model": assistant_model})["text"] + total_time_assist = time.time() - start_time + + inputs = { + "sampling_rate": sample["audio"]["sampling_rate"], + "raw": np.array(sample["audio"]["array"]), + } + + start_time = time.time() + transcription_ass = pipe(inputs=inputs)["text"] + total_time_non_assist = time.time() - start_time + + assert transcription_ass == transcription_non_ass + assert ( + transcription_ass + == " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel." + ) + + assert total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster" + + @slow + def test_speculative_decoding_whisper_distil(self): + # Load data: + dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]") + sample = dataset[0] + + # Load model: + model_id = "openai/whisper-large-v2" + processor = AutoProcessor.from_pretrained(model_id) + model = AutoModelForSpeechSeq2Seq.from_pretrained( + model_id, + low_cpu_mem_usage=True, + use_safetensors=True, + ) + + # Load assistant: + assistant_model_id = "distil-whisper/distil-large-v2" + assistant_model = AutoModelForCausalLM.from_pretrained( + assistant_model_id, + low_cpu_mem_usage=True, + use_safetensors=True, + ) + + # Load pipeline: + pipe = AutomaticSpeechRecognitionPipeline( + model=model, + tokenizer=processor.tokenizer, + feature_extractor=processor.feature_extractor, + generate_kwargs={"language": "en"}, + ) + + inputs = { + "sampling_rate": sample["audio"]["sampling_rate"], + "raw": np.array(sample["audio"]["array"]), + } + + start_time = time.time() + transcription_non_ass = pipe(inputs=inputs, generate_kwargs={"assistant_model": assistant_model})["text"] + total_time_assist = time.time() - start_time + + inputs = { + "sampling_rate": sample["audio"]["sampling_rate"], + "raw": np.array(sample["audio"]["array"]), + } + + start_time = time.time() + transcription_ass = pipe(inputs=inputs)["text"] + total_time_non_assist = time.time() - start_time + + assert transcription_ass == transcription_non_ass + assert ( + transcription_ass + == " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel." + ) + assert total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster" + @slow @require_torch @require_torchaudio From c7f3f1ccaef095b3bd1a819e7ab7a410dd53c5c2 Mon Sep 17 00:00:00 2001 From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com> Date: Fri, 10 May 2024 18:38:18 +0200 Subject: [PATCH 10/25] Update src/transformers/generation/utils.py Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> --- src/transformers/generation/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index c7730ae8a6e5ee..8273c4b9c7a188 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1109,7 +1109,7 @@ def _validate_assistant(self, assistant_model): if not are_equal: raise ValueError( "The main model and the assistant don't have encoders of the same size. " - "Load the assistant with AutoModelForSpeechSeq2Seq", + "Ensure you load the assistant with the correct encoder-decoder class, e.g. `AutoModelForSpeechSeq2Seq` for Whisper." ) if not self.config.vocab_size == assistant_model.config.vocab_size: From 405606c918d9227411b319af0319aa640179b587 Mon Sep 17 00:00:00 2001 From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com> Date: Fri, 10 May 2024 18:40:00 +0200 Subject: [PATCH 11/25] Update tests/pipelines/test_pipelines_automatic_speech_recognition.py Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> --- .../pipelines/test_pipelines_automatic_speech_recognition.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py index 41524b22cb1f72..b87943a22efcb3 100644 --- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py @@ -1172,10 +1172,6 @@ def test_speculative_decoding_whisper_non_distil(self): generate_kwargs={"language": "en"}, ) - inputs = { - "sampling_rate": sample["audio"]["sampling_rate"], - "raw": np.array(sample["audio"]["array"]), - } start_time = time.time() transcription_non_ass = pipe(inputs=inputs, generate_kwargs={"assistant_model": assistant_model})["text"] From 2c8c0391144317f6d3e44d3ff3d7fa3ded29f3d0 Mon Sep 17 00:00:00 2001 From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com> Date: Fri, 10 May 2024 18:40:17 +0200 Subject: [PATCH 12/25] Update tests/pipelines/test_pipelines_automatic_speech_recognition.py Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> --- tests/pipelines/test_pipelines_automatic_speech_recognition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py index b87943a22efcb3..79c17a0a53a844 100644 --- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py @@ -1174,7 +1174,7 @@ def test_speculative_decoding_whisper_non_distil(self): start_time = time.time() - transcription_non_ass = pipe(inputs=inputs, generate_kwargs={"assistant_model": assistant_model})["text"] + transcription_non_ass = pipe(sample.copy(), generate_kwargs={"assistant_model": assistant_model})["text"] total_time_assist = time.time() - start_time inputs = { From 5b6f297ae91a3c5d394d42dac9b6a7d0d837e0a1 Mon Sep 17 00:00:00 2001 From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com> Date: Fri, 10 May 2024 18:40:26 +0200 Subject: [PATCH 13/25] Update tests/pipelines/test_pipelines_automatic_speech_recognition.py Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> --- .../pipelines/test_pipelines_automatic_speech_recognition.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py index 79c17a0a53a844..e2dbb960ccac31 100644 --- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py @@ -1177,10 +1177,6 @@ def test_speculative_decoding_whisper_non_distil(self): transcription_non_ass = pipe(sample.copy(), generate_kwargs={"assistant_model": assistant_model})["text"] total_time_assist = time.time() - start_time - inputs = { - "sampling_rate": sample["audio"]["sampling_rate"], - "raw": np.array(sample["audio"]["array"]), - } start_time = time.time() transcription_ass = pipe(inputs=inputs)["text"] From 03d2c3e52afd2edd24528a0c6e21be66edb1ca98 Mon Sep 17 00:00:00 2001 From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com> Date: Fri, 10 May 2024 18:40:36 +0200 Subject: [PATCH 14/25] Update tests/pipelines/test_pipelines_automatic_speech_recognition.py Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> --- tests/pipelines/test_pipelines_automatic_speech_recognition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py index e2dbb960ccac31..b64bc5a7011ff8 100644 --- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py @@ -1179,7 +1179,7 @@ def test_speculative_decoding_whisper_non_distil(self): start_time = time.time() - transcription_ass = pipe(inputs=inputs)["text"] + transcription_ass = pipe(sample)["text"] total_time_non_assist = time.time() - start_time assert transcription_ass == transcription_non_ass From f1c8c8abf46b8b62ff95aa5b049837960a8f6ad8 Mon Sep 17 00:00:00 2001 From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com> Date: Fri, 10 May 2024 18:40:48 +0200 Subject: [PATCH 15/25] Update tests/pipelines/test_pipelines_automatic_speech_recognition.py Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> --- tests/pipelines/test_pipelines_automatic_speech_recognition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py index b64bc5a7011ff8..de0822b7544fa3 100644 --- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py @@ -1145,7 +1145,7 @@ def test_whisper_language(self): def test_speculative_decoding_whisper_non_distil(self): # Load data: dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]") - sample = dataset[0] + sample = dataset[0]["audio"] # Load model: model_id = "openai/whisper-large-v2" From d1571a9c32eb1ee92b7ff8353d86a400283cce55 Mon Sep 17 00:00:00 2001 From: kamilakesbi Date: Fri, 10 May 2024 18:46:56 +0200 Subject: [PATCH 16/25] apply review --- ..._pipelines_automatic_speech_recognition.py | 22 +++---------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py index de0822b7544fa3..9f79d2ec72b71a 100644 --- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py @@ -1152,7 +1152,6 @@ def test_speculative_decoding_whisper_non_distil(self): processor = AutoProcessor.from_pretrained(model_id) model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, - low_cpu_mem_usage=True, use_safetensors=True, ) @@ -1160,7 +1159,6 @@ def test_speculative_decoding_whisper_non_distil(self): assistant_model_id = "openai/whisper-tiny" assistant_model = AutoModelForSpeechSeq2Seq.from_pretrained( assistant_model_id, - low_cpu_mem_usage=True, use_safetensors=True, ) @@ -1172,12 +1170,10 @@ def test_speculative_decoding_whisper_non_distil(self): generate_kwargs={"language": "en"}, ) - start_time = time.time() transcription_non_ass = pipe(sample.copy(), generate_kwargs={"assistant_model": assistant_model})["text"] total_time_assist = time.time() - start_time - start_time = time.time() transcription_ass = pipe(sample)["text"] total_time_non_assist = time.time() - start_time @@ -1194,14 +1190,13 @@ def test_speculative_decoding_whisper_non_distil(self): def test_speculative_decoding_whisper_distil(self): # Load data: dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]") - sample = dataset[0] + sample = dataset[0]["audio"] # Load model: model_id = "openai/whisper-large-v2" processor = AutoProcessor.from_pretrained(model_id) model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, - low_cpu_mem_usage=True, use_safetensors=True, ) @@ -1209,7 +1204,6 @@ def test_speculative_decoding_whisper_distil(self): assistant_model_id = "distil-whisper/distil-large-v2" assistant_model = AutoModelForCausalLM.from_pretrained( assistant_model_id, - low_cpu_mem_usage=True, use_safetensors=True, ) @@ -1221,22 +1215,12 @@ def test_speculative_decoding_whisper_distil(self): generate_kwargs={"language": "en"}, ) - inputs = { - "sampling_rate": sample["audio"]["sampling_rate"], - "raw": np.array(sample["audio"]["array"]), - } - start_time = time.time() - transcription_non_ass = pipe(inputs=inputs, generate_kwargs={"assistant_model": assistant_model})["text"] + transcription_non_ass = pipe(sample.copy(), generate_kwargs={"assistant_model": assistant_model})["text"] total_time_assist = time.time() - start_time - inputs = { - "sampling_rate": sample["audio"]["sampling_rate"], - "raw": np.array(sample["audio"]["array"]), - } - start_time = time.time() - transcription_ass = pipe(inputs=inputs)["text"] + transcription_ass = pipe(sample)["text"] total_time_non_assist = time.time() - start_time assert transcription_ass == transcription_non_ass From 83e17f659cf31f34a3bd47e20f8dbee7497483b5 Mon Sep 17 00:00:00 2001 From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com> Date: Wed, 15 May 2024 15:51:37 +0200 Subject: [PATCH 17/25] Update src/transformers/generation/utils.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- src/transformers/generation/utils.py | 31 ++++++++++++++-------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 8273c4b9c7a188..65911bc5c84f1c 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1098,22 +1098,23 @@ def _validate_model_class(self): raise TypeError(exception_message) def _validate_assistant(self, assistant_model): - if assistant_model is not None: - if self.config.is_encoder_decoder: - if not assistant_model.config.is_encoder_decoder: - attributes_to_check = [attr for attr in dir(self.config) if attr.startswith("encoder_")] - are_equal = all( - getattr(self.config, attr) == getattr(assistant_model.config, attr) - for attr in attributes_to_check - ) - if not are_equal: - raise ValueError( - "The main model and the assistant don't have encoders of the same size. " - "Ensure you load the assistant with the correct encoder-decoder class, e.g. `AutoModelForSpeechSeq2Seq` for Whisper." - ) + if assistant_model is None or not self.config.is_encoder_decoder: + return + + if not assistant_model.config.is_encoder_decoder: + attributes_to_check = [attr for attr in dir(self.config) if attr.startswith("encoder_")] + are_equal = all( + getattr(self.config, attr) == getattr(assistant_model.config, attr) + for attr in attributes_to_check + ) + if not are_equal: + raise ValueError( + "The main model and the assistant don't have encoders of the same size. " + "Ensure you load the assistant with the correct encoder-decoder class, e.g. `AutoModelForSpeechSeq2Seq` for Whisper." + ) - if not self.config.vocab_size == assistant_model.config.vocab_size: - raise ValueError("Make sure the main and assistant model use the same tokenizer") + if not self.config.vocab_size == assistant_model.config.vocab_size: + raise ValueError("Make sure the main and assistant model use the same tokenizer") def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]): """Validates model kwargs for generation. Generate argument typos will also be caught here.""" From 29046c6c926aeb4b1eb60d7c51549516f72d71f1 Mon Sep 17 00:00:00 2001 From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com> Date: Wed, 15 May 2024 15:51:57 +0200 Subject: [PATCH 18/25] Update tests/pipelines/test_pipelines_automatic_speech_recognition.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- .../test_pipelines_automatic_speech_recognition.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py index 9f79d2ec72b71a..9b8a3713b89759 100644 --- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py @@ -1178,13 +1178,12 @@ def test_speculative_decoding_whisper_non_distil(self): transcription_ass = pipe(sample)["text"] total_time_non_assist = time.time() - start_time - assert transcription_ass == transcription_non_ass - assert ( - transcription_ass - == " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel." + self.assertEqual(transcription_ass, transcription_non_ass) + self.assertEqual( + transcription_ass, + " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel." ) - - assert total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster" + self.assertTrue(total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster") @slow def test_speculative_decoding_whisper_distil(self): From d21637648483af6e3220b4134cf01b19da24dab8 Mon Sep 17 00:00:00 2001 From: kamilakesbi Date: Wed, 15 May 2024 16:02:02 +0200 Subject: [PATCH 19/25] apply code review --- src/transformers/generation/utils.py | 3 +-- .../test_pipelines_automatic_speech_recognition.py | 14 +++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 65911bc5c84f1c..b5b6e82d16b29a 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1104,8 +1104,7 @@ def _validate_assistant(self, assistant_model): if not assistant_model.config.is_encoder_decoder: attributes_to_check = [attr for attr in dir(self.config) if attr.startswith("encoder_")] are_equal = all( - getattr(self.config, attr) == getattr(assistant_model.config, attr) - for attr in attributes_to_check + getattr(self.config, attr) == getattr(assistant_model.config, attr) for attr in attributes_to_check ) if not are_equal: raise ValueError( diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py index 9b8a3713b89759..430666990fe5c2 100644 --- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py @@ -1181,9 +1181,9 @@ def test_speculative_decoding_whisper_non_distil(self): self.assertEqual(transcription_ass, transcription_non_ass) self.assertEqual( transcription_ass, - " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel." + " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", ) - self.assertTrue(total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster") + self.assertTrue(total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster") @slow def test_speculative_decoding_whisper_distil(self): @@ -1222,12 +1222,12 @@ def test_speculative_decoding_whisper_distil(self): transcription_ass = pipe(sample)["text"] total_time_non_assist = time.time() - start_time - assert transcription_ass == transcription_non_ass - assert ( - transcription_ass - == " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel." + self.assertEqual(transcription_ass, transcription_non_ass) + self.assertEqual( + transcription_ass, + " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", ) - assert total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster" + self.assertEqual(total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster") @slow @require_torch From 87b08e97b0171b5c3cf9169ed4777c1392f5e563 Mon Sep 17 00:00:00 2001 From: kamilakesbi Date: Wed, 15 May 2024 19:40:45 +0200 Subject: [PATCH 20/25] update attributes encoder_xyz to check --- src/transformers/generation/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index b5b6e82d16b29a..45e61b3da3029a 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1102,7 +1102,8 @@ def _validate_assistant(self, assistant_model): return if not assistant_model.config.is_encoder_decoder: - attributes_to_check = [attr for attr in dir(self.config) if attr.startswith("encoder_")] + attributes_to_check = ["encoder_attention_heads", "encoder_ffn_dim", "encoder_layers"] + attributes_to_check = [attr for attr in dir(self.config) if attr in attributes_to_check] are_equal = all( getattr(self.config, attr) == getattr(assistant_model.config, attr) for attr in attributes_to_check ) From a43e202b6e54e8aa21c105dfa81e00b0ec287cd1 Mon Sep 17 00:00:00 2001 From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com> Date: Mon, 20 May 2024 14:01:05 +0200 Subject: [PATCH 21/25] Update src/transformers/generation/utils.py Co-authored-by: Joao Gante --- src/transformers/generation/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 45e61b3da3029a..1987f35d525d3d 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1109,7 +1109,7 @@ def _validate_assistant(self, assistant_model): ) if not are_equal: raise ValueError( - "The main model and the assistant don't have encoders of the same size. " + "The main model and the assistant don't have compatible encoder-dependent input shapes. " "Ensure you load the assistant with the correct encoder-decoder class, e.g. `AutoModelForSpeechSeq2Seq` for Whisper." ) From b23f1f3f7fb46b1f9a672bbe425b66abd67fc532 Mon Sep 17 00:00:00 2001 From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com> Date: Mon, 20 May 2024 14:02:45 +0200 Subject: [PATCH 22/25] Update src/transformers/generation/utils.py Co-authored-by: Joao Gante --- src/transformers/generation/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 1987f35d525d3d..6f0ea47d9836dc 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1103,7 +1103,7 @@ def _validate_assistant(self, assistant_model): if not assistant_model.config.is_encoder_decoder: attributes_to_check = ["encoder_attention_heads", "encoder_ffn_dim", "encoder_layers"] - attributes_to_check = [attr for attr in dir(self.config) if attr in attributes_to_check] + attributes_to_check = [attr for attr in dir(assistant_model.config) if attr in attributes_to_check] are_equal = all( getattr(self.config, attr) == getattr(assistant_model.config, attr) for attr in attributes_to_check ) From 5547aef75bd1080c6bdc207e0dc6061b4cf804aa Mon Sep 17 00:00:00 2001 From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com> Date: Mon, 20 May 2024 14:04:03 +0200 Subject: [PATCH 23/25] Update src/transformers/generation/utils.py Co-authored-by: Joao Gante --- src/transformers/generation/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 6f0ea47d9836dc..149ce144e66272 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1098,10 +1098,10 @@ def _validate_model_class(self): raise TypeError(exception_message) def _validate_assistant(self, assistant_model): - if assistant_model is None or not self.config.is_encoder_decoder: + if assistant_model is None: return - if not assistant_model.config.is_encoder_decoder: + if self.config.is_encoder_decoder and not assistant_model.config.is_encoder_decoder: attributes_to_check = ["encoder_attention_heads", "encoder_ffn_dim", "encoder_layers"] attributes_to_check = [attr for attr in dir(assistant_model.config) if attr in attributes_to_check] are_equal = all( From e2bdde1a62cd82448098ecd24430fd8e828f405f Mon Sep 17 00:00:00 2001 From: kamilakesbi Date: Tue, 21 May 2024 13:07:52 +0200 Subject: [PATCH 24/25] add slow test --- tests/generation/test_utils.py | 62 ++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index b8e90a5b8ed18e..840b64e17db010 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -45,6 +45,7 @@ AutoModelForSeq2SeqLM, AutoModelForSpeechSeq2Seq, AutoModelForVision2Seq, + AutoProcessor, AutoTokenizer, BartForCausalLM, BartForConditionalGeneration, @@ -2919,6 +2920,67 @@ def test_assisted_decoding_num_assistant_tokens_heuristic_transient_schedule(sel # update_candidate_strategy is called once but assistant_model.generation_config.num_assistant_tokens should stay 5 self.assertEqual(assistant_model.generation_config.num_assistant_tokens, 5) + @slow + def test_validate_assistant(self): + # Generate a random sample: + inputs = np.random.rand(160000) + + # Load a main encoder-decoder model: + model_id = "openai/whisper-large-v2" + processor = AutoProcessor.from_pretrained(model_id) + model = AutoModelForSpeechSeq2Seq.from_pretrained( + model_id, + low_cpu_mem_usage=True, + use_safetensors=True, + ) + model.to(torch_device) + + # process the input: + features = processor(inputs, return_tensors="pt").to(torch_device) + + # Load an encoder-decoder assistant with same encoder as the main model: + assistant_distil_model_id = "distil-whisper/distil-large-v2" + assistant_seq_to_seq = AutoModelForSpeechSeq2Seq.from_pretrained( + assistant_distil_model_id, + use_safetensors=True, + ).to(torch_device) + self.assertTrue(model.generate(**features, assistant_model=assistant_seq_to_seq).sum()) + + # Load its decoder only version: + assistant_causal_lm = AutoModelForCausalLM.from_pretrained( + assistant_distil_model_id, + low_cpu_mem_usage=True, + use_safetensors=True, + ).to(torch_device) + self.assertTrue(model.generate(**features, assistant_model=assistant_causal_lm).sum()) + + # Load an encoder-decoder assistant with a different encoder than the main model: + assistant_distil_model_id = "openai/whisper-tiny" + assistant_seq_to_seq = AutoModelForSpeechSeq2Seq.from_pretrained( + assistant_distil_model_id, + use_safetensors=True, + ).to(torch_device) + self.assertTrue(model.generate(**features, assistant_model=assistant_seq_to_seq).sum()) + + # Load its decoder only version: + assistant_causal_lm = AutoModelForCausalLM.from_pretrained( + assistant_distil_model_id, + low_cpu_mem_usage=True, + use_safetensors=True, + ).to(torch_device) + # It will raise an error as the encoder of the main and assistant model are not compatible: + with self.assertRaises(ValueError): + model.generate(**features, assistant_model=assistant_causal_lm) + + # Load an encoder-decoder model with a different tokenizer than the main model: + assistant_distil_model_id = "hf-internal-testing/tiny-random-SeamlessM4Tv2ForSpeechToText" + assistant_seq_to_seq = AutoModelForSpeechSeq2Seq.from_pretrained( + assistant_distil_model_id, + ).to(torch_device) + # This should raise an error as the main and assistant model don't use the same tokenizer: + with self.assertRaises(ValueError): + model.generate(**features, assistant_model=assistant_seq_to_seq) + def test_compare_unprocessed_logit_scores(self): # Get unprocessed logit scores back from model generate function. # Assert that unprocessed logits from generate() are same as those from modal eval() From 3a3514547299c44af7676f3a08b86aaf7e96e6a0 Mon Sep 17 00:00:00 2001 From: kamilakesbi Date: Wed, 22 May 2024 13:07:28 +0200 Subject: [PATCH 25/25] solve conflicts --- src/transformers/pipelines/automatic_speech_recognition.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py index f077c7d2c7f11c..01faab6d74adac 100644 --- a/src/transformers/pipelines/automatic_speech_recognition.py +++ b/src/transformers/pipelines/automatic_speech_recognition.py @@ -498,6 +498,8 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs): generate_kwargs["num_frames"] = stride[0] // self.feature_extractor.hop_length else: generate_kwargs["num_frames"] = [s[0] // self.feature_extractor.hop_length for s in stride] + else: + generate_kwargs["num_frames"] = num_frames tokens = self.model.generate( inputs=inputs,