From 2c6d67e039a22e32bc43b53533c3f5b27929eea6 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Thu, 9 Jan 2025 06:21:51 +0100 Subject: [PATCH] Whisper pipeline: refactor tests, disable `return_timestamps` check (#1496) Ticket: 160055 --------- Co-authored-by: Ilya Lavrenov --- .github/workflows/windows.yml | 6 + samples/export-requirements.txt | 2 +- tests/python_tests/requirements.txt | 2 +- tests/python_tests/test_whisper_pipeline.py | 434 ++++++++------------ 4 files changed, 169 insertions(+), 275 deletions(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 95a713d7a1..8f43af44ae 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -310,6 +310,12 @@ jobs: . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels python -m pip install ./tools/who_what_benchmark --find-links ${env:OV_INSTALL_DIR}/wheels + + # will install transformers 4.46.3 version + # transformers 4.46.3 will enable return_timestamps tests + # this check enabled for windows only. Ticket: 160205. + python -m pip install git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631 + python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k "not test_smoke" genai_python_lib_vlm: diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt index 2f71891b7b..af38558656 100644 --- a/samples/export-requirements.txt +++ b/samples/export-requirements.txt @@ -2,7 +2,7 @@ --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly openvino-tokenizers~=2025.0.0.0.dev -optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631 +optimum-intel @ git+https://github.com/huggingface/optimum-intel.git numpy<2.0.0; sys_platform == 'darwin' einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index e23eaacc21..c851c71ee5 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -1,6 +1,6 @@ --extra-index-url https://download.pytorch.org/whl/cpu diffusers==0.32.1 -optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631 +optimum-intel @ git+https://github.com/huggingface/optimum-intel.git numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64" onnx==1.17.0 pytest diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py index aa78666e32..c046d1ae2c 100644 --- a/tests/python_tests/test_whisper_pipeline.py +++ b/tests/python_tests/test_whisper_pipeline.py @@ -11,11 +11,13 @@ from optimum.intel.openvino import OVModelForSpeechSeq2Seq import gc import json -import time import typing import numpy as np import os import pathlib +import importlib.metadata as metadata +from packaging.version import parse + @pytest.fixture(scope="class", autouse=True) def run_gc_after_test(): @@ -27,36 +29,29 @@ def run_gc_after_test(): gc.collect() -def get_whisper_models_list(tiny_only=False, multilingual=False, en_only=False): - precommit_models = [ +def get_whisper_models_list(tiny_only=False): + model_ids = [ "openai/whisper-tiny", - "openai/whisper-tiny.en", "distil-whisper/distil-small.en", ] - if multilingual: - precommit_models = ["openai/whisper-tiny"] - if en_only: - precommit_models = ["openai/whisper-tiny.en", "distil-whisper/distil-small.en"] - if tiny_only: - precommit_models = ["openai/whisper-tiny"] - - nightly_models = [] - if pytest.run_marker == "precommit": - model_ids = precommit_models - else: - model_ids = nightly_models + if tiny_only: + model_ids = ["openai/whisper-tiny"] if pytest.selected_model_ids: - model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')] + model_ids = [ + model_id + for model_id in model_ids + if model_id in pytest.selected_model_ids.split(" ") + ] - prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) - return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids] + prefix = pathlib.Path(os.getenv("GENAI_MODELS_PATH_PREFIX", "")) + return [(model_id, prefix / model_id.split("/")[1]) for model_id in model_ids] # used whisper models are relatively small # cache them in memory to speedup tests -@functools.lru_cache(3) +@functools.lru_cache() def read_whisper_model(params, **tokenizer_kwargs): model_id, path = params @@ -90,6 +85,7 @@ def read_whisper_model(params, **tokenizer_kwargs): model_id, export=True, trust_remote_code=True, + stateful=False, compile=False, device="CPU", load_in_8bit=False, @@ -114,30 +110,39 @@ def read_whisper_model(params, **tokenizer_kwargs): ) -def compare_genai_and_opt_pipelines(opt_pipe, genai_pipe, dataset_id): - ds = datasets.load_dataset(dataset_id, "clean", split="validation") - opt_infer_time = 0 - genai_infer_time = 0 - - for ds_row in ds: - audio_sample = ds_row["audio"] +def run_huggingface( + pipeline, + sample, + config: ov_genai.WhisperGenerationConfig | None = None, +): + if not config: + config = ov_genai.WhisperGenerationConfig() + + return pipeline( + sample, + max_new_tokens=min(config.max_new_tokens, 444), + return_timestamps=config.return_timestamps, + generate_kwargs={"language": config.language, "task": config.task}, + ) - streamer_result = [] - start = time.time() - genai_result = genai_pipe.generate( - audio_sample["array"].tolist(), streamer=lambda x: streamer_result.append(x) - ) - genai_infer_time += time.time() - start +def run_genai( + pipeline: ov_genai.WhisperPipeline, + sample, + config: ov_genai.WhisperGenerationConfig | None = None, + streamer: typing.Callable[[str], bool] | None = None, +): + if not config: + config = ov_genai.WhisperGenerationConfig() - start = time.time() - result = opt_pipe(audio_sample) - opt_infer_time += time.time() - start + genai_config = pipeline.get_generation_config() - assert genai_result.texts[0] == result["text"] - assert "".join(streamer_result) == result["text"] + genai_config.max_new_tokens = config.max_new_tokens + genai_config.return_timestamps = config.return_timestamps + genai_config.task = config.task + genai_config.language = f"<|{config.language}|>" if config.language else None - print(f"Inference time\nOpt: {opt_infer_time}\nGenAI: {genai_infer_time}") + return pipeline.generate(sample, genai_config, streamer=streamer) def get_samples_from_dataset( @@ -166,13 +171,50 @@ def get_samples_from_dataset( return [x["audio"]["array"] for x in ds] -@pytest.mark.parametrize("model_descr", get_whisper_models_list()) -@pytest.mark.parametrize("dataset_id", ["hf-internal-testing/librispeech_asr_dummy"]) -@pytest.mark.precommit -def test_whisper_on_hf_dataset(model_descr, dataset_id): - model_id, path, opt_pipe, genai_pipe = read_whisper_model(model_descr) +def run_pipeline_with_ref( + model_id: str, + tmp_path: str, + sample: np.ndarray | list[np.ndarray], + generation_config: ov_genai.WhisperGenerationConfig | None = None, + streamer: typing.Callable[[str], bool] | None = None, +): + _, _, hf_pipe, genai_pipe = read_whisper_model((model_id, tmp_path)) + + if type(sample) is np.ndarray and len(sample.shape) == 1: + sample = np.expand_dims(sample, 0) + + for _sample in sample: + genai_result = run_genai(genai_pipe, _sample, generation_config, streamer) + hf_result = run_huggingface(hf_pipe, _sample, generation_config) + + compare_results(hf_result, genai_result) + - compare_genai_and_opt_pipelines(opt_pipe, genai_pipe, dataset_id) +def compare_results(hf_result, genai_result): + assert genai_result.texts[0] == hf_result["text"] + + # transformers 4.47 updated return_timestamps implementation + # remove once genai implementation aligned with transformers. Ticket 160205. + transformers_version_greater_4_47 = parse( + metadata.version("transformers") + ) >= parse("4.47.0") + + if transformers_version_greater_4_47: + return + + if "chunks" not in hf_result and genai_result.chunks is None: + return + + assert len(genai_result.chunks) == len(hf_result["chunks"]) + + for opt_chunk, genai_chunk in zip(hf_result["chunks"], genai_result.chunks): + assert opt_chunk["text"] == genai_chunk.text + assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2) + if opt_chunk["timestamp"][1]: + assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2) + else: + assert opt_chunk["timestamp"][1] == None + assert round(genai_chunk.end_ts, 2) == -1.0 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) @@ -182,16 +224,11 @@ def test_whisper_on_hf_dataset(model_descr, dataset_id): ) @pytest.mark.precommit def test_smoke(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) - - expected = opt_pipe(test_sample) - - genai_result = pipe.generate(test_sample) - - assert genai_result.texts[0] == expected["text"] - - assert "chunks" not in expected - assert genai_result.chunks == None + run_pipeline_with_ref( + model_id=model_descr[0], + tmp_path=model_descr[1], + sample=test_sample, + ) @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) @@ -259,79 +296,55 @@ def test_whisper_constructors(model_descr, test_sample): def test_max_new_tokens(model_descr, test_sample): model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) - expected = opt_pipe(test_sample, max_new_tokens=10)["text"] + expected = opt_pipe(test_sample, max_new_tokens=10) genai_result = pipe.generate(test_sample, max_new_tokens=10) - assert genai_result.texts[0] == expected - - genai_result = pipe.generate(test_sample) - - assert genai_result.texts[0] != expected + compare_results(expected, genai_result) config = pipe.get_generation_config() config.max_new_tokens = 10 genai_result = pipe.generate(test_sample, config) - assert genai_result.texts[0] == expected + compare_results(expected, genai_result) @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) @pytest.mark.parametrize( - "test_sample", get_samples_from_dataset(language="fr", length=3) + "test_samples", + [ + (get_samples_from_dataset(language="fr", length=1), "fr"), + (get_samples_from_dataset(language="de", length=1), "de"), + ], ) @pytest.mark.precommit -def test_language_mode_fr(model_descr, test_sample): - model_id, path = model_descr +def test_language_mode(model_descr, test_samples): model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + samples, language = test_samples expected = opt_pipe( - test_sample, max_new_tokens=30, generate_kwargs={"language": "fr"} + samples[0], max_new_tokens=30, generate_kwargs={"language": language} ) - genai_result = pipe.generate(test_sample, max_new_tokens=30, language="<|fr|>") - - assert genai_result.texts[0] == expected["text"] - - config = pipe.get_generation_config() - config.max_new_tokens = 30 - config.language = "<|fr|>" - genai_result = pipe.generate(test_sample, config) - - assert genai_result.texts[0] == expected["text"] - - -@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) -@pytest.mark.parametrize( - "test_sample", get_samples_from_dataset(language="de", length=3) -) -@pytest.mark.precommit -def test_language_mode_de(model_descr, test_sample): - model_id, path = model_descr - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) - - expected = opt_pipe( - test_sample, max_new_tokens=30, generate_kwargs={"language": "de"} + genai_result = pipe.generate( + samples[0], max_new_tokens=30, language=f"<|{language}|>" ) - genai_result = pipe.generate(test_sample, max_new_tokens=30, language="<|de|>") - - assert genai_result.texts[0] == expected["text"] + compare_results(expected, genai_result) config = pipe.get_generation_config() config.max_new_tokens = 30 - config.language = "<|de|>" - genai_result = pipe.generate(test_sample, config) + config.language = f"<|{language}|>" + genai_result = pipe.generate(samples[0], config) - assert genai_result.texts[0] == expected["text"] + compare_results(expected, genai_result) @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) @pytest.mark.parametrize( - "test_sample", get_samples_from_dataset(language="fr", length=3) + "test_sample", get_samples_from_dataset(language="fr", length=1) ) @pytest.mark.precommit def test_task_mode(model_descr, test_sample): - model_id, path = model_descr model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) expected = opt_pipe( @@ -344,7 +357,7 @@ def test_task_mode(model_descr, test_sample): test_sample, max_new_tokens=30, language="<|fr|>", task="translate" ) - assert genai_result.texts[0] == expected["text"] + compare_results(expected, genai_result) config = pipe.get_generation_config() config.max_new_tokens = 30 @@ -352,27 +365,7 @@ def test_task_mode(model_descr, test_sample): config.task = "translate" genai_result = pipe.generate(test_sample, config) - assert genai_result.texts[0] == expected["text"] - - expected = opt_pipe( - test_sample, - max_new_tokens=30, - generate_kwargs={"language": "ru", "task": "translate"}, - ) - - genai_result = pipe.generate( - test_sample, max_new_tokens=30, language="<|ru|>", task="translate" - ) - - assert genai_result.texts[0] == expected["text"] - - config = pipe.get_generation_config() - config.max_new_tokens = 30 - config.language = "<|ru|>" - config.task = "translate" - genai_result = pipe.generate(test_sample, config) - - assert genai_result.texts[0] == expected["text"] + compare_results(expected, genai_result) # seems to be equivalent to translate task expected = opt_pipe( @@ -385,7 +378,7 @@ def test_task_mode(model_descr, test_sample): test_sample, max_new_tokens=30, language="<|en|>", task="transcribe" ) - assert genai_result.texts[0] == expected["text"] + compare_results(expected, genai_result) config = pipe.get_generation_config() config.max_new_tokens = 30 @@ -393,21 +386,20 @@ def test_task_mode(model_descr, test_sample): config.task = "transcribe" genai_result = pipe.generate(test_sample, config) - assert genai_result.texts[0] == expected["text"] + compare_results(expected, genai_result) @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) @pytest.mark.parametrize( "test_sample", [ - *get_samples_from_dataset(language="fr", length=2), - *get_samples_from_dataset(language="de", length=2), - *get_samples_from_dataset(language="es", length=2), + *get_samples_from_dataset(language="fr", length=1), + *get_samples_from_dataset(language="de", length=1), + *get_samples_from_dataset(language="es", length=1), ], ) @pytest.mark.precommit def test_language_autodetect(model_descr, test_sample): - model_id, path = model_descr model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) input_features = opt_pipe.feature_extractor(test_sample) @@ -415,188 +407,84 @@ def test_language_autodetect(model_descr, test_sample): # ensure detected language us not english assert language_id != pipe.get_generation_config().lang_to_id["<|en|>"] - expected = opt_pipe( - test_sample, - max_new_tokens=30, + run_pipeline_with_ref( + model_id=model_descr[0], + tmp_path=model_descr[1], + sample=test_sample, + generation_config=ov_genai.WhisperGenerationConfig(max_new_tokens=30), ) - genai_result = pipe.generate(test_sample, max_new_tokens=30) - - assert genai_result.texts[0] == expected["text"] - @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) -@pytest.mark.parametrize( - "test_sample", - [ - *get_samples_from_dataset(language="en", length=10, long_form=True), - ], -) +@pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1)) @pytest.mark.precommit def test_return_timestamps_short_form(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) - # long form audio not supported yet - test_sample = test_sample[: 16000 * 30] - - expected = opt_pipe( - test_sample, - return_timestamps=True, - ) - - genai_result = pipe.generate( - test_sample.tolist(), - return_timestamps=True, + run_pipeline_with_ref( + model_id=model_descr[0], + tmp_path=model_descr[1], + sample=test_sample, + generation_config=ov_genai.WhisperGenerationConfig(return_timestamps=True), ) - assert genai_result.texts[0] == expected["text"] - - assert len(genai_result.chunks) == len(expected["chunks"]) - - for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks): - assert opt_chunk["text"] == genai_chunk.text - assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2) - assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2) - @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) -@pytest.mark.parametrize( - "test_sample", - [ - *get_samples_from_dataset(language="en", length=10, long_form=True), - ], -) +@pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1)) @pytest.mark.precommit def test_return_timestamps_max_new_tokens_short_form(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) - # long form audio not supported yet - test_sample = test_sample[: 16000 * 30] - - expected = opt_pipe( - test_sample, - return_timestamps=True, - max_new_tokens=15, - generate_kwargs={"language": "en"}, - ) - - genai_result = pipe.generate( - test_sample.tolist(), - max_new_tokens=15, - return_timestamps=True, - language="<|en|>", + run_pipeline_with_ref( + model_id=model_descr[0], + tmp_path=model_descr[1], + sample=test_sample, + generation_config=ov_genai.WhisperGenerationConfig( + return_timestamps=True, language="en", max_new_tokens=30 + ), ) - assert genai_result.texts[0] == expected["text"] - - assert len(genai_result.chunks) == len(expected["chunks"]) - for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks): - assert opt_chunk["text"] == genai_chunk.text - assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2) - if opt_chunk["timestamp"][1]: - assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2) - else: - assert opt_chunk["timestamp"][1] == None - assert round(genai_chunk.end_ts, 2) == -1.0 - - -@pytest.mark.parametrize("model_descr", get_whisper_models_list(multilingual=True)) +@pytest.mark.parametrize("model_descr", get_whisper_models_list()) @pytest.mark.parametrize( - "test_sample", - [ - *get_samples_from_dataset(language="en", length=10, long_form=True), - *get_samples_from_dataset(language="fr", length=10, long_form=True), - ], + "test_sample", get_samples_from_dataset(length=10, long_form=True) ) @pytest.mark.precommit -def test_longform_audio_return_timestamps_multilingual(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) - - expected = opt_pipe( - test_sample, - return_timestamps=True, - ) +def test_longform_audio(model_descr, test_sample): + _, _, hf_pipe, genai_pipe = read_whisper_model(model_descr) streamer_result = [] - genai_result = pipe.generate( + genai_result = run_genai( + genai_pipe, test_sample, - return_timestamps=True, + config=ov_genai.WhisperGenerationConfig(return_timestamps=True), streamer=lambda x: streamer_result.append(x), ) - assert genai_result.texts[0] == expected["text"] - assert "".join(streamer_result) == expected["text"] - - assert len(genai_result.chunks) == len(expected["chunks"]) - - for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks): - assert opt_chunk["text"] == genai_chunk.text - assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2) - if opt_chunk["timestamp"][1]: - assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2) - else: - assert opt_chunk["timestamp"][1] == None - assert round(genai_chunk.end_ts, 2) == -1.0 - - -@pytest.mark.parametrize("model_descr", get_whisper_models_list(en_only=True)) -@pytest.mark.parametrize( - "test_sample", - [ - *get_samples_from_dataset(language="en", length=10, long_form=True), - ], -) -@pytest.mark.precommit -def test_longform_audio_return_timestamps_en(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) - - expected = opt_pipe( - test_sample, - return_timestamps=True, - ) - - streamer_result = [] - - genai_result = pipe.generate( + hf_result = run_huggingface( + hf_pipe, test_sample, - return_timestamps=True, - streamer=lambda x: streamer_result.append(x), + config=ov_genai.WhisperGenerationConfig(return_timestamps=True), ) - assert genai_result.texts[0] == expected["text"] - assert "".join(streamer_result) == expected["text"] - - assert len(genai_result.chunks) == len(expected["chunks"]) + compare_results(hf_result, genai_result) - for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks): - assert opt_chunk["text"] == genai_chunk.text - assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2) - if opt_chunk["timestamp"][1]: - assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2) - else: - assert opt_chunk["timestamp"][1] == None - assert round(genai_chunk.end_ts, 2) == -1.0 + assert "".join(streamer_result) == hf_result["text"] -@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) -@pytest.mark.parametrize( - "test_sample", - [ - *get_samples_from_dataset(language="en", length=3, long_form=True), - *get_samples_from_dataset(language="sp", length=3, long_form=True), - ], -) +@pytest.mark.parametrize("model_descr", get_whisper_models_list()) @pytest.mark.precommit -def test_longform_audio(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) - - expected = opt_pipe(test_sample, return_timestamps=True) - - genai_result = pipe.generate(test_sample) +def test_shortform(model_descr): + samples = [] + ds = datasets.load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation" + ) - assert genai_result.texts[0] == expected["text"] + for ds_row in ds: + samples.append(ds_row["audio"]["array"]) - assert genai_result.chunks == None + run_pipeline_with_ref( + model_id=model_descr[0], + tmp_path=model_descr[1], + sample=samples, + ) @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))