diff --git a/.github/labeler.yml b/.github/labeler.yml index f7015233a0..bf54ba756f 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -54,6 +54,9 @@ - 'src/cpp/include/openvino/genai/whisper_pipeline.hpp' - 'src/cpp/src/whisper/**/*' - 'src/cpp/src/whisper_generation_config.cpp' +- 'src/cpp/src/whisper_pipeline_base.hpp' +- 'src/cpp/src/whisper_pipeline_static.cpp' +- 'src/cpp/src/whisper_pipeline_static.hpp' - 'src/cpp/src/whisper_pipeline.cpp' - 'src/python/py_whisper_pipeline.cpp' - 'tests/python_tests/test_whisper_generate_api.py' diff --git a/tests/python_tests/test_whisper_pipeline_static.py b/tests/python_tests/test_whisper_pipeline_static.py index 2702b38eca..15470bed35 100644 --- a/tests/python_tests/test_whisper_pipeline_static.py +++ b/tests/python_tests/test_whisper_pipeline_static.py @@ -3,13 +3,15 @@ from ov_genai_test_utils import get_whisper_models_list from test_whisper_generate_api import get_samples_from_dataset -from transformers import WhisperProcessor, pipeline, AutoTokenizer +from transformers import WhisperProcessor, AutoTokenizer from optimum.intel.openvino import OVModelForSpeechSeq2Seq import openvino_genai as ov_genai import openvino_tokenizers import openvino import pytest +# This test suite is designed specifically to validate the functionality +# and robustness of the WhisperStaticPipeline on NPUW:CPU. config = {"NPU_USE_NPUW" : "YES", "NPUW_DEVICES" : "CPU", "NPUW_ONLINE_PIPELINE" : "NONE"} @@ -47,11 +49,23 @@ def load_and_save_whisper_model(params, **tokenizer_kwargs): opt_model.save_pretrained(path) processor.save_pretrained(path) +def get_results_cpu_npu(model_path, audio_sample, **config_kwargs): + cpu_pipe = ov_genai.WhisperPipeline(model_path, "CPU") + expected = cpu_pipe.generate(audio_sample, **config_kwargs) + + npu_pipe = ov_genai.WhisperPipeline(model_path, "NPU", **config) + actual_out = npu_pipe.generate(audio_sample, **config_kwargs) + + return expected, actual_out + def compare_results_with_assert(expected, actual_out): - if expected.texts[0] != actual_out.texts[0]: - print(f'expected: {expected.texts[0]}\n') - print(f'actual_out: {actual_out.texts[0]}') - assert expected.texts[0] == actual_out.texts[0] + assert len(expected.texts) == len(actual_out.texts) + + for i in range(0, len(expected.texts)): + if expected.texts[i] != actual_out.texts[i]: + print(f'expected: {expected.texts[i]}\n') + print(f'actual_out: {actual_out.texts[i]}') + assert expected.texts[i] == actual_out.texts[i] @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) @@ -61,12 +75,7 @@ def test_static_whisper_generation_compare_with_cpu(model_descr, test_sample): model_id, model_path = model_descr load_and_save_whisper_model(model_descr) - cpu_pipe = ov_genai.WhisperPipeline(model_path, "CPU") - expected = cpu_pipe.generate(test_sample) - # expected = None - - npu_pipe = ov_genai.WhisperPipeline(model_path, "NPU", **config) - actual_out = npu_pipe.generate(test_sample) + expected, actual_out = get_results_cpu_npu(model_path, test_sample) compare_results_with_assert(expected, actual_out) @@ -74,20 +83,16 @@ def test_static_whisper_generation_compare_with_cpu(model_descr, test_sample): @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) @pytest.mark.parametrize("test_sample", [ -# *get_samples_from_dataset(language="fr", length=2), # 1/2 failed + *get_samples_from_dataset(language="fr", length=1), *get_samples_from_dataset(language="de", length=2), -# *get_samples_from_dataset(language="es", length=2), # 1/2 failed + # *get_samples_from_dataset(language="es", length=2), # mismatch CPU/NPU pipelines ],) @pytest.mark.precommit def test_static_whisper_autodetect(model_descr, test_sample): model_id, model_path = model_descr load_and_save_whisper_model(model_descr) - cpu_pipe = ov_genai.WhisperPipeline(model_path, "CPU") - expected = cpu_pipe.generate(test_sample) - - npu_pipe = ov_genai.WhisperPipeline(model_path, "NPU", **config) - actual_out = npu_pipe.generate(test_sample) + expected, actual_out = get_results_cpu_npu(model_path, test_sample) compare_results_with_assert(expected, actual_out) @@ -101,11 +106,7 @@ def test_static_whisper_language_de(model_descr, test_sample): model_id, model_path = model_descr load_and_save_whisper_model(model_descr) - cpu_pipe = ov_genai.WhisperPipeline(model_path, "CPU") - expected = cpu_pipe.generate(test_sample, max_new_tokens=30, language="<|de|>") - - npu_pipe = ov_genai.WhisperPipeline(model_path, "NPU", **config) - actual_out = npu_pipe.generate(test_sample, max_new_tokens=30, language="<|de|>") + expected, actual_out = get_results_cpu_npu(model_path, test_sample, max_new_tokens=30, language="<|de|>") compare_results_with_assert(expected, actual_out) @@ -119,11 +120,7 @@ def test_static_whisper_language_fr(model_descr, test_sample): model_id, model_path = model_descr load_and_save_whisper_model(model_descr) - cpu_pipe = ov_genai.WhisperPipeline(model_path, "CPU") - expected = cpu_pipe.generate(test_sample, max_new_tokens=30, language="<|fr|>") - - npu_pipe = ov_genai.WhisperPipeline(model_path, "NPU", **config) - actual_out = npu_pipe.generate(test_sample, max_new_tokens=30, language="<|fr|>") + expected, actual_out = get_results_cpu_npu(model_path, test_sample, max_new_tokens=30, language="<|fr|>") compare_results_with_assert(expected, actual_out) @@ -137,10 +134,19 @@ def test_static_whisper_language_ru(model_descr, test_sample): model_id, model_path = model_descr load_and_save_whisper_model(model_descr) - cpu_pipe = ov_genai.WhisperPipeline(model_path, "CPU") - expected = cpu_pipe.generate(test_sample, max_new_tokens=30, language="<|ru|>") + expected, actual_out = get_results_cpu_npu(model_path, test_sample, max_new_tokens=30, language="<|ru|>") - npu_pipe = ov_genai.WhisperPipeline(model_path, "NPU", **config) - actual_out = npu_pipe.generate(test_sample, max_new_tokens=30, language="<|ru|>") + compare_results_with_assert(expected, actual_out) + + +@pytest.mark.skip(reason="Mismatches in output") +@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) +@pytest.mark.parametrize("test_sample", get_samples_from_dataset(language="en", length=1, long_form=True)) +@pytest.mark.precommit +def test_static_whisper_generation_long(model_descr, test_sample): + model_id, model_path = model_descr + load_and_save_whisper_model(model_descr) + + expected, actual_out = get_results_cpu_npu(model_path, test_sample) compare_results_with_assert(expected, actual_out)