From 2c6d67e039a22e32bc43b53533c3f5b27929eea6 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Thu, 9 Jan 2025 06:21:51 +0100
Subject: [PATCH] Whisper pipeline: refactor tests, disable `return_timestamps`
 check (#1496)

Ticket: 160055

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 .github/workflows/windows.yml               |   6 +
 samples/export-requirements.txt             |   2 +-
 tests/python_tests/requirements.txt         |   2 +-
 tests/python_tests/test_whisper_pipeline.py | 434 ++++++++------------
 4 files changed, 169 insertions(+), 275 deletions(-)

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 95a713d7a1..8f43af44ae 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -310,6 +310,12 @@ jobs:
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels
           python -m pip install ./tools/who_what_benchmark --find-links ${env:OV_INSTALL_DIR}/wheels
+          
+          # will install transformers 4.46.3 version
+          # transformers 4.46.3 will enable return_timestamps tests
+          # this check enabled for windows only. Ticket: 160205.
+          python -m pip install git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
+          
           python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k "not test_smoke"
 
   genai_python_lib_vlm:
diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt
index 2f71891b7b..af38558656 100644
--- a/samples/export-requirements.txt
+++ b/samples/export-requirements.txt
@@ -2,7 +2,7 @@
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 openvino-tokenizers~=2025.0.0.0.dev
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
 numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen
diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt
index e23eaacc21..c851c71ee5 100644
--- a/tests/python_tests/requirements.txt
+++ b/tests/python_tests/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 diffusers==0.32.1
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
 numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64"
 onnx==1.17.0
 pytest
diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py
index aa78666e32..c046d1ae2c 100644
--- a/tests/python_tests/test_whisper_pipeline.py
+++ b/tests/python_tests/test_whisper_pipeline.py
@@ -11,11 +11,13 @@
 from optimum.intel.openvino import OVModelForSpeechSeq2Seq
 import gc
 import json
-import time
 import typing
 import numpy as np
 import os
 import pathlib
+import importlib.metadata as metadata
+from packaging.version import parse
+
 
 @pytest.fixture(scope="class", autouse=True)
 def run_gc_after_test():
@@ -27,36 +29,29 @@ def run_gc_after_test():
     gc.collect()
 
 
-def get_whisper_models_list(tiny_only=False, multilingual=False, en_only=False):
-    precommit_models = [
+def get_whisper_models_list(tiny_only=False):
+    model_ids = [
         "openai/whisper-tiny",
-        "openai/whisper-tiny.en",
         "distil-whisper/distil-small.en",
     ]
-    if multilingual:
-        precommit_models = ["openai/whisper-tiny"]
-    if en_only:
-        precommit_models = ["openai/whisper-tiny.en", "distil-whisper/distil-small.en"]
-    if tiny_only:
-        precommit_models = ["openai/whisper-tiny"]
-
-    nightly_models = []
 
-    if pytest.run_marker == "precommit":
-        model_ids = precommit_models
-    else:
-        model_ids = nightly_models
+    if tiny_only:
+        model_ids = ["openai/whisper-tiny"]
 
     if pytest.selected_model_ids:
-        model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]
+        model_ids = [
+            model_id
+            for model_id in model_ids
+            if model_id in pytest.selected_model_ids.split(" ")
+        ]
 
-    prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
-    return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
+    prefix = pathlib.Path(os.getenv("GENAI_MODELS_PATH_PREFIX", ""))
+    return [(model_id, prefix / model_id.split("/")[1]) for model_id in model_ids]
 
 
 # used whisper models are relatively small
 # cache them in memory to speedup tests
-@functools.lru_cache(3)
+@functools.lru_cache()
 def read_whisper_model(params, **tokenizer_kwargs):
     model_id, path = params
 
@@ -90,6 +85,7 @@ def read_whisper_model(params, **tokenizer_kwargs):
             model_id,
             export=True,
             trust_remote_code=True,
+            stateful=False,
             compile=False,
             device="CPU",
             load_in_8bit=False,
@@ -114,30 +110,39 @@ def read_whisper_model(params, **tokenizer_kwargs):
     )
 
 
-def compare_genai_and_opt_pipelines(opt_pipe, genai_pipe, dataset_id):
-    ds = datasets.load_dataset(dataset_id, "clean", split="validation")
-    opt_infer_time = 0
-    genai_infer_time = 0
-
-    for ds_row in ds:
-        audio_sample = ds_row["audio"]
+def run_huggingface(
+    pipeline,
+    sample,
+    config: ov_genai.WhisperGenerationConfig | None = None,
+):
+    if not config:
+        config = ov_genai.WhisperGenerationConfig()
+
+    return pipeline(
+        sample,
+        max_new_tokens=min(config.max_new_tokens, 444),
+        return_timestamps=config.return_timestamps,
+        generate_kwargs={"language": config.language, "task": config.task},
+    )
 
-        streamer_result = []
 
-        start = time.time()
-        genai_result = genai_pipe.generate(
-            audio_sample["array"].tolist(), streamer=lambda x: streamer_result.append(x)
-        )
-        genai_infer_time += time.time() - start
+def run_genai(
+    pipeline: ov_genai.WhisperPipeline,
+    sample,
+    config: ov_genai.WhisperGenerationConfig | None = None,
+    streamer: typing.Callable[[str], bool] | None = None,
+):
+    if not config:
+        config = ov_genai.WhisperGenerationConfig()
 
-        start = time.time()
-        result = opt_pipe(audio_sample)
-        opt_infer_time += time.time() - start
+    genai_config = pipeline.get_generation_config()
 
-        assert genai_result.texts[0] == result["text"]
-        assert "".join(streamer_result) == result["text"]
+    genai_config.max_new_tokens = config.max_new_tokens
+    genai_config.return_timestamps = config.return_timestamps
+    genai_config.task = config.task
+    genai_config.language = f"<|{config.language}|>" if config.language else None
 
-    print(f"Inference time\nOpt: {opt_infer_time}\nGenAI: {genai_infer_time}")
+    return pipeline.generate(sample, genai_config, streamer=streamer)
 
 
 def get_samples_from_dataset(
@@ -166,13 +171,50 @@ def get_samples_from_dataset(
     return [x["audio"]["array"] for x in ds]
 
 
-@pytest.mark.parametrize("model_descr", get_whisper_models_list())
-@pytest.mark.parametrize("dataset_id", ["hf-internal-testing/librispeech_asr_dummy"])
-@pytest.mark.precommit
-def test_whisper_on_hf_dataset(model_descr, dataset_id):
-    model_id, path, opt_pipe, genai_pipe = read_whisper_model(model_descr)
+def run_pipeline_with_ref(
+    model_id: str,
+    tmp_path: str,
+    sample: np.ndarray | list[np.ndarray],
+    generation_config: ov_genai.WhisperGenerationConfig | None = None,
+    streamer: typing.Callable[[str], bool] | None = None,
+):
+    _, _, hf_pipe, genai_pipe = read_whisper_model((model_id, tmp_path))
+
+    if type(sample) is np.ndarray and len(sample.shape) == 1:
+        sample = np.expand_dims(sample, 0)
+
+    for _sample in sample:
+        genai_result = run_genai(genai_pipe, _sample, generation_config, streamer)
+        hf_result = run_huggingface(hf_pipe, _sample, generation_config)
+
+        compare_results(hf_result, genai_result)
+
 
-    compare_genai_and_opt_pipelines(opt_pipe, genai_pipe, dataset_id)
+def compare_results(hf_result, genai_result):
+    assert genai_result.texts[0] == hf_result["text"]
+
+    # transformers 4.47 updated return_timestamps implementation
+    # remove once genai implementation aligned with transformers. Ticket 160205.
+    transformers_version_greater_4_47 = parse(
+        metadata.version("transformers")
+    ) >= parse("4.47.0")
+
+    if transformers_version_greater_4_47:
+        return
+
+    if "chunks" not in hf_result and genai_result.chunks is None:
+        return
+
+    assert len(genai_result.chunks) == len(hf_result["chunks"])
+
+    for opt_chunk, genai_chunk in zip(hf_result["chunks"], genai_result.chunks):
+        assert opt_chunk["text"] == genai_chunk.text
+        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
+        if opt_chunk["timestamp"][1]:
+            assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
+        else:
+            assert opt_chunk["timestamp"][1] == None
+            assert round(genai_chunk.end_ts, 2) == -1.0
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
@@ -182,16 +224,11 @@ def test_whisper_on_hf_dataset(model_descr, dataset_id):
 )
 @pytest.mark.precommit
 def test_smoke(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(test_sample)
-
-    genai_result = pipe.generate(test_sample)
-
-    assert genai_result.texts[0] == expected["text"]
-
-    assert "chunks" not in expected
-    assert genai_result.chunks == None
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=test_sample,
+    )
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
@@ -259,79 +296,55 @@ def test_whisper_constructors(model_descr, test_sample):
 def test_max_new_tokens(model_descr, test_sample):
     model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
 
-    expected = opt_pipe(test_sample, max_new_tokens=10)["text"]
+    expected = opt_pipe(test_sample, max_new_tokens=10)
 
     genai_result = pipe.generate(test_sample, max_new_tokens=10)
 
-    assert genai_result.texts[0] == expected
-
-    genai_result = pipe.generate(test_sample)
-
-    assert genai_result.texts[0] != expected
+    compare_results(expected, genai_result)
 
     config = pipe.get_generation_config()
     config.max_new_tokens = 10
     genai_result = pipe.generate(test_sample, config)
-    assert genai_result.texts[0] == expected
+    compare_results(expected, genai_result)
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
 @pytest.mark.parametrize(
-    "test_sample", get_samples_from_dataset(language="fr", length=3)
+    "test_samples",
+    [
+        (get_samples_from_dataset(language="fr", length=1), "fr"),
+        (get_samples_from_dataset(language="de", length=1), "de"),
+    ],
 )
 @pytest.mark.precommit
-def test_language_mode_fr(model_descr, test_sample):
-    model_id, path = model_descr
+def test_language_mode(model_descr, test_samples):
     model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
+    samples, language = test_samples
 
     expected = opt_pipe(
-        test_sample, max_new_tokens=30, generate_kwargs={"language": "fr"}
+        samples[0], max_new_tokens=30, generate_kwargs={"language": language}
     )
 
-    genai_result = pipe.generate(test_sample, max_new_tokens=30, language="<|fr|>")
-
-    assert genai_result.texts[0] == expected["text"]
-
-    config = pipe.get_generation_config()
-    config.max_new_tokens = 30
-    config.language = "<|fr|>"
-    genai_result = pipe.generate(test_sample, config)
-
-    assert genai_result.texts[0] == expected["text"]
-
-
-@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
-@pytest.mark.parametrize(
-    "test_sample", get_samples_from_dataset(language="de", length=3)
-)
-@pytest.mark.precommit
-def test_language_mode_de(model_descr, test_sample):
-    model_id, path = model_descr
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(
-        test_sample, max_new_tokens=30, generate_kwargs={"language": "de"}
+    genai_result = pipe.generate(
+        samples[0], max_new_tokens=30, language=f"<|{language}|>"
     )
 
-    genai_result = pipe.generate(test_sample, max_new_tokens=30, language="<|de|>")
-
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
     config = pipe.get_generation_config()
     config.max_new_tokens = 30
-    config.language = "<|de|>"
-    genai_result = pipe.generate(test_sample, config)
+    config.language = f"<|{language}|>"
+    genai_result = pipe.generate(samples[0], config)
 
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
 @pytest.mark.parametrize(
-    "test_sample", get_samples_from_dataset(language="fr", length=3)
+    "test_sample", get_samples_from_dataset(language="fr", length=1)
 )
 @pytest.mark.precommit
 def test_task_mode(model_descr, test_sample):
-    model_id, path = model_descr
     model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
 
     expected = opt_pipe(
@@ -344,7 +357,7 @@ def test_task_mode(model_descr, test_sample):
         test_sample, max_new_tokens=30, language="<|fr|>", task="translate"
     )
 
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
     config = pipe.get_generation_config()
     config.max_new_tokens = 30
@@ -352,27 +365,7 @@ def test_task_mode(model_descr, test_sample):
     config.task = "translate"
     genai_result = pipe.generate(test_sample, config)
 
-    assert genai_result.texts[0] == expected["text"]
-
-    expected = opt_pipe(
-        test_sample,
-        max_new_tokens=30,
-        generate_kwargs={"language": "ru", "task": "translate"},
-    )
-
-    genai_result = pipe.generate(
-        test_sample, max_new_tokens=30, language="<|ru|>", task="translate"
-    )
-
-    assert genai_result.texts[0] == expected["text"]
-
-    config = pipe.get_generation_config()
-    config.max_new_tokens = 30
-    config.language = "<|ru|>"
-    config.task = "translate"
-    genai_result = pipe.generate(test_sample, config)
-
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
     # seems to be equivalent to translate task
     expected = opt_pipe(
@@ -385,7 +378,7 @@ def test_task_mode(model_descr, test_sample):
         test_sample, max_new_tokens=30, language="<|en|>", task="transcribe"
     )
 
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
     config = pipe.get_generation_config()
     config.max_new_tokens = 30
@@ -393,21 +386,20 @@ def test_task_mode(model_descr, test_sample):
     config.task = "transcribe"
     genai_result = pipe.generate(test_sample, config)
 
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
 @pytest.mark.parametrize(
     "test_sample",
     [
-        *get_samples_from_dataset(language="fr", length=2),
-        *get_samples_from_dataset(language="de", length=2),
-        *get_samples_from_dataset(language="es", length=2),
+        *get_samples_from_dataset(language="fr", length=1),
+        *get_samples_from_dataset(language="de", length=1),
+        *get_samples_from_dataset(language="es", length=1),
     ],
 )
 @pytest.mark.precommit
 def test_language_autodetect(model_descr, test_sample):
-    model_id, path = model_descr
     model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
 
     input_features = opt_pipe.feature_extractor(test_sample)
@@ -415,188 +407,84 @@ def test_language_autodetect(model_descr, test_sample):
     # ensure detected language us not english
     assert language_id != pipe.get_generation_config().lang_to_id["<|en|>"]
 
-    expected = opt_pipe(
-        test_sample,
-        max_new_tokens=30,
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=test_sample,
+        generation_config=ov_genai.WhisperGenerationConfig(max_new_tokens=30),
     )
 
-    genai_result = pipe.generate(test_sample, max_new_tokens=30)
-
-    assert genai_result.texts[0] == expected["text"]
-
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
-@pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=10, long_form=True),
-    ],
-)
+@pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1))
 @pytest.mark.precommit
 def test_return_timestamps_short_form(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-    # long form audio not supported yet
-    test_sample = test_sample[: 16000 * 30]
-
-    expected = opt_pipe(
-        test_sample,
-        return_timestamps=True,
-    )
-
-    genai_result = pipe.generate(
-        test_sample.tolist(),
-        return_timestamps=True,
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=test_sample,
+        generation_config=ov_genai.WhisperGenerationConfig(return_timestamps=True),
     )
 
-    assert genai_result.texts[0] == expected["text"]
-
-    assert len(genai_result.chunks) == len(expected["chunks"])
-
-    for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks):
-        assert opt_chunk["text"] == genai_chunk.text
-        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
-        assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
-
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
-@pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=10, long_form=True),
-    ],
-)
+@pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1))
 @pytest.mark.precommit
 def test_return_timestamps_max_new_tokens_short_form(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-    # long form audio not supported yet
-    test_sample = test_sample[: 16000 * 30]
-
-    expected = opt_pipe(
-        test_sample,
-        return_timestamps=True,
-        max_new_tokens=15,
-        generate_kwargs={"language": "en"},
-    )
-
-    genai_result = pipe.generate(
-        test_sample.tolist(),
-        max_new_tokens=15,
-        return_timestamps=True,
-        language="<|en|>",
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=test_sample,
+        generation_config=ov_genai.WhisperGenerationConfig(
+            return_timestamps=True, language="en", max_new_tokens=30
+        ),
     )
 
-    assert genai_result.texts[0] == expected["text"]
-
-    assert len(genai_result.chunks) == len(expected["chunks"])
 
-    for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks):
-        assert opt_chunk["text"] == genai_chunk.text
-        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
-        if opt_chunk["timestamp"][1]:
-            assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
-        else:
-            assert opt_chunk["timestamp"][1] == None
-            assert round(genai_chunk.end_ts, 2) == -1.0
-
-
-@pytest.mark.parametrize("model_descr", get_whisper_models_list(multilingual=True))
+@pytest.mark.parametrize("model_descr", get_whisper_models_list())
 @pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=10, long_form=True),
-        *get_samples_from_dataset(language="fr", length=10, long_form=True),
-    ],
+    "test_sample", get_samples_from_dataset(length=10, long_form=True)
 )
 @pytest.mark.precommit
-def test_longform_audio_return_timestamps_multilingual(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(
-        test_sample,
-        return_timestamps=True,
-    )
+def test_longform_audio(model_descr, test_sample):
+    _, _, hf_pipe, genai_pipe = read_whisper_model(model_descr)
 
     streamer_result = []
 
-    genai_result = pipe.generate(
+    genai_result = run_genai(
+        genai_pipe,
         test_sample,
-        return_timestamps=True,
+        config=ov_genai.WhisperGenerationConfig(return_timestamps=True),
         streamer=lambda x: streamer_result.append(x),
     )
 
-    assert genai_result.texts[0] == expected["text"]
-    assert "".join(streamer_result) == expected["text"]
-
-    assert len(genai_result.chunks) == len(expected["chunks"])
-
-    for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks):
-        assert opt_chunk["text"] == genai_chunk.text
-        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
-        if opt_chunk["timestamp"][1]:
-            assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
-        else:
-            assert opt_chunk["timestamp"][1] == None
-            assert round(genai_chunk.end_ts, 2) == -1.0
-
-
-@pytest.mark.parametrize("model_descr", get_whisper_models_list(en_only=True))
-@pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=10, long_form=True),
-    ],
-)
-@pytest.mark.precommit
-def test_longform_audio_return_timestamps_en(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(
-        test_sample,
-        return_timestamps=True,
-    )
-
-    streamer_result = []
-
-    genai_result = pipe.generate(
+    hf_result = run_huggingface(
+        hf_pipe,
         test_sample,
-        return_timestamps=True,
-        streamer=lambda x: streamer_result.append(x),
+        config=ov_genai.WhisperGenerationConfig(return_timestamps=True),
     )
 
-    assert genai_result.texts[0] == expected["text"]
-    assert "".join(streamer_result) == expected["text"]
-
-    assert len(genai_result.chunks) == len(expected["chunks"])
+    compare_results(hf_result, genai_result)
 
-    for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks):
-        assert opt_chunk["text"] == genai_chunk.text
-        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
-        if opt_chunk["timestamp"][1]:
-            assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
-        else:
-            assert opt_chunk["timestamp"][1] == None
-            assert round(genai_chunk.end_ts, 2) == -1.0
+    assert "".join(streamer_result) == hf_result["text"]
 
 
-@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
-@pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=3, long_form=True),
-        *get_samples_from_dataset(language="sp", length=3, long_form=True),
-    ],
-)
+@pytest.mark.parametrize("model_descr", get_whisper_models_list())
 @pytest.mark.precommit
-def test_longform_audio(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(test_sample, return_timestamps=True)
-
-    genai_result = pipe.generate(test_sample)
+def test_shortform(model_descr):
+    samples = []
+    ds = datasets.load_dataset(
+        "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
+    )
 
-    assert genai_result.texts[0] == expected["text"]
+    for ds_row in ds:
+        samples.append(ds_row["audio"]["array"])
 
-    assert genai_result.chunks == None
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=samples,
+    )
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))