diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index ed847a7e3696b..32eed1a771718 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -338,7 +338,10 @@ steps:
   - tests/models/decoder_only/vision_language
   commands:
     - pytest -v -s models/decoder_only/audio_language
-    - pytest -v -s models/decoder_only/vision_language
+    # HACK - run phi3v tests separately to sidestep this transformers bug
+    # https://github.com/huggingface/transformers/issues/34307
+    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language
 
 - label: Other Models Test # 6min
   #mirror_hardwares: [amd]
@@ -413,7 +416,7 @@ steps:
   # Avoid importing model tests that cause CUDA reinitialization error
   - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
   - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
-  - pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
+  - pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s distributed/test_distributed_oot.py
diff --git a/tests/conftest.py b/tests/conftest.py
index 2fce2d772c6ed..bdc6ffb148602 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -259,8 +259,7 @@ def __init__(
         is_sentence_transformer: bool = False,
         skip_tokenizer_init: bool = False,
         auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
-        postprocess_inputs: Callable[[BatchEncoding],
-                                     BatchEncoding] = identity,
+        postprocess_inputs: Callable[..., BatchEncoding] = identity,
     ) -> None:
         torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
 
@@ -303,6 +302,7 @@ def __init__(
         if skip_tokenizer_init:
             self.tokenizer = self.processor.tokenizer
 
+        self.dtype = dtype
         self.postprocess_inputs = postprocess_inputs
 
     def get_inputs(
@@ -337,7 +337,7 @@ def get_inputs(
                 processor_kwargs["sampling_rate"] = sr
 
             inputs = self.processor(**processor_kwargs)
-            inputs = self.postprocess_inputs(inputs)
+            inputs = self.postprocess_inputs(inputs, dtype=self.dtype)
 
             all_inputs.append(inputs)
 
diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py
new file mode 100644
index 0000000000000..a6ba7a131c506
--- /dev/null
+++ b/tests/engine/test_short_mm_context.py
@@ -0,0 +1,29 @@
+import pytest
+
+from ..conftest import IMAGE_ASSETS
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
+    "cherry_blossom":
+    "USER: <image>\nWhat is the season?\nASSISTANT:",
+})
+
+models = ["llava-hf/llava-1.5-7b-hf"]
+
+
+@pytest.mark.parametrize("model", models)
+def test_context_length_too_short(vllm_runner, image_assets, model):
+    images = [asset.pil_image for asset in image_assets]
+
+    with pytest.raises(ValueError, match="too long to fit into the model"):
+        vllm_model = vllm_runner(
+            model,
+            max_model_len=128,  # LLaVA has a feature size of 576
+            enforce_eager=True,
+        )
+
+        with vllm_model:
+            vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]],
+                                       max_tokens=1,
+                                       images=[images[0]])
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index bfffd34d1142c..ad6c2d854d1f0 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -92,7 +92,7 @@ def run_test(
             for vllm_prompt, _, audio in prompts_and_audios
         ]
 
-    def process(hf_inputs: BatchEncoding):
+    def process(hf_inputs: BatchEncoding, **kwargs):
         hf_inputs["audio_values"] = hf_inputs["audio_values"] \
             .to(torch_dtype)  # type: ignore
         return hf_inputs
diff --git a/tests/models/decoder_only/language/test_qwen.py b/tests/models/decoder_only/language/test_qwen.py
new file mode 100644
index 0000000000000..128fe65afbb84
--- /dev/null
+++ b/tests/models/decoder_only/language/test_qwen.py
@@ -0,0 +1,34 @@
+"""Ensure that a text-only Qwen model can be run without throwing an error.
+We explicitly test this because Qwen is implemented as a multimodal and
+supports a visual encoder for models like Qwen-VL.
+"""
+from typing import List, Type
+
+import pytest
+
+from ....conftest import VllmRunner
+
+models = [
+    "Qwen/Qwen-7B-Chat"  # Has no visual encoder
+]
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_text_only_qwen_model_can_be_loaded_and_run(
+    vllm_runner: Type[VllmRunner],
+    example_prompts: List[str],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+):
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_model.generate_greedy_logprobs(
+            example_prompts,
+            max_tokens,
+            num_logprobs=num_logprobs,
+        )
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
new file mode 100644
index 0000000000000..c2d3fda6994f6
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
@@ -0,0 +1,68 @@
+import pytest
+
+from vllm.inputs import InputContext
+
+from ....utils import build_model_context
+
+
+@pytest.fixture()
+def get_max_llava_next_image_tokens():
+    from vllm.model_executor.models.llava_next import (
+        get_max_llava_next_image_tokens)
+    return get_max_llava_next_image_tokens
+
+
+@pytest.fixture()
+def dummy_data_for_llava_next():
+    from vllm.model_executor.models.llava_next import dummy_data_for_llava_next
+    return dummy_data_for_llava_next
+
+
+@pytest.mark.parametrize("gridpoints,expected_max_tokens", [
+    ([[336, 336]], 1176),
+    ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928),
+])
+def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens,
+                                         get_max_llava_next_image_tokens):
+    ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
+
+    # Update the config image_grid_pinpoints
+    # and calculate the resulting max tokens
+    ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
+
+    actual_max_tokens = get_max_llava_next_image_tokens(
+        InputContext(ctx.model_config))
+
+    assert expected_max_tokens == actual_max_tokens
+
+
+@pytest.mark.parametrize(
+    "gridpoints,expected_size",
+    [
+        # One point; it has to be the largest
+        ([[336, 336]], (336, 336)),
+        # Default for most llava next models; the 2x2 tile is the largest
+        ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]],
+         (672, 672)),
+        # If two rectangular gridpoints are the same, the more vertical
+        # one has the higher feature count due to newline features
+        ([[336, 672], [672, 336]], (672, 336))
+    ])
+def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next,
+                                                gridpoints, expected_size):
+    ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
+
+    # Update the config image_grid_pinpoints
+    ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
+    seq_len = 5000  # bigger than the max feature size for any image
+
+    seq_data, mm_data = dummy_data_for_llava_next(
+        ctx,
+        seq_len=seq_len,
+        mm_counts={"image": 1},
+    )
+
+    # The dummy data dims should match the gridpoint with the biggest feat size
+    assert mm_data["image"].height == expected_size[0]
+    assert mm_data["image"].width == expected_size[1]
+    assert len(seq_data.get_token_ids()) >= seq_len
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
new file mode 100644
index 0000000000000..d6a7b34fdde9f
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
@@ -0,0 +1,181 @@
+"""Tests for phi3v's multimodal preprocessing kwargs."""
+from typing import Optional
+
+import pytest
+import torch
+from transformers import AutoImageProcessor, AutoTokenizer
+
+from vllm.inputs import InputContext, token_inputs
+from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
+from vllm.multimodal import MultiModalRegistry
+
+from .....conftest import _ImageAssets
+from ....utils import build_model_context
+
+models = ["microsoft/Phi-3.5-vision-instruct"]
+
+
+# Wrap lazy imports to avoid initializing CUDA during test collection
+@pytest.fixture()
+def input_processor_for_phi3v():
+    from vllm.model_executor.models.phi3v import input_processor_for_phi3v
+    return input_processor_for_phi3v
+
+
+@pytest.fixture()
+def dummy_data_for_phi3v():
+    from vllm.model_executor.models.phi3v import dummy_data_for_phi3v
+    return dummy_data_for_phi3v
+
+
+@pytest.fixture()
+def get_max_phi3v_image_tokens():
+    from vllm.model_executor.models.phi3v import get_max_phi3v_image_tokens
+    return get_max_phi3v_image_tokens
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("num_crops", [4, 16, None])
+def test_input_mapper_override(model: str, image_assets: _ImageAssets,
+                               num_crops: Optional[int]):
+    """Ensure that the [default] input mapper handles num_crops properly."""
+    # We pass the processor kwargs here since for this model, we fall back to
+    # the default mapper; this will fall back to the HF mapper and forward
+    # mm_processor_kwargs to it.
+    mm_processor_kwargs = {
+        "num_crops": num_crops
+    } if num_crops is not None else {}
+    ctx = build_model_context(
+        model_name=model,
+        tokenizer_name=model,
+        trust_remote_code=True,
+        mm_processor_kwargs=mm_processor_kwargs,
+    )
+
+    hf_processor = AutoImageProcessor.from_pretrained(model,
+                                                      trust_remote_code=True,
+                                                      **mm_processor_kwargs)
+
+    mm_registry = MultiModalRegistry()
+    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
+
+    image = image_assets[0].pil_image
+    hf_result = hf_processor.preprocess(
+        image,
+        return_tensors="pt",
+    )
+
+    vllm_result = mm_registry.map_input(
+        ctx.model_config,
+        {"image": image},
+    )
+
+    assert torch.all(hf_result["image_sizes"] == vllm_result["image_sizes"])
+    assert torch.all(
+        hf_result["num_img_tokens"] == vllm_result["num_img_tokens"])
+
+    # For pixel values, the second axis should be the num_crops + 1
+    # for the rescaled original image. The default value in VLLM falls
+    # back to the HF config, which is why we compare to the processor num_crops
+    assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
+    assert vllm_result["pixel_values"].shape[1] == hf_processor.num_crops + 1
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("num_crops,expected_max_tokens", [
+    (4, 781),
+    (16, 2653),
+])
+def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
+                             num_crops: int, expected_max_tokens: int):
+    """Ensure get_max_phi3v_image_tokens handles num_crops properly."""
+    # NOTE: mm_processor_kwargs on the context in this test is unused, since
+    # this is testing the mapper directly. In practice, the processor kwargs
+    # are wrapped in a closure when calling the max tokens func. We explicitly
+    # do NOT use the mm_processor_kwargs in the model context here to ensure
+    # that the max image tokens implementation is referencing a mix of the
+    # kwargs to the function and the original mm_processor_kwargs in case
+    # values are somehow updated and end up in a bad state.
+    ctx = build_model_context(
+        model_name=model,
+        tokenizer_name=model,
+        trust_remote_code=True,
+        mm_processor_kwargs=None,
+    )
+
+    actual_max_tokens = get_max_phi3v_image_tokens(
+        InputContext(ctx.model_config),
+        num_crops=num_crops,
+    )
+
+    assert expected_max_tokens == actual_max_tokens
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("num_crops,toks_per_img,num_imgs", [
+    (4, 781, 1),
+    (4, 781, 2),
+    (16, 2653, 1),
+    (16, 2653, 2),
+])
+def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int,
+                             toks_per_img: int, num_imgs: int):
+    """Ensure dummy_data_for_phi3v handles num_crops properly."""
+    # Same as the previous test - don't initialize mm_processor_kwargs
+    # in this test and assume that the kwargs will be correctly expanded by
+    # the partial when calling the dummy data func.
+    ctx = build_model_context(
+        model_name=model,
+        tokenizer_name=model,
+        trust_remote_code=True,
+        mm_processor_kwargs=None,
+    )
+
+    sequence_data, _, = dummy_data_for_phi3v(
+        ctx=ctx,
+        seq_len=8192,  # Should be bigger than num_imgs * toks_per_img
+        mm_counts={"image": num_imgs},
+        num_crops=num_crops,
+    )
+    # Ensure we have the right number of placeholders per num_crops size
+    img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID)
+    assert img_tok_count == toks_per_img * num_imgs
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("num_crops,expected_toks_per_img,num_imgs", [
+    (4, 757, 1),
+    (4, 757, 2),
+    (16, 1921, 1),
+    (16, 1921, 2),
+])
+def test_input_processor_override(input_processor_for_phi3v,
+                                  image_assets: _ImageAssets, model: str,
+                                  num_crops: int, expected_toks_per_img: int,
+                                  num_imgs: int):
+    """Ensure input_processor_for_phi3v handles num_crops properly."""
+    # Same as the previous test - don't initialize mm_processor_kwargs
+    # in this test and assume that the kwargs will be correctly expanded by
+    # the partial when calling the custom input processor.
+    ctx = build_model_context(
+        model_name=model,
+        tokenizer_name=model,
+        trust_remote_code=True,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    # Build the image str / prompt based on the number of images we pass
+    img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
+    prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
+    images = [image_assets[0].pil_image] * num_imgs
+
+    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
+                          prompt=prompt,
+                          multi_modal_data={"image": images})
+
+    processed_inputs = input_processor_for_phi3v(ctx,
+                                                 inputs,
+                                                 num_crops=num_crops)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
+    assert img_tok_count == expected_toks_per_img * num_imgs
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
new file mode 100644
index 0000000000000..a01651b171d60
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
@@ -0,0 +1,144 @@
+"""Tests for Qwen's multimodal preprocessing kwargs."""
+from typing import Dict, List, Union
+
+import pytest
+import torch
+from PIL.Image import Image
+
+from vllm.inputs import InputContext, token_inputs
+from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.utils import cached_get_tokenizer
+
+from .....conftest import IMAGE_ASSETS
+from ....utils import build_model_context
+
+### Multimodal preprocessing tests
+SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
+# These values are specific to Qwen-VL/Chat; we can get these from the model
+# config also, but they are hardcoded here to keep the parameterize/fixtures
+# easy to read.
+IMG_START_ID = 151857
+IMG_END_ID = 151858
+IMG_PAD_ID = 151859
+TOKS_PER_IMG = 256
+VIS_ENC_DIM = 4096
+IMG_SIZE = 448
+
+
+@pytest.fixture()
+def input_mapper_for_qwen():
+    # Lazy import to avoid initializing CUDA during test collection
+    from vllm.model_executor.models.qwen import input_mapper_for_qwen
+    return input_mapper_for_qwen
+
+
+@pytest.fixture()
+def input_processor_for_qwen():
+    # Lazy import to avoid initializing CUDA during test collection
+    from vllm.model_executor.models.qwen import input_processor_for_qwen
+    return input_processor_for_qwen
+
+
+@pytest.fixture()
+def qwen_vl_context() -> InputContext:
+    """Get an InputContext for Qwen-VL."""
+    return build_model_context(model_name="Qwen/Qwen-VL",
+                               trust_remote_code=True)
+
+
+# Happy path tests for single/multi-image scenarios for the multimodal
+# input processor and mapper, respectively
+@pytest.mark.parametrize("num_images", [1, 2])
+def test_input_processor_valid_mm_data(input_processor_for_qwen,
+                                       qwen_vl_context: InputContext,
+                                       num_images: int):
+    """Happy cases for image inputs to Qwen's multimodal input processor."""
+    prompt = "".join(
+        [f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
+    inputs = token_inputs(
+        prompt=prompt,
+        # When processing multimodal data for a multimodal model, the qwen
+        # input processor will overwrite the provided prompt_token_ids with
+        # the image prompts
+        prompt_token_ids=[],
+        multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
+    )
+    proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
+    assert isinstance(proc_inputs, dict)
+
+    # Each image should have one start / stop and a fixed context of 256
+    proc_tokens = proc_inputs["prompt_token_ids"]
+    assert proc_tokens.count(IMG_START_ID) == num_images
+    assert proc_tokens.count(IMG_END_ID) == num_images
+    assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
+
+
+@pytest.mark.parametrize(
+    "img_data,expected_shape",
+    [
+        # single / multi-image
+        (SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
+        (2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
+        # single / multi-image embeddings
+        (torch.rand(
+            (TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
+        (torch.rand(
+            (1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
+        (torch.rand(
+            (2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
+    ])
+def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
+                                    qwen_vl_context: InputContext,
+                                    img_data: Union[torch.Tensor, List[Image],
+                                                    Image],
+                                    expected_shape: List[int]):
+    """Happy cases for image inputs to Qwen's multimodal input mapper."""
+    mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
+    # Ensure that we get the appropriately shaped pixel_values
+    # for images and image embeddings, respectively.
+    assert isinstance(mapped_img_data, MultiModalInputs)
+    assert "pixel_values" in mapped_img_data
+    assert mapped_img_data["pixel_values"].shape == expected_shape
+
+
+# Sad path tests for the multimodal input processor and mapper, respectively
+@pytest.mark.parametrize("mm_data", [
+    {
+        "image": torch.rand((5))
+    },
+    {
+        "image": torch.rand((5, 5, 5, 5, 5))
+    },
+])
+def test_input_processor_invalid_mm_data(input_processor_for_qwen,
+                                         qwen_vl_context: InputContext,
+                                         mm_data: Dict[str, torch.Tensor]):
+    """Test sad cases validated in Qwen's multimodal input processor."""
+    tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
+                                     trust_remote_code=True)
+    prompt = "Picture 1: <img></img>\n"
+    prompt_token_ids = tokenizer.encode(prompt)
+    inputs = token_inputs(prompt=prompt,
+                          prompt_token_ids=prompt_token_ids,
+                          multi_modal_data=mm_data)
+    # Should fail since we have too many or too few dimensions for embeddings
+    with pytest.raises(ValueError):
+        input_processor_for_qwen(qwen_vl_context, inputs)
+
+
+@pytest.mark.parametrize(
+    "img_data",
+    [
+        # Wrong context length
+        torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
+        # Wrong visual encoder output size
+        torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
+    ])
+def test_input_mapper_invalid_mm_data(
+    input_mapper_for_qwen,
+    qwen_vl_context: InputContext,
+    img_data: Union[torch.Tensor, List[Image], Image],
+):
+    """Sad cases validated in Qwen VL's multimodal input mapper."""
+    with pytest.raises(ValueError):
+        input_mapper_for_qwen(qwen_vl_context, img_data)
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
similarity index 98%
rename from tests/models/decoder_only/vision_language/test_qwen2_vl.py
rename to tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
index d3de5fb26d4b8..5c90e7f7a267c 100644
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
@@ -8,8 +8,8 @@
 from vllm.inputs import InputContext, token_inputs
 from vllm.multimodal import MultiModalRegistry
 
-from ....conftest import _ImageAssets
-from ...utils import build_model_context
+from .....conftest import _ImageAssets
+from ....utils import build_model_context
 
 MODEL = "Qwen/Qwen2-VL-2B-Instruct"
 MIN_PIXELS = "min_pixels"
diff --git a/tests/models/decoder_only/vision_language/test_blip2.py b/tests/models/decoder_only/vision_language/test_blip2.py
deleted file mode 100644
index e1e32b96d89ac..0000000000000
--- a/tests/models/decoder_only/vision_language/test_blip2.py
+++ /dev/null
@@ -1,101 +0,0 @@
-from typing import List, Optional, Tuple
-
-import pytest
-from transformers import AutoModelForVision2Seq, AutoTokenizer
-
-from vllm.multimodal.utils import rescale_image_size
-from vllm.sequence import SampleLogprobs
-
-from ....conftest import IMAGE_ASSETS
-from ...utils import check_logprobs_close
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "Question: What's the content of the image? Answer:",
-    "cherry_blossom":
-    "Question: What is the season? Answer:",
-})
-
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
-                                         Optional[SampleLogprobs]],
-                      model: str):
-    """Sanitize vllm output to be comparable with hf output."""
-    _, output_str, out_logprobs = vllm_output
-
-    hf_output_str = output_str + "\n"
-
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    hf_output_ids = tokenizer.encode(hf_output_str)
-    assert hf_output_ids[0] == tokenizer.bos_token_id
-    hf_output_ids = hf_output_ids[1:]
-
-    return hf_output_ids, hf_output_str, out_logprobs
-
-
-@pytest.mark.parametrize("model", ["Salesforce/blip2-opt-2.7b"])
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test are from IMAGE_ASSETS.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalData objects and corresponding
-    MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
-        vllm_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in inputs_per_image
-        ]
-
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForVision2Seq) as hf_model:
-        hf_outputs_per_image = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images)
-            for prompts, images in inputs_per_image
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
-                                        vllm_outputs_per_image):
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(vllm_output, model)
-                for vllm_output in vllm_outputs
-            ],
-            name_0="hf",
-            name_1="vllm",
-        )
diff --git a/tests/models/decoder_only/vision_language/test_broadcast.py b/tests/models/decoder_only/vision_language/test_broadcast.py
deleted file mode 100644
index 38c4a95de16f4..0000000000000
--- a/tests/models/decoder_only/vision_language/test_broadcast.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import pytest
-import transformers
-
-from ....utils import multi_gpu_test
-
-
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-@pytest.mark.parametrize("model", [
-    "llava-hf/llava-1.5-7b-hf",
-    "llava-hf/llava-v1.6-mistral-7b-hf",
-    "facebook/chameleon-7b",
-])
-def test_models(hf_runner, vllm_runner, image_assets,
-                distributed_executor_backend, model) -> None:
-
-    dtype = "half"
-    max_tokens = 5
-    num_logprobs = 5
-    tensor_parallel_size = 2
-
-    if model.startswith("llava-hf/llava-1.5"):
-        from .test_llava import models, run_test
-    elif model.startswith("llava-hf/llava-v1.6"):
-        from .test_llava_next import models, run_test  # type: ignore[no-redef]
-    elif model.startswith("facebook/chameleon"):
-        if transformers.__version__.startswith("4.46"):
-            pytest.skip("Model broken in HF, "
-                        "see huggingface/transformers#34379")
-        from .test_chameleon import models, run_test  # type: ignore[no-redef]
-    else:
-        raise NotImplementedError(f"Unsupported model: {model}")
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        image_assets,
-        model=models[0],
-        # So that LLaVA-NeXT processor may return nested list
-        size_factors=[0.25, 0.5, 1.0],
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=tensor_parallel_size,
-        distributed_executor_backend=distributed_executor_backend,
-    )
diff --git a/tests/models/decoder_only/vision_language/test_chameleon.py b/tests/models/decoder_only/vision_language/test_chameleon.py
deleted file mode 100644
index 4bd678b9f21c4..0000000000000
--- a/tests/models/decoder_only/vision_language/test_chameleon.py
+++ /dev/null
@@ -1,130 +0,0 @@
-from typing import List, Optional, Type
-
-import pytest
-import transformers
-from transformers import AutoModelForVision2Seq, BatchEncoding
-
-from vllm.multimodal.utils import rescale_image_size
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-
-from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from ...utils import check_outputs_equal
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
-    "cherry_blossom":
-    "USER: <image>\nWhat is the season?\nASSISTANT:",
-})
-
-models = ["facebook/chameleon-7b"]
-
-
-def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    image_assets: _ImageAssets,
-    model: str,
-    *,
-    size_factors: List[float],
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test are from IMAGE_ASSETS.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
-    with vllm_runner(model,
-                     max_model_len=4096,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-
-        vllm_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in inputs_per_image
-        ]
-
-    def process(hf_inputs: BatchEncoding):
-        hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \
-            .to(torch_dtype)  # type: ignore
-        return hf_inputs
-
-    with hf_runner(model,
-                   dtype=dtype,
-                   postprocess_inputs=process,
-                   auto_cls=AutoModelForVision2Seq) as hf_model:
-        hf_outputs_per_image = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images)
-            for prompts, images in inputs_per_image
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
-                                        vllm_outputs_per_image):
-        # HF Logprobs include image tokens, unlike vLLM, so we don't directly
-        # compare them
-        check_outputs_equal(
-            outputs_0_lst=[outputs[:2] for outputs in hf_outputs],
-            outputs_1_lst=[outputs[:2] for outputs in vllm_outputs],
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.skipif(
-    transformers.__version__.startswith("4.46.0"),
-    reason="Model broken in HF, see huggingface/transformers#34379",
-)
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [8])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype, max_tokens, num_logprobs) -> None:
-    run_test(
-        hf_runner,
-        vllm_runner,
-        image_assets,
-        model,
-        size_factors=size_factors,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-    )
diff --git a/tests/models/decoder_only/vision_language/test_fuyu.py b/tests/models/decoder_only/vision_language/test_fuyu.py
deleted file mode 100644
index 1affcd10ee72d..0000000000000
--- a/tests/models/decoder_only/vision_language/test_fuyu.py
+++ /dev/null
@@ -1,139 +0,0 @@
-from typing import List, Optional, Tuple, Type
-
-import pytest
-
-from vllm.multimodal.utils import rescale_image_size
-from vllm.platforms import current_platform
-from vllm.sequence import SampleLogprobs
-
-from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from ...utils import check_logprobs_close
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "What's the content of the image?\n",
-    "cherry_blossom":
-    "What is the season?\n",
-})
-
-models = ["adept/fuyu-8b"]
-
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
-                                         Optional[SampleLogprobs]]):
-    """Sanitize vllm output to be comparable with hf output."""
-    output_ids, output_str, out_logprobs = vllm_output
-
-    hf_output_str = output_str.lstrip() + "|ENDOFTEXT|"
-
-    return output_ids, hf_output_str, out_logprobs
-
-
-def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    image_assets: _ImageAssets,
-    model: str,
-    *,
-    size_factors: List[float],
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test are from IMAGE_ASSETS.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     max_model_len=2048,
-                     max_num_seqs=2,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-        vllm_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in inputs_per_image
-        ]
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        eos_token_id = hf_model.processor.tokenizer.eos_token_id
-        hf_outputs_per_image = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images,
-                                                    eos_token_id=eos_token_id)
-            for prompts, images in inputs_per_image
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
-                                        vllm_outputs_per_image):
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs
-            ],
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-target_dtype = "half"
-if current_platform.is_cpu():
-    target_dtype = "bfloat16"
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [0.25],
-        # Single-scale, batched
-        [0.25, 0.25, 0.25],
-        # Multi-scale
-        [0.25, 0.2, 0.15],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
-    run_test(
-        hf_runner,
-        vllm_runner,
-        image_assets,
-        model,
-        size_factors=size_factors,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-    )
diff --git a/tests/models/decoder_only/vision_language/test_glm4.py b/tests/models/decoder_only/vision_language/test_glm4.py
deleted file mode 100644
index 47922a57f680b..0000000000000
--- a/tests/models/decoder_only/vision_language/test_glm4.py
+++ /dev/null
@@ -1,133 +0,0 @@
-from typing import List, Optional, Tuple, Type
-
-import pytest
-
-from vllm.multimodal.utils import rescale_image_size
-from vllm.transformers_utils.tokenizer import patch_padding_side
-
-from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
-from ....utils import large_gpu_test
-from ...utils import check_logprobs_close
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "What's the content of the image?",
-    "cherry_blossom":
-    "What is the season?",
-})
-
-models = ["THUDM/glm-4v-9b"]
-target_dtype = "bfloat16"
-
-
-def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptImageInput]],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    mm_limit: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     max_model_len=2048,
-                     max_num_seqs=2,
-                     dtype=dtype,
-                     limit_mm_per_prompt={"image": mm_limit},
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-        stop_token_ids = [151329, 151336, 151338]
-        vllm_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images,
-                                                stop_token_ids=stop_token_ids)
-            for prompts, images in inputs
-        ]
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_processor = hf_model.processor
-        patch_padding_side(hf_processor)
-
-        def processor(*args, text="", images=None, **kwargs):
-            if images is None:
-                return hf_processor(*args, **kwargs)
-
-            return hf_processor.apply_chat_template(
-                [{
-                    "role": "user",
-                    "image": images,
-                    "content": text
-                }],
-                add_generation_prompt=True,
-                tokenize=True,
-                return_dict=True,
-                **kwargs,
-            )
-
-        hf_model.processor = processor
-        hf_model.model.get_output_embeddings = lambda: \
-            hf_model.model.transformer.output_layer
-        hf_outputs_per_image = [
-            hf_model.generate_greedy_logprobs_limit(
-                prompts,
-                max_tokens,
-                num_logprobs=num_logprobs,
-                images=images,
-            ) for prompts, images in inputs
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
-                                        vllm_outputs_per_image):
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@large_gpu_test(min_gb=48)
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_image,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
diff --git a/tests/models/decoder_only/vision_language/test_internvl.py b/tests/models/decoder_only/vision_language/test_internvl.py
index fc842ec4a6171..2fd1ac4bb08f7 100644
--- a/tests/models/decoder_only/vision_language/test_internvl.py
+++ b/tests/models/decoder_only/vision_language/test_internvl.py
@@ -1,15 +1,11 @@
-import types
-from typing import List, Optional, Tuple, Type, Union
+from typing import List, Optional, Tuple, Type
 
 import pytest
 import torch
-from PIL.Image import Image
-from transformers import AutoConfig
 
 from vllm.multimodal.utils import rescale_image_size
 
-from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
-                          _ImageAssets)
+from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
 from ...utils import check_logprobs_close
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
@@ -18,171 +14,6 @@
     "cherry_blossom":
     "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
 })
-HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: <image>\nImage-2: <image>\nDescribe the two images in short.<|im_end|>\n<|im_start|>Assistant\n"  # noqa: E501
-
-models = [
-    "OpenGVLab/InternVL2-1B",
-    "OpenGVLab/InternVL2-2B",
-    # NOTE: Mono-InternVL-2B doesn't work with fp16,
-    # it will result NaN during inference.
-    # See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
-    "OpenGVLab/Mono-InternVL-2B",
-    # Broken due to outdated implementation of Phi-3
-    # See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3
-    # "OpenGVLab/InternVL2-4B",
-]
-target_dtype = "bfloat16"
-
-
-# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py
-def generate(
-    self,
-    pixel_values: torch.FloatTensor,
-    input_ids: torch.FloatTensor,
-    attention_mask: Optional[torch.LongTensor] = None,
-    **generate_kwargs,
-) -> torch.LongTensor:
-    """Generate method for InternVL2 model without fixed use_cache."""
-    assert self.img_context_token_id is not None
-    vit_embeds = self.extract_feature(pixel_values)
-    input_embeds = self.language_model.get_input_embeddings()(input_ids)
-    B, N, C = input_embeds.shape
-    input_embeds = input_embeds.reshape(B * N, C)
-
-    input_ids = input_ids.reshape(B * N)
-    selected = (input_ids == self.img_context_token_id)
-    assert selected.sum() != 0
-    input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
-
-    input_embeds = input_embeds.reshape(B, N, C)
-
-    forward_kwargs = dict(
-        inputs_embeds=input_embeds,
-        attention_mask=attention_mask,
-    )
-    if getattr(self, "use_visual_token_mask", False):
-        visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype)
-        forward_kwargs["visual_token_mask"] = visual_token_mask
-    outputs = self.language_model.generate(
-        **forward_kwargs,
-        **generate_kwargs,
-    )
-
-    return outputs
-
-
-def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptImageInput]],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    mm_limit: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test are from IMAGE_ASSETS.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    class InternVLProcessor:
-        """A simple processor for InternVL2 which misses a processor."""
-
-        def __init__(self, hf_runner: HfRunner):
-            self.num_image_token = hf_runner.model.num_image_token
-            self.tokenizer = hf_runner.tokenizer
-            self.dtype = hf_runner.model.dtype
-
-            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
-                                                     trust_remote_code=True)
-            self.vision_config = self.config.vision_config
-            self.use_thumbnail = self.config.use_thumbnail
-            self.min_num = self.config.min_dynamic_patch
-            self.max_num = self.config.max_dynamic_patch
-            self.image_size = self.vision_config.image_size
-
-        def __call__(self, text: str, images: Union[Image, List[Image]],
-                     **kwargs):
-            from vllm.model_executor.models.internvl import (
-                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
-            images = [images] if isinstance(images, Image) else images
-            pixel_values = [
-                image_to_pixel_values(image, self.image_size, self.min_num,
-                                      self.max_num,
-                                      self.use_thumbnail).to(self.dtype)
-                for image in images
-            ]
-            num_patches_list = [
-                pixel_value.shape[0] for pixel_value in pixel_values
-            ]
-            pixel_values = torch.cat(pixel_values, dim=0)
-            for num_patches in num_patches_list:
-                context_tokens = IMG_CONTEXT * self.num_image_token \
-                    * num_patches
-                image_tokens = IMG_START + context_tokens + IMG_END
-                text = text.replace('<image>', image_tokens, 1)
-            prompt = self.tokenizer(text, return_tensors="pt")
-            prompt.update({"pixel_values": pixel_values})
-            return prompt
-
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     max_model_len=4096,
-                     dtype=dtype,
-                     limit_mm_per_prompt={"image": mm_limit},
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-        vllm_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in inputs
-        ]
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
-            "<IMG_CONTEXT>")
-        hf_model.model.img_context_token_id = img_context_token_id
-        hf_model.processor = InternVLProcessor(hf_model)
-        hf_model.model.get_output_embeddings = lambda: \
-            hf_model.model.language_model.get_output_embeddings()
-        hf_model.model.generate = types.MethodType(generate, hf_model.model)
-        eos_token_id = hf_model.tokenizer.eos_token_id
-        hf_outputs_per_image = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=hf_images,
-                                                    eos_token_id=eos_token_id)
-            for prompts, hf_images in inputs
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
-                                        vllm_outputs_per_image):
-        # TODO: Check whether using original CLIPVisionModel can improve
-        # consistency against HF
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
 
 
 def run_awq_test(
@@ -253,123 +84,6 @@ def run_awq_test(
         )
 
 
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-@torch.inference_mode()
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_image,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.5, 0.75, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-@torch.inference_mode()
-def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
-                             size_factors, dtype: str, max_tokens: int,
-                             num_logprobs: int) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_case = [
-        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-         [[rescale_image_size(image, factor) for image in images]
-          for factor in size_factors])
-    ]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_case,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=2,
-        tensor_parallel_size=1,
-    )
-
-
-@pytest.mark.parametrize("model", ["OpenGVLab/InternVL2-2B"])
-@pytest.mark.parametrize("size_factors", [[0.5, 1.0]])
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-@torch.inference_mode()
-def test_different_num_patches(hf_runner, vllm_runner, image_assets, model,
-                               size_factors, dtype: str, max_tokens: int,
-                               num_logprobs: int) -> None:
-    images = [asset.pil_image.resize((896, 896)) for asset in image_assets]
-
-    inputs_batching = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
-    inputs_multi_images = [
-        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-         [[rescale_image_size(image, factor) for image in images]
-          for factor in size_factors])
-    ]
-    for inputs in [inputs_batching, inputs_multi_images]:
-        run_test(
-            hf_runner,
-            vllm_runner,
-            inputs,
-            model,
-            dtype=dtype,
-            max_tokens=max_tokens,
-            num_logprobs=num_logprobs,
-            mm_limit=2,
-            tensor_parallel_size=1,
-        )
-
-
 @pytest.mark.parametrize(
     "models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")])
 @pytest.mark.parametrize(
diff --git a/tests/models/decoder_only/vision_language/test_llava.py b/tests/models/decoder_only/vision_language/test_llava.py
deleted file mode 100644
index fd28a9367b4b2..0000000000000
--- a/tests/models/decoder_only/vision_language/test_llava.py
+++ /dev/null
@@ -1,313 +0,0 @@
-from typing import List, Optional, Tuple, Type, overload
-
-import pytest
-from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
-                          BatchEncoding)
-
-from vllm.multimodal.utils import rescale_image_size
-from vllm.sequence import SampleLogprobs
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-
-from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
-                          _ImageAssets)
-from ...utils import check_logprobs_close
-
-_LIMIT_IMAGE_PER_PROMPT = 4
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
-    "cherry_blossom":
-    "USER: <image>\nWhat is the season?\nASSISTANT:",
-})
-
-models = [
-    "llava-hf/llava-1.5-7b-hf",
-    # TODO: Get this model to produce meaningful output in vLLM
-    # "TIGER-Lab/Mantis-8B-siglip-llama3",
-]
-
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
-                                         Optional[SampleLogprobs]],
-                      model: str):
-    """Sanitize vllm output to be comparable with hf output."""
-    output_ids, output_str, out_logprobs = vllm_output
-
-    config = AutoConfig.from_pretrained(model)
-    image_token_id = config.image_token_index
-
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    eos_token_id = tokenizer.eos_token_id
-
-    hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
-        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
-    ]
-
-    assert output_str[0] == " "
-    hf_output_str = output_str[1:]
-    if hf_output_ids[-1] == eos_token_id:
-        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
-
-    return hf_output_ids, hf_output_str, out_logprobs
-
-
-@overload
-def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    image_assets: _ImageAssets,
-    model: str,
-    *,
-    size_factors: List[float],
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    ...
-
-
-@overload
-def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    image_assets: _ImageAssets,
-    model: str,
-    *,
-    sizes: List[Tuple[int, int]],
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    ...
-
-
-def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    image_assets: _ImageAssets,
-    model: str,
-    *,
-    size_factors: Optional[List[float]] = None,
-    sizes: Optional[List[Tuple[int, int]]] = None,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    images = [asset.pil_image for asset in image_assets]
-
-    if size_factors is not None:
-        inputs_per_image = [(
-            [prompt for _ in size_factors],
-            [rescale_image_size(image, factor) for factor in size_factors],
-        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-    elif sizes is not None:
-        inputs_per_image = [(
-            [prompt for _ in sizes],
-            [image.resize(size) for size in sizes],
-        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-    else:
-        raise ValueError("You must provide either `size_factors` or `sizes`")
-
-    _run_test(hf_runner,
-              vllm_runner,
-              inputs_per_image,
-              model,
-              dtype=dtype,
-              max_tokens=max_tokens,
-              num_logprobs=num_logprobs,
-              tensor_parallel_size=tensor_parallel_size,
-              distributed_executor_backend=distributed_executor_backend)
-
-
-def _run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptImageInput]],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test are from IMAGE_ASSETS.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    # NOTE: For local use; this isn't tested in CI yet (see TODO above)
-    if model.startswith("TIGER-Lab/Mantis"):
-        from mantis.models.mllava import MLlavaProcessor
-
-        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-        mantis_processor = MLlavaProcessor.from_pretrained(
-            model, torch_dtype=torch_dtype)
-        assert isinstance(mantis_processor, MLlavaProcessor)
-    else:
-        mantis_processor = None
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     dtype=dtype,
-                     max_model_len=4096,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True,
-                     limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
-                                          }) as vllm_model:
-        vllm_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in inputs
-        ]
-
-    if mantis_processor is not None:
-
-        def process(hf_inputs: BatchEncoding):
-            hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \
-                .to(torch_dtype)  # type: ignore
-            return hf_inputs
-    else:
-
-        def process(hf_inputs: BatchEncoding):
-            return hf_inputs
-
-    with hf_runner(model,
-                   dtype=dtype,
-                   postprocess_inputs=process,
-                   auto_cls=AutoModelForVision2Seq) as hf_model:
-        hf_outputs_per_image = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images)
-            for prompts, images in inputs
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
-                                        vllm_outputs_per_image):
-        # TODO: Check whether using original CLIPVisionModel can improve
-        # consistency against HF
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(vllm_output, model)
-                for vllm_output in vllm_outputs
-            ],
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype, max_tokens, num_logprobs) -> None:
-    run_test(
-        hf_runner,
-        vllm_runner,
-        image_assets,
-        model,
-        size_factors=size_factors,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-    )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets,
-                                      model, dtype, max_tokens,
-                                      num_logprobs) -> None:
-    stop_sign = image_assets[0].pil_image
-    cherry_blossom = image_assets[1].pil_image
-
-    inputs = [(
-        [
-            "USER: <image><image>\nDescribe 2 images.\nASSISTANT:",
-            "USER: <image><image>\nDescribe 2 images.\nASSISTANT:",
-            "USER: <image><image><image><image>\nDescribe 4 images.\nASSISTANT:",  # noqa: E501
-            "USER: <image>\nWhat is the season?\nASSISTANT:",
-        ],
-        [
-            [stop_sign, cherry_blossom],
-            # Images with different sizes and aspect-ratios
-            [
-                rescale_image_size(stop_sign, 0.1),
-                stop_sign,
-            ],
-            [
-                stop_sign,
-                rescale_image_size(stop_sign, 0.25),
-                cherry_blossom.resize((183, 488)),
-                cherry_blossom.resize((488, 183))
-            ],
-            cherry_blossom,
-        ])]
-
-    _run_test(
-        hf_runner,
-        vllm_runner,
-        inputs,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-    )
-
-
-@pytest.mark.parametrize("model", models)
-def test_context_length_too_short(vllm_runner, image_assets, model):
-    images = [asset.pil_image for asset in image_assets]
-
-    with pytest.raises(ValueError, match="too long to fit into the model"):
-        vllm_model = vllm_runner(
-            model,
-            max_model_len=128,  # LLaVA has a feature size of 576
-            enforce_eager=True,
-        )
-
-        with vllm_model:
-            vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]],
-                                       max_tokens=1,
-                                       images=[images[0]])
diff --git a/tests/models/decoder_only/vision_language/test_llava_image_embeds.py b/tests/models/decoder_only/vision_language/test_llava_image_embeds.py
deleted file mode 100644
index 66414032509ed..0000000000000
--- a/tests/models/decoder_only/vision_language/test_llava_image_embeds.py
+++ /dev/null
@@ -1,158 +0,0 @@
-from typing import List, Optional, Tuple, Type
-
-import pytest
-from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
-
-from vllm.sequence import SampleLogprobs
-
-from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from ...utils import check_logprobs_close
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
-    "cherry_blossom":
-    "USER: <image>\nWhat is the season?\nASSISTANT:",
-})
-
-models = [
-    "llava-hf/llava-1.5-7b-hf",
-]
-
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
-                                         Optional[SampleLogprobs]],
-                      model: str):
-    """Sanitize vllm output to be comparable with hf output."""
-    output_ids, output_str, out_logprobs = vllm_output
-
-    config = AutoConfig.from_pretrained(model)
-    image_token_id = config.image_token_index
-
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    eos_token_id = tokenizer.eos_token_id
-
-    hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
-        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
-    ]
-
-    assert output_str[0] == " "
-    hf_output_str = output_str[1:]
-    if hf_output_ids[-1] == eos_token_id:
-        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
-
-    return hf_output_ids, hf_output_str, out_logprobs
-
-
-def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    image_assets: _ImageAssets,
-    model: str,
-    *,
-    size_factors: List[float],
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test are from IMAGE_ASSETS.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-
-    # vLLM to load from image embeddings
-    vllm_images = [asset.image_embeds for asset in image_assets]
-
-    # transformers to load from PIL images
-    hf_images = [asset.pil_image for asset in image_assets]
-
-    vllm_inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [image for _ in size_factors],
-    ) for image, prompt in zip(vllm_images, HF_IMAGE_PROMPTS)]
-
-    hf_inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [image for _ in size_factors],
-    ) for image, prompt in zip(hf_images, HF_IMAGE_PROMPTS)]
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-        vllm_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in vllm_inputs_per_image
-        ]
-
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForVision2Seq) as hf_model:
-        hf_outputs_per_image = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images)
-            for prompts, images in hf_inputs_per_image
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
-                                        vllm_outputs_per_image):
-        # TODO: Check whether using original CLIPVisionModel can improve
-        # consistency against HF
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(vllm_output, model)
-                for vllm_output in vllm_outputs
-            ],
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
-    run_test(
-        hf_runner,
-        vllm_runner,
-        image_assets,
-        model,
-        size_factors=size_factors,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-    )
diff --git a/tests/models/decoder_only/vision_language/test_llava_next.py b/tests/models/decoder_only/vision_language/test_llava_next.py
deleted file mode 100644
index aa9b297c5dd4e..0000000000000
--- a/tests/models/decoder_only/vision_language/test_llava_next.py
+++ /dev/null
@@ -1,347 +0,0 @@
-from typing import List, Optional, Tuple, Type, overload
-
-import pytest
-from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
-
-from vllm.inputs import InputContext
-from vllm.multimodal.utils import rescale_image_size
-from vllm.sequence import SampleLogprobs
-
-from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
-                          _ImageAssets)
-from ...utils import build_model_context, check_logprobs_close
-
-_LIMIT_IMAGE_PER_PROMPT = 4
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "[INST] <image>\nWhat's the content of the image? [/INST]",
-    "cherry_blossom":
-    "[INST] <image>\nWhat is the season? [/INST]",
-})
-
-models = ["llava-hf/llava-v1.6-mistral-7b-hf"]
-
-
-@pytest.fixture()
-def get_max_llava_next_image_tokens():
-    from vllm.model_executor.models.llava_next import (
-        get_max_llava_next_image_tokens)
-    return get_max_llava_next_image_tokens
-
-
-@pytest.fixture()
-def dummy_data_for_llava_next():
-    from vllm.model_executor.models.llava_next import dummy_data_for_llava_next
-    return dummy_data_for_llava_next
-
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
-                                         Optional[SampleLogprobs]],
-                      model: str):
-    """Sanitize vllm output to be comparable with hf output."""
-    output_ids, output_str, out_logprobs = vllm_output
-
-    config = AutoConfig.from_pretrained(model)
-    image_token_id = config.image_token_index
-
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    eos_token_id = tokenizer.eos_token_id
-
-    hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
-        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
-    ]
-
-    assert output_str[0] == " "
-    hf_output_str = output_str[1:]
-    if hf_output_ids[-1] == eos_token_id:
-        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
-
-    return hf_output_ids, hf_output_str, out_logprobs
-
-
-@overload
-def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    image_assets: _ImageAssets,
-    model: str,
-    *,
-    size_factors: List[float],
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    ...
-
-
-@overload
-def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    image_assets: _ImageAssets,
-    model: str,
-    *,
-    sizes: List[Tuple[int, int]],
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    ...
-
-
-def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    image_assets: _ImageAssets,
-    model: str,
-    *,
-    size_factors: Optional[List[float]] = None,
-    sizes: Optional[List[Tuple[int, int]]] = None,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    images = [asset.pil_image for asset in image_assets]
-
-    if size_factors is not None:
-        inputs_per_image = [(
-            [prompt for _ in size_factors],
-            [rescale_image_size(image, factor) for factor in size_factors],
-        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-    elif sizes is not None:
-        inputs_per_image = [(
-            [prompt for _ in sizes],
-            [image.resize(size) for size in sizes],
-        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-    else:
-        raise ValueError("You must provide either `size_factors` or `sizes`")
-
-    _run_test(hf_runner,
-              vllm_runner,
-              inputs_per_image,
-              model,
-              dtype=dtype,
-              max_tokens=max_tokens,
-              num_logprobs=num_logprobs,
-              tensor_parallel_size=tensor_parallel_size,
-              distributed_executor_backend=distributed_executor_backend)
-
-
-def _run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptImageInput]],
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     dtype=dtype,
-                     max_model_len=10240,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True,
-                     limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
-                                          }) as vllm_model:
-        vllm_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in inputs
-        ]
-
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForVision2Seq) as hf_model:
-        hf_outputs_per_image = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images)
-            for prompts, images in inputs
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
-                                        vllm_outputs_per_image):
-        # TODO: Check whether using original CLIPVisionModel can improve
-        # consistency against HF
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(vllm_output, model)
-                for vllm_output in vllm_outputs
-            ],
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype, max_tokens, num_logprobs) -> None:
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test are from IMAGE_ASSETS.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    run_test(
-        hf_runner,
-        vllm_runner,
-        image_assets,
-        model,
-        size_factors=size_factors,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-    )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "sizes",
-    [[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]],
-)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models_fixed_sizes(hf_runner, vllm_runner, image_assets, model, sizes,
-                            dtype, max_tokens, num_logprobs) -> None:
-    run_test(
-        hf_runner,
-        vllm_runner,
-        image_assets,
-        model,
-        sizes=sizes,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-    )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets,
-                                      model, dtype, max_tokens,
-                                      num_logprobs) -> None:
-    stop_sign = image_assets[0].pil_image
-    cherry_blossom = image_assets[1].pil_image
-
-    inputs = [(
-        [
-            "[INST] <image><image>\nDescribe 2 images. [/INST]",
-            "[INST] <image><image>\nDescribe 2 images. [/INST]",
-            "[INST] <image><image><image><image>\nDescribe 4 images. [/INST]",
-            "[INST] <image>\nWhat is the season? [/INST]"
-        ],
-        [
-            [stop_sign, cherry_blossom],
-            # Images with different sizes and aspect-ratios
-            [
-                rescale_image_size(stop_sign, 0.1),
-                stop_sign,
-            ],
-            [
-                stop_sign,
-                rescale_image_size(stop_sign, 0.25),
-                cherry_blossom.resize((183, 488)),
-                cherry_blossom.resize((488, 183))
-            ],
-            cherry_blossom,
-        ])]
-
-    _run_test(
-        hf_runner,
-        vllm_runner,
-        inputs,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-    )
-
-
-@pytest.mark.parametrize("gridpoints,expected_max_tokens", [
-    ([[336, 336]], 1176),
-    ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928),
-])
-def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens,
-                                         get_max_llava_next_image_tokens):
-    ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
-
-    # Update the config image_grid_pinpoints
-    # and calculate the resulting max tokens
-    ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
-
-    actual_max_tokens = get_max_llava_next_image_tokens(
-        InputContext(ctx.model_config))
-
-    assert expected_max_tokens == actual_max_tokens
-
-
-@pytest.mark.parametrize(
-    "gridpoints,expected_size",
-    [
-        # One point; it has to be the largest
-        ([[336, 336]], (336, 336)),
-        # Default for most llava next models; the 2x2 tile is the largest
-        ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]],
-         (672, 672)),
-        # If two rectangular gridpoints are the same, the more vertical
-        # one has the higher feature count due to newline features
-        ([[336, 672], [672, 336]], (672, 336))
-    ])
-def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next,
-                                                gridpoints, expected_size):
-    ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
-
-    # Update the config image_grid_pinpoints
-    ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
-    seq_len = 5000  # bigger than the max feature size for any image
-
-    seq_data, mm_data = dummy_data_for_llava_next(
-        ctx,
-        seq_len=seq_len,
-        mm_counts={"image": 1},
-    )
-
-    # The dummy data dims should match the gridpoint with the biggest feat size
-    assert mm_data["image"].height == expected_size[0]
-    assert mm_data["image"].width == expected_size[1]
-    assert len(seq_data.get_token_ids()) >= seq_len
diff --git a/tests/models/decoder_only/vision_language/test_llava_next_video.py b/tests/models/decoder_only/vision_language/test_llava_next_video.py
deleted file mode 100644
index 7b7b23c783e2a..0000000000000
--- a/tests/models/decoder_only/vision_language/test_llava_next_video.py
+++ /dev/null
@@ -1,226 +0,0 @@
-from typing import List, Optional, Tuple, Type, overload
-
-import pytest
-from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
-
-from vllm.multimodal.utils import (rescale_video_size, resize_video,
-                                   sample_frames_from_video)
-from vllm.sequence import SampleLogprobs
-
-from ....conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
-from ...utils import check_logprobs_close
-
-_PREFACE = (
-    "A chat between a curious human and an artificial intelligence assistant. "
-    "The assistant gives helpful, detailed, and polite answers to the human's "
-    "questions.")
-
-HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
-    "sample_demo_1":
-    f"{_PREFACE}USER: <video>\nWhy is this video funny? ASSISTANT:"
-})
-
-models = ["llava-hf/LLaVA-NeXT-Video-7B-hf"]
-
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
-                                         Optional[SampleLogprobs]],
-                      model: str):
-    """Sanitize vllm output to be comparable with hf output."""
-    output_ids, output_str, out_logprobs = vllm_output
-
-    config = AutoConfig.from_pretrained(model)
-    video_token_id = config.video_token_index
-
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    eos_token_id = tokenizer.eos_token_id
-
-    hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
-        if token_id != video_token_id or output_ids[idx - 1] != video_token_id
-    ]
-
-    assert output_str[0] == " "
-    hf_output_str = output_str[1:]
-    if hf_output_ids[-1] == eos_token_id:
-        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
-
-    return hf_output_ids, hf_output_str, out_logprobs
-
-
-@overload
-def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    video_assets: _VideoAssets,
-    model: str,
-    *,
-    size_factors: List[float],
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    num_frames: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    ...
-
-
-@overload
-def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    video_assets: _VideoAssets,
-    model: str,
-    *,
-    sizes: List[Tuple[int, int]],
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    num_frames: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    ...
-
-
-def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    video_assets: _VideoAssets,
-    model: str,
-    *,
-    size_factors: Optional[List[float]] = None,
-    sizes: Optional[List[Tuple[int, int]]] = None,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    num_frames: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    videos = [
-        sample_frames_from_video(asset.np_ndarrays, num_frames)
-        for asset in video_assets
-    ]
-
-    if size_factors is not None:
-        inputs_per_video = [(
-            [prompt for _ in size_factors],
-            [rescale_video_size(video, factor) for factor in size_factors],
-        ) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
-    elif sizes is not None:
-        inputs_per_video = [(
-            [prompt for _ in sizes],
-            [resize_video(video, size) for size in sizes],
-        ) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
-    else:
-        raise ValueError("You must provide either `size_factors` or `sizes`")
-
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     dtype=dtype,
-                     max_model_len=4096,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-        vllm_outputs_per_video = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                videos=videos)
-            for prompts, videos in inputs_per_video
-        ]
-
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForVision2Seq) as hf_model:
-        hf_outputs_per_video = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    videos=videos)
-            for prompts, videos in inputs_per_video
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_video,
-                                        vllm_outputs_per_video):
-        # TODO: Check whether using original CLIPVisionModel can improve
-        # consistency against HF
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(vllm_output, model)
-                for vllm_output in vllm_outputs
-            ],
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No video
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("num_frames", [16])
-def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
-                dtype, max_tokens, num_logprobs, num_frames) -> None:
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test is under tests/videos.
-    For huggingface runner, we provide the np.ndarray as input.
-    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    run_test(
-        hf_runner,
-        vllm_runner,
-        video_assets,
-        model,
-        size_factors=size_factors,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        num_frames=num_frames,
-        tensor_parallel_size=1,
-    )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "sizes",
-    [[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]],
-)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("num_frames", [16])
-def test_models_fixed_sizes(hf_runner, vllm_runner, video_assets, model, sizes,
-                            dtype, max_tokens, num_logprobs,
-                            num_frames) -> None:
-    run_test(
-        hf_runner,
-        vllm_runner,
-        video_assets,
-        model,
-        sizes=sizes,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        num_frames=num_frames,
-        tensor_parallel_size=1,
-    )
diff --git a/tests/models/decoder_only/vision_language/test_llava_onevision.py b/tests/models/decoder_only/vision_language/test_llava_onevision.py
deleted file mode 100644
index 1616fd299b9aa..0000000000000
--- a/tests/models/decoder_only/vision_language/test_llava_onevision.py
+++ /dev/null
@@ -1,272 +0,0 @@
-from typing import List, Optional, Tuple, Type
-
-import pytest
-from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
-                          BatchEncoding)
-
-from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
-                                   resize_video, sample_frames_from_video)
-from vllm.sequence import SampleLogprobs
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-
-from ....conftest import (VIDEO_ASSETS, HfRunner, PromptImageInput,
-                          PromptVideoInput, VllmRunner)
-from ...utils import check_logprobs_close
-
-# Video test
-HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
-    "sample_demo_1":
-    "<|im_start|>user\n<video>\nwhy is this video funny?<|im_end|>\n<|im_start|>assistant\n"  # noqa: E501
-})
-
-models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]
-
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
-                                         Optional[SampleLogprobs]],
-                      model: str):
-    """Sanitize vllm output to be comparable with hf output."""
-    output_ids, output_str, out_logprobs = vllm_output
-
-    config = AutoConfig.from_pretrained(model)
-    video_token_id = config.video_token_index
-
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    eos_token_id = tokenizer.eos_token_id
-
-    hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
-        if token_id != video_token_id or output_ids[idx - 1] != video_token_id
-    ]
-
-    hf_output_str = output_str
-    if hf_output_ids[-1] == eos_token_id:
-        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
-
-    return hf_output_ids, hf_output_str, out_logprobs
-
-
-# Video test
-_LIMIT_VIDEO_PER_PROMPT = 4
-
-
-def run_video_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptVideoInput]],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    num_frames: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-    with vllm_runner(model,
-                     dtype=dtype,
-                     max_model_len=16384,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True,
-                     limit_mm_per_prompt={"video": _LIMIT_VIDEO_PER_PROMPT
-                                          }) as vllm_model:
-        vllm_outputs_per_input = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                videos=videos)
-            for prompts, videos in inputs
-        ]
-
-    def process(hf_inputs: BatchEncoding):
-        hf_inputs["pixel_values_videos"] = hf_inputs["pixel_values_videos"] \
-            .to(torch_dtype)  # type: ignore
-        return hf_inputs
-
-    with hf_runner(model,
-                   dtype=dtype,
-                   postprocess_inputs=process,
-                   auto_cls=AutoModelForVision2Seq) as hf_model:
-        hf_outputs_per_input = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    videos=videos)
-            for prompts, videos in inputs
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_input,
-                                        vllm_outputs_per_input):
-        # TODO: Check whether using original CLIPVisionModel can improve
-        # consistency against HF
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(vllm_output, model)
-                for vllm_output in vllm_outputs
-            ],
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("num_frames", [16])
-def test_models_multiple_video_inputs(hf_runner, vllm_runner, video_assets,
-                                      model, dtype, max_tokens, num_logprobs,
-                                      num_frames) -> None:
-    video = sample_frames_from_video(video_assets[0].np_ndarrays, num_frames)
-    inputs = [(
-        [
-            "<|im_start|>user <video><video>\nDescribe 2 videos. \
-                <|im_end|><|im_start|>assistant\n",
-            "<|im_start|>user <video><video>\nDescribe 2 videos. \
-                <|im_end|><|im_start|>assistant\n",
-            "<|im_start|>user <video><video><video><video>\nDescribe 4 videos. \
-                <|im_end|><|im_start|>assistant\n",
-            "<|im_start|>user <video>\nwhy is this video funny? \
-                <|im_end|><|im_start|>assistant\n",
-        ],
-        [
-            [video, video],
-            # Images with different sizes and aspect-ratios
-            [
-                rescale_video_size(video, 0.1),
-                video,
-            ],
-            [
-                video,
-                rescale_video_size(video, 0.25),
-                resize_video(video, (183, 488)),
-                resize_video(video, (488, 183))
-            ],
-            video,
-        ])]
-    run_video_test(
-        hf_runner,
-        vllm_runner,
-        inputs,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-        num_frames=num_frames,
-    )
-
-
-# Image test
-_LIMIT_IMAGE_PER_PROMPT = 4
-
-
-def run_image_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptImageInput]],
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     dtype=dtype,
-                     max_model_len=16384,
-                     max_num_seqs=2,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True,
-                     limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
-                                          }) as vllm_model:
-        vllm_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in inputs
-        ]
-
-    def process(hf_inputs: BatchEncoding):
-        hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \
-            .to(torch_dtype)  # type: ignore
-        return hf_inputs
-
-    with hf_runner(model,
-                   dtype=dtype,
-                   postprocess_inputs=process,
-                   auto_cls=AutoModelForVision2Seq) as hf_model:
-        hf_outputs_per_image = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images)
-            for prompts, images in inputs
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
-                                        vllm_outputs_per_image):
-        # TODO: Check whether using original CLIPVisionModel can improve
-        # consistency against HF
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(vllm_output, model)
-                for vllm_output in vllm_outputs
-            ],
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets,
-                                      model, dtype, max_tokens,
-                                      num_logprobs) -> None:
-    stop_sign = image_assets[0].pil_image
-    cherry_blossom = image_assets[1].pil_image
-
-    inputs = [(
-        [
-            "<|im_start|>user\n<image><image>\nDescribe 2 images.<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
-            "<|im_start|>user\n<image><image>\nDescribe 2 images.<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
-            "<|im_start|>user\n<image><image><image><image>\nDescribe 4 images.<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
-            "<|im_start|>user\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
-        ],
-        [
-            [stop_sign, cherry_blossom],
-            # Images with different sizes and aspect-ratios
-            [
-                rescale_image_size(stop_sign, 0.1),
-                stop_sign,
-            ],
-            [
-                stop_sign,
-                rescale_image_size(stop_sign, 0.25),
-                cherry_blossom.resize((183, 488)),
-                cherry_blossom.resize((488, 183))
-            ],
-            cherry_blossom,
-        ])]
-
-    run_image_test(
-        hf_runner,
-        vllm_runner,
-        inputs,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-    )
diff --git a/tests/models/decoder_only/vision_language/test_minicpmv.py b/tests/models/decoder_only/vision_language/test_minicpmv.py
deleted file mode 100644
index d3a0561f65797..0000000000000
--- a/tests/models/decoder_only/vision_language/test_minicpmv.py
+++ /dev/null
@@ -1,199 +0,0 @@
-from typing import List, Optional, Tuple, Type, Union
-
-import pytest
-import torch
-import torch.types
-from PIL import Image
-from transformers import BatchEncoding
-
-from vllm.multimodal.utils import rescale_image_size
-from vllm.sequence import SampleLogprobs
-
-from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner
-from ...utils import check_logprobs_close
-
-# The image token is placed before "user" on purpose so that the test can pass
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-        "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
-        "(<image>./</image>)\nWhat's the content of the image?<|eot_id|>" \
-        "<|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
-    "cherry_blossom":
-        "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
-        "(<image>./</image>)\nWhat is the season?<|eot_id|>" \
-        "<|start_header_id|>assistant<|end_header_id|>\n\n",
-})
-HF_MULTIIMAGE_IMAGE_PROMPT = \
-    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
-    "(<image>./</image>)\n(<image>./</image>)\n" \
-    "Describe these images.<|eot_id|>" \
-    "<|start_header_id|>assistant<|end_header_id|>\n\n"
-
-models = ["openbmb/MiniCPM-Llama3-V-2_5"]
-
-
-def _wrap_inputs(hf_inputs: BatchEncoding):
-    return {"model_inputs": hf_inputs}
-
-
-def trunc_hf_output(hf_output: Tuple[List[int], str,
-                                     Optional[SampleLogprobs]]):
-    output_ids, output_str, out_logprobs = hf_output
-    if output_str.endswith("<|eot_id|>"):
-        output_str = output_str.split("<|eot_id|>")[0]
-    return output_ids, output_str, out_logprobs
-
-
-target_dtype = "half"
-
-
-def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], Union[List[Image.Image],
-                                        List[List[Image.Image]]]]],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    mm_limit: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test are from IMAGE_ASSETS.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     max_model_len=4096,
-                     max_num_seqs=2,
-                     dtype=dtype,
-                     limit_mm_per_prompt={"image": mm_limit},
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-        tokenizer = vllm_model.model.get_tokenizer()
-        stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
-        vllm_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images,
-                                                stop_token_ids=stop_token_ids)
-            for prompts, images in inputs
-        ]
-
-    hf_model = hf_runner(model, dtype=dtype, postprocess_inputs=_wrap_inputs)
-    with hf_model, torch.no_grad():
-        hf_outputs_per_image = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images,
-                                                    tokenizer=tokenizer)
-            for prompts, images in inputs
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
-                                        vllm_outputs_per_image):
-        check_logprobs_close(
-            outputs_0_lst=[
-                trunc_hf_output(hf_output) for hf_output in hf_outputs
-            ],
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_image,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
-                             size_factors, dtype: str, max_tokens: int,
-                             num_logprobs: int) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_case = [
-        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-         [[rescale_image_size(image, factor) for image in images]
-          for factor in size_factors])
-    ]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_case,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=2,
-        tensor_parallel_size=1,
-    )
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
new file mode 100644
index 0000000000000..9370527e3cd57
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -0,0 +1,594 @@
+"""Common tests for testing .generate() functionality for single / multiple
+image, embedding, and video support for different VLMs in vLLM.
+"""
+import os
+from pathlib import PosixPath
+from typing import Type
+
+import pytest
+import transformers
+from transformers import AutoModelForVision2Seq
+
+from vllm.platforms import current_platform
+from vllm.utils import cuda_device_count_stateless, identity
+
+from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
+                          _VideoAssets)
+from ....utils import fork_new_process_for_each_test, large_gpu_mark
+from ...utils import check_outputs_equal
+from .vlm_utils import custom_inputs, model_utils, runners
+from .vlm_utils.case_filtering import get_parametrized_options
+from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
+                              VLMTestInfo, VLMTestType)
+
+# This hack is needed for phi3v & paligemma models
+# ROCm Triton FA can run into shared memory issues with these models,
+# use other backends in the meantime
+# FIXME (mattwong, gshtrasb, hongxiayan)
+if current_platform.is_rocm():
+    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
+
+# yapf: disable
+COMMON_BROADCAST_SETTINGS = {
+    "test_type": VLMTestType.IMAGE,
+    "dtype": "half",
+    "max_tokens": 5,
+    "tensor_parallel_size": 2,
+    "image_size_factors": [(.25, 0.5, 1.0)],
+    "distributed_executor_backend": (
+        "ray",
+        "mp",
+    )
+}
+
+### Test configuration for specific models
+# NOTE: The convention of the test settings below is to lead each test key
+# with the name of the model arch used in the test, using underscores in place
+# of hyphens; this makes it more convenient to filter tests for a specific kind
+# of model. For example....
+#
+# To run all test types for a specific key:
+#     use the k flag to substring match with a leading square bracket; if the
+#     model arch happens to be a substring of another one, you can add a
+#     trailing hyphen. E.g.,
+#                 - pytest $TEST_FILE -k "[llava-"
+#     prevents matching on "[llava_next-" & will match just the enabled cases
+#     for llava, i.e., single image, image embedding, and custom input tests.
+#
+# To run a test for a Test Info for just one of multiple models:
+#     use the k flag to substring match the model name, e.g.,
+#                 - pytest $TEST_FILE -k OpenGVLab/InternVL2-1B
+#     prevents matching on nGVLab/InternVL2-2B.
+#
+# You can also combine substrings to match more granularly.
+#     ex 1:
+#        pytest $TEST_FILE -k "test_single_image and OpenGVLab/InternVL2-1B"
+#     will run only test_single_image* for OpenGVLab/InternVL2-1B; this would
+#     match both wrappers for single image tests, since it also matches
+#     test_single_image_heavy (which forks if we have a distributed backend)
+#     ex 2:
+#        pytest $TEST_FILE -k  "[llava- or [intern_vl-"
+#     will run all of the tests for only llava & internvl.
+#
+# NOTE you can add --collect-only to any of the above commands to see
+# which cases would be selected and deselected by pytest. In general,
+# this is a good idea for checking your command first, since tests are slow.
+
+VLM_TEST_SETTINGS = {
+    "blip2": VLMTestInfo(
+        models=["Salesforce/blip2-opt-2.7b"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
+        img_idx_to_prompt=lambda idx: "",
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
+    ),
+    "chameleon": VLMTestInfo(
+        models=["facebook/chameleon-7b"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
+        max_model_len=4096,
+        auto_cls=AutoModelForVision2Seq,
+        postprocess_inputs=model_utils.get_key_type_post_processor(
+            "pixel_values"
+        ),
+        # For chameleon, we only compare the sequences
+        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
+        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
+        comparator=check_outputs_equal,
+        max_tokens=8,
+        dtype="bfloat16",
+        marks=[
+            pytest.mark.skipif(
+                transformers.__version__.startswith("4.46"),
+                reason="Model broken in HF, see huggingface/transformers#34379"
+            )
+        ]
+    ),
+    "fuyu": VLMTestInfo(
+        models=["adept/fuyu-8b"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=lambda img_prompt: f"{img_prompt}\n",
+        img_idx_to_prompt=lambda idx: "",
+        max_model_len=2048,
+        max_num_seqs=2,
+        use_tokenizer_eos=True,
+        vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
+        num_logprobs=10,
+        dtype="bfloat16" if current_platform.is_cpu() else "half",
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+    ),
+    "glm4": VLMTestInfo(
+        models=["THUDM/glm-4v-9b"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=identity,
+        img_idx_to_prompt=lambda idx: "",
+        max_model_len=2048,
+        max_num_seqs=2,
+        dtype="bfloat16",
+        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
+        marks=[large_gpu_mark(min_gb=48)],
+        patch_hf_runner=model_utils.glm_patch_hf_runner,
+    ),
+    "intern_vl": VLMTestInfo(
+        models=[
+            "OpenGVLab/InternVL2-1B",
+            "OpenGVLab/InternVL2-2B",
+            "OpenGVLab/Mono-InternVL-2B",
+        ],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<image>\nWhat is the season?",
+        }),
+        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
+        max_model_len=4096,
+        # NOTE: Mono-InternVL-2B doesn't work with fp16,
+        # it will result NaN during inference.
+        # See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
+        dtype="bfloat16",
+        use_tokenizer_eos=True,
+        patch_hf_runner=model_utils.internvl_patch_hf_runner,
+    ),
+    "llava": VLMTestInfo(
+        models=["llava-hf/llava-1.5-7b-hf"],
+        test_type=(
+            VLMTestType.EMBEDDING,
+            VLMTestType.IMAGE,
+            VLMTestType.CUSTOM_INPUTS
+        ),
+        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
+        convert_assets_to_embeddings=model_utils.get_llava_embeddings,
+        max_model_len=4096,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
+        custom_test_opts=[CustomTestOptions(
+            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
+                formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
+            ),
+            limit_mm_per_prompt={"image": 4},
+        )],
+    ),
+    "llava_next": VLMTestInfo(
+        models=["llava-hf/llava-v1.6-mistral-7b-hf"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
+        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
+        max_model_len=10240,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
+        custom_test_opts=[CustomTestOptions(
+            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
+                formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
+            ),
+            limit_mm_per_prompt={"image": 4},
+        )],
+        # Llava-next tests fixed sizes & the default size factors
+        image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
+    ),
+    "llava_one_vision": VLMTestInfo(
+        models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
+        dtype="half",
+        num_video_frames=16,
+        max_model_len=16384,
+        postprocess_inputs=model_utils.get_key_type_post_processor(
+            "pixel_values_videos"
+        ),
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
+        # Llava-one-vision tests fixed sizes & the default size factors
+        image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
+        runner_mm_key="videos",
+        custom_test_opts=[CustomTestOptions(
+            inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
+                formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
+            ),
+            limit_mm_per_prompt={"video": 4},
+        )],
+    ),
+    # FIXME
+    "llava_next_video": VLMTestInfo(
+        models=["llava-hf/LLaVA-NeXT-Video-7B-hf"],
+        test_type=VLMTestType.VIDEO,
+        prompt_formatter=lambda vid_prompt: f"USER: {vid_prompt} ASSISTANT:",
+        num_video_frames=16,
+        max_model_len=4096,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
+        image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
+        runner_mm_key="videos",
+        marks=[
+            pytest.mark.skip(reason="LLava next video tests currently fail.")
+        ],
+    ),
+    "minicpmv": VLMTestInfo(
+        models=["openbmb/MiniCPM-Llama3-V-2_5"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
+        postprocess_inputs=model_utils.wrap_inputs_post_processor,
+        hf_output_post_proc=model_utils.minicmpv_trunc_hf_output,
+    ),
+    "paligemma": VLMTestInfo(
+        models=["google/paligemma-3b-mix-224"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=identity,
+        img_idx_to_prompt = lambda idx: "",
+        # Paligemma uses its own sample prompts because the default one fails
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "caption es",
+            "cherry_blossom": "What is in the picture?",
+        }),
+        auto_cls=AutoModelForVision2Seq,
+        postprocess_inputs=model_utils.get_key_type_post_processor(
+            "pixel_values"
+        ),
+        vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
+        dtype="half" if current_platform.is_rocm() else ("half", "float"),
+    ),
+    # Tests for phi3v currently live in another file because of a bug in
+    # transformers. Once this issue is fixed, we can enable them here instead.
+    # https://github.com/huggingface/transformers/issues/34307
+    # "phi3v": VLMTestInfo(
+    #     models=["microsoft/Phi-3.5-vision-instruct"],
+    #     test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+    #     prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
+    #     img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
+    #     max_model_len=4096,
+    #     max_num_seqs=2,
+    #     task="generate",
+    #     # use eager mode for hf runner since phi3v didn't work with flash_attn
+    #     model_kwargs={"_attn_implementation": "eager"},
+    #     use_tokenizer_eos=True,
+    #     vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
+    #     num_logprobs=10,
+    # ),
+    "qwen": VLMTestInfo(
+        models=["Qwen/Qwen-VL"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=identity,
+        img_idx_to_prompt=lambda idx: f"Picture {idx}: <img></img>\n",
+        max_model_len=1024,
+        max_num_seqs=2,
+        vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
+        prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
+    ),
+    ### Tensor parallel / multi-gpu broadcast tests
+    "broadcast-chameleon": VLMTestInfo(
+        models=["facebook/chameleon-7b"],
+        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
+        max_model_len=4096,
+        auto_cls=AutoModelForVision2Seq,
+        postprocess_inputs=model_utils.get_key_type_post_processor(
+            "pixel_values"
+        ),
+        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
+        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
+        comparator=check_outputs_equal,
+        marks=[
+            pytest.mark.distributed_2_gpus,
+            pytest.mark.skipif(
+                cuda_device_count_stateless() < 2,
+                reason="Need at least 2 GPUs to run the test.",
+            ),
+            pytest.mark.skipif(
+                transformers.__version__.startswith("4.46"),
+                reason="Model broken in HF, see huggingface/transformers#34379"
+            )
+        ],
+        **COMMON_BROADCAST_SETTINGS # type: ignore
+    ),
+    "broadcast-llava": VLMTestInfo(
+        models=["llava-hf/llava-1.5-7b-hf"],
+        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
+        max_model_len=4096,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
+        marks=[
+            pytest.mark.distributed_2_gpus,
+            pytest.mark.skipif(
+                cuda_device_count_stateless() < 2,
+                reason="Need at least 2 GPUs to run the test.",
+            )
+        ],
+        **COMMON_BROADCAST_SETTINGS # type: ignore
+    ),
+    "broadcast-llava_next": VLMTestInfo(
+        models=["llava-hf/llava-v1.6-mistral-7b-hf"],
+        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
+        max_model_len=10240,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
+        marks=[
+            pytest.mark.distributed_2_gpus,
+            pytest.mark.skipif(
+                cuda_device_count_stateless() < 2,
+                reason="Need at least 2 GPUs to run the test.",
+            )
+        ],
+        **COMMON_BROADCAST_SETTINGS # type: ignore
+    ),
+    ### Custom input edge-cases for specific models
+    "intern_vl-diff-patches": VLMTestInfo(
+        models=["OpenGVLab/InternVL2-2B"],
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        max_model_len=4096,
+        dtype="bfloat16" if current_platform.is_cpu() else "half",
+        use_tokenizer_eos=True,
+        patch_hf_runner=model_utils.internvl_patch_hf_runner,
+        custom_test_opts=[
+            CustomTestOptions(
+                inputs=inp,
+                limit_mm_per_prompt={"image": 2},
+            ) for inp in custom_inputs.different_patch_input_cases_internvl()
+        ],
+    ),
+    "llava_one_vision-multiple-images": VLMTestInfo(
+        models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        max_model_len=16384,
+        max_num_seqs=2,
+        dtype="half",
+        postprocess_inputs=model_utils.get_key_type_post_processor(
+            "pixel_values"
+        ),
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
+        custom_test_opts=[CustomTestOptions(
+            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
+                formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
+            ),
+            limit_mm_per_prompt={"image": 4},
+        )],
+    ),
+}
+# yapf: enable
+
+
+### Test wrappers
+# Wrappers around the core test running func for:
+# - single image
+# - multi-image
+# - image embeddings
+# - video
+# - custom inputs
+@pytest.mark.parametrize("model_type,test_case",
+                         get_parametrized_options(
+                             VLM_TEST_SETTINGS,
+                             test_type=VLMTestType.IMAGE,
+                             fork_new_process_for_each_test=False,
+                         ))
+def test_single_image_models(tmp_path: PosixPath, model_type: str,
+                             test_case: ExpandableVLMTestArgs,
+                             hf_runner: Type[HfRunner],
+                             vllm_runner: Type[VllmRunner],
+                             image_assets: _ImageAssets):
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_single_image_test(
+        tmp_path=tmp_path,
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        image_assets=image_assets,
+    )
+
+
+@pytest.mark.parametrize("model_type,test_case",
+                         get_parametrized_options(
+                             VLM_TEST_SETTINGS,
+                             test_type=VLMTestType.MULTI_IMAGE,
+                             fork_new_process_for_each_test=False,
+                         ))
+def test_multi_image_models(tmp_path: PosixPath, model_type: str,
+                            test_case: ExpandableVLMTestArgs,
+                            hf_runner: Type[HfRunner],
+                            vllm_runner: Type[VllmRunner],
+                            image_assets: _ImageAssets):
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_multi_image_test(
+        tmp_path=tmp_path,
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        image_assets=image_assets,
+    )
+
+
+@pytest.mark.parametrize("model_type,test_case",
+                         get_parametrized_options(
+                             VLM_TEST_SETTINGS,
+                             test_type=VLMTestType.EMBEDDING,
+                             fork_new_process_for_each_test=False,
+                         ))
+def test_image_embedding_models(model_type: str,
+                                test_case: ExpandableVLMTestArgs,
+                                hf_runner: Type[HfRunner],
+                                vllm_runner: Type[VllmRunner],
+                                image_assets: _ImageAssets):
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_embedding_test(
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        image_assets=image_assets,
+    )
+
+
+@pytest.mark.parametrize("model_type,test_case",
+                         get_parametrized_options(
+                             VLM_TEST_SETTINGS,
+                             test_type=VLMTestType.VIDEO,
+                             fork_new_process_for_each_test=False,
+                         ))
+def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
+                      hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
+                      video_assets: _VideoAssets):
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_video_test(
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        video_assets=video_assets,
+    )
+
+
+@pytest.mark.parametrize("model_type,test_case",
+                         get_parametrized_options(
+                             VLM_TEST_SETTINGS,
+                             test_type=VLMTestType.CUSTOM_INPUTS,
+                             fork_new_process_for_each_test=False,
+                         ))
+def test_custom_inputs_models(
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+):
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_custom_inputs_test(
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+    )
+
+
+#### Tests filtering for things running each test as a new process
+@pytest.mark.parametrize("model_type,test_case",
+                         get_parametrized_options(
+                             VLM_TEST_SETTINGS,
+                             test_type=VLMTestType.IMAGE,
+                             fork_new_process_for_each_test=True,
+                         ))
+@fork_new_process_for_each_test
+def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
+                                   test_case: ExpandableVLMTestArgs,
+                                   hf_runner: Type[HfRunner],
+                                   vllm_runner: Type[VllmRunner],
+                                   image_assets: _ImageAssets):
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_single_image_test(
+        tmp_path=tmp_path,
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        image_assets=image_assets,
+    )
+
+
+@pytest.mark.parametrize("model_type,test_case",
+                         get_parametrized_options(
+                             VLM_TEST_SETTINGS,
+                             test_type=VLMTestType.MULTI_IMAGE,
+                             fork_new_process_for_each_test=True,
+                         ))
+@fork_new_process_for_each_test
+def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
+                                  test_case: ExpandableVLMTestArgs,
+                                  hf_runner: Type[HfRunner],
+                                  vllm_runner: Type[VllmRunner],
+                                  image_assets: _ImageAssets):
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_multi_image_test(
+        tmp_path=tmp_path,
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        image_assets=image_assets,
+    )
+
+
+@pytest.mark.parametrize("model_type,test_case",
+                         get_parametrized_options(
+                             VLM_TEST_SETTINGS,
+                             test_type=VLMTestType.EMBEDDING,
+                             fork_new_process_for_each_test=True,
+                         ))
+@fork_new_process_for_each_test
+def test_image_embedding_models_heavy(model_type: str,
+                                      test_case: ExpandableVLMTestArgs,
+                                      hf_runner: Type[HfRunner],
+                                      vllm_runner: Type[VllmRunner],
+                                      image_assets: _ImageAssets):
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_embedding_test(
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        image_assets=image_assets,
+    )
+
+
+@pytest.mark.parametrize("model_type,test_case",
+                         get_parametrized_options(
+                             VLM_TEST_SETTINGS,
+                             test_type=VLMTestType.VIDEO,
+                             fork_new_process_for_each_test=True,
+                         ))
+def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
+                            hf_runner: Type[HfRunner],
+                            vllm_runner: Type[VllmRunner],
+                            video_assets: _VideoAssets):
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_video_test(
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        video_assets=video_assets,
+    )
+
+
+@pytest.mark.parametrize("model_type,test_case",
+                         get_parametrized_options(
+                             VLM_TEST_SETTINGS,
+                             test_type=VLMTestType.CUSTOM_INPUTS,
+                             fork_new_process_for_each_test=True,
+                         ))
+@fork_new_process_for_each_test
+def test_custom_inputs_models_heavy(
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+):
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_custom_inputs_test(
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+    )
diff --git a/tests/models/decoder_only/vision_language/test_paligemma.py b/tests/models/decoder_only/vision_language/test_paligemma.py
deleted file mode 100644
index 69189ba2f25cb..0000000000000
--- a/tests/models/decoder_only/vision_language/test_paligemma.py
+++ /dev/null
@@ -1,174 +0,0 @@
-import os
-from typing import List, Optional, Tuple, Type
-
-import pytest
-from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
-                          BatchEncoding)
-
-from vllm.multimodal.utils import rescale_image_size
-from vllm.platforms import current_platform
-from vllm.sequence import SampleLogprobs
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-
-from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from ...utils import check_logprobs_close
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "caption es",
-    "cherry_blossom":
-    "What is in the picture?",
-})
-
-models = ["google/paligemma-3b-mix-224"]
-
-# ROCm Triton FA can run into compilation issues with these models due to,
-# excessive use of shared memory. Use other backends in the meantime.
-# FIXME (mattwong, gshtrasb, hongxiayan)
-if current_platform.is_rocm():
-    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
-
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
-                                         Optional[SampleLogprobs]],
-                      model: str):
-    """Sanitize vllm output to be comparable with hf output."""
-    output_ids, output_str, out_logprobs = vllm_output
-
-    config = AutoConfig.from_pretrained(model)
-    image_token_id = config.image_token_index
-
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    eos_token_id = tokenizer.eos_token_id
-
-    hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
-        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
-    ]
-
-    hf_output_str = output_str
-
-    if hf_output_ids[-1] == eos_token_id:
-        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
-
-    return hf_output_ids, hf_output_str, out_logprobs
-
-
-def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    image_assets: _ImageAssets,
-    model: str,
-    *,
-    size_factors: List[float],
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test are from IMAGE_ASSETS.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-        vllm_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in inputs_per_image
-        ]
-
-    def process(hf_inputs: BatchEncoding):
-        hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \
-            .to(torch_dtype)  # type: ignore
-        return hf_inputs
-
-    with hf_runner(model,
-                   dtype=dtype,
-                   postprocess_inputs=process,
-                   auto_cls=AutoModelForVision2Seq) as hf_model:
-        hf_outputs_per_image = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images)
-            for prompts, images in inputs_per_image
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
-                                        vllm_outputs_per_image):
-
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(vllm_output, model)
-                for vllm_output in vllm_outputs
-            ],
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [
-    pytest.param(
-        "float",
-        marks=pytest.mark.skipif(
-            current_platform.is_rocm(),
-            reason=
-            "ROCm FA does not yet fully support 32-bit precision on PaliGemma")
-    ), "half"
-])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
-    run_test(
-        hf_runner,
-        vllm_runner,
-        image_assets,
-        model,
-        size_factors=size_factors,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-    )
diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py
index 1840b4bb8574c..b9c20ddb2d746 100644
--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -3,19 +3,14 @@
 from typing import List, Optional, Tuple, Type
 
 import pytest
-import torch
-from transformers import AutoImageProcessor, AutoTokenizer
+from transformers import AutoTokenizer
 
-from vllm.inputs import InputContext, token_inputs
-from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
-from vllm.multimodal import MultiModalRegistry
 from vllm.multimodal.utils import rescale_image_size
 from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
 
-from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
-                          _ImageAssets)
-from ...utils import build_model_context, check_logprobs_close
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ...utils import check_logprobs_close
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
@@ -81,12 +76,15 @@ def run_test(
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
+    # HACK - this is an attempted workaround for the following bug
+    # https://github.com/huggingface/transformers/issues/34307
+    from transformers import AutoImageProcessor  # noqa: F401
+    from transformers import AutoProcessor  # noqa: F401
 
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
-
     # max_model_len should be greater than image_feature_size
     with vllm_runner(model,
                      task="generate",
@@ -236,172 +234,3 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
         mm_limit=2,
         tensor_parallel_size=1,
     )
-
-
-### Fast tests for correctness in processor_kwarg override handling
-
-
-# Wrap lazy imports to avoid initializing CUDA during test collection
-@pytest.fixture()
-def input_processor_for_phi3v():
-    from vllm.model_executor.models.phi3v import input_processor_for_phi3v
-    return input_processor_for_phi3v
-
-
-@pytest.fixture()
-def dummy_data_for_phi3v():
-    from vllm.model_executor.models.phi3v import dummy_data_for_phi3v
-    return dummy_data_for_phi3v
-
-
-@pytest.fixture()
-def get_max_phi3v_image_tokens():
-    from vllm.model_executor.models.phi3v import get_max_phi3v_image_tokens
-    return get_max_phi3v_image_tokens
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("num_crops", [4, 16, None])
-def test_input_mapper_override(model: str, image_assets: _ImageAssets,
-                               num_crops: Optional[int]):
-    """Ensure that the [default] input mapper handles num_crops properly."""
-    # We pass the processor kwargs here since for this model, we fall back to
-    # the default mapper; this will fall back to the HF mapper and forward
-    # mm_processor_kwargs to it.
-    mm_processor_kwargs = {
-        "num_crops": num_crops
-    } if num_crops is not None else {}
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=mm_processor_kwargs,
-    )
-
-    hf_processor = AutoImageProcessor.from_pretrained(model,
-                                                      trust_remote_code=True,
-                                                      **mm_processor_kwargs)
-
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-
-    image = image_assets[0].pil_image
-    hf_result = hf_processor.preprocess(
-        image,
-        return_tensors="pt",
-    )
-
-    vllm_result = mm_registry.map_input(
-        ctx.model_config,
-        {"image": image},
-    )
-
-    assert torch.all(hf_result["image_sizes"] == vllm_result["image_sizes"])
-    assert torch.all(
-        hf_result["num_img_tokens"] == vllm_result["num_img_tokens"])
-
-    # For pixel values, the second axis should be the num_crops + 1
-    # for the rescaled original image. The default value in VLLM falls
-    # back to the HF config, which is why we compare to the processor num_crops
-    assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
-    assert vllm_result["pixel_values"].shape[1] == hf_processor.num_crops + 1
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("num_crops,expected_max_tokens", [
-    (4, 781),
-    (16, 2653),
-])
-def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
-                             num_crops: int, expected_max_tokens: int):
-    """Ensure get_max_phi3v_image_tokens handles num_crops properly."""
-    # NOTE: mm_processor_kwargs on the context in this test is unused, since
-    # this is testing the mapper directly. In practice, the processor kwargs
-    # are wrapped in a closure when calling the max tokens func. We explicitly
-    # do NOT use the mm_processor_kwargs in the model context here to ensure
-    # that the max image tokens implementation is referencing a mix of the
-    # kwargs to the function and the original mm_processor_kwargs in case
-    # values are somehow updated and end up in a bad state.
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=None,
-    )
-
-    actual_max_tokens = get_max_phi3v_image_tokens(
-        InputContext(ctx.model_config),
-        num_crops=num_crops,
-    )
-
-    assert expected_max_tokens == actual_max_tokens
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("num_crops,toks_per_img,num_imgs", [
-    (4, 781, 1),
-    (4, 781, 2),
-    (16, 2653, 1),
-    (16, 2653, 2),
-])
-def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int,
-                             toks_per_img: int, num_imgs: int):
-    """Ensure dummy_data_for_phi3v handles num_crops properly."""
-    # Same as the previous test - don't initialize mm_processor_kwargs
-    # in this test and assume that the kwargs will be correctly expanded by
-    # the partial when calling the dummy data func.
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=None,
-    )
-
-    sequence_data, _, = dummy_data_for_phi3v(
-        ctx=ctx,
-        seq_len=8192,  # Should be bigger than num_imgs * toks_per_img
-        mm_counts={"image": num_imgs},
-        num_crops=num_crops,
-    )
-    # Ensure we have the right number of placeholders per num_crops size
-    img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID)
-    assert img_tok_count == toks_per_img * num_imgs
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("num_crops,expected_toks_per_img,num_imgs", [
-    (4, 757, 1),
-    (4, 757, 2),
-    (16, 1921, 1),
-    (16, 1921, 2),
-])
-def test_input_processor_override(input_processor_for_phi3v,
-                                  image_assets: _ImageAssets, model: str,
-                                  num_crops: int, expected_toks_per_img: int,
-                                  num_imgs: int):
-    """Ensure input_processor_for_phi3v handles num_crops properly."""
-    # Same as the previous test - don't initialize mm_processor_kwargs
-    # in this test and assume that the kwargs will be correctly expanded by
-    # the partial when calling the custom input processor.
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    # Build the image str / prompt based on the number of images we pass
-    img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
-    prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
-    images = [image_assets[0].pil_image] * num_imgs
-
-    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
-                          prompt=prompt,
-                          multi_modal_data={"image": images})
-
-    processed_inputs = input_processor_for_phi3v(ctx,
-                                                 inputs,
-                                                 num_crops=num_crops)
-
-    # Ensure we have the right number of placeholders per num_crops size
-    img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
-    assert img_tok_count == expected_toks_per_img * num_imgs
diff --git a/tests/models/decoder_only/vision_language/test_qwen.py b/tests/models/decoder_only/vision_language/test_qwen.py
deleted file mode 100644
index db5ab485f872d..0000000000000
--- a/tests/models/decoder_only/vision_language/test_qwen.py
+++ /dev/null
@@ -1,374 +0,0 @@
-import pathlib
-from typing import Dict, List, Optional, Tuple, Type, Union
-
-import pytest
-import torch
-from PIL.Image import Image
-
-from vllm.inputs import InputContext, token_inputs
-from vllm.multimodal.base import MultiModalInputs
-from vllm.multimodal.utils import cached_get_tokenizer, rescale_image_size
-
-from ....conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput,
-                          VllmRunner, _ImageAssets)
-from ...utils import build_model_context, check_logprobs_close
-
-text_only_models = [
-    "Qwen/Qwen-7B-Chat"  # Has no visual component
-]
-
-multimodal_models = ["Qwen/Qwen-VL"]
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "Picture 1: <img></img>\nWhat's the content of the image?: ",
-    "cherry_blossom":
-    "Picture 1: <img></img>\nWhat is the season?: ",
-})
-
-HF_MULTIIMAGE_IMAGE_PROMPT = "Picture 1: <img></img>\nPicture 2: <img></img>\nCan you compare these images?\n"  # noqa: E501
-HF_MULTIIMAGE_IMAGE_PROMPT = "Picture 1: <img></img>\nPicture 2: <img></img>\nDescribe the two images in detail.\n"  # noqa: E501
-### Multimodal preprocessing tests
-SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
-# These values are specific to Qwen-VL/Chat; we can get these from the model
-# config also, but they are hardcoded here to keep the parameterize/fixtures
-# easy to read.
-IMG_START_ID = 151857
-IMG_END_ID = 151858
-IMG_PAD_ID = 151859
-TOKS_PER_IMG = 256
-VIS_ENC_DIM = 4096
-IMG_SIZE = 448
-
-
-@pytest.fixture()
-def input_mapper_for_qwen():
-    # Lazy import to avoid initializing CUDA during test collection
-    from vllm.model_executor.models.qwen import input_mapper_for_qwen
-    return input_mapper_for_qwen
-
-
-@pytest.fixture()
-def input_processor_for_qwen():
-    # Lazy import to avoid initializing CUDA during test collection
-    from vllm.model_executor.models.qwen import input_processor_for_qwen
-    return input_processor_for_qwen
-
-
-@pytest.fixture()
-def qwen_vl_context() -> InputContext:
-    """Get an InputContext for Qwen-VL."""
-    return build_model_context(model_name="Qwen/Qwen-VL",
-                               trust_remote_code=True)
-
-
-# Happy path tests for single/multi-image scenarios for the multimodal
-# input processor and mapper, respectively
-@pytest.mark.parametrize("num_images", [1, 2])
-def test_input_processor_valid_mm_data(input_processor_for_qwen,
-                                       qwen_vl_context: InputContext,
-                                       num_images: int):
-    """Happy cases for image inputs to Qwen's multimodal input processor."""
-    prompt = "".join(
-        [f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
-    inputs = token_inputs(
-        prompt=prompt,
-        # When processing multimodal data for a multimodal model, the qwen
-        # input processor will overwrite the provided prompt_token_ids with
-        # the image prompts
-        prompt_token_ids=[],
-        multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
-    )
-    proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
-    assert isinstance(proc_inputs, dict)
-
-    # Each image should have one start / stop and a fixed context of 256
-    proc_tokens = proc_inputs["prompt_token_ids"]
-    assert proc_tokens.count(IMG_START_ID) == num_images
-    assert proc_tokens.count(IMG_END_ID) == num_images
-    assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
-
-
-@pytest.mark.parametrize(
-    "img_data,expected_shape",
-    [
-        # single / multi-image
-        (SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
-        (2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
-        # single / multi-image embeddings
-        (torch.rand(
-            (TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
-        (torch.rand(
-            (1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
-        (torch.rand(
-            (2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
-    ])
-def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
-                                    qwen_vl_context: InputContext,
-                                    img_data: Union[torch.Tensor, List[Image],
-                                                    Image],
-                                    expected_shape: List[int]):
-    """Happy cases for image inputs to Qwen's multimodal input mapper."""
-    mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
-    # Ensure that we get the appropriately shaped pixel_values
-    # for images and image embeddings, respectively.
-    assert isinstance(mapped_img_data, MultiModalInputs)
-    assert "pixel_values" in mapped_img_data
-    assert mapped_img_data["pixel_values"].shape == expected_shape
-
-
-# Sad path tests for the multimodal input processor and mapper, respectively
-@pytest.mark.parametrize("mm_data", [
-    {
-        "image": torch.rand((5))
-    },
-    {
-        "image": torch.rand((5, 5, 5, 5, 5))
-    },
-])
-def test_input_processor_invalid_mm_data(input_processor_for_qwen,
-                                         qwen_vl_context: InputContext,
-                                         mm_data: Dict[str, torch.Tensor]):
-    """Test sad cases validated in Qwen's multimodal input processor."""
-    tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
-                                     trust_remote_code=True)
-    prompt = "Picture 1: <img></img>\n"
-    prompt_token_ids = tokenizer.encode(prompt)
-    inputs = token_inputs(prompt=prompt,
-                          prompt_token_ids=prompt_token_ids,
-                          multi_modal_data=mm_data)
-    # Should fail since we have too many or too few dimensions for embeddings
-    with pytest.raises(ValueError):
-        input_processor_for_qwen(qwen_vl_context, inputs)
-
-
-@pytest.mark.parametrize(
-    "img_data",
-    [
-        # Wrong context length
-        torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
-        # Wrong visual encoder output size
-        torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
-    ])
-def test_input_mapper_invalid_mm_data(
-    input_mapper_for_qwen,
-    qwen_vl_context: InputContext,
-    img_data: Union[torch.Tensor, List[Image], Image],
-):
-    """Sad cases validated in Qwen VL's multimodal input mapper."""
-    with pytest.raises(ValueError):
-        input_mapper_for_qwen(qwen_vl_context, img_data)
-
-
-### End-to-end generation tests
-def get_prompt_with_path(tmp_path: pathlib.PosixPath, prompt: str,
-                         assets: Union[_ImageAssets, List[ImageAsset]]) -> str:
-    """Given a temporary dir path, export one or more image assets into the
-    tempdir & replace its contents with the local path to the string so that
-    the HF version of Qwen-VL can resolve the path and load the image ni its
-    forward() call.
-
-    Args:
-        tmp_path: Tempdir for test under consideration.
-        prompt: Prompt with image placeholders.
-        assets: List of image assets whose len equals the num placeholders.
-    """
-    # Ensure that the number of placeholders matches the number of assets;
-    # If this is not true, the test is probably written incorrectly.
-    assert prompt.count("<img></img>") == len(assets)
-
-    # Replace the placeholders with local paths to the exported assets
-    for asset in assets:
-        image_tmp_path = tmp_path / f"{asset.name}.jpg"
-        asset.pil_image.save(image_tmp_path)
-        prompt = prompt.replace(
-            "<img></img>",
-            f"<img>{image_tmp_path}</img>",
-            1,
-        )
-    return prompt
-
-
-def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptImageInput]],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    mm_limit: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test is under tests/images.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    # max_model_len should be greater than image_feature_size
-    # Qwen encodes each image into a fixed content size of 256
-    with vllm_runner(model,
-                     max_model_len=1024,
-                     max_num_seqs=2,
-                     dtype=dtype,
-                     limit_mm_per_prompt={"image": mm_limit},
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-        vllm_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in inputs
-        ]
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs_per_image = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images)
-            for prompts, images in inputs
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
-                                        vllm_outputs_per_image):
-
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", multimodal_models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [8])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_multimodal_models_single_image(tmp_path: pathlib.PosixPath,
-                                        hf_runner: Type[HfRunner],
-                                        vllm_runner: Type[VllmRunner],
-                                        image_assets: _ImageAssets, model: str,
-                                        size_factors: List[float], dtype: str,
-                                        max_tokens: int,
-                                        num_logprobs: int) -> None:
-    """Tests multimodal models with single image prompts."""
-    images = [asset.pil_image for asset in image_assets]
-
-    prompts = [
-        get_prompt_with_path(tmp_path, prompt, [asset])
-        for prompt, asset in zip(HF_IMAGE_PROMPTS, image_assets)
-    ]
-
-    inputs = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, prompts)]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
-
-
-@pytest.mark.parametrize("model", multimodal_models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_multimodal_models_multi_image(tmp_path: pathlib.PosixPath,
-                                       hf_runner: Type[HfRunner],
-                                       vllm_runner: Type[VllmRunner],
-                                       image_assets: _ImageAssets, model: str,
-                                       size_factors: List[float], dtype: str,
-                                       max_tokens: int,
-                                       num_logprobs: int) -> None:
-    """Tests multimodal models with multi-image prompts."""
-    images = [asset.pil_image for asset in image_assets]
-    # Put all of the images into one prompt.
-    prompt = get_prompt_with_path(tmp_path, HF_MULTIIMAGE_IMAGE_PROMPT,
-                                  image_assets)
-    inputs = [([prompt for _ in size_factors],
-               [[rescale_image_size(image, factor) for image in images]
-                for factor in size_factors])]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=2,
-        tensor_parallel_size=1,
-    )
-
-
-# Ensure that a text-only Qwen model can still be loaded and
-# used for inference in VLLM without throwing.
-@pytest.mark.parametrize("model", text_only_models)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_text_only_qwen_model_can_be_loaded_and_run(
-    vllm_runner: Type[VllmRunner],
-    example_prompts: List[str],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-):
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_model.generate_greedy_logprobs(
-            example_prompts,
-            max_tokens,
-            num_logprobs=num_logprobs,
-        )
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/__init__.py b/tests/models/decoder_only/vision_language/vlm_utils/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/builders.py b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
new file mode 100644
index 0000000000000..66668296139f5
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
@@ -0,0 +1,235 @@
+"""Helpers for building inputs that can be leveraged for different test types.
+"""
+from pathlib import PosixPath
+from typing import Callable, Iterable, List, Optional, Tuple, Union
+
+import torch
+
+from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
+                                   resize_video, sample_frames_from_video)
+
+from .....conftest import _ImageAssets, _VideoAssets
+from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER,
+                    TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
+                    ImageSizeWrapper, SizeType, VLMTestInfo)
+
+
+def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
+                                                                      str],
+                             test_placeholder: str) -> str:
+    """Given a prompt, replaces each test placeholder with the
+    model-specific tag.
+    """
+    prompt_segments = prompt.split(test_placeholder)
+    img_prompt = prompt_segments[0]
+    for placeholder_idx, next_seg in enumerate(prompt_segments[1:], start=1):
+        img_prompt += img_idx_to_prompt(placeholder_idx)
+        img_prompt += next_seg
+    return img_prompt
+
+
+def get_model_prompts(base_prompts: Iterable[str],
+                      img_idx_to_prompt: Optional[Callable[[int], str]],
+                      video_idx_to_prompt: Optional[Callable[[int], str]],
+                      prompt_formatter: Callable[[str], str]) -> List[str]:
+    """Given a model-agnostic base prompt and test configuration for a model(s)
+    to be tested, update the media placeholders and apply the prompt formatting
+    to get the test prompt string for this model.
+
+    Example for phi3v, given the base_prompt: "<image>What is the season?"
+        1. Replace img placeholder(s)
+          -> "<|image_1|>\nWhat is the season?"
+        2. Apply prompt formatter:
+          -> <|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n
+    """
+    assert isinstance(base_prompts, (list, tuple))
+    model_prompts = []
+    for base_prompt in base_prompts:
+        # Replace the multimodal placeholders in the base prompt with
+        # the correct ones for the model that we are testing
+        if img_idx_to_prompt:
+            base_prompt = replace_test_placeholder(base_prompt,
+                                                   img_idx_to_prompt,
+                                                   TEST_IMG_PLACEHOLDER)
+
+        if video_idx_to_prompt:
+            base_prompt = replace_test_placeholder(base_prompt,
+                                                   video_idx_to_prompt,
+                                                   TEST_VIDEO_PLACEHOLDER)
+
+        # Apply the prompt formatter to wrap the base prompt with
+        # the correct media placeholders to get the model test prompt
+        model_prompt = prompt_formatter(base_prompt)
+        model_prompts.append(model_prompt)
+    return model_prompts
+
+
+def build_single_image_inputs_from_test_info(
+        test_info: VLMTestInfo,
+        image_assets: _ImageAssets,
+        size_wrapper: ImageSizeWrapper,
+        tmp_path: Optional[PosixPath] = None):
+    if test_info.prompt_formatter is None:
+        raise ValueError(
+            "Prompt formatter must be set to build single image inputs")
+
+    model_prompts = get_model_prompts(test_info.single_image_prompts,
+                                      test_info.img_idx_to_prompt,
+                                      test_info.video_idx_to_prompt,
+                                      test_info.prompt_formatter)
+
+    # For models that require a local path / URL encoded in the image; export
+    # assets and encode into tmp_path for this test. This should be avoided
+    # where possible (currently needed for Qwen-VL).
+    if test_info.prompt_path_encoder is not None:
+        if tmp_path is None:
+            raise ValueError("Prompt path encoder requires setting local path")
+        model_prompts = [
+            test_info.prompt_path_encoder(tmp_path, prompt, [asset])
+            for prompt, asset in zip(model_prompts, image_assets)
+        ]
+
+    images = [asset.pil_image for asset in image_assets]
+    assert len(images) == len(model_prompts)
+    return build_single_image_inputs(images, model_prompts, size_wrapper)
+
+
+def build_single_image_inputs(images, model_prompts,
+                              size_wrapper: ImageSizeWrapper):
+    # For every image / prompt pair, get a pair containing two lists of
+    # length size_factors, where the first contains duplicates of the model
+    # prompt [str], and the second contains copies of the image after being
+    # scaled by one of the size factors.
+    #
+    # NOTE: rescaling preserves the image aspect ratio.
+    return [(
+        [prompt for _ in size_wrapper.data],
+        [
+            apply_image_size_scaling(image, size, size_wrapper.type)
+            for size in size_wrapper.data
+        ],
+    ) for image, prompt in zip(images, model_prompts)]
+
+
+def build_multi_image_inputs_from_test_info(
+        test_info: VLMTestInfo,
+        image_assets: _ImageAssets,
+        size_wrapper: ImageSizeWrapper,
+        tmp_path: Optional[PosixPath] = None):
+    if test_info.prompt_formatter is None:
+        raise ValueError(
+            "Prompt formatter must be set to build multi image inputs")
+
+    model_prompts = get_model_prompts([test_info.multi_image_prompt],
+                                      test_info.img_idx_to_prompt,
+                                      test_info.video_idx_to_prompt,
+                                      test_info.prompt_formatter)
+
+    if test_info.prompt_path_encoder is not None:
+        if tmp_path is None:
+            raise ValueError("Prompt path encoder requires setting local path")
+        model_prompts = [
+            test_info.prompt_path_encoder(tmp_path, model_prompt, image_assets)
+            for model_prompt in model_prompts
+        ]
+
+    images = [asset.pil_image for asset in image_assets]
+
+    # Currently, we only have one multi-image list & one multi-image prompt
+    return build_multi_image_inputs(
+        image_lists=[images],
+        model_prompts=model_prompts,
+        size_wrapper=size_wrapper,
+    )
+
+
+def build_multi_image_inputs(image_lists, model_prompts,
+                             size_wrapper: ImageSizeWrapper):
+    return [(
+        [prompt for _ in size_wrapper.data],
+        [[
+            apply_image_size_scaling(image, size, size_wrapper.type)
+            for image in images
+        ] for size in size_wrapper.data],
+    ) for images, prompt in zip(image_lists, model_prompts)]
+
+
+def build_embedding_inputs_from_test_info(
+    test_info: VLMTestInfo,
+    image_assets: _ImageAssets,
+    size_wrapper: ImageSizeWrapper,
+):
+    # These conditions will always be true if invoked through filtering,
+    # but we still check them in case this is ever called directly
+    if test_info.prompt_formatter is None:
+        raise ValueError(
+            "Prompt formatter must be set to build image embedding inputs")
+    if size_wrapper.type != SizeType.SIZE_FACTOR or not \
+            all(factor == 1.0 for factor in size_wrapper.data):
+        raise ValueError("Embedding tests require constant (1.0) size factors")
+    if test_info.convert_assets_to_embeddings is None:
+        raise ValueError("No conversion func for getting embeddings found")
+
+    model_prompts = get_model_prompts(
+        SINGLE_IMAGE_BASE_PROMPTS,
+        test_info.img_idx_to_prompt,
+        test_info.video_idx_to_prompt,
+        test_info.prompt_formatter,
+    )
+
+    images = [asset.pil_image for asset in image_assets]
+    embeds = test_info.convert_assets_to_embeddings(image_assets)
+    assert len(images) == len(model_prompts)
+
+    inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
+    vllm_embeddings = build_single_image_inputs(embeds, model_prompts,
+                                                size_wrapper)
+    return inputs, vllm_embeddings
+
+
+def build_video_inputs_from_test_info(
+    test_info: VLMTestInfo,
+    video_assets: _VideoAssets,
+    size_wrapper: ImageSizeWrapper,
+    num_frames: int,
+):
+    if test_info.prompt_formatter is None:
+        raise ValueError("Prompt formatter must be set to build video inputs")
+    model_prompts = get_model_prompts(
+        [VIDEO_BASE_PROMPT],
+        test_info.img_idx_to_prompt,
+        test_info.video_idx_to_prompt,
+        test_info.prompt_formatter,
+    )
+
+    sampled_vids = [
+        sample_frames_from_video(asset.np_ndarrays, num_frames)
+        for asset in video_assets
+    ]
+
+    video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
+                    else rescale_video_size)
+
+    return [(
+        [prompt for _ in size_wrapper.data],
+        [video_scaler(video, size) for size in size_wrapper.data],
+    ) for video, prompt in zip(sampled_vids, model_prompts)]
+
+
+def apply_image_size_scaling(image, size: Union[float, Tuple[int, int]],
+                             size_type: SizeType):
+    """Applies a size scaler to one image; this can be a an image size factor,
+    which scales the image while maintaining the aspect ratio"""
+    # Special case for embeddings; if it's a tensor, it's only valid if we
+    # are considering size factors at constant scale, i.e., we just clone
+    # the tensor
+    if isinstance(image, torch.Tensor):
+        assert size_type == SizeType.SIZE_FACTOR and size == 1
+        return image
+    if size_type == SizeType.SIZE_FACTOR:
+        # We have a list of image size factors
+        return rescale_image_size(image, size)
+    elif size_type == SizeType.FIXED_SIZE:
+        # We have a list of fixed sizes
+        return image.resize(size)
+    raise ValueError("ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR")
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
new file mode 100644
index 0000000000000..9bb7134160659
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
@@ -0,0 +1,157 @@
+"""Utils for determining which subset of model tests belong to a specific
+modality, getting all combinations (similar to pytest's parametrization),
+handling multimodal placeholder substitution, and so on.
+"""
+import itertools
+from collections import OrderedDict
+from typing import Dict, Iterable, Tuple
+
+import pytest
+
+from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
+                    ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
+
+
+def get_filtered_test_settings(test_settings: Dict[str, VLMTestInfo],
+                               test_type: VLMTestType,
+                               fork_per_test: bool) -> Dict[str, VLMTestInfo]:
+    """Given the dict of potential test settings to run, return a subdict
+    of tests who have the current test type enabled with the matching val for
+    fork_per_test.
+    """
+
+    def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
+        return test_info.test_type == test_type or (
+            isinstance(test_info.test_type, Iterable)
+            and test_type in test_info.test_type)
+
+    matching_tests = {}
+    for test_name, test_info in test_settings.items():
+        # Otherwise check if the test has the right type & keep if it does
+        if matches_test_type(test_info, test_type):
+            # Embedding tests need to have a conversion func in their test info
+            if matches_test_type(test_info, VLMTestType.EMBEDDING):
+                assert test_info.convert_assets_to_embeddings is not None
+            # Custom test inputs need to explicitly define the mm limit/inputs
+            if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
+                assert (test_info.custom_test_opts is not None
+                        and isinstance(test_info.custom_test_opts, Iterable))
+            # For all types besides custom inputs, we need a prompt formatter
+            else:
+                assert test_info.prompt_formatter is not None
+
+            # Everything looks okay; keep if this is has correct proc handling
+            if (test_info.distributed_executor_backend
+                    is not None) == fork_per_test:
+                matching_tests[test_name] = test_info
+
+    return matching_tests
+
+
+def get_parametrized_options(test_settings: Dict[str, VLMTestInfo],
+                             test_type: VLMTestType,
+                             fork_new_process_for_each_test: bool):
+    """Converts all of our VLMTestInfo into an expanded list of parameters.
+    This is similar to nesting pytest parametrize calls, but done directly
+    through an itertools product so that each test can set things like
+    size factors etc, while still running in isolated test cases.
+    """
+    matching_tests = get_filtered_test_settings(
+        test_settings, test_type, fork_new_process_for_each_test)
+
+    # Ensure that something is wrapped as an iterable it's not already
+    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
+
+    def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
+        # This is essentially the same as nesting a bunch of mark.parametrize
+        # decorators, but we do it programmatically to allow overrides for on
+        # a per-model basis, while still being able to execute each of these
+        # as individual test cases in pytest.
+        iter_kwargs = OrderedDict([
+            ("model", ensure_wrapped(test_info.models)),
+            ("max_tokens", ensure_wrapped(test_info.max_tokens)),
+            ("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
+            ("dtype", ensure_wrapped(test_info.dtype)),
+            ("distributed_executor_backend",
+             ensure_wrapped(test_info.distributed_executor_backend)),
+        ])
+
+        # num_frames is video only
+        if test_type == VLMTestType.VIDEO:
+            iter_kwargs["num_video_frames"] = ensure_wrapped(
+                test_info.num_video_frames)
+
+        # No sizes passed for custom inputs, since inputs are directly provided
+        if test_type != VLMTestType.CUSTOM_INPUTS:
+            wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
+            if wrapped_sizes is None:
+                raise ValueError(
+                    f"Sizes must be set for test type {test_type}")
+            iter_kwargs["size_wrapper"] = wrapped_sizes
+
+        #Otherwise expand the custom test options instead
+        else:
+            if test_info.custom_test_opts is None:
+                raise ValueError("Test has type CUSTOM_INPUTS, but none given")
+            iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
+
+        # yapf: disable
+        # Wrap all model cases in a pytest parameter & pass marks through
+        return [
+            pytest.param(
+                model_type,
+                ExpandableVLMTestArgs(
+                    **{k: v for k, v in zip(iter_kwargs.keys(), case)}
+                ),
+                marks=test_info.marks if test_info.marks is not None else []
+            ) for case in list(itertools.product(*iter_kwargs.values()))
+        ]
+        # yapf: enable
+
+    # Get a list per model type, where each entry contains a tuple of all of
+    # that model type's cases, then flatten them into the top level so that
+    # we can consume them in one mark.parametrize call.
+    cases_by_model_type = [
+        get_model_type_cases(model_type, test_info)
+        for model_type, test_info in matching_tests.items()
+    ]
+    return list(itertools.chain(*cases_by_model_type))
+
+
+def get_wrapped_test_sizes(
+        test_info: VLMTestInfo,
+        test_type: VLMTestType) -> Tuple[ImageSizeWrapper, ...]:
+    """Given a test info which may have size factors or fixed sizes, wrap them
+    and combine them into an iterable, each of which will be used in parameter
+    expansion.
+
+    Args:
+        test_info: Test configuration to be expanded.
+        test_type: The type of test being filtered for.
+    """
+    # If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
+    if test_type == VLMTestType.EMBEDDING:
+        return tuple([
+            ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
+            for factor in EMBEDDING_SIZE_FACTORS
+        ])
+    # Custom inputs have preprocessed inputs
+    elif test_type == VLMTestType.CUSTOM_INPUTS:
+        return tuple()
+
+    size_factors = test_info.image_size_factors \
+        if test_info.image_size_factors else []
+    fixed_sizes = test_info.image_sizes \
+        if test_info.image_sizes else []
+
+    wrapped_factors = [
+        ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
+        for factor in size_factors
+    ]
+
+    wrapped_sizes = [
+        ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size)
+        for size in fixed_sizes
+    ]
+
+    return tuple(wrapped_factors + wrapped_sizes)
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py
new file mode 100644
index 0000000000000..7e8c6dabb15af
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -0,0 +1,141 @@
+"""Core test implementation to be shared across modalities."""
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+
+import torch
+from PIL.Image import Image
+from transformers import AutoTokenizer, BatchEncoding
+from transformers.models.auto.auto_factory import _BaseAutoModelClass
+
+from .....conftest import HfRunner, VllmRunner
+from .types import RunnerOutput
+
+
+def run_test(
+    *,
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]],
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    enforce_eager: bool,
+    max_model_len: int,
+    max_num_seqs: int,
+    hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
+    vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
+    auto_cls: Type[_BaseAutoModelClass],
+    use_tokenizer_eos: bool,
+    postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
+    comparator: Callable[..., None],
+    get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]],
+    limit_mm_per_prompt: Dict[str, int],
+    model_kwargs: Optional[Dict[str, Any]],
+    patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
+    task: str = "auto",
+    runner_mm_key: str = "images",
+    distributed_executor_backend: Optional[str] = None,
+    tensor_parallel_size: int = 1,
+    vllm_embeddings: Optional[torch.Tensor] = None,
+):
+    """Modality agnostic test test executor for comparing HF/vLLM outputs."""
+    # In the case of embeddings, vLLM takes separate input tensors
+    vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
+    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+
+    vllm_outputs_per_mm = []
+    hf_outputs_per_mm = []
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    vllm_kwargs = {}
+    if get_stop_token_ids is not None:
+        vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
+
+    with vllm_runner(model,
+                     max_model_len=max_model_len,
+                     max_num_seqs=max_num_seqs,
+                     dtype=dtype,
+                     limit_mm_per_prompt=limit_mm_per_prompt,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=enforce_eager,
+                     task=task) as vllm_model:
+        for prompts, media in vllm_inputs:
+            vllm_kwargs[runner_mm_key] = media
+            vllm_output = vllm_model.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs=num_logprobs, **vllm_kwargs)
+            vllm_outputs_per_mm.append(vllm_output)
+
+    hf_model = hf_runner(model,
+                         dtype=dtype,
+                         auto_cls=auto_cls,
+                         postprocess_inputs=postprocess_inputs,
+                         model_kwargs=model_kwargs)
+
+    # Some models need to patch things like the model processor, e.g., internvl
+    if patch_hf_runner is not None:
+        hf_model = patch_hf_runner(hf_model)
+
+    # Some models need to explicitly pass the eos_token_id off the tokenizer or
+    # processor for a good comparison; currently assume processor/tokenizer
+    # agree on the EOS, and pull it off the tokenizer if requested.
+    hf_kwargs = {}
+    if use_tokenizer_eos:
+        hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
+
+    with hf_model, torch.no_grad():
+        for prompts, media in inputs:
+            hf_kwargs[runner_mm_key] = media
+            hf_output = hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                tokenizer=tokenizer,
+                **hf_kwargs)
+            hf_outputs_per_mm.append(hf_output)
+
+    # Apply output processing / sanitation to the vLLM and HF runner results
+    hf_outputs_per_mm, vllm_outputs_per_mm = process_runner_outputs(
+        model,
+        first_runner_outputs=hf_outputs_per_mm,
+        second_runner_outputs=vllm_outputs_per_mm,
+        first_runner_processor=hf_output_post_proc,
+        second_runner_processor=vllm_output_post_proc,
+    )
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm,
+                                        vllm_outputs_per_mm):
+        # This is usually check_logprobs_close, but it's passed through to
+        # allow things like check_outputs_equal where needed
+        comparator(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+def process_runner_outputs(
+    model,
+    first_runner_outputs,
+    second_runner_outputs,
+    first_runner_processor=None,
+    second_runner_processor=None,
+):
+    """Applies the runner processor(s) to the runner outputs, if any."""
+    if first_runner_processor is not None:
+        first_runner_outputs = process_outputs(first_runner_processor, model,
+                                               first_runner_outputs)
+    if second_runner_processor is not None:
+        second_runner_outputs = process_outputs(second_runner_processor, model,
+                                                second_runner_outputs)
+    return first_runner_outputs, second_runner_outputs
+
+
+def process_outputs(output_processor, model, outputs_per_image):
+    """Applies a model specific post-processor function to a runner's output"""
+    return [[output_processor(res, model) for res in outputs]
+            for outputs in outputs_per_image]
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
new file mode 100644
index 0000000000000..e698d8d3f6f56
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
@@ -0,0 +1,102 @@
+"""Custom input builders for edge-cases in different models."""
+from typing import Callable
+
+from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
+                                   resize_video, sample_frames_from_video)
+
+from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
+from .builders import build_multi_image_inputs, build_single_image_inputs
+from .types import ImageSizeWrapper, SizeType
+
+
+def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
+    """Builds inputs for multi-image (varied sizes/aspect ratio) testing.
+    
+    Args:
+        formatter: model-specific prompt formatter.
+    """
+    stop_sign = IMAGE_ASSETS[0].pil_image
+    cherry_blossom = IMAGE_ASSETS[1].pil_image
+
+    # Apply the selected formatter to the base prompts
+    img_prompts = [
+        "<image><image>\nDescribe 2 images.",
+        "<image><image>\nDescribe 2 images.",
+        "<image><image><image><image>\nDescribe 4 images.",
+        "<image>\nWhat is the season?",
+    ]
+    formatted_prompts = [formatter(prompt) for prompt in img_prompts]
+
+    return [(
+        formatted_prompts,
+        [
+            [stop_sign, cherry_blossom],
+            # Images with different sizes and aspect-ratios
+            [
+                rescale_image_size(stop_sign, 0.1),
+                stop_sign,
+            ],
+            [
+                stop_sign,
+                rescale_image_size(stop_sign, 0.25),
+                cherry_blossom.resize((183, 488)),
+                cherry_blossom.resize((488, 183))
+            ],
+            cherry_blossom,
+        ])]
+
+
+def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
+                                          num_frames: int = 16):
+    """Builds inputs for multi-video (varied sizes/aspect ratio) testing.
+    
+    Args:
+        formatter: model-specific prompt formatter.
+    """
+    video = sample_frames_from_video(VIDEO_ASSETS[0].np_ndarrays, num_frames)
+    # Apply the selected formatter to the base prompts
+    video_prompts = [
+        "<video><video>\nDescribe 2 videos.",
+        "<video><video>\nDescribe 2 videos.",
+        "<video><video><video><video>\nDescribe 4 videos.",
+        "<video>\nWhy is this video funny?",
+    ]
+    formatted_prompts = [formatter(prompt) for prompt in video_prompts]
+
+    return [(
+        formatted_prompts,
+        [
+            [video, video],
+            # Videos with different sizes and aspect-ratios
+            [
+                rescale_video_size(video, 0.1),
+                video,
+            ],
+            [
+                video,
+                rescale_video_size(video, 0.25),
+                resize_video(video, (183, 488)),
+                resize_video(video, (488, 183))
+            ],
+            video,
+        ])]
+
+
+def different_patch_input_cases_internvl():
+    images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
+    formatter = lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"  # noqa: E501
+    single_img_prompts = [
+        "<image>\nWhat's the content in the center of the image?",
+        "<image>\nWhat is the season?",
+    ]
+    multi_img_prompts = [
+        "Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.\n",  # noqa: E501
+    ]
+    formatted_sprompts = [formatter(prompt) for prompt in single_img_prompts]
+    formatted_mprompts = [formatter(prompt) for prompt in multi_img_prompts]
+
+    wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5, 1.0])
+    return [
+        build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
+        build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
+    ]
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
new file mode 100644
index 0000000000000..6856e8df81a13
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -0,0 +1,338 @@
+"""Common utility functions relating to different models that are useful
+for manipulating the input / output of HF & vLLM test runners, which are
+typically specific to a small subset of models.
+"""
+import re
+import types
+from pathlib import PosixPath
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+from PIL.Image import Image
+from transformers import AutoConfig, AutoTokenizer, BatchEncoding
+
+from vllm.sequence import SampleLogprobs
+from vllm.transformers_utils.tokenizer import patch_padding_side
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+
+from .....conftest import HfRunner, ImageAsset, _ImageAssets
+from .types import RunnerOutput
+
+
+####### vLLM output processors functions
+def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
+                            model: str) -> RunnerOutput:
+    """Sanitize vllm output [blip2 models] to be comparable with hf output."""
+    _, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "\n"
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    hf_output_ids = tokenizer.encode(hf_output_str)
+    assert hf_output_ids[0] == tokenizer.bos_token_id
+    hf_output_ids = hf_output_ids[1:]
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
+                           model: str) -> RunnerOutput:
+    """Sanitize vllm output [fuyu models] to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str.lstrip() + "|ENDOFTEXT|"
+
+    return output_ids, hf_output_str, out_logprobs
+
+
+def qwen_vllm_to_hf_output(
+        vllm_output: RunnerOutput,
+        model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
+    """Sanitize vllm output [qwen models] to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "<|endoftext|>"
+
+    return output_ids, hf_output_str, out_logprobs
+
+
+def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
+                                  model: str) -> RunnerOutput:
+    config = AutoConfig.from_pretrained(model)
+    mm_token_id = config.image_token_index
+    return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
+
+
+def llava_video_vllm_to_hf_output(
+        vllm_output: RunnerOutput,
+        model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
+    config = AutoConfig.from_pretrained(model)
+    mm_token_id = config.video_token_index
+    return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
+
+
+def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
+                             mm_token_id: int) -> RunnerOutput:
+    """Sanitize vllm output [Llava models] to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
+    ]
+
+    assert output_str[0] == " "
+    hf_output_str = output_str[1:]
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
+                                      model: str) -> RunnerOutput:
+    """Sanitize vllm output [llava-onevision] to compare with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    config = AutoConfig.from_pretrained(model)
+    video_token_id = config.video_token_index
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != video_token_id or output_ids[idx - 1] != video_token_id
+    ]
+
+    hf_output_str = output_str
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
+                            model: str) -> RunnerOutput:
+    """Sanitize vllm output [phi3v] to be comparable with hf output."""
+    _, output_str, out_logprobs = vllm_output
+
+    output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
+    assert output_str_without_image[0] == " "
+    output_str_without_image = output_str_without_image[1:]
+
+    hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    hf_output_ids = tokenizer.encode(output_str_without_image)
+    assert hf_output_ids[0] == 1
+    hf_output_ids = hf_output_ids[1:]
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
+                                model: str) -> RunnerOutput:
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    config = AutoConfig.from_pretrained(model)
+    image_token_id = config.image_token_index
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
+    ]
+
+    hf_output_str = output_str
+
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+####### Post-processors for HF outputs
+def minicmpv_trunc_hf_output(hf_output: RunnerOutput,
+                             model: str) -> RunnerOutput:
+    output_ids, output_str, out_logprobs = hf_output
+    if output_str.endswith("<|eot_id|>"):
+        output_str = output_str.split("<|eot_id|>")[0]
+    return output_ids, output_str, out_logprobs
+
+
+####### Functions for converting image assets to embeddings
+def get_llava_embeddings(image_assets: _ImageAssets):
+    return [asset.image_embeds for asset in image_assets]
+
+
+####### postprocessors to run on HF BatchEncoding
+def get_key_type_post_processor(
+        hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
+    """Gets a handle to a post processor which converts a given key into a
+    target data type."""
+
+    def process(hf_inputs: BatchEncoding, dtype: str):
+        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
+        hf_inputs[hf_inp_key] = hf_inputs[hf_inp_key].to(torch_dtype)
+        return hf_inputs
+
+    return process
+
+
+def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
+    return {"model_inputs": hf_inputs}
+
+
+####### Prompt path encoders for models that need models on disk
+def qwen_prompt_path_encoder(
+        tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset],
+                                                        _ImageAssets]) -> str:
+    """Given a temporary dir path, export one or more image assets into the
+    tempdir & replace its contents with the local path to the string so that
+    the HF version of Qwen-VL can resolve the path and load the image in its
+    forward() call.
+
+    Args:
+        tmp_path: Tempdir for test under consideration.
+        prompt: Prompt with image placeholders.
+        assets: List of image assets whose len equals the num placeholders.
+    """
+    # Ensure that the number of placeholders matches the number of assets;
+    # If this is not true, the test is probably written incorrectly.
+    assert prompt.count("<img></img>") == len(assets)
+
+    # Replace the placeholders with local paths to the exported assets
+    for asset in assets:
+        image_tmp_path = tmp_path / f"{asset.name}.jpg"
+        asset.pil_image.save(image_tmp_path)
+        prompt = prompt.replace(
+            "<img></img>",
+            f"<img>{image_tmp_path}</img>",
+            1,
+        )
+    return prompt
+
+
+####### Model-specific HuggingFace runner patchers
+def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for GLM4."""
+    hf_processor = hf_model.processor
+    patch_padding_side(hf_processor)
+
+    def processor(*args, text="", images=None, **kwargs):
+        if images is None:
+            return hf_processor(*args, **kwargs)
+
+        return hf_processor.apply_chat_template(
+            [{
+                "role": "user",
+                "image": images,
+                "content": text
+            }],
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            **kwargs,
+        )
+
+    hf_model.processor = processor
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.transformer.output_layer
+    return hf_model
+
+
+def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for InternVL."""
+
+    class InternVLProcessor:
+        """A simple processor for InternVL2 which misses a processor."""
+
+        def __init__(self, hf_runner: HfRunner):
+            self.num_image_token = hf_runner.model.num_image_token
+            self.tokenizer = hf_runner.tokenizer
+            self.dtype = hf_runner.model.dtype
+
+            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
+                                                     trust_remote_code=True)
+            self.vision_config = self.config.vision_config
+            self.use_thumbnail = self.config.use_thumbnail
+            self.min_num = self.config.min_dynamic_patch
+            self.max_num = self.config.max_dynamic_patch
+            self.image_size = self.vision_config.image_size
+
+        def __call__(self, text: str, images: Union[Image, List[Image]],
+                     **kwargs):
+            from vllm.model_executor.models.internvl import (
+                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+            images = [images] if isinstance(images, Image) else images
+            pixel_values = [
+                image_to_pixel_values(image, self.image_size, self.min_num,
+                                      self.max_num,
+                                      self.use_thumbnail).to(self.dtype)
+                for image in images
+            ]
+            num_patches_list = [
+                pixel_value.shape[0] for pixel_value in pixel_values
+            ]
+            pixel_values = torch.cat(pixel_values, dim=0)
+            for num_patches in num_patches_list:
+                context_tokens = IMG_CONTEXT * self.num_image_token \
+                    * num_patches
+                image_tokens = IMG_START + context_tokens + IMG_END
+                text = text.replace('<image>', image_tokens, 1)
+            prompt = self.tokenizer(text, return_tensors="pt")
+            prompt.update({"pixel_values": pixel_values})
+            return prompt
+
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
+        "<IMG_CONTEXT>")
+    hf_model.model.img_context_token_id = img_context_token_id
+    hf_model.processor = InternVLProcessor(hf_model)
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.language_model.get_output_embeddings()
+    hf_model.model.generate = types.MethodType(_internvl_generate,
+                                               hf_model.model)
+    return hf_model
+
+
+def _internvl_generate(
+    self,
+    pixel_values: torch.FloatTensor,
+    input_ids: torch.FloatTensor,
+    attention_mask: Optional[torch.LongTensor] = None,
+    **generate_kwargs,
+) -> torch.LongTensor:
+    """Generate method for InternVL2 model without fixed use_cache."""
+    assert self.img_context_token_id is not None
+    vit_embeds = self.extract_feature(pixel_values)
+    input_embeds = self.language_model.get_input_embeddings()(input_ids)
+    B, N, C = input_embeds.shape
+    input_embeds = input_embeds.reshape(B * N, C)
+
+    input_ids = input_ids.reshape(B * N)
+    selected = (input_ids == self.img_context_token_id)
+    assert selected.sum() != 0
+    input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
+
+    input_embeds = input_embeds.reshape(B, N, C)
+
+    forward_kwargs = dict(
+        inputs_embeds=input_embeds,
+        attention_mask=attention_mask,
+    )
+    if getattr(self, "use_visual_token_mask", False):
+        visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype)
+        forward_kwargs["visual_token_mask"] = visual_token_mask
+    outputs = self.language_model.generate(
+        **forward_kwargs,
+        **generate_kwargs,
+    )
+
+    return outputs
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/runners.py b/tests/models/decoder_only/vision_language/vlm_utils/runners.py
new file mode 100644
index 0000000000000..5a3f9e820dad0
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/vlm_utils/runners.py
@@ -0,0 +1,130 @@
+"""Entrypoints for wrapping the core run_test implementation for specific test
+types / modalities.
+"""
+from pathlib import PosixPath
+from typing import Type
+
+from .....conftest import HfRunner, VllmRunner, _ImageAssets, _VideoAssets
+from . import builders, core
+from .types import ExpandableVLMTestArgs, VLMTestInfo
+
+
+####### Entrypoints for running different test types
+def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
+                          test_case: ExpandableVLMTestArgs,
+                          hf_runner: Type[HfRunner],
+                          vllm_runner: Type[VllmRunner],
+                          image_assets: _ImageAssets):
+    assert test_case.size_wrapper is not None
+    inputs = builders.build_single_image_inputs_from_test_info(
+        model_test_info, image_assets, test_case.size_wrapper, tmp_path)
+
+    core.run_test(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        inputs=inputs,
+        model=test_case.model,
+        dtype=test_case.dtype,
+        max_tokens=test_case.max_tokens,
+        num_logprobs=test_case.num_logprobs,
+        limit_mm_per_prompt={"image": 1},
+        distributed_executor_backend=test_case.distributed_executor_backend,
+        **model_test_info.get_non_parametrized_runner_kwargs())
+
+
+def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
+                         test_case: ExpandableVLMTestArgs,
+                         hf_runner: Type[HfRunner],
+                         vllm_runner: Type[VllmRunner],
+                         image_assets: _ImageAssets):
+    assert test_case.size_wrapper is not None
+    inputs = builders.build_multi_image_inputs_from_test_info(
+        model_test_info, image_assets, test_case.size_wrapper, tmp_path)
+
+    core.run_test(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        inputs=inputs,
+        model=test_case.model,
+        dtype=test_case.dtype,
+        max_tokens=test_case.max_tokens,
+        num_logprobs=test_case.num_logprobs,
+        limit_mm_per_prompt={"image": len(image_assets)},
+        distributed_executor_backend=test_case.distributed_executor_backend,
+        **model_test_info.get_non_parametrized_runner_kwargs())
+
+
+def run_embedding_test(*, model_test_info: VLMTestInfo,
+                       test_case: ExpandableVLMTestArgs,
+                       hf_runner: Type[HfRunner],
+                       vllm_runner: Type[VllmRunner],
+                       image_assets: _ImageAssets):
+    assert test_case.size_wrapper is not None
+    inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
+        model_test_info, image_assets, test_case.size_wrapper)
+
+    core.run_test(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        inputs=inputs,
+        model=test_case.model,
+        dtype=test_case.dtype,
+        max_tokens=test_case.max_tokens,
+        num_logprobs=test_case.num_logprobs,
+        limit_mm_per_prompt={"image": 1},
+        vllm_embeddings=vllm_embeddings,
+        distributed_executor_backend=test_case.distributed_executor_backend,
+        **model_test_info.get_non_parametrized_runner_kwargs())
+
+
+def run_video_test(
+    *,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    video_assets: _VideoAssets,
+):
+    assert test_case.size_wrapper is not None
+    assert test_case.num_video_frames is not None
+    inputs = builders.build_video_inputs_from_test_info(
+        model_test_info, video_assets, test_case.size_wrapper,
+        test_case.num_video_frames)
+
+    core.run_test(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        inputs=inputs,
+        model=test_case.model,
+        dtype=test_case.dtype,
+        max_tokens=test_case.max_tokens,
+        num_logprobs=test_case.num_logprobs,
+        limit_mm_per_prompt={"video": len(video_assets)},
+        distributed_executor_backend=test_case.distributed_executor_backend,
+        **model_test_info.get_non_parametrized_runner_kwargs())
+
+
+def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
+                           test_case: ExpandableVLMTestArgs,
+                           hf_runner: Type[HfRunner],
+                           vllm_runner: Type[VllmRunner]):
+    # Custom test cases can provide inputs directly, but they need to
+    # explicitly provided a CustomTestConfig, which wraps the inputs and
+    # the limit_mm_per_prompt
+    assert test_case.custom_test_opts is not None
+
+    inputs = test_case.custom_test_opts.inputs
+    limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt
+    assert inputs is not None and limit_mm_per_prompt is not None
+
+    core.run_test(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        inputs=inputs,
+        model=test_case.model,
+        dtype=test_case.dtype,
+        max_tokens=test_case.max_tokens,
+        num_logprobs=test_case.num_logprobs,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        distributed_executor_backend=test_case.distributed_executor_backend,
+        **model_test_info.get_non_parametrized_runner_kwargs())
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py
new file mode 100644
index 0000000000000..4d18d53af30fa
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
@@ -0,0 +1,187 @@
+"""Types for writing multimodal model tests."""
+from enum import Enum
+from pathlib import PosixPath
+from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional,
+                    Tuple, Type, Union)
+
+import torch
+from PIL.Image import Image
+from pytest import MarkDecorator
+from transformers import AutoModelForCausalLM, AutoTokenizer, BatchEncoding
+from transformers.models.auto.auto_factory import _BaseAutoModelClass
+
+from vllm.sequence import SampleLogprobs
+from vllm.utils import identity
+
+from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
+from ....utils import check_logprobs_close
+
+# meta image tag; will be replaced by the appropriate tag for the model
+TEST_IMG_PLACEHOLDER = "<vlm_image>"
+TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
+
+# yapf: disable
+SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
+    "cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
+})
+
+MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n"  # noqa: E501
+VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
+
+
+IMAGE_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
+EMBEDDING_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0)]
+RunnerOutput = Tuple[List[int], str, Optional[SampleLogprobs]]
+# yapf: enable
+
+
+class VLMTestType(Enum):
+    IMAGE = 1
+    MULTI_IMAGE = 2
+    EMBEDDING = 3
+    VIDEO = 4
+    CUSTOM_INPUTS = 5
+
+
+class SizeType(Enum):
+    SIZE_FACTOR = 1
+    FIXED_SIZE = 2
+
+
+class CustomTestOptions(NamedTuple):
+    inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]]
+    limit_mm_per_prompt: Dict[str, int]
+
+
+class ImageSizeWrapper(NamedTuple):
+    type: SizeType
+    # A size factor is a wrapper of 0+ floats,
+    # while a fixed size contains an iterable of integer pairs
+    data: Union[Iterable[float], Iterable[Tuple[int, int]]]
+
+
+class VLMTestInfo(NamedTuple):
+    """Holds the configuration for 1+ tests for one model architecture."""
+
+    models: Union[List[str]]
+    test_type: Union[VLMTestType, Iterable[VLMTestType]]
+
+    # Should be None only if this is a CUSTOM_INPUTS test
+    prompt_formatter: Optional[Callable[[str], str]] = None
+    img_idx_to_prompt: Callable[[int], str] = lambda idx: "<image>\n"
+    video_idx_to_prompt: Callable[[int], str] = lambda idx: "<video>\n"
+
+    # Most models work on the single / multi-image prompts above, but in some
+    # cases the log prob check fails, e.g., for paligemma. We allow passing
+    # an override for the single image prompts / multi-image prompt for this
+    # reason.
+    single_image_prompts: Iterable[str] = SINGLE_IMAGE_BASE_PROMPTS
+    multi_image_prompt: str = MULTI_IMAGE_BASE_PROMPT
+
+    # Function for converting ImageAssets to image embeddings;
+    # We need to define this explicitly for embedding tests
+    convert_assets_to_embeddings: Optional[Callable[[_ImageAssets],
+                                                    torch.Tensor]] = None
+
+    # Exposed options for vLLM runner; we change these in a several tests,
+    # but the defaults are derived from VllmRunner & the engine defaults
+    # These settings are chosen to avoid OOMs when running in the CI
+    enforce_eager: bool = True
+    max_model_len: int = 1024
+    max_num_seqs: int = 256
+    task: str = "auto"
+    tensor_parallel_size: int = 1
+
+    # Optional callable which gets a list of token IDs from the model tokenizer
+    get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]] = None
+
+    # Exposed options for HF runner
+    model_kwargs: Optional[Dict[str, Any]] = None
+    # Indicates we should explicitly pass the EOS from the tokeniezr
+    use_tokenizer_eos: bool = False
+    auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM
+    # Callable to pass to the HF runner to run on inputs; for now, we also pass
+    # the data type to input post processing, because almost all of the uses of
+    # postprocess_inputs are to fix the data types of BatchEncoding values.
+    postprocess_inputs: Callable[[BatchEncoding, str],
+                                 BatchEncoding] = identity
+    patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None
+
+    # Post processors that if defined, will run oun the outputs of the
+    # vLLM and HF runner, respectively (useful for sanitization, etc).
+    vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
+    hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
+
+    # Consumes the output of the callables above and checks if they're equal
+    comparator: Callable[..., None] = check_logprobs_close
+
+    # Default expandable params per test; these defaults can be overridden in
+    # instances of this object; the complete set of test cases for the model
+    # is all combinations of .models + all fields below
+    max_tokens: Union[int, Tuple[int]] = 128
+    num_logprobs: Union[int, Tuple[int]] = 5
+    dtype: Union[str, Iterable[str]] = "half"
+    distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
+    # Only expanded in video tests
+    num_video_frames: Union[int, Tuple[int]] = 16
+
+    # Fixed image sizes / image size factors; most tests use image_size_factors
+    # The values provided for these two fields will be stacked and expanded
+    # such that each model will consider each image size factor / image size
+    # once per tests (much like concatenating and wrapping in one parametrize
+    # call)
+    image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS
+    image_sizes: Optional[Iterable[Iterable[Tuple[int, int]]]] = None
+
+    # Hack for updating a prompt to take into a local path; currently only used
+    # for Qwen-VL, which requires encoding the image path / url into the prompt
+    # for HF runner
+    prompt_path_encoder: Optional[
+        Callable[[PosixPath, str, Union[List[ImageAsset], _ImageAssets]],
+                 str]] = None  # noqa: E501
+
+    # kwarg to pass multimodal data in as to vllm/hf runner instances
+    runner_mm_key: str = "images"
+
+    # Allows configuring a test to run with custom inputs
+    custom_test_opts: Optional[List[CustomTestOptions]] = None
+
+    marks: Optional[List[MarkDecorator]] = None
+
+    def get_non_parametrized_runner_kwargs(self):
+        """Returns a dictionary of expandable kwargs for items that are used
+        in all test types, which are NOT used when creating the parametrized
+        test cases.
+        """
+        return {
+            "enforce_eager": self.enforce_eager,
+            "max_model_len": self.max_model_len,
+            "max_num_seqs": self.max_num_seqs,
+            "task": self.task,
+            "hf_output_post_proc": self.hf_output_post_proc,
+            "vllm_output_post_proc": self.vllm_output_post_proc,
+            "auto_cls": self.auto_cls,
+            "use_tokenizer_eos": self.use_tokenizer_eos,
+            "postprocess_inputs": self.postprocess_inputs,
+            "comparator": self.comparator,
+            "get_stop_token_ids": self.get_stop_token_ids,
+            "model_kwargs": self.model_kwargs,
+            "patch_hf_runner": self.patch_hf_runner,
+            "runner_mm_key": self.runner_mm_key,
+        }
+
+
+class ExpandableVLMTestArgs(NamedTuple):
+    """The expanded kwargs which correspond to a single test case."""
+    model: str
+    max_tokens: int
+    num_logprobs: int
+    dtype: str
+    distributed_executor_backend: Optional[str]
+    # Sizes are used for everything except for custom input tests
+    size_wrapper: Optional[ImageSizeWrapper] = None
+    # Video only
+    num_video_frames: Optional[int] = None
+    # Custom inputs only
+    custom_test_opts: Optional[CustomTestOptions] = None
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index 52aef8c34d6f3..a8d0ac4fc160d 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -85,6 +85,8 @@ def _run_test(
     )
 
 
+# FIXME
+@pytest.mark.skip(reason="LLava next embedding tests currently fail")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_models_text(
diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index 52f74ec885946..7f82347841cdb 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -192,7 +192,7 @@ def _run_test(
             for prompts, images in inputs
         ]
 
-    def process(hf_inputs: BatchEncoding):
+    def process(hf_inputs: BatchEncoding, **kwargs):
         return hf_inputs
 
     with hf_runner(model,
diff --git a/tests/utils.py b/tests/utils.py
index 0c61891cfefec..f6f588df48810 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -561,12 +561,11 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
     return wrapper
 
 
-def large_gpu_test(*, min_gb: int):
-    """
-    Decorate a test to be skipped if no GPU is available or it does not have
-    sufficient memory.
-
-    Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
+def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
+    """Gets a pytest skipif mark, which triggers ig the the device doesn't have
+    meet a minimum memory requirement in gb; can be leveraged via 
+    @large_gpu_test to skip tests in environments without enough resources, or
+    called when filtering tests to run directly.
     """
     try:
         if current_platform.is_cpu():
@@ -578,14 +577,23 @@ def large_gpu_test(*, min_gb: int):
             f"An error occurred when finding the available memory: {e}",
             stacklevel=2,
         )
-
         memory_gb = 0
 
-    test_skipif = pytest.mark.skipif(
+    return pytest.mark.skipif(
         memory_gb < min_gb,
         reason=f"Need at least {memory_gb}GB GPU memory to run the test.",
     )
 
+
+def large_gpu_test(*, min_gb: int):
+    """
+    Decorate a test to be skipped if no GPU is available or it does not have
+    sufficient memory.
+
+    Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
+    """
+    test_skipif = large_gpu_mark(min_gb)
+
     def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
         return test_skipif(f)
 
diff --git a/vllm/utils.py b/vllm/utils.py
index bd6e8335d3530..c5f14a8f6c4cf 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1094,7 +1094,8 @@ def enable_trace_function_call_for_thread() -> None:
 
 
 # `functools` helpers
-def identity(value: T) -> T:
+def identity(value: T, **kwargs) -> T:
+    """Returns the first provided value."""
     return value