Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CI/Build] Split up models tests #10069

Merged
merged 13 commits into from
Nov 9, 2024
24 changes: 14 additions & 10 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ steps:

##### models test #####

- label: Basic Models Test # 3min
- label: Basic Models Test # 10min
source_file_dependencies:
- vllm/
- tests/models
Expand All @@ -314,23 +314,24 @@ steps:
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py

- label: Decoder-only Language Models Test (Standard) # 35min
- label: Decoder-only Language Models Test (Standard) # 18min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language
commands:
- pytest -v -s models/decoder_only/language/test_models.py
- pytest -v -s models/decoder_only/language -m core_model
- pytest -v -s models/decoder_only/language -m quant_model

- label: Decoder-only Language Models Test (Extended) # 1h20min
- label: Decoder-only Language Models Test (Extended) # 46min
nightly: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language
commands:
- pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'

- label: Decoder-only Multi-Modal Models Test (Standard) # 26min
- label: Decoder-only Multi-Modal Models Test (Standard) # 22min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
Expand All @@ -339,21 +340,24 @@ steps:
commands:
- pytest -v -s models/decoder_only/audio_language -m core_model
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model
# No tests under this group for now
# - pytest -v -s models/decoder_only/audio_language -m quant_model
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m quant_model

- label: Decoder-only Multi-Modal Models Test (Extended)
- label: Decoder-only Multi-Modal Models Test (Extended) # 1h10m
nightly: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
commands:
- pytest -v -s models/decoder_only/audio_language -m 'not core_model'
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
# HACK - run phi3v tests separately to sidestep this transformers bug
# https://github.com/huggingface/transformers/issues/34307
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model'
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'

- label: Other Models Test # 6min
- label: Other Models Test # 20min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ markers = [
"skip_global_cleanup",
"core_model: enable this model test in each PR instead of only nightly",
"cpu_model: enable this model test in CPU tests",
"quant_model: run this model test under Quantized category",
"distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
"skip_v1: do not run this test with v1",
]
1 change: 1 addition & 0 deletions tests/models/decoder_only/language/test_aqlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
]


@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("aqlm"),
reason="AQLM is not supported on this GPU type.")
@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
Expand Down
1 change: 1 addition & 0 deletions tests/models/decoder_only/language/test_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
os.environ["TOKENIZERS_PARALLELISM"] = "true"


@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize(
Expand Down
35 changes: 16 additions & 19 deletions tests/models/decoder_only/language/test_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,21 @@

MAX_MODEL_LEN = 1024

# FIXME: Move this to confest
MODELS = [
("meta-llama/Llama-3.2-1B-Instruct",
hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF",
filename="Llama-3.2-1B-Instruct-Q4_K_M.gguf")),
("meta-llama/Llama-3.2-1B-Instruct",
hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF",
filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf")),
("Qwen/Qwen2-1.5B-Instruct",
hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF",
filename="qwen2-1_5b-instruct-q4_k_m.gguf")),
("Qwen/Qwen2-1.5B-Instruct",
hf_hub_download("legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
filename="Qwen2-1.5B-Instruct.IQ4_XS.gguf")),
]


@pytest.mark.skipif(not is_quant_method_supported("gguf"),
reason="gguf is not supported on this GPU type.")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize(("original_model", "gguf_id", "gguf_path"), [
("meta-llama/Llama-3.2-1B-Instruct",
"bartowski/Llama-3.2-1B-Instruct-GGUF",
"Llama-3.2-1B-Instruct-Q4_K_M.gguf"),
("meta-llama/Llama-3.2-1B-Instruct",
"bartowski/Llama-3.2-1B-Instruct-GGUF",
"Llama-3.2-1B-Instruct-IQ4_XS.gguf"),
("Qwen/Qwen2-1.5B-Instruct", "Qwen/Qwen2-1.5B-Instruct-GGUF",
"qwen2-1_5b-instruct-q4_k_m.gguf"),
("Qwen/Qwen2-1.5B-Instruct", "legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
"Qwen2-1.5B-Instruct.IQ4_XS.gguf"),
])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
Expand All @@ -45,7 +40,9 @@ def test_models(
num_gpus_available,
vllm_runner,
example_prompts,
model,
original_model,
gguf_id,
gguf_path,
dtype: str,
max_tokens: int,
num_logprobs: int,
Expand All @@ -54,7 +51,7 @@ def test_models(
if num_gpus_available < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")

original_model, gguf_model = model
gguf_model = hf_hub_download(gguf_id, filename=gguf_path)

tokenizer = AutoTokenizer.from_pretrained(original_model)
messages = [[{
Expand Down
1 change: 1 addition & 0 deletions tests/models/decoder_only/language/test_gptq_marlin.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
]


@pytest.mark.quant_model
@pytest.mark.flaky(reruns=3)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
reason="gptq_marlin is not supported on this GPU type.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class ModelPair:
]


@pytest.mark.quant_model
@pytest.mark.flaky(reruns=2)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
reason="Marlin24 is not supported on this GPU type.")
Expand Down
3 changes: 2 additions & 1 deletion tests/models/decoder_only/language/test_granite.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
from ...utils import check_logprobs_close

MODELS = [
# TODO(sang): Sliding window should be tested separately.
"ibm/PowerLM-3b",
"ibm/PowerMoE-3b",
]


Expand All @@ -24,7 +26,6 @@ def test_models(
max_tokens: int,
num_logprobs: int,
) -> None:
# TODO(sang): Sliding window should be tested separately.
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
Expand Down
39 changes: 0 additions & 39 deletions tests/models/decoder_only/language/test_granitemoe.py

This file was deleted.

1 change: 1 addition & 0 deletions tests/models/decoder_only/language/test_modelopt.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
@pytest.mark.skip(
reason=
"Prevent unstable test based on golden strings from breaking the build.")
@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS)
Expand Down
4 changes: 1 addition & 3 deletions tests/models/decoder_only/language/test_models.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
"""Compare the outputs of HF and vLLM when using greedy sampling.

This test only tests small models. Big models such as 7B should be tested from
test_big_models.py because it could use a larger instance to run tests.

Run `pytest tests/models/test_models.py`.
"""
import pytest
Expand Down Expand Up @@ -35,6 +32,7 @@
target_dtype = "half"


@pytest.mark.core_model
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [32])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,13 @@ def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next,
ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
seq_len = 5000 # bigger than the max feature size for any image

seq_data, mm_data = dummy_data_for_llava_next(
dummy_data = dummy_data_for_llava_next(
ctx,
seq_len=seq_len,
mm_counts={"image": 1},
)
seq_data = dummy_data.seq_data
mm_data = dummy_data.multi_modal_data

# The dummy data dims should match the gridpoint with the biggest feat size
assert mm_data["image"].height == expected_size[0]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,13 @@ def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int,
mm_processor_kwargs=None,
)

sequence_data, _, = dummy_data_for_phi3v(
dummy_data = dummy_data_for_phi3v(
ctx=ctx,
seq_len=8192, # Should be bigger than num_imgs * toks_per_img
mm_counts={"image": num_imgs},
num_crops=num_crops,
)
sequence_data = dummy_data.seq_data
# Ensure we have the right number of placeholders per num_crops size
img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID)
assert img_tok_count == toks_per_img * num_imgs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,17 @@ def test_qwen2_vl_dummy_data(dummy_data_for_qwen2_vl,

# NOTE: video value is required, but isn't actually used
# when making the dummy data except for error handling currently
seq_data, mm_data = dummy_data_for_qwen2_vl(qwen2_vl_context, seq_len, {
"image": 1,
"video": 0
}, **mm_processor_kwargs)
dummy_data = dummy_data_for_qwen2_vl(
ctx=qwen2_vl_context,
seq_len=seq_len,
mm_counts={
"image": 1,
"video": 0
},
**mm_processor_kwargs,
)
seq_data = dummy_data.seq_data
mm_data = dummy_data.multi_modal_data

# Ensure we have the right number of placeholders for min/max pixel values
assert seq_data.get_token_ids().count(image_token_id) == token_count
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Optional, Tuple, Type
from typing import List, Optional, Type

import pytest
import torch
Expand All @@ -19,7 +19,8 @@
def run_awq_test(
vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets,
models: Tuple[str, str],
source_model: str,
quant_model: str,
*,
size_factors: List[float],
dtype: str,
Expand All @@ -28,8 +29,6 @@ def run_awq_test(
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
source_model, quant_model = models

images = [asset.pil_image for asset in image_assets]

inputs_per_image = [(
Expand Down Expand Up @@ -84,8 +83,11 @@ def run_awq_test(
)


@pytest.mark.quant_model
@pytest.mark.parametrize(
"models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")])
("source_model", "quant_model"),
[("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")],
)
@pytest.mark.parametrize(
"size_factors",
[
Expand All @@ -103,12 +105,13 @@ def run_awq_test(
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@torch.inference_mode()
def test_awq_models(vllm_runner, image_assets, models, size_factors,
dtype: str, max_tokens: int, num_logprobs: int) -> None:
def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
size_factors, dtype, max_tokens, num_logprobs) -> None:
run_awq_test(
vllm_runner,
image_assets,
models,
source_model,
quant_model,
size_factors=size_factors,
dtype=dtype,
max_tokens=max_tokens,
Expand Down
Loading