Skip to content

Commit

Permalink
[Model] Support Qwen2 embeddings and use tags to select model tests (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
DarkLight1337 authored Nov 15, 2024
1 parent 2885ba0 commit b40cf64
Show file tree
Hide file tree
Showing 19 changed files with 252 additions and 178 deletions.
6 changes: 3 additions & 3 deletions .buildkite/run-cpu-test-ppc64le.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ function cpu_tests() {
decord einops librosa peft Pillow sentence-transformers soundfile \
transformers_stream_generator matplotlib datamodel_code_generator
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
pytest -v -s tests/models/embedding/language
pytest -v -s tests/models/encoder_decoder/language
pytest -v -s tests/models/decoder_only/language/test_models.py
pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/embedding/language -m cpu_model
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"

Expand Down
6 changes: 3 additions & 3 deletions .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ function cpu_tests() {
decord einops librosa peft Pillow sentence-transformers soundfile \
transformers_stream_generator matplotlib datamodel_code_generator
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
pytest -v -s tests/models/embedding/language
pytest -v -s tests/models/encoder_decoder/language
pytest -v -s tests/models/decoder_only/language/test_models.py
pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/embedding/language -m cpu_model
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"

Expand Down
48 changes: 23 additions & 25 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -323,62 +323,60 @@ steps:
- pytest -v -s models/test_registry.py
- pytest -v -s models/test_initialization.py

- label: Decoder-only Language Models Test (Standard) # 18min
- label: Language Models Test (Standard) # 42min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language
- tests/models/embedding/language
- tests/models/encoder_decoder/language
commands:
- pytest -v -s models/decoder_only/language -m core_model
- pytest -v -s models/decoder_only/language -m quant_model
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
- pytest -v -s models/embedding/language -m core_model
- pytest -v -s models/embedding/vision_language -m core_model

- label: Decoder-only Language Models Test (Extended) # 46min
- label: Language Models Test (Extended) # 50min
nightly: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language
- tests/models/embedding/language
- tests/models/encoder_decoder/language
commands:
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
- pytest -v -s models/embedding/language -m 'not core_model'
- pytest -v -s models/embedding/vision_language -m 'not core_model'

- label: Decoder-only Multi-Modal Models Test (Standard) # 22min
- label: Multi-Modal Models Test (Standard) # 26min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/vision_language
commands:
- pytest -v -s models/decoder_only/audio_language -m core_model
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model
# No tests under this group for now
# - pytest -v -s models/decoder_only/audio_language -m quant_model
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m quant_model
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
- pytest -v -s models/encoder_decoder/language -m core_model
- pytest -v -s models/encoder_decoder/vision_language -m core_model

- label: Decoder-only Multi-Modal Models Test (Extended) # 1h10m
- label: Multi-Modal Models Test (Extended) # 1h15m
nightly: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/vision_language
commands:
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
# HACK - run phi3v tests separately to sidestep this transformers bug
# https://github.com/huggingface/transformers/issues/34307
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'

- label: Other Models Test # 20min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/embedding/language
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/language
- tests/models/encoder_decoder/vision_language
commands:
- pytest -v -s models/embedding/language
- pytest -v -s models/embedding/vision_language
- pytest -v -s models/encoder_decoder/language
- pytest -v -s models/encoder_decoder/vision_language
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'

# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
Expand Down
13 changes: 9 additions & 4 deletions docs/source/models/supported_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -330,11 +330,16 @@ Text Embedding
- :code:`BAAI/bge-multilingual-gemma2`, etc.
-
- ✅︎
* - :code:`MistralModel`
- Mistral-based
* - :code:`LlamaModel`, :code:`LlamaForCausalLM`, :code:`MistralModel`, etc.
- Llama-based
- :code:`intfloat/e5-mistral-7b-instruct`, etc.
- ✅︎
- ✅︎
* - :code:`Qwen2Model`, :code:`Qwen2ForCausalLM`
- Qwen2-based
- :code:`ssmits/Qwen2-7B-Instruct-embed-base`, :code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`, etc.
- ✅︎
- ✅︎

.. important::
Some model architectures support both generation and embedding tasks.
Expand All @@ -355,7 +360,7 @@ Reward Modeling
* - :code:`Qwen2ForRewardModel`
- Qwen2-based
- :code:`Qwen/Qwen2.5-Math-RM-72B`, etc.
-
- ✅︎
- ✅︎

.. note::
Expand All @@ -376,7 +381,7 @@ Classification
* - :code:`Qwen2ForSequenceClassification`
- Qwen2-based
- :code:`jason9693/Qwen2.5-1.5B-apeach`, etc.
-
- ✅︎
- ✅︎

.. note::
Expand Down
18 changes: 4 additions & 14 deletions tests/models/decoder_only/language/test_jamba.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ def test_models(

with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model)

for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
Expand Down Expand Up @@ -293,17 +297,3 @@ def test_jamba_distributed_produces_identical_generation(
name_0="vllm_tp_1",
name_1="vllm_tp_2",
)


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
def test_model_print(
vllm_runner,
model: str,
dtype: str,
) -> None:
with vllm_runner(model, dtype=dtype) as vllm_model:
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model)
18 changes: 4 additions & 14 deletions tests/models/decoder_only/language/test_mamba.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ def test_models(

with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model)

for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
Expand Down Expand Up @@ -279,17 +283,3 @@ def test_state_cleanup(
except ValueError:
pytest.fail("Mamba inner state wasn't cleaned up between states, "
"could be related to finished_requests_ids")


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
def test_model_print(
vllm_runner,
model: str,
dtype: str,
) -> None:
with vllm_runner(model, dtype=dtype) as vllm_model:
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model)
71 changes: 43 additions & 28 deletions tests/models/decoder_only/language/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,37 +4,52 @@
"""
import pytest

from vllm.platforms import current_platform

from ...utils import check_logprobs_close

MODELS = [
"facebook/opt-125m", # opt
"openai-community/gpt2", # gpt2
# "Milos/slovak-gpt-j-405M", # gptj
# "bigcode/tiny_starcoder_py", # gpt_bigcode
# "EleutherAI/pythia-70m", # gpt_neox
"bigscience/bloom-560m", # bloom - testing alibi slopes
"microsoft/phi-2", # phi
# "stabilityai/stablelm-3b-4e1t", # stablelm
# "bigcode/starcoder2-3b", # starcoder2
"google/gemma-1.1-2b-it", # gemma
"Qwen/Qwen2.5-0.5B-Instruct", # qwen2
"meta-llama/Llama-3.2-1B-Instruct", # llama
]

if not current_platform.is_cpu():
MODELS += [
# fused_moe which not supported on CPU
"openbmb/MiniCPM3-4B",
]

target_dtype = "half"


@pytest.mark.core_model
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize(
"model",
[
pytest.param(
"bigscience/bloom-560m", # bloom - testing alibi slopes
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
pytest.param(
"openai-community/gpt2", # gpt2
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
pytest.param("Milos/slovak-gpt-j-405M"), # gptj
pytest.param("bigcode/tiny_starcoder_py"), # gpt_bigcode
pytest.param("EleutherAI/pythia-70m"), # gpt_neox
pytest.param(
"google/gemma-1.1-2b-it", # gemma
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
pytest.param(
"meta-llama/Llama-3.2-1B-Instruct", # llama
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
pytest.param(
"openbmb/MiniCPM3-4B",
# fused_moe not supported on CPU
marks=[pytest.mark.core_model],
),
pytest.param(
"facebook/opt-125m", # opt
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
pytest.param(
"microsoft/phi-2", # phi
marks=[pytest.mark.core_model],
),
pytest.param(
"Qwen/Qwen2.5-0.5B-Instruct", # qwen2
marks=[pytest.mark.core_model],
),
pytest.param("stabilityai/stablelm-3b-4e1t"), # stablelm
pytest.param("bigcode/starcoder2-3b"), # starcoder2
])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(
Expand Down
30 changes: 11 additions & 19 deletions tests/models/embedding/language/test_cls_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,14 @@
import torch
from transformers import AutoModelForSequenceClassification

CLASSIFICATION_MODELS = ["jason9693/Qwen2.5-1.5B-apeach"]


@pytest.mark.parametrize("model", CLASSIFICATION_MODELS)
@pytest.mark.parametrize(
"model",
[
pytest.param("jason9693/Qwen2.5-1.5B-apeach",
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
],
)
@pytest.mark.parametrize("dtype", ["float"])
def test_classification_models(
hf_runner,
Expand All @@ -23,31 +27,19 @@ def test_classification_models(
) -> None:
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.classify(example_prompts)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model)

with hf_runner(model,
dtype=dtype,
auto_cls=AutoModelForSequenceClassification) as hf_model:
hf_outputs = hf_model.classify(example_prompts)

print(hf_outputs, vllm_outputs)

# check logits difference
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
hf_output = torch.tensor(hf_output)
vllm_output = torch.tensor(vllm_output)

assert torch.allclose(hf_output, vllm_output, 1e-3)


@pytest.mark.parametrize("model", CLASSIFICATION_MODELS)
@pytest.mark.parametrize("dtype", ["float"])
def test_classification_model_print(
vllm_runner,
model: str,
dtype: str,
) -> None:
with vllm_runner(model, dtype=dtype) as vllm_model:
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model)
Loading

0 comments on commit b40cf64

Please sign in to comment.