From 74fc2d77aec13304550bb52b459bd8c6da756d39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A7=91=E8=8B=B1?= Date: Wed, 30 Oct 2024 01:32:56 +0800 Subject: [PATCH 01/88] [Misc] Add metrics for request queue time, forward time, and execute time (#9659) --- vllm/config.py | 7 ----- vllm/engine/llm_engine.py | 15 +++++++++ vllm/engine/metrics.py | 60 +++++++++++++++++++++++++++++++----- vllm/engine/metrics_types.py | 3 ++ 4 files changed, 70 insertions(+), 15 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 99a82c8f1b40b..3814e41aeb92d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1892,13 +1892,6 @@ def __post_init__(self): "'otlp_traces_endpoint'. Ensure OpenTelemetry packages are " f"installed. Original error:\n{otel_import_error_traceback}") - if ((self.collect_model_forward_time - or self.collect_model_execute_time) - and self.otlp_traces_endpoint is None): - raise ValueError( - "collect_model_forward_time or collect_model_execute_time " - "requires --otlp-traces-endpoint to be set.") - @dataclass(frozen=True) class EngineConfig: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index ede77f04b1db9..60575210c9386 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1645,6 +1645,9 @@ def _get_stats(self, # Request stats # Latency time_e2e_requests: List[float] = [] + time_in_queue_requests: List[float] = [] + model_forward_time_requests: List[float] = [] + model_execute_time_requests: List[float] = [] # Metadata num_prompt_tokens_requests: List[int] = [] num_generation_tokens_requests: List[int] = [] @@ -1738,6 +1741,15 @@ def _get_stats(self, # Latency timings time_e2e_requests.append(now - seq_group.metrics.arrival_time) + if seq_group.metrics.time_in_queue is not None: + time_in_queue_requests.append( + seq_group.metrics.time_in_queue) + if seq_group.metrics.model_forward_time is not None: + model_forward_time_requests.append( + seq_group.metrics.model_forward_time) + if seq_group.metrics.model_execute_time is not None: + model_execute_time_requests.append( + seq_group.metrics.model_execute_time * 1000) # Metadata num_prompt_tokens_requests.append( len(seq_group.prompt_token_ids)) @@ -1795,6 +1807,9 @@ def _get_stats(self, # Request stats # Latency time_e2e_requests=time_e2e_requests, + time_in_queue_requests=time_in_queue_requests, + model_forward_time_requests=model_forward_time_requests, + model_execute_time_requests=model_execute_time_requests, # Metadata num_prompt_tokens_requests=num_prompt_tokens_requests, num_generation_tokens_requests=num_generation_tokens_requests, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index a46625eff1e4a..0f5615ff14db1 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -133,7 +133,31 @@ def __init__(self, labelnames: List[str], max_model_len: int): name="vllm:e2e_request_latency_seconds", documentation="Histogram of end to end request latency in seconds.", labelnames=labelnames, - buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0]) + buckets=[ + 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, + 40.0, 50.0, 60.0 + ]) + self.histogram_time_in_queue_request = self._histogram_cls( + name="vllm:time_in_queue_requests", + documentation= + "Histogram of time the request spent in the queue in seconds.", + labelnames=labelnames, + buckets=[ + 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, + 40.0, 50.0, 60.0 + ]) + self.histogram_model_forward_time_request = self._histogram_cls( + name="vllm:model_forward_time_milliseconds", + documentation= + "Histogram of time spent in the model forward pass in ms.", + labelnames=labelnames, + buckets=build_1_2_3_5_8_buckets(3000)) + self.histogram_model_execute_time_request = self._histogram_cls( + name="vllm:model_execute_time_milliseconds", + documentation= + "Histogram of time spent in the model execute function in ms.", + labelnames=labelnames, + buckets=build_1_2_3_5_8_buckets(3000)) # Metadata self.histogram_num_prompt_tokens_request = self._histogram_cls( name="vllm:request_prompt_tokens", @@ -299,16 +323,12 @@ def _unregister_vllm_metrics(self) -> None: pass -def build_1_2_5_buckets(max_value: int) -> List[int]: +def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]: """ - Builds a list of buckets with increasing powers of 10 multiplied by - mantissa values (1, 2, 5) until the value exceeds the specified maximum. + Builds a list of buckets with increasing powers of 10 multiplied by + mantissa values until the value exceeds the specified maximum. - Example: - >>> build_1_2_5_buckets(100) - [1, 2, 5, 10, 20, 50, 100] """ - mantissa_lst = [1, 2, 5] exponent = 0 buckets: List[int] = [] while True: @@ -321,6 +341,24 @@ def build_1_2_5_buckets(max_value: int) -> List[int]: exponent += 1 +def build_1_2_5_buckets(max_value: int) -> List[int]: + """ + Example: + >>> build_1_2_5_buckets(100) + [1, 2, 5, 10, 20, 50, 100] + """ + return build_buckets([1, 2, 5], max_value) + + +def build_1_2_3_5_8_buckets(max_value: int) -> List[int]: + """ + Example: + >>> build_1_2_3_5_8_buckets(100) + [1, 2, 3, 5, 8, 10, 20, 30, 50, 80, 100] + """ + return build_buckets([1, 2, 3, 5, 8], max_value) + + def local_interval_elapsed(now: float, last_log: float, local_interval: float) -> bool: elapsed_time = now - last_log @@ -486,6 +524,12 @@ def _log_prometheus(self, stats: Stats) -> None: # Latency self._log_histogram(self.metrics.histogram_e2e_time_request, stats.time_e2e_requests) + self._log_histogram(self.metrics.histogram_time_in_queue_request, + stats.time_in_queue_requests) + self._log_histogram(self.metrics.histogram_model_forward_time_request, + stats.model_forward_time_requests) + self._log_histogram(self.metrics.histogram_model_execute_time_request, + stats.model_execute_time_requests) # Metadata finished_reason_counter = CollectionsCounter( stats.finished_reason_requests) diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index e9a5bd3b586be..510dd04bb3e55 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -46,6 +46,9 @@ class Stats: # Request stats (should have _requests suffix) # Latency time_e2e_requests: List[float] + time_in_queue_requests: List[float] + model_forward_time_requests: List[float] + model_execute_time_requests: List[float] # Metadata num_prompt_tokens_requests: List[int] num_generation_tokens_requests: List[int] From 08600ddc685558d8504eb94bbbf382230f6de386 Mon Sep 17 00:00:00 2001 From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com> Date: Wed, 30 Oct 2024 01:36:59 +0800 Subject: [PATCH 02/88] Fix the log to correct guide user to install modelscope (#9793) Signed-off-by: yuze.zyz --- vllm/transformers_utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py index 74ca396276c3f..eeec029fc051a 100644 --- a/vllm/transformers_utils/__init__.py +++ b/vllm/transformers_utils/__init__.py @@ -9,7 +9,7 @@ if version.parse(modelscope.__version__) <= version.parse('1.18.0'): raise ImportError( 'Using vLLM with ModelScope needs modelscope>=1.18.1, please ' - 'install by `pip install modelscope>=1.18.1`') + 'install by `pip install modelscope -U`') from modelscope.utils.hf_util import patch_hub From 0f43387157010bf84da05c68fc5ff366b3252f01 Mon Sep 17 00:00:00 2001 From: Sven Seeberg Date: Tue, 29 Oct 2024 18:37:59 +0100 Subject: [PATCH 03/88] [Bugfix] Use host argument to bind to interface (#9798) --- vllm/entrypoints/openai/api_server.py | 2 +- vllm/entrypoints/openai/cli_args.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index ae44b26a6c55a..afa370a1cb40b 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -541,7 +541,7 @@ async def run_server(args, **uvicorn_kwargs) -> None: # This avoids race conditions with ray. # see https://github.com/vllm-project/vllm/issues/8204 sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.bind(("", args.port)) + sock.bind((args.host, args.port)) def signal_handler(*_) -> None: # Interrupt server on sigterm while initializing diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index a089985ac9758..f4dd9df9587ce 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -77,7 +77,7 @@ def __call__( def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument("--host", type=nullable_str, - default=None, + default="0.0.0.0", help="host name") parser.add_argument("--port", type=int, default=8000, help="port number") parser.add_argument( From 0ce7798f44c586e11c65d59725724eb805086e93 Mon Sep 17 00:00:00 2001 From: yannicks1 <43552841+yannicks1@users.noreply.github.com> Date: Tue, 29 Oct 2024 18:39:20 +0100 Subject: [PATCH 04/88] [Misc]: Typo fix: Renaming classes (casualLM -> causalLM) (#9801) Signed-off-by: Yannick Schnider --- vllm/model_executor/model_loader/neuron.py | 4 ++-- vllm/model_executor/model_loader/openvino.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py index a9f1e6e88d792..a90fbd648def9 100644 --- a/vllm/model_executor/model_loader/neuron.py +++ b/vllm/model_executor/model_loader/neuron.py @@ -37,7 +37,7 @@ } -class NeuronCasualLM(nn.Module): +class NeuronCausalLM(nn.Module): def __init__(self, config: PretrainedConfig, @@ -184,7 +184,7 @@ def get_neuron_model(model_config: ModelConfig, scheduler_config: SchedulerConfig) -> nn.Module: # Create a model instance. - model = NeuronCasualLM( + model = NeuronCausalLM( model_config.hf_config, _is_neuron_on_device_sampling_disabled(model_config)) diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py index 8ada2210d0d51..573f2a04895d9 100644 --- a/vllm/model_executor/model_loader/openvino.py +++ b/vllm/model_executor/model_loader/openvino.py @@ -95,7 +95,7 @@ def _require_model_export(model_id, revision=None, subfolder=None): return True -class OpenVINOCasualLM(nn.Module): +class OpenVINOCausalLM(nn.Module): def __init__( self, @@ -199,5 +199,5 @@ def get_model( "be added in the future. If this is important to you, " "please open an issue on github.") - return OpenVINOCasualLM(ov_core, model_config, device_config, + return OpenVINOCausalLM(ov_core, model_config, device_config, kv_cache_dtype) From ac3d748dba446b9a8417fe3005345c12989d8de0 Mon Sep 17 00:00:00 2001 From: Junichi Sato Date: Wed, 30 Oct 2024 02:40:35 +0900 Subject: [PATCH 05/88] [Model] Add LlamaEmbeddingModel as an embedding Implementation of LlamaModel (#9806) --- vllm/model_executor/models/registry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 32b9341ae0b93..30dfff31f7e48 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -95,6 +95,7 @@ # [Text-only] "BertModel": ("bert", "BertEmbeddingModel"), "Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"), + "LlamaModel": ("llama", "LlamaEmbeddingModel"), "MistralModel": ("llama", "LlamaEmbeddingModel"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), "Qwen2ForSequenceClassification": ( From ab6f981671c4e5035575f5e5ef6172f4df52e121 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 29 Oct 2024 14:12:43 -0400 Subject: [PATCH 06/88] [CI][Bugfix] Skip chameleon for transformers 4.46.1 (#9808) --- tests/models/decoder_only/vision_language/test_broadcast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/decoder_only/vision_language/test_broadcast.py b/tests/models/decoder_only/vision_language/test_broadcast.py index fd7af4a8b0b29..38c4a95de16f4 100644 --- a/tests/models/decoder_only/vision_language/test_broadcast.py +++ b/tests/models/decoder_only/vision_language/test_broadcast.py @@ -24,7 +24,7 @@ def test_models(hf_runner, vllm_runner, image_assets, elif model.startswith("llava-hf/llava-v1.6"): from .test_llava_next import models, run_test # type: ignore[no-redef] elif model.startswith("facebook/chameleon"): - if transformers.__version__.startswith("4.46.0"): + if transformers.__version__.startswith("4.46"): pytest.skip("Model broken in HF, " "see huggingface/transformers#34379") from .test_chameleon import models, run_test # type: ignore[no-redef] From 7585ec996f7ec88735627cb2ab13949226f9bfce Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Tue, 29 Oct 2024 15:24:42 -0400 Subject: [PATCH 07/88] [CI/Build] mergify: fix rules for ci/build label (#9804) Signed-off-by: Russell Bryant --- .github/mergify.yml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/mergify.yml b/.github/mergify.yml index 2a3dee7c662d1..1ce5039a061b2 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -13,13 +13,14 @@ pull_request_rules: - name: label-ci-build description: Automatically apply ci/build label conditions: - - files~=^\.github/ - - files~=\.buildkite/ - - files~=^cmake/ - - files=CMakeLists.txt - - files~=^Dockerfile - - files~=^requirements.*\.txt - - files=setup.py + - or: + - files~=^\.github/ + - files~=\.buildkite/ + - files~=^cmake/ + - files=CMakeLists.txt + - files~=^Dockerfile + - files~=^requirements.*\.txt + - files=setup.py actions: label: add: From 0ad216f5750742115c686723bf38698372d483fd Mon Sep 17 00:00:00 2001 From: Kunjan Date: Tue, 29 Oct 2024 12:52:19 -0700 Subject: [PATCH 08/88] [MISC] Set label value to timestamp over 0, to keep track of recent history (#9777) Signed-off-by: Kunjan Patel --- vllm/engine/metrics.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 0f5615ff14db1..9ed30e1e99857 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,3 +1,4 @@ +import time from typing import TYPE_CHECKING from typing import Counter as CollectionsCounter from typing import Dict, List, Optional, Type, Union, cast @@ -253,6 +254,10 @@ def labels(self, **labels): def set(self, value: Union[int, float]): return self._gauge.set(value) + def set_to_current_time(self): + # ray metrics doesn't have set_to_current time, https://docs.ray.io/en/latest/_modules/ray/util/metrics.html + return self._gauge.set(time.time()) + class _RayCounterWrapper: """Wraps around ray.util.metrics.Counter to provide same API as @@ -479,7 +484,7 @@ def _log_histogram(self, histogram, data: Union[List[int], histogram.labels(**self.labels).observe(datum) def _log_gauge_string(self, gauge, data: Dict[str, str]) -> None: - gauge.labels(**data).set(1) + gauge.labels(**data).set_to_current_time() def _log_prometheus(self, stats: Stats) -> None: # System state data From 67bdf8e523e4020a559b6d74981936c8156243f9 Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Tue, 29 Oct 2024 16:13:20 -0500 Subject: [PATCH 09/88] [Bugfix][Frontend] Guard against bad token ids (#9634) Signed-off-by: Joe Runde --- .../entrypoints/llm/test_prompt_validation.py | 8 +++- tests/entrypoints/openai/test_completion.py | 18 ++++----- .../openai/test_prompt_validation.py | 15 +++++++ vllm/engine/async_llm_engine.py | 15 +++++-- vllm/engine/llm_engine.py | 40 +++++++++++++++++-- vllm/transformers_utils/tokenizer.py | 5 +++ vllm/transformers_utils/tokenizers/mistral.py | 5 +++ 7 files changed, 89 insertions(+), 17 deletions(-) diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py index 565dfa01346cc..675a980ab3f3f 100644 --- a/tests/entrypoints/llm/test_prompt_validation.py +++ b/tests/entrypoints/llm/test_prompt_validation.py @@ -4,6 +4,12 @@ def test_empty_prompt(): - llm = LLM(model="gpt2") + llm = LLM(model="gpt2", enforce_eager=True) with pytest.raises(ValueError, match='Prompt cannot be empty'): llm.generate([""]) + + +def test_out_of_vocab_token(): + llm = LLM(model="gpt2", enforce_eager=True) + with pytest.raises(ValueError, match='out of vocabulary'): + llm.generate({"prompt_token_ids": [999999]}) diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index f03bdb045f640..c81cfdbbe5cff 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -157,15 +157,15 @@ async def test_added_lora_tokens(client: openai.AsyncOpenAI): @pytest.mark.asyncio async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI): # test using token IDs - completion = await client.completions.create( - model=MODEL_NAME, - prompt=[0, 0, 32000, 32001, 32002], - echo=True, - max_tokens=5, - temperature=0.0, - ) - # Added tokens should not appear in tokenized prompt - assert "vllm" not in completion.choices[0].text + with pytest.raises(openai.BadRequestError, match="out of vocabulary"): + # Added tokens should be rejected by the base model + await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 32000, 32001, 32002], + echo=True, + max_tokens=5, + temperature=0.0, + ) @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index 0a573a0066d32..58075f7023821 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -20,3 +20,18 @@ async def test_empty_prompt(): prompt="", max_tokens=5, temperature=0.0) + + +@pytest.mark.asyncio +async def test_out_of_vocab_token_ids(): + model_name = "gpt2" + server_args = ["--enforce-eager"] + with RemoteOpenAIServer(model_name, server_args) as remote_server: + client = remote_server.get_async_client() + + with pytest.raises(openai.BadRequestError, + match=re.compile('.*out of vocabulary.*')): + await client.completions.create(model=model_name, + prompt=[999999], + max_tokens=5, + temperature=0.0) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index e9848a14cbe17..5198467a6ac40 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -412,6 +412,12 @@ async def stop_remote_worker_execution_loop_async(self) -> None: """Stop the remote worker execution loop.""" await self.model_executor.stop_remote_worker_execution_loop_async() + async def get_tokenizer_async(self, + lora_request: Optional[LoRARequest] = None + ) -> AnyTokenizer: + return await ( + self.get_tokenizer_group().get_lora_tokenizer_async(lora_request)) + @overload # DEPRECATED async def add_request_async( self, @@ -472,6 +478,10 @@ async def add_request_async( if arrival_time is None: arrival_time = time.time() + if self.tokenizer is not None: + tokenizer = await self.get_tokenizer_async(lora_request) + self._validate_token_prompt(prompt, tokenizer=tokenizer) + preprocessed_inputs = await self.input_preprocessor.preprocess_async( prompt, request_id=request_id, @@ -488,7 +498,7 @@ async def add_request_async( # implementation in the LLMEngine params = await build_guided_decoding_logits_processor_async( sampling_params=params, - tokenizer=self.get_tokenizer(lora_request), + tokenizer=await self.get_tokenizer_async(lora_request), default_guided_backend=self.decoding_config. guided_decoding_backend) @@ -715,8 +725,7 @@ async def get_tokenizer( self, lora_request: Optional[LoRARequest] = None, ) -> AnyTokenizer: - return await (self.engine.get_tokenizer_group(). - get_lora_tokenizer_async(lora_request)) + return await self.engine.get_tokenizer_async(lora_request) def start_background_loop(self) -> None: """Start the background loop.""" diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 60575210c9386..fde768ed5165e 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -10,7 +10,7 @@ from typing import Set, Type, Union, cast, overload import torch -from typing_extensions import TypeVar +from typing_extensions import TypeIs, TypeVar import vllm.envs as envs from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, @@ -32,7 +32,8 @@ from vllm.executor.gpu_executor import GPUExecutor from vllm.executor.ray_utils import initialize_ray_cluster from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, - EncoderDecoderInputs, InputRegistry, PromptType) + EncoderDecoderInputs, InputRegistry, PromptType, + TokensPrompt) from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.logits_process import get_bad_words_logits_processors @@ -667,7 +668,7 @@ def _add_processed_request( ) return None - self._validate_model_inputs(processed_inputs) + self._validate_model_inputs(processed_inputs, lora_request) # Create the sequences. block_size = self.cache_config.block_size seq_id = next(self.seq_counter) @@ -829,6 +830,11 @@ def add_request( if arrival_time is None: arrival_time = time.time() + if self.tokenizer is not None: + self._validate_token_prompt( + prompt, + tokenizer=self.get_tokenizer(lora_request=lora_request)) + preprocessed_inputs = self.input_preprocessor.preprocess( prompt, request_id=request_id, @@ -855,6 +861,31 @@ def add_request( priority=priority, ) + def _validate_token_prompt(self, prompt: PromptType, + tokenizer: AnyTokenizer): + # Guard against out-of-vocab tokens. + # For some tokenizers, tokenizer.decode will happily return empty text + # for token ids that are out of vocab, and we don't detect token ids + # that are greater than the max token id before running the model. + # However, these token ids will later crash a cuda kernel at runtime + # with an index out of bounds error. This will crash the entire engine. + # This needs to happen before multimodal input pre-processing, which + # may add dummy tokens that aren't part of the tokenizer's + # vocabulary. + if self._is_token_prompt(prompt): + prompt_ids = prompt["prompt_token_ids"] + if len(prompt_ids) == 0: + # Empty prompt check is handled later + return + max_input_id = max(prompt_ids) + if max_input_id > tokenizer.max_token_id: + raise ValueError( + "Token id {} is out of vocabulary".format(max_input_id)) + + @staticmethod + def _is_token_prompt(prompt: PromptType) -> TypeIs[TokensPrompt]: + return isinstance(prompt, dict) and "prompt_token_ids" in prompt + def _create_sequence_group_with_sampling( self, request_id: str, @@ -1942,7 +1973,8 @@ def is_encoder_decoder_model(self): return self.input_preprocessor.is_encoder_decoder_model() def _validate_model_inputs(self, inputs: Union[DecoderOnlyInputs, - EncoderDecoderInputs]): + EncoderDecoderInputs], + lora_request: Optional[LoRARequest]): if self.model_config.is_multimodal_model: # For encoder-decoder multimodal models, the max_prompt_len # restricts the decoder prompt length diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 94af2388d79db..54f9f895fe541 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -35,6 +35,7 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer: tokenizer.all_special_tokens_extended) tokenizer_all_special_tokens = set(tokenizer.all_special_tokens) tokenizer_len = len(tokenizer) + max_token_id = max(tokenizer.get_vocab().values()) class CachedTokenizer(tokenizer.__class__): # type: ignore @@ -50,6 +51,10 @@ def all_special_tokens(self): def all_special_tokens_extended(self): return tokenizer_all_special_tokens_extended + @property + def max_token_id(self): + return max_token_id + def __len__(self): return tokenizer_len diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 23ea657ffb0a9..80e21c2d32ecc 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -85,6 +85,7 @@ def __init__(self, tokenizer: PublicMistralTokenizer) -> None: raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}") self.tokenizer = tokenizer_ + self._max_token_id = max(self._vocab.values()) @classmethod def from_pretrained(cls, @@ -158,6 +159,10 @@ def is_fast(self) -> bool: def vocab_size(self) -> int: return len(self._vocab) + @property + def max_token_id(self) -> int: + return self._max_token_id + def __len__(self) -> int: return self.vocab_size From 882a1ad0deb9fd26283db611e78e122ac19fb72f Mon Sep 17 00:00:00 2001 From: Will Eaton Date: Tue, 29 Oct 2024 18:07:37 -0400 Subject: [PATCH 10/88] [Model] tool calling support for ibm-granite/granite-20b-functioncalling (#8339) Signed-off-by: Max de Bayser Co-authored-by: Max de Bayser Co-authored-by: Maximilien de Bayser --- .../serving/openai_compatible_server.md | 21 +- .../tool_chat_template_granite_20b_fc.jinja | 130 +++++++++ tests/tool_use/utils.py | 12 + .../openai/tool_parsers/__init__.py | 7 +- .../granite_20b_fc_tool_parser.py | 251 ++++++++++++++++++ .../openai/tool_parsers/llama_tool_parser.py | 27 +- vllm/entrypoints/openai/tool_parsers/utils.py | 36 ++- 7 files changed, 456 insertions(+), 28 deletions(-) create mode 100644 examples/tool_chat_template_granite_20b_fc.jinja create mode 100644 vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 413c87ab28755..a1f93a9a28578 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -185,7 +185,9 @@ from HuggingFace; and you can find an example of this in a `tokenizer_config.jso If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! + #### Hermes Models (`hermes`) + All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported. * `NousResearch/Hermes-2-Pro-*` * `NousResearch/Hermes-2-Theta-*` @@ -197,7 +199,9 @@ step in their creation_. Flags: `--tool-call-parser hermes` + #### Mistral Models (`mistral`) + Supported models: * `mistralai/Mistral-7B-Instruct-v0.3` (confirmed) * Additional mistral function-calling models are compatible as well. @@ -216,7 +220,9 @@ when tools are provided, that results in much better reliability when working wi Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` + #### Llama Models (`llama3_json`) + Supported models: * `meta-llama/Meta-Llama-3.1-8B-Instruct` * `meta-llama/Meta-Llama-3.1-70B-Instruct` @@ -236,7 +242,9 @@ it works better with vLLM. Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja` + #### InternLM Models (`internlm`) + Supported models: * `internlm/internlm2_5-7b-chat` (confirmed) * Additional internlm2.5 function-calling models are compatible as well @@ -246,6 +254,7 @@ Known issues: Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja` + #### Jamba Models (`jamba`) AI21's Jamba-1.5 models are supported. * `ai21labs/AI21-Jamba-1.5-Mini` @@ -255,6 +264,16 @@ AI21's Jamba-1.5 models are supported. Flags: `--tool-call-parser jamba` +#### IBM Granite (`granite-20b-fc`) + +Supported models: +* `ibm-granite/granite-20b-functioncalling` + +Flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja` + +The example chat template deviates slightly from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported. + + ### How to write a tool parser plugin A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py. @@ -312,5 +331,5 @@ Then you can use this plugin in the command line like this. --tool-parser-plugin --tool-call-parser example \ --chat-template \ -``` +``` diff --git a/examples/tool_chat_template_granite_20b_fc.jinja b/examples/tool_chat_template_granite_20b_fc.jinja new file mode 100644 index 0000000000000..cb52188ec72d9 --- /dev/null +++ b/examples/tool_chat_template_granite_20b_fc.jinja @@ -0,0 +1,130 @@ +{%- macro json_to_python_type(json_spec) %} + {%- set basic_type_map = { + "string": "str", + "number": "float", + "integer": "int", + "boolean": "bool" +} %} + + {%- if basic_type_map[json_spec.type] is defined %} + {{- basic_type_map[json_spec.type] }} + {%- elif json_spec.type == "array" %} + {{- "list[" + json_to_python_type(json_spec|items) + "]" }} + {%- elif json_spec.type == "object" %} + {%- if json_spec.additionalProperties is defined %} + {{- "dict[str, " + json_to_python_type(json_spec.additionalProperties) + ']' }} + {%- else %} + {{- "dict" }} + {%- endif %} + {%- elif json_spec.type is iterable %} + {{- "Union[" }} + {%- for t in json_spec.type %} + {{- json_to_python_type({"type": t}) }} + {%- if not loop.last %} + {{- "," }} + {%- endif %} + {%- endfor %} + {{- "]" }} + {%- else %} + {{- "Any" }} + {%- endif %} +{%- endmacro %} + +{%- if not full_function_description is defined %} + {%- set full_function_description = false %} +{%- endif %} + +{%- macro full_description(tool) %} + {{- tool.name + '(' }} + {%- if tool.parameters is defined %} + {%- for param_name, param_fields in tool.parameters.properties|items %} + {{- param_name + ": " + json_to_python_type(param_fields) }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {%- endif %} + {{- ")" }} + {%- if tool.return is defined %} + {{- " -> " + json_to_python_type(tool.return) }} + {%- endif %} + {{- " - " + tool.description + "\n\n" }} + {%- if tool.parameters is defined %} + {%- for param_name, param_fields in tool.parameters.properties|items %} + {%- if loop.first %} + {{- " Args:\n" }} + {%- endif %} + {{- " " + param_name + "(" + json_to_python_type(param_fields) + "): " + param_fields.description|trim }} + {%- endfor %} + {%- endif %} + {%- if tool.return is defined and tool.return.description is defined %} + {{- "\n Returns:\n " + tool.return.description }} + {%- endif %} + {{- '"' }} +{%- endmacro %} + +{%- macro simple_description(tool) %} + {{- tool.description }} +{%- endmacro %} + +{%- macro function_description(tool) %} + {%- if full_function_description %} + {{- full_description(tool) }} + {%- else %} + {{- simple_description(tool) }} + {%- endif %} +{%- endmacro %} + +{%- if messages[0]["role"] == "system" %} + {%- set sys_prompt = messages[0]["content"] %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set loop_messages = messages %} + {% set sys_prompt = 'You are a helpful assistant with access to the following function calls. Your task is to understand the given conversation with function calls and responses and generate natural language response as the ASSISTANT to continue the conversation. You may use the following function calls to understand how to respond to the user query.' %} +{%- endif %} + +{{ 'SYSTEM: ' + sys_prompt }} +{% if tools is iterable and tools | length > 0 %} +<|function_call_library|> + {%- for tool in tools %} + {%- if tool.function is defined %} + {%- set tool = tool.function %} + {%- endif %} + {{- '{"name": "' + tool.name + '", ' }} + {{- '"description": "' + function_description(tool) }} + {{- ', "parameters": ' }} + {%- if not tool.parameters is defined or tool.parameters.properties | length == 0 %} + {{- "{}" }} + {%- else %} + {{- tool.parameters|tojson }} + {%- endif %} + {{- "}" }} + {%- if not loop.last %} + {{- "\n" }} + {%- endif %} + {%- endfor %} +If none of the functions are relevant or the given question lacks the parameters required by the function, please output \" {\"name\": \"no_function\", \"arguments\": {}}\". +{%- endif %} + + + +{% for message in messages %} + {% if message['role'] == 'user' %} + {{- '\nUSER: ' + message['content'] }} + {% elif message['role'] == 'assistant' and message.tool_calls is defined %} + {{- '\nASSISTANT:' }} + {% for tc in message.tool_calls %} + {{- ' ' + {'name': tc.function.name, 'arguments': tc.function.arguments}|tojson }} + {% endfor %} + {{- '<|endoftext|>' }} + {% elif message['role'] == 'assistant' %} + {{- '\nASSISTANT: ' + message['content'] + ' <|endoftext|>' }} + {% elif message['role'] == 'tool' %} + {{- ' ' + message['content'] }} + {%- else %} + {{- raise_exception("Unexpected combination of role and message content") }} + {% endif %} + {% if loop.last and add_generation_prompt %} + {{- '\nASSISTANT: ' }} + {% endif %} +{% endfor %} diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index ce36515a2381c..d9ee0b1d54b0a 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -88,6 +88,18 @@ def ensure_system_prompt(messages: List[Dict[str, Any]], "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT " "to the user's question - just respond to it normally." }, + ## FIXME: temporary disabled due to lack of hardware specification + ## for individual runs + #"granite20b": { + # "model": + # "ibm-granite/granite-20b-functioncalling", + # "arguments": [ + # "--tool-call-parser", "granite-20b-fc", "--chat-template", + # str(VLLM_PATH / "examples/tool_chat_template_granite_20b_fc.jinja") + # ], + # "supports_parallel": + # False, + #}, "internlm": { "model": "internlm/internlm2_5-7b-chat", diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index 0e88bb21ca75f..1b299ce655570 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -1,4 +1,5 @@ from .abstract_tool_parser import ToolParser, ToolParserManager +from .granite_20b_fc_tool_parser import Granite20bFCToolParser from .hermes_tool_parser import Hermes2ProToolParser from .internlm2_tool_parser import Internlm2ToolParser from .jamba_tool_parser import JambaToolParser @@ -6,7 +7,7 @@ from .mistral_tool_parser import MistralToolParser __all__ = [ - "ToolParser", "ToolParserManager", "Hermes2ProToolParser", - "MistralToolParser", "Internlm2ToolParser", "Llama3JsonToolParser", - "JambaToolParser" + "ToolParser", "ToolParserManager", "Granite20bFCToolParser", + "Hermes2ProToolParser", "MistralToolParser", "Internlm2ToolParser", + "Llama3JsonToolParser", "JambaToolParser" ] diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py new file mode 100644 index 0000000000000..94db8f379e33a --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -0,0 +1,251 @@ +import json +import re +from json import JSONDecoder +from typing import Dict, Sequence, Union + +import partial_json_parser +from partial_json_parser.core.options import Allow + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaFunctionCall, DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, ToolCall) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, ToolParserManager) +from vllm.entrypoints.openai.tool_parsers.utils import (consume_space, + find_common_prefix, + is_complete_json, + partial_json_loads) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import random_uuid + +logger = init_logger(__name__) + + +@ToolParserManager.register_module("granite-20b-fc") +class Granite20bFCToolParser(ToolParser): + """ + Tool call parser for the granite-20b-functioncalling model intended + for use with the examples/tool_chat_template_granite20b_fc.jinja + template. + + Used when --enable-auto-tool-choice --tool-call-parser granite-20-fc + are all set + """ + + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + self.bot_token = "" + self.tool_start_token = self.bot_token + self.tool_call_regex = re.compile(r"\s*") + + def extract_tool_calls( + self, model_output: str, + request: ChatCompletionRequest) -> ExtractedToolCallInformation: + if self.tool_start_token not in model_output: + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + dec = JSONDecoder() + try: + matches = list(self.tool_call_regex.finditer(model_output)) + logger.debug("Found %d tool call matches", len(matches)) + + raw_function_calls = [] + + for i, match in enumerate(matches): + # position after the tag + start_of_json = match.end() + # end_index == the start of the next function call + # (if exists) + next_function_call_start = (matches[i + 1].start() + if i + 1 < len(matches) else None) + + raw_function_calls.append( + dec.raw_decode( + model_output[start_of_json:next_function_call_start]) + [0]) + + logger.debug("Extracted %d tool calls", len(raw_function_calls)) + tool_calls = [ + ToolCall( + type="function", + function=FunctionCall( + name=function_call["name"], + # function call args are JSON but as a string + arguments=json.dumps(function_call["arguments"]), + ), + ) for function_call in raw_function_calls + ] + + content = model_output[:model_output.find(self.bot_token)] + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=content if content else None, + ) + + except Exception as e: + logger.error("Error in extracting tool call from response %s", e) + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> Union[DeltaMessage, None]: + + if len(current_text) < len( + self.bot_token) and self.bot_token.startswith(current_text): + return None + + if not current_text.startswith(self.bot_token): + return DeltaMessage(content=delta_text) + + # bit mask flags for partial JSON parsing. If the name hasn't been + # sent yet, don't allow sending + # an incomplete string since OpenAI only ever (as far as I have + # seen) allows sending the entire tool/ function name at once. + flags = Allow.ALL if self.current_tool_name_sent \ + else Allow.ALL & ~Allow.STR + try: + tool_call_arr = [] + is_complete = [] + try: + start_idx = len(self.bot_token) + start_idx = consume_space(start_idx, current_text) + + while start_idx < len(current_text): + (obj, + end_idx) = partial_json_loads(current_text[start_idx:], + flags) + is_complete.append( + is_complete_json(current_text[start_idx:start_idx + + end_idx])) + start_idx += end_idx + start_idx = consume_space(start_idx, current_text) + start_idx += len(self.bot_token) + start_idx = consume_space(start_idx, current_text) + tool_call_arr.append(obj) + except partial_json_parser.core.exceptions.MalformedJSON: + logger.debug('not enough tokens to parse into JSON yet') + return None + + # select as the current tool call the one we're on the state at + current_tool_call: Dict = tool_call_arr[self.current_tool_id] \ + if len(tool_call_arr) > 0 else {} + + # case -- if no tokens have been streamed for the tool, e.g. + # only the array brackets, stream nothing + if len(tool_call_arr) == 0: + return None + + # case: we are starting a new tool in the array + # -> array has > 0 length AND length has moved past cursor + elif (len(tool_call_arr) > 0 + and len(tool_call_arr) > self.current_tool_id + 1): + + # if we're moving on to a new call, first make sure we + # haven't missed anything in the previous one that was + # auto-generated due to JSON completions, but wasn't + # streamed to the client yet. + if self.current_tool_id >= 0: + cur_arguments = current_tool_call.get("arguments") + if cur_arguments: + cur_args_json = json.dumps(cur_arguments) + sent = len( + self.streamed_args_for_tool[self.current_tool_id]) + argument_diff = cur_args_json[sent:] + + logger.debug("got arguments diff: %s", argument_diff) + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=argument_diff). + model_dump(exclude_none=True)) + ]) + self.streamed_args_for_tool[ + self.current_tool_id] += argument_diff + else: + delta = None + else: + delta = None + # re-set stuff pertaining to progress in the current tool + self.current_tool_id = len(tool_call_arr) - 1 + self.current_tool_name_sent = False + self.streamed_args_for_tool.append("") + logger.debug("starting on new tool %d", self.current_tool_id) + return delta + + # if the current tool name hasn't been sent, send if available + # - otherwise send nothing + elif not self.current_tool_name_sent: + function_name = current_tool_call.get("name") + if function_name: + + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + type="function", + id=f"chatcmpl-tool-{random_uuid()}", + function=DeltaFunctionCall( + name=function_name).model_dump( + exclude_none=True)) + ]) + self.current_tool_name_sent = True + else: + delta = None + + # now we know we're on the same tool call and we're streaming + # arguments + else: + cur_arguments = current_tool_call.get("arguments") + delta = None + + if cur_arguments: + sent = len( + self.streamed_args_for_tool[self.current_tool_id]) + cur_args_json = json.dumps(cur_arguments) + prev_arguments = self.prev_tool_call_arr[ + self.current_tool_id].get("arguments") + + argument_diff = None + if is_complete[self.current_tool_id]: + argument_diff = cur_args_json[sent:] + elif prev_arguments: + prev_args_json = json.dumps(prev_arguments) + if cur_args_json != prev_args_json: + + prefix = find_common_prefix( + prev_args_json, cur_args_json) + argument_diff = prefix[sent:] + + if argument_diff is not None: + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=argument_diff). + model_dump(exclude_none=True)) + ]) + self.streamed_args_for_tool[ + self.current_tool_id] += argument_diff + + self.prev_tool_call_arr = tool_call_arr + return delta + + except Exception as e: + logger.error("Error trying to handle streaming tool call: %s", e) + logger.debug( + "Skipping chunk as a result of tool streaming extraction " + "error") + return None diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py index 1b836a687a1c3..a5f44d69e5fd2 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -1,6 +1,6 @@ import json import re -from json import JSONDecodeError, JSONDecoder +from json import JSONDecoder from typing import Dict, List, Sequence, Union import partial_json_parser @@ -14,34 +14,15 @@ FunctionCall, ToolCall) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ToolParserManager) -from vllm.entrypoints.openai.tool_parsers.utils import find_common_prefix +from vllm.entrypoints.openai.tool_parsers.utils import (find_common_prefix, + is_complete_json, + partial_json_loads) from vllm.logger import init_logger from vllm.utils import random_uuid logger = init_logger(__name__) -# partial_json_parser doesn't support extra data and -# JSONDecorder.raw_decode doesn't support partial JSON -def partial_json_loads(input_str, flags): - try: - return (partial_json_parser.loads(input_str, flags), len(input_str)) - except JSONDecodeError as e: - if "Extra data" in e.msg: - dec = JSONDecoder() - return dec.raw_decode(input_str) - else: - raise - - -def is_complete_json(input_str): - try: - json.loads(input_str) - return True - except JSONDecodeError: - return False - - @ToolParserManager.register_module("llama3_json") class Llama3JsonToolParser(ToolParser): """ diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py index db7fc5259fc4e..5e4eb23bfaf43 100644 --- a/vllm/entrypoints/openai/tool_parsers/utils.py +++ b/vllm/entrypoints/openai/tool_parsers/utils.py @@ -1,3 +1,11 @@ +import json +from json import JSONDecodeError, JSONDecoder +from typing import Any, List, Tuple + +import partial_json_parser +from partial_json_parser.core.options import Allow + + def find_common_prefix(s1: str, s2: str) -> str: """ Finds a common prefix that is shared between two strings, if there is one. @@ -72,7 +80,7 @@ def extract_intermediate_diff(curr: str, old: str) -> str: return diff -def find_all_indices(string, substring): +def find_all_indices(string: str, substring: str) -> List[int]: """ Find all (starting) indices of a substring in a given string. Useful for tool call extraction @@ -85,3 +93,29 @@ def find_all_indices(string, substring): break indices.append(index) return indices + + +# partial_json_parser doesn't support extra data and +# JSONDecorder.raw_decode doesn't support partial JSON +def partial_json_loads(input_str: str, flags: Allow) -> Tuple[Any, int]: + try: + return (partial_json_parser.loads(input_str, flags), len(input_str)) + except JSONDecodeError as e: + if "Extra data" in e.msg: + dec = JSONDecoder() + return dec.raw_decode(input_str) + raise + + +def is_complete_json(input_str: str) -> bool: + try: + json.loads(input_str) + return True + except JSONDecodeError: + return False + + +def consume_space(i: int, s: str) -> int: + while i < len(s) and s[i].isspace(): + i += 1 + return i From 8d7724104aef4381cf268de094360f27ff68f4ab Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Tue, 29 Oct 2024 15:19:02 -0700 Subject: [PATCH 11/88] [Docs] Add notes about Snowflake Meetup (#9814) Signed-off-by: simon-mo --- README.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0836d872358fb..8c8d6eb291cea 100644 --- a/README.md +++ b/README.md @@ -13,9 +13,19 @@ Easy, fast, and cheap LLM serving for everyone | Documentation | Blog | Paper | Discord | Twitter/X | Developer Slack |

+--- + +**vLLM x Snowfkale Meetup (Wednesday, November 13th, 5:30-8PM PT) at Snowfkale HQ, San Mateo** + +We are excited to announce the last in-person vLLM meetup of the year! +Join the vLLM developers and engineers from Snowflake AI Research to chat about the latest LLM inference optimizations and your 2025 vLLM wishlist! +Register [here](https://lu.ma/h0qvrajz) and be a part of the event! + +--- + *Latest News* 🔥 -- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! +- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! - [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users! - [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing). - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing). @@ -42,7 +52,7 @@ vLLM is fast with: - Speculative decoding - Chunked prefill -**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script. +**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script. vLLM is flexible and easy to use with: From bc73e9821cb4f90a88c04e7d550f132d8911266b Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 29 Oct 2024 19:02:59 -0400 Subject: [PATCH 12/88] [Bugfix] Fix prefix strings for quantized VLMs (#9772) --- vllm/model_executor/model_loader/loader.py | 11 +++- vllm/model_executor/models/blip2.py | 5 +- vllm/model_executor/models/gemma.py | 58 +++++++++++++------ vllm/model_executor/models/internlm2.py | 56 ++++++++++++------ vllm/model_executor/models/internlm2_ve.py | 16 +++-- vllm/model_executor/models/internvl.py | 5 +- vllm/model_executor/models/llama.py | 7 ++- vllm/model_executor/models/llava.py | 20 +++++-- vllm/model_executor/models/llava_next.py | 10 +++- .../model_executor/models/llava_next_video.py | 10 +++- vllm/model_executor/models/llava_onevision.py | 10 +++- vllm/model_executor/models/minicpmv.py | 34 ++++++++--- vllm/model_executor/models/opt.py | 34 ++++++++--- vllm/model_executor/models/paligemma.py | 7 ++- vllm/model_executor/models/phi3v.py | 19 ++++-- vllm/model_executor/models/pixtral.py | 5 +- vllm/model_executor/models/qwen2.py | 50 +++++++++++----- vllm/model_executor/models/qwen2_vl.py | 8 ++- vllm/model_executor/models/ultravox.py | 5 +- vllm/model_executor/models/utils.py | 15 +++++ 20 files changed, 288 insertions(+), 97 deletions(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 3cfee13b9fa6e..3ae8a51859f70 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -147,15 +147,20 @@ def _get_model_initialization_kwargs( return extra_kwargs -def build_model(model_class: Type[nn.Module], hf_config: PretrainedConfig, +def build_model(model_class: Type[nn.Module], + hf_config: PretrainedConfig, cache_config: Optional[CacheConfig], - quant_config: Optional[QuantizationConfig], *, + quant_config: Optional[QuantizationConfig], + *, lora_config: Optional[LoRAConfig], multimodal_config: Optional[MultiModalConfig], - scheduler_config: Optional[SchedulerConfig]) -> nn.Module: + scheduler_config: Optional[SchedulerConfig], + prefix: Optional[str] = None) -> nn.Module: extra_kwargs = _get_model_initialization_kwargs(model_class, lora_config, multimodal_config, scheduler_config) + if prefix: + extra_kwargs["prefix"] = prefix return model_class(config=hf_config, cache_config=cache_config, diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index cd2013e91514d..c3b3cc8a4ddb6 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -507,7 +507,10 @@ def __init__(self, ) self.language_model = init_vllm_registered_model( - config.text_config, cache_config, quant_config) + config.text_config, + cache_config, + quant_config, + prefix="language_model") self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 436bd45d53f35..57b2b43c82f89 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -43,7 +43,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) logger = init_logger(__name__) @@ -83,16 +84,23 @@ def __init__( hidden_act: Optional[str] = None, hidden_activation: Optional[str] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, + hidden_size, + [intermediate_size] * 2, bias=False, - quant_config=quant_config) - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", + ) + self.down_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.down_proj", + ) self.act_fn = _get_gemma_act_fn(hidden_act, hidden_activation) def forward(self, x): @@ -104,15 +112,18 @@ def forward(self, x): class GemmaAttention(nn.Module): - def __init__(self, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - head_dim: int, - max_position_embeddings: int = 8192, - rope_theta: float = 10000, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None) -> None: + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + head_dim: int, + max_position_embeddings: int = 8192, + rope_theta: float = 10000, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() self.hidden_size = hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -142,12 +153,14 @@ def __init__(self, self.total_num_kv_heads, bias=False, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.o_proj = RowParallelLinear( self.total_num_heads * self.head_dim, hidden_size, bias=False, quant_config=quant_config, + prefix=f"{prefix}.o_proj", ) self.rotary_emb = get_rope( @@ -186,6 +199,7 @@ def __init__( config: GemmaConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -198,6 +212,7 @@ def __init__( rope_theta=config.rope_theta, cache_config=cache_config, quant_config=quant_config, + prefix=f"{prefix}.self_attn", ) self.mlp = GemmaMLP( hidden_size=self.hidden_size, @@ -205,6 +220,7 @@ def __init__( hidden_act=config.hidden_act, hidden_activation=getattr(config, "hidden_activation", None), quant_config=quant_config, + prefix=f"{prefix}.mlp", ) self.input_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -259,8 +275,8 @@ def __init__( ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: GemmaDecoderLayer(config, cache_config, quant_config - ), + lambda prefix: GemmaDecoderLayer( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers") self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -366,6 +382,7 @@ def __init__( cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoRAConfig] = None, + prefix: str = "", ) -> None: super().__init__() @@ -375,7 +392,10 @@ def __init__( self.lora_config = lora_config self.quant_config = quant_config - self.model = GemmaModel(config, cache_config, quant_config) + self.model = GemmaModel(config, + cache_config, + quant_config, + prefix=maybe_prefix(prefix, "model")) self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = Sampler() self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 9a77e48626ca5..313d98b649b48 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -30,7 +30,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class InternLM2MLP(nn.Module): @@ -41,16 +42,23 @@ def __init__( intermediate_size: int, hidden_act: str, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, + hidden_size, + [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", + ) + self.w2 = RowParallelLinear( + intermediate_size, + hidden_size, bias=False, - quant_config=quant_config) - self.w2 = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.w2", + ) if hidden_act != "silu": raise ValueError(f"Unsupported activation: {hidden_act}. " "Only silu is supported for now.") @@ -75,6 +83,7 @@ def __init__( max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = hidden_size @@ -108,12 +117,14 @@ def __init__( self.total_num_kv_heads, bias=False, quant_config=quant_config, + prefix=f"{prefix}.wqkv", ) self.wo = RowParallelLinear( self.total_num_heads * self.head_dim, hidden_size, bias=False, quant_config=quant_config, + prefix=f"{prefix}.wo", ) self.rotary_emb = get_rope( @@ -123,12 +134,15 @@ def __init__( base=rope_theta, rope_scaling=rope_scaling, ) - self.attn = Attention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - quant_config=quant_config) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) def split_qkv(self, qkv: torch.Tensor): seq_len = qkv.shape[0] @@ -176,6 +190,7 @@ def __init__( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -192,12 +207,14 @@ def __init__( max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, + prefix=f"{prefix}.attention", ) self.feed_forward = InternLM2MLP( hidden_size=self.hidden_size, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, + prefix=f"{prefix}.feed_forward", ) self.attention_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -251,8 +268,8 @@ def __init__( ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: InternLMDecoderLayer(config, cache_config, - quant_config), + lambda prefix: InternLMDecoderLayer( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers") self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.make_empty_intermediate_tensors = ( @@ -306,14 +323,19 @@ def __init__( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.config = config self.quant_config = quant_config - self.model = InternLM2Model(config, cache_config, quant_config) + self.model = InternLM2Model(config, + cache_config, + quant_config, + prefix=maybe_prefix(prefix, "model")) self.output = ParallelLMHead(config.vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=maybe_prefix(prefix, "output")) if self.config.tie_word_embeddings: self.output.weight = self.model.tok_embeddings.weight self.logits_processor = LogitsProcessor(config.vocab_size) diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index 6effd70b75da3..edd867e4b6457 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -15,7 +15,7 @@ InternLM2MLP, InternLM2Model) from vllm.sequence import IntermediateTensors -from .utils import make_layers +from .utils import make_layers, maybe_prefix class InternLM2VEDecoderLayer(nn.Module): @@ -25,6 +25,7 @@ def __init__( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -41,18 +42,21 @@ def __init__( max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, + prefix=f"{prefix}.attention", ) self.feed_forward = InternLM2MLP( hidden_size=self.hidden_size, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, + prefix=f"{prefix}.feed_forward", ) self.feed_forward_ve = InternLM2MLP( hidden_size=self.hidden_size, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, + prefix=f"{prefix}.feed_forward_ve", ) self.attention_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -111,8 +115,8 @@ def __init__( super().__init__(config, cache_config, quant_config) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: InternLM2VEDecoderLayer(config, cache_config, - quant_config), + lambda prefix: InternLM2VEDecoderLayer( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers") def forward( @@ -161,6 +165,10 @@ def __init__( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__(config, cache_config, quant_config) - self.model = InternLM2VEModel(config, cache_config, quant_config) + self.model = InternLM2VEModel(config, + cache_config, + quant_config, + prefix=maybe_prefix(prefix, "model")) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 3ae37d9fe5d85..1c1fde5b30983 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -439,7 +439,10 @@ def __init__(self, ) self.language_model = init_vllm_registered_model( - config.text_config, cache_config, quant_config) + config.text_config, + cache_config, + quant_config, + prefix="language_model") self.mlp1 = self._init_mlp1(config) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index b0ca1fe006239..98c53bdaae811 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -55,7 +55,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class LlamaMLP(nn.Module): @@ -500,6 +501,7 @@ def __init__( cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoRAConfig] = None, + prefix: str = "", ) -> None: super().__init__() @@ -510,7 +512,7 @@ def __init__( cache_config, quant_config, lora_config=lora_config, - prefix="model") + prefix=maybe_prefix(prefix, "model")) if get_pp_group().is_last_rank: self.unpadded_vocab_size = config.vocab_size if lora_config: @@ -526,6 +528,7 @@ def __init__( if not lora_config else lora_config.lora_vocab_padding_size), quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) if config.tie_word_embeddings: self.lm_head = self.lm_head.tie_weights( diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index b005d83c17f90..eda99c029881f 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -210,6 +210,7 @@ def init_vision_tower_for_llava( quant_config: Optional[QuantizationConfig], *, require_post_norm: Optional[bool] = None, + prefix: str = "", ): vision_config = hf_config.vision_config @@ -224,23 +225,26 @@ def init_vision_tower_for_llava( if isinstance(vision_config, CLIPVisionConfig): return CLIPVisionModel( vision_config, - quant_config, + quant_config=quant_config, num_hidden_layers_override=num_hidden_layers, require_post_norm=require_post_norm, + prefix=prefix, ) elif isinstance(vision_config, SiglipVisionConfig): return SiglipVisionModel( vision_config, - quant_config, + quant_config=quant_config, num_hidden_layers_override=num_hidden_layers, require_post_norm=require_post_norm, + prefix=prefix, ) elif isinstance(vision_config, PixtralVisionConfig): return PixtralHFVisionModel( vision_config, - quant_config, + quant_config=quant_config, num_hidden_layers_override=num_hidden_layers, require_post_norm=require_post_norm, + prefix=prefix, ) msg = f"Unsupported vision config: {type(vision_config)}" @@ -274,14 +278,20 @@ def __init__(self, # TODO: Optionally initializes this for supporting embeddings. self.vision_tower = init_vision_tower_for_llava( - config, quant_config, require_post_norm=False) + config, + quant_config, + require_post_norm=False, + prefix="vision_tower") self.multi_modal_projector = LlavaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, projector_hidden_act=config.projector_hidden_act) self.language_model = init_vllm_registered_model( - config.text_config, cache_config, quant_config) + config.text_config, + cache_config, + quant_config, + prefix="language_model") self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 2a582deeaa2c9..f85129b206919 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -293,7 +293,10 @@ def __init__(self, # TODO: Optionally initializes this for supporting embeddings. self.vision_tower = init_vision_tower_for_llava( - config, quant_config, require_post_norm=False) + config, + quant_config, + require_post_norm=False, + prefix="vision_tower") self.image_newline = nn.Parameter( torch.empty(config.text_config.hidden_size)) self.multi_modal_projector = LlavaMultiModalProjector( @@ -302,7 +305,10 @@ def __init__(self, projector_hidden_act=config.projector_hidden_act) self.language_model = init_vllm_registered_model( - config.text_config, cache_config, quant_config) + config.text_config, + cache_config, + quant_config, + prefix="language_model") # The same model class supports both language generation and embedding # because the architecture name is the same diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 43eec43d56643..b8051d5fc6ae2 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -257,14 +257,20 @@ def __init__(self, # Initialize the vision tower only up to the required feature layer self.vision_tower = init_vision_tower_for_llava( - config, quant_config, require_post_norm=False) + config, + quant_config, + require_post_norm=False, + prefix="vision_tower") self.vision_resampler = LlavaNextVideoPooler(config) self.multi_modal_projector = LlavaNextMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, projector_hidden_act=config.projector_hidden_act) self.language_model = init_vllm_registered_model( - config.text_config, cache_config, quant_config) + config.text_config, + cache_config, + quant_config, + prefix="language_model") self.make_empty_intermediate_tensors = ( self.language_model.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 9606b126141df..a0cf208a65f36 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -415,10 +415,16 @@ def __init__(self, # Initialize the vision tower only up to the required feature layer self.vision_tower = init_vision_tower_for_llava( - config, quant_config, require_post_norm=False) + config, + quant_config, + require_post_norm=False, + prefix="vision_tower") self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config) self.language_model = init_vllm_registered_model( - config.text_config, cache_config, quant_config) + config.text_config, + cache_config, + quant_config, + prefix="language_model") self.image_newline = nn.Parameter( torch.empty(config.text_config.hidden_size)) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 2ec51dc4647f5..a270282d87bc8 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -394,8 +394,11 @@ def __init__( self.multimodal_config = multimodal_config self.version = get_version_by_config(self.config) - self.llm = self.init_llm(config, cache_config, quant_config) - self.vpm = self.init_vision_module(config, quant_config) + self.llm = self.init_llm(config, + cache_config, + quant_config, + prefix="llm") + self.vpm = self.init_vision_module(config, quant_config, prefix="vpm") param_dtype = torch.get_default_dtype() self.vpm.to(dtype=param_dtype) self.vision_dim = (self.vpm.embed_dim if self.version == (2, 0) else @@ -403,9 +406,11 @@ def __init__( self.embed_dim = self.config.hidden_size self.resampler = self.init_resampler(self.embed_dim, self.vision_dim) self.resampler.to(device="cuda", dtype=param_dtype) + # TODO: why is there _KEYS_TO_MODIFY_MAPPING? lm_head should be in llm self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix="llm.lm_head") self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = Sampler() @@ -644,6 +649,7 @@ def init_llm( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> nn.Module: raise NotImplementedError @@ -651,6 +657,7 @@ def init_vision_module( self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig], + prefix: str = "", ) -> nn.Module: raise NotImplementedError @@ -690,17 +697,20 @@ def init_llm( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> nn.Module: return LLMWrapper(MiniCPMModel(config, cache_config=cache_config, - quant_config=quant_config), + quant_config=quant_config, + prefix=prefix), name="model") def init_vision_module( self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig], + prefix: str = "", ) -> nn.Module: # TODO :refactor this vision model try: @@ -819,19 +829,23 @@ def init_llm( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> nn.Module: return LLMWrapper(LlamaModel(config, cache_config=cache_config, - quant_config=quant_config), + quant_config=quant_config, + prefix=prefix), name="model") def init_vision_module( self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig], + prefix: str = "", ) -> nn.Module: model = Idefics2VisionTransformer(config.vision_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=prefix) if self.config.drop_vision_last_layer: model.encoder.layers = model.encoder.layers[:-1] return model @@ -935,20 +949,24 @@ def init_llm( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> nn.Module: return LLMWrapper(Qwen2Model(config, cache_config=cache_config, - quant_config=quant_config), + quant_config=quant_config, + prefix=prefix), name="model") def init_vision_module( self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig], + prefix: str = "", ) -> nn.Module: model = Idefics2VisionTransformer(config.vision_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=prefix) if self.config.drop_vision_last_layer: model.encoder.layers = model.encoder.layers[:-1] return model diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 37c3fa919124e..10cca8b56268a 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -43,7 +43,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class OPTLearnedPositionalEmbedding(nn.Embedding): @@ -68,6 +69,7 @@ def __init__( bias: bool = True, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.embed_dim = embed_dim @@ -85,18 +87,21 @@ def __init__( total_num_heads, bias=bias, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.out_proj = RowParallelLinear( embed_dim, embed_dim, bias=bias, quant_config=quant_config, + prefix=f"{prefix}.out_proj", ) self.attn = Attention(self.num_heads, self.head_dim, scale=self.scaling, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -118,6 +123,7 @@ def __init__( config: OPTConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.config = config @@ -128,6 +134,7 @@ def __init__( bias=config.enable_bias, cache_config=cache_config, quant_config=quant_config, + prefix=f"{prefix}.self_attn", ) self.do_layer_norm_before = config.do_layer_norm_before @@ -139,6 +146,7 @@ def __init__( config.ffn_dim, bias=config.enable_bias, quant_config=quant_config, + prefix=f"{prefix}.fc1", ) self.activation_fn = get_act_fn(config.activation_function, quant_config, config.ffn_dim) @@ -147,6 +155,7 @@ def __init__( self.embed_dim, bias=config.enable_bias, quant_config=quant_config, + prefix=f"{prefix}.fc2", ) self.final_layer_norm = nn.LayerNorm( self.embed_dim, @@ -214,7 +223,8 @@ def __init__( self.project_out = ReplicatedLinear(config.hidden_size, config.word_embed_proj_dim, bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.project_out") else: self.project_out = None @@ -222,7 +232,8 @@ def __init__( self.project_in = ReplicatedLinear(config.word_embed_proj_dim, config.hidden_size, bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.project_in") else: self.project_in = None @@ -239,7 +250,8 @@ def __init__( self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: OPTDecoderLayer(config, cache_config, quant_config), + lambda prefix: OPTDecoderLayer( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers") def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: @@ -288,9 +300,13 @@ def __init__( config: OPTConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() - self.decoder = OPTDecoder(config, cache_config, quant_config) + self.decoder = OPTDecoder(config, + cache_config, + quant_config, + prefix=f"{prefix}.decoder") self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory(["hidden_states"], config.hidden_size)) @@ -335,11 +351,15 @@ def __init__( config: OPTConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.config = config self.quant_config = quant_config - self.model = OPTModel(config, cache_config, quant_config) + self.model = OPTModel(config, + cache_config, + quant_config, + prefix=maybe_prefix(prefix, "model")) if self.config.tie_word_embeddings: self.lm_head = self.model.decoder.embed_tokens else: diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 7a62a098a4525..8e29c6079b994 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -143,14 +143,17 @@ def __init__(self, self.multimodal_config = multimodal_config self.vision_tower = SiglipVisionModel(config.vision_config, - quant_config) + quant_config, + prefix="vision_tower") self.multi_modal_projector = PaliGemmaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, projection_dim=config.vision_config.projection_dim) self.quant_config = quant_config self.language_model = GemmaForCausalLM(config.text_config, - cache_config, quant_config) + cache_config, + quant_config, + prefix="language_model") logit_scale = getattr(config, "logit_scale", 1.0) self.language_model.logits_processor.scale *= logit_scale diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 855a9b17585a4..0962d3d3847c9 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -71,7 +71,8 @@ def _init_img_processor(hf_config: PretrainedConfig, - quant_config: Optional[QuantizationConfig]): + quant_config: Optional[QuantizationConfig], + prefix: str = "") -> CLIPVisionModel: clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG layer_idx = hf_config.img_processor.get('layer_idx', -2) @@ -86,6 +87,7 @@ def _init_img_processor(hf_config: PretrainedConfig, clip_config, quant_config, num_hidden_layers_override=num_hidden_layers, + prefix=prefix, ) return img_processor @@ -152,15 +154,18 @@ def get_img_features(self, class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase): """Phi3 Image embedding with HD transform.""" - def __init__(self, config: PretrainedConfig, - quant_config: Optional[QuantizationConfig]) -> None: + def __init__(self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + prefix: str = "") -> None: super().__init__() # n_embed or hidden_size hidden_size = config.n_embd if hasattr( config, 'n_embd') else config.hidden_size - self.img_processor = _init_img_processor(config, quant_config) + self.img_processor = _init_img_processor( + config, quant_config, prefix=f"{prefix}.img_processor") image_dim_out = config.img_processor['image_dim_out'] self.num_img_tokens = config.img_processor['num_img_tokens'] @@ -537,11 +542,15 @@ def __init__(self, config.hidden_size, org_num_embeddings=config.vocab_size, quant_config=quant_config, + prefix="model.embed_tokens", ) # TODO: Optionally initializes this for supporting input embeddings. - self.vision_embed_tokens = Phi3HDImageEmbedding(config, quant_config) + self.vision_embed_tokens = Phi3HDImageEmbedding( + config, quant_config, prefix="model.vision_embed_tokens") + # The prefix is empty intentionally because default prefix of + # LlamaForCausalLM is "model" self.language_model = LlamaForCausalLM(config, cache_config, quant_config) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index a9dbb3823743a..6b53bf5660096 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -164,7 +164,10 @@ def __init__(self, # init MistralForCausalLM self.language_model = init_vllm_registered_model( - config.text_config, cache_config, quant_config) + config.text_config, + cache_config, + quant_config, + prefix="language_model") self.vision_encoder = VisionTransformer(self.vision_args) self.vision_language_adapter = VisionLanguageAdapter( diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 23eb1482ffef1..db1029345a8ac 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -49,7 +49,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class Qwen2MLP(nn.Module): @@ -60,16 +61,23 @@ def __init__( intermediate_size: int, hidden_act: str, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, + hidden_size, + [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", + ) + self.down_proj = RowParallelLinear( + intermediate_size, + hidden_size, bias=False, - quant_config=quant_config) - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.down_proj", + ) if hidden_act != "silu": raise ValueError(f"Unsupported activation: {hidden_act}. " "Only silu is supported for now.") @@ -92,7 +100,8 @@ def __init__(self, rope_theta: float = 10000, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - rope_scaling: Optional[Tuple] = None) -> None: + rope_scaling: Optional[Tuple] = None, + prefix: str = "") -> None: super().__init__() self.hidden_size = hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -122,12 +131,14 @@ def __init__(self, self.total_num_kv_heads, bias=True, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.o_proj = RowParallelLinear( self.total_num_heads * self.head_dim, hidden_size, bias=False, quant_config=quant_config, + prefix=f"{prefix}.o_proj", ) self.rotary_emb = get_rope( @@ -142,7 +153,8 @@ def __init__(self, self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -166,6 +178,7 @@ def __init__( config: Qwen2Config, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -180,12 +193,15 @@ def __init__( rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling) + rope_scaling=rope_scaling, + prefix=f"{prefix}.self_attn", + ) self.mlp = Qwen2MLP( hidden_size=self.hidden_size, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, + prefix=f"{prefix}.mlp", ) self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -241,6 +257,7 @@ def __init__( config.vocab_size, config.hidden_size, quant_config=quant_config, + prefix=f"{prefix}.embed_tokens", ) else: self.embed_tokens = PPMissingLayer() @@ -249,7 +266,8 @@ def __init__( config.num_hidden_layers, lambda prefix: Qwen2DecoderLayer(config=config, cache_config=cache_config, - quant_config=quant_config), + quant_config=quant_config, + prefix=f"{prefix}.layers"), prefix=f"{prefix}.layers", ) @@ -393,6 +411,7 @@ def __init__( cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoRAConfig] = None, + prefix: str = "", ) -> None: # TODO (@robertgshaw2): see if this can be moved out if (cache_config.sliding_window is not None @@ -412,14 +431,19 @@ def __init__( self.lora_config = lora_config self.quant_config = quant_config - self.model = Qwen2Model(config, cache_config, quant_config) + self.model = Qwen2Model(config, + cache_config, + quant_config, + prefix=maybe_prefix(prefix, "model")) if config.tie_word_embeddings: self.lm_head = self.model.embed_tokens else: self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=maybe_prefix( + prefix, "lm_head")) self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = Sampler() diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 4e60fe70b25f1..633d66b4af31a 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -938,7 +938,10 @@ def __init__(self, quant_config=None, ) - self.model = Qwen2Model(config, cache_config, quant_config) + self.model = Qwen2Model(config, + cache_config, + quant_config, + prefix="model") if get_pp_group().is_last_rank: if config.tie_word_embeddings: @@ -946,7 +949,8 @@ def __init__(self, else: self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix="lm_head") else: self.lm_head = PPMissingLayer() diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 5f33b872beecb..f08e4aa355086 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -357,7 +357,10 @@ def __init__(self, )) self.multi_modal_projector = UltravoxProjector(config) self.language_model = init_vllm_registered_model( - config.text_config, cache_config, quant_config) + config.text_config, + cache_config, + quant_config, + prefix="language_model") if config.text_model_id is not None: self.secondary_weights.append( DefaultModelLoader.Source(model_or_path=config.text_model_id, diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 6995f5805c5e1..0aecb5d151a45 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -242,6 +242,7 @@ def init_vllm_registered_model( lora_config: Optional[LoRAConfig] = None, multimodal_config: Optional[MultiModalConfig] = None, scheduler_config: Optional[SchedulerConfig] = None, + prefix: str = "", ) -> nn.Module: """ Helper function to initialize an inner model registered to vLLM, @@ -257,6 +258,7 @@ def init_vllm_registered_model( lora_config=lora_config, multimodal_config=multimodal_config, scheduler_config=scheduler_config, + prefix=prefix, ) @@ -610,3 +612,16 @@ def get_vit_attn_backend() -> _Backend: else: selected_backend = _Backend.XFORMERS return selected_backend + + +def maybe_prefix(prefix: str, name: str) -> str: + """Add a prefix to a name if the prefix is non-empty. + + Args: + prefix: The prefix to add. If empty, no prefix will be added. + name: The name to potentially prefix. + + Returns: + The string "prefix.name" if prefix was non-empty, otherwise just "name". + """ + return name if not prefix else f"{prefix}.{name}" From 1ab6f6b4ad5c4aac6ee72e51b7f6712098f9ccff Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 29 Oct 2024 17:06:24 -0700 Subject: [PATCH 13/88] [core][distributed] fix custom allreduce in pytorch 2.5 (#9815) Signed-off-by: youkaichao --- .../device_communicators/custom_all_reduce.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index 7de5b05a0b053..c3632aee6d11a 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -191,8 +191,20 @@ def capture(self): def _get_ipc_meta(self, inp: torch.Tensor): data = inp.untyped_storage()._share_cuda_() + handle = data[1] + # https://github.com/pytorch/pytorch/pull/130890 changes + # the binary format of the ipc handle + # it starts from pytorch 2.5 + if len(handle) > 64: + assert len(handle) == 66 + # only support SHAREABLE_HANDLE_VERSION = 1 + assert int(handle[0]) == 1 + # only support SHAREABLE_CUDA_MALLOC = 'c' + assert handle[1] == ord("c") + handle = handle[2:] + # TODO: support expandable segment shard_data = ( - data[1], # ipc handle to base ptr + handle, # ipc handle to base ptr data[3], # offset of base ptr ) return self._gather_ipc_meta(shard_data) From 64cb1cdc3f3a6c0ca976d68b19d454122c720e6d Mon Sep 17 00:00:00 2001 From: Lily Liu Date: Tue, 29 Oct 2024 17:28:43 -0700 Subject: [PATCH 14/88] Update README.md (#9819) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8c8d6eb291cea..b75bfc5c699a7 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Easy, fast, and cheap LLM serving for everyone --- -**vLLM x Snowfkale Meetup (Wednesday, November 13th, 5:30-8PM PT) at Snowfkale HQ, San Mateo** +**vLLM x Snowflake Meetup (Wednesday, November 13th, 5:30-8PM PT) at Snowflake HQ, San Mateo** We are excited to announce the last in-person vLLM meetup of the year! Join the vLLM developers and engineers from Snowflake AI Research to chat about the latest LLM inference optimizations and your 2025 vLLM wishlist! From 226688bd6114749633132b9ed074c59d50904830 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 29 Oct 2024 22:49:44 -0400 Subject: [PATCH 15/88] [Bugfix][VLM] Make apply_fp8_linear work with >2D input (#9812) --- .../layers/quantization/utils/w8a8_utils.py | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 1879d2855d93d..445117ac99a34 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -96,21 +96,26 @@ def apply_fp8_linear( # If dynamic, layer.input_scale is None and x_scale computed from x. # If static, layer.input_scale is scalar and x_scale is input_scale. + # View input as 2D matrix for fp8 methods + input_2d = input.view(-1, input.shape[-1]) + output_shape = [*input.shape[:-1], weight.shape[1]] + # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A if cutlass_fp8_supported: qinput, x_scale = ops.scaled_fp8_quant( - input, + input_2d, input_scale, scale_ub=input_scale_ub, use_per_token_if_dynamic=use_per_token_if_dynamic) # Fused GEMM_DQ - return ops.cutlass_scaled_mm(qinput, - weight, - out_dtype=input.dtype, - scale_a=x_scale, - scale_b=weight_scale, - bias=bias) + output = ops.cutlass_scaled_mm(qinput, + weight, + out_dtype=input.dtype, + scale_a=x_scale, + scale_b=weight_scale, + bias=bias) + return output.view(*output_shape) # torch.scaled_mm supports per tensor weights + activations only # so fallback to naive if per channel or per token @@ -119,7 +124,7 @@ def apply_fp8_linear( # for matrices with batch dimension > 16. # This could change in the future. qinput, x_scale = ops.scaled_fp8_quant( - input, + input_2d, input_scale, num_token_padding=17, use_per_token_if_dynamic=use_per_token_if_dynamic) @@ -138,8 +143,10 @@ def apply_fp8_linear( # A fix for discrepancy in scaled_mm which returns tuple # for torch < 2.5 and a single value in torch >= 2.5 if type(output) is tuple and len(output) == 2: - return torch.narrow(output[0], 0, 0, input.shape[0]) - return torch.narrow(output, 0, 0, input.shape[0]) + output = output[0] + + return torch.narrow(output, 0, 0, + input_2d.shape[0]).view(*output_shape) else: # Fallback for channelwise case, where we use unfused DQ @@ -176,15 +183,15 @@ def apply_fp8_linear( if type(output) is tuple and len(output) == 2: output = output[0] # Unpad (undo num_token_padding) - output = torch.narrow(output, 0, 0, input.shape[0]) - x_scale = torch.narrow(x_scale, 0, 0, input.shape[0]) + output = torch.narrow(output, 0, 0, input_2d.shape[0]) + x_scale = torch.narrow(x_scale, 0, 0, input_2d.shape[0]) # DQ # C = sw * sx * (X * W) + bias output = output * x_scale * weight_scale.t() if bias is not None: output = output + bias - return output.to(dtype=input.dtype) + return output.to(dtype=input.dtype).view(*output_shape) def apply_int8_linear( From 62fac4b9aab3c05124d83fcd71db5732774b17d8 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Tue, 29 Oct 2024 17:34:55 -1000 Subject: [PATCH 16/88] [ci/build] Pin CI dependencies version with pip-compile (#9810) Signed-off-by: kevin --- Dockerfile.rocm | 2 + requirements-build.txt | 18 +- requirements-test.in | 37 +++ requirements-test.txt | 593 ++++++++++++++++++++++++++++++++++++++--- 4 files changed, 608 insertions(+), 42 deletions(-) create mode 100644 requirements-test.in diff --git a/Dockerfile.rocm b/Dockerfile.rocm index d35889f053e27..562117a313020 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -121,6 +121,8 @@ ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi +RUN python3 -m pip install --upgrade pip + # Package upgrades for useful functionality or to avoid dependency issues RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard diff --git a/requirements-build.txt b/requirements-build.txt index ea2b688bb3108..7b16d9778c1a6 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -1,9 +1,9 @@ -# Should be mirrored in pyproject.toml -cmake>=3.26 -ninja -packaging -setuptools>=61 -setuptools-scm>=8 -torch==2.5.0 -wheel -jinja2 +# Should be mirrored in pyproject.toml +cmake>=3.26 +ninja +packaging +setuptools>=61 +setuptools-scm>=8 +torch==2.5.0 +wheel +jinja2 diff --git a/requirements-test.in b/requirements-test.in new file mode 100644 index 0000000000000..3881f2566b556 --- /dev/null +++ b/requirements-test.in @@ -0,0 +1,37 @@ +# testing +pytest +tensorizer>=2.9.0 +pytest-forked +pytest-asyncio +pytest-rerunfailures +pytest-shard + +# testing utils +awscli +einops # required for MPT, qwen-vl and Mamba +httpx +librosa # required for audio tests +opencv-python # required for video tests +peft +requests +ray[adag]==2.35 +sentence-transformers # required for embedding +soundfile # required for audio test +timm # required for internvl test +torch==2.5.0 +transformers_stream_generator # required for qwen-vl test +matplotlib # required for qwen-vl test +datamodel_code_generator # required for minicpm3 test +lm-eval[api]==0.4.4 # required for model evaluation test + +# TODO: Add this after fully implementing llava(mantis) +# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test + +# Benchmarking +aiohttp + +# quantization +bitsandbytes>=0.44.0 +buildkite-test-collector==0.1.8 + +numpy < 2.0.0 diff --git a/requirements-test.txt b/requirements-test.txt index 9787fa2a4a486..c474c2ec34b22 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,34 +1,561 @@ -# testing -pytest -tensorizer>=2.9.0 -pytest-forked -pytest-asyncio -pytest-rerunfailures -pytest-shard - -# testing utils -awscli -einops # required for MPT, qwen-vl and Mamba -httpx -librosa # required for audio tests -opencv-python # required for video tests -peft -requests -ray[adag]==2.35 -sentence-transformers # required for embedding -soundfile # required for audio test -timm # required for internvl test -transformers_stream_generator # required for qwen-vl test -matplotlib # required for qwen-vl test -datamodel_code_generator # required for minicpm3 test -lm-eval[api]==0.4.4 # required for model evaluation test - -# TODO: Add this after fully implementing llava(mantis) -# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test - -# Benchmarking -aiohttp - -# quantization -bitsandbytes>=0.44.0 +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --output-file=requirements-test.txt requirements-test.in +# +absl-py==2.1.0 + # via rouge-score +accelerate==1.0.1 + # via + # lm-eval + # peft +aiohappyeyeballs==2.4.3 + # via aiohttp +aiohttp==3.10.10 + # via + # -r requirements-test.in + # datasets + # fsspec + # lm-eval +aiosignal==1.3.1 + # via + # aiohttp + # ray +annotated-types==0.7.0 + # via pydantic +anyio==4.6.2.post1 + # via httpx +argcomplete==3.5.1 + # via datamodel-code-generator +attrs==24.2.0 + # via + # aiohttp + # jsonlines + # jsonschema + # referencing +audioread==3.0.1 + # via librosa +awscli==1.35.16 + # via -r requirements-test.in +bitsandbytes==0.44.1 + # via -r requirements-test.in +black==24.10.0 + # via datamodel-code-generator +boto3==1.35.50 + # via tensorizer +botocore==1.35.50 + # via + # awscli + # boto3 + # s3transfer buildkite-test-collector==0.1.8 + # via -r requirements-test.in +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests +cffi==1.17.1 + # via soundfile +chardet==5.2.0 + # via mbstrdecoder +charset-normalizer==3.4.0 + # via requests +click==8.1.7 + # via + # black + # nltk + # ray +colorama==0.4.6 + # via + # awscli + # sacrebleu + # tqdm-multiprocess +contourpy==1.3.0 + # via matplotlib +cupy-cuda12x==13.3.0 + # via ray +cycler==0.12.1 + # via matplotlib +datamodel-code-generator==0.26.2 + # via -r requirements-test.in +dataproperty==1.0.1 + # via + # pytablewriter + # tabledata +datasets==3.0.2 + # via + # evaluate + # lm-eval +decorator==5.1.1 + # via librosa +dill==0.3.8 + # via + # datasets + # evaluate + # lm-eval + # multiprocess +dnspython==2.7.0 + # via email-validator +docutils==0.16 + # via awscli +einops==0.8.0 + # via -r requirements-test.in +email-validator==2.2.0 + # via pydantic +evaluate==0.4.3 + # via lm-eval +fastrlock==0.8.2 + # via cupy-cuda12x +filelock==3.16.1 + # via + # datasets + # huggingface-hub + # ray + # torch + # transformers + # triton +fonttools==4.54.1 + # via matplotlib +frozenlist==1.5.0 + # via + # aiohttp + # aiosignal + # ray +fsspec[http]==2024.9.0 + # via + # datasets + # evaluate + # huggingface-hub + # torch +genson==1.3.0 + # via datamodel-code-generator +h11==0.14.0 + # via httpcore +hiredis==3.0.0 + # via tensorizer +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via -r requirements-test.in +huggingface-hub==0.26.2 + # via + # accelerate + # datasets + # evaluate + # peft + # sentence-transformers + # timm + # tokenizers + # transformers +idna==3.10 + # via + # anyio + # email-validator + # httpx + # requests + # yarl +inflect==5.6.2 + # via datamodel-code-generator +iniconfig==2.0.0 + # via pytest +isort==5.13.2 + # via datamodel-code-generator +jinja2==3.1.4 + # via + # datamodel-code-generator + # torch +jmespath==1.0.1 + # via + # boto3 + # botocore +joblib==1.4.2 + # via + # librosa + # nltk + # scikit-learn +jsonlines==4.0.0 + # via lm-eval +jsonschema==4.23.0 + # via ray +jsonschema-specifications==2024.10.1 + # via jsonschema +kiwisolver==1.4.7 + # via matplotlib +lazy-loader==0.4 + # via librosa +libnacl==2.1.0 + # via tensorizer +librosa==0.10.2.post1 + # via -r requirements-test.in +llvmlite==0.43.0 + # via numba +lm-eval[api]==0.4.4 + # via -r requirements-test.in +lxml==5.3.0 + # via sacrebleu +markupsafe==3.0.2 + # via jinja2 +matplotlib==3.9.2 + # via -r requirements-test.in +mbstrdecoder==1.1.3 + # via + # dataproperty + # pytablewriter + # typepy +more-itertools==10.5.0 + # via lm-eval +mpmath==1.3.0 + # via sympy +msgpack==1.1.0 + # via + # librosa + # ray +multidict==6.1.0 + # via + # aiohttp + # yarl +multiprocess==0.70.16 + # via + # datasets + # evaluate +mypy-extensions==1.0.0 + # via black +networkx==3.2.1 + # via torch +nltk==3.9.1 + # via rouge-score +numba==0.60.0 + # via librosa +numexpr==2.10.1 + # via lm-eval +numpy==1.26.4 + # via + # -r requirements-test.in + # accelerate + # bitsandbytes + # contourpy + # cupy-cuda12x + # datasets + # evaluate + # librosa + # matplotlib + # numba + # numexpr + # opencv-python + # pandas + # peft + # rouge-score + # sacrebleu + # scikit-learn + # scipy + # soxr + # tensorizer + # torchvision + # transformers +nvidia-cublas-cu12==12.4.5.8 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.4.127 + # via torch +nvidia-cuda-nvrtc-cu12==12.4.127 + # via torch +nvidia-cuda-runtime-cu12==12.4.127 + # via torch +nvidia-cudnn-cu12==9.1.0.70 + # via torch +nvidia-cufft-cu12==11.2.1.3 + # via torch +nvidia-curand-cu12==10.3.5.147 + # via torch +nvidia-cusolver-cu12==11.6.1.9 + # via torch +nvidia-cusparse-cu12==12.3.1.170 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-nccl-cu12==2.21.5 + # via torch +nvidia-nvjitlink-cu12==12.4.127 + # via + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 + # torch +nvidia-nvtx-cu12==12.4.127 + # via torch +opencv-python==4.10.0.84 + # via -r requirements-test.in +packaging==24.1 + # via + # accelerate + # black + # datamodel-code-generator + # datasets + # evaluate + # huggingface-hub + # lazy-loader + # matplotlib + # peft + # pooch + # pytest + # pytest-rerunfailures + # ray + # transformers + # typepy +pandas==2.2.3 + # via + # datasets + # evaluate +pathspec==0.12.1 + # via black +pathvalidate==3.2.1 + # via pytablewriter +peft==0.13.2 + # via + # -r requirements-test.in + # lm-eval +pillow==11.0.0 + # via + # matplotlib + # sentence-transformers + # torchvision +platformdirs==4.3.6 + # via + # black + # pooch +pluggy==1.5.0 + # via pytest +pooch==1.8.2 + # via librosa +portalocker==2.10.1 + # via sacrebleu +propcache==0.2.0 + # via yarl +protobuf==5.28.3 + # via + # ray + # tensorizer +psutil==6.1.0 + # via + # accelerate + # peft + # tensorizer +py==1.11.0 + # via pytest-forked +pyarrow==18.0.0 + # via datasets +pyasn1==0.6.1 + # via rsa +pybind11==2.13.6 + # via lm-eval +pycparser==2.22 + # via cffi +pydantic[email]==2.9.2 + # via datamodel-code-generator +pydantic-core==2.23.4 + # via pydantic +pyparsing==3.2.0 + # via matplotlib +pytablewriter==1.2.0 + # via lm-eval +pytest==8.3.3 + # via + # -r requirements-test.in + # buildkite-test-collector + # pytest-asyncio + # pytest-forked + # pytest-rerunfailures + # pytest-shard +pytest-asyncio==0.24.0 + # via -r requirements-test.in +pytest-forked==1.6.0 + # via -r requirements-test.in +pytest-rerunfailures==14.0 + # via -r requirements-test.in +pytest-shard==0.1.2 + # via -r requirements-test.in +python-dateutil==2.9.0.post0 + # via + # botocore + # matplotlib + # pandas + # typepy +pytz==2024.2 + # via + # pandas + # typepy +pyyaml==6.0.2 + # via + # accelerate + # awscli + # datamodel-code-generator + # datasets + # huggingface-hub + # peft + # ray + # timm + # transformers +ray[adag]==2.35.0 + # via -r requirements-test.in +redis==5.2.0 + # via tensorizer +referencing==0.35.1 + # via + # jsonschema + # jsonschema-specifications +regex==2024.9.11 + # via + # nltk + # sacrebleu + # tiktoken + # transformers +requests==2.32.3 + # via + # -r requirements-test.in + # buildkite-test-collector + # datasets + # evaluate + # huggingface-hub + # lm-eval + # pooch + # ray + # tiktoken + # transformers +rouge-score==0.1.2 + # via lm-eval +rpds-py==0.20.0 + # via + # jsonschema + # referencing +rsa==4.7.2 + # via awscli +s3transfer==0.10.3 + # via + # awscli + # boto3 +sacrebleu==2.4.3 + # via lm-eval +safetensors==0.4.5 + # via + # accelerate + # peft + # timm + # transformers +scikit-learn==1.5.2 + # via + # librosa + # lm-eval + # sentence-transformers +scipy==1.13.1 + # via + # librosa + # scikit-learn + # sentence-transformers +sentence-transformers==3.2.1 + # via -r requirements-test.in +six==1.16.0 + # via + # python-dateutil + # rouge-score +sniffio==1.3.1 + # via + # anyio + # httpx +soundfile==0.12.1 + # via + # -r requirements-test.in + # librosa +soxr==0.5.0.post1 + # via librosa +sqlitedict==2.1.0 + # via lm-eval +sympy==1.13.1 + # via torch +tabledata==1.3.3 + # via pytablewriter +tabulate==0.9.0 + # via sacrebleu +tcolorpy==0.1.6 + # via pytablewriter +tenacity==9.0.0 + # via lm-eval +tensorizer==2.9.0 + # via -r requirements-test.in +threadpoolctl==3.5.0 + # via scikit-learn +tiktoken==0.8.0 + # via lm-eval +timm==1.0.11 + # via -r requirements-test.in +tokenizers==0.20.1 + # via transformers +torch==2.5.0 + # via + # -r requirements-test.in + # accelerate + # bitsandbytes + # lm-eval + # peft + # sentence-transformers + # tensorizer + # timm + # torchvision +torchvision==0.20.0 + # via timm +tqdm==4.66.6 + # via + # datasets + # evaluate + # huggingface-hub + # lm-eval + # nltk + # peft + # sentence-transformers + # tqdm-multiprocess + # transformers +tqdm-multiprocess==0.0.11 + # via lm-eval +transformers==4.45.2 + # via + # lm-eval + # peft + # sentence-transformers + # transformers-stream-generator +transformers-stream-generator==0.0.5 + # via -r requirements-test.in +triton==3.1.0 + # via torch +typepy[datetime]==1.3.2 + # via + # dataproperty + # pytablewriter + # tabledata +typing-extensions==4.12.2 + # via + # huggingface-hub + # librosa + # pydantic + # pydantic-core + # torch +tzdata==2024.2 + # via pandas +urllib3==1.26.20 + # via + # botocore + # requests +word2number==1.1 + # via lm-eval +xxhash==3.5.0 + # via + # datasets + # evaluate +yarl==1.17.0 + # via aiohttp +zstandard==0.23.0 + # via lm-eval + +# The following packages are considered to be unsafe in a requirements file: +# setuptools From 04a3ae0acae3d522299ec90b5730f876daa845e6 Mon Sep 17 00:00:00 2001 From: Yan Ma Date: Wed, 30 Oct 2024 12:34:45 +0800 Subject: [PATCH 17/88] [Bugfix] Fix multi nodes TP+PP for XPU (#8884) Signed-off-by: YiSheng5 Signed-off-by: yan ma Co-authored-by: YiSheng5 --- .../getting_started/xpu-installation.rst | 18 +++++++++++++++ requirements-xpu.txt | 2 +- vllm/distributed/parallel_state.py | 22 +++++++++++++++++++ vllm/executor/xpu_executor.py | 12 +++++++++- vllm/platforms/__init__.py | 3 +++ vllm/platforms/xpu.py | 4 ++++ vllm/worker/xpu_worker.py | 13 ++++------- 7 files changed, 63 insertions(+), 11 deletions(-) diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst index 151ebb5f1811f..b1868acbc84b0 100644 --- a/docs/source/getting_started/xpu-installation.rst +++ b/docs/source/getting_started/xpu-installation.rst @@ -60,3 +60,21 @@ Build from source - FP16 is the default data type in the current XPU backend. The BF16 data type will be supported in the future. + +Distributed inference and serving +--------------------------------- + +XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following: + +.. code-block:: console + + $ python -m vllm.entrypoints.openai.api_server \ + $ --model=facebook/opt-13b \ + $ --dtype=bfloat16 \ + $ --device=xpu \ + $ --max_model_len=1024 \ + $ --distributed-executor-backend=ray \ + $ --pipeline-parallel-size=2 \ + $ -tp=8 + +By default, a ray instance will be launched automatically if no existing one is detected in system, with ``num-gpus`` equals to ``parallel_config.world_size``. We recommend properly starting a ray cluster before execution, referring helper `script `_. diff --git a/requirements-xpu.txt b/requirements-xpu.txt index ce83a178c618f..eb76a33dab5c2 100644 --- a/requirements-xpu.txt +++ b/requirements-xpu.txt @@ -13,4 +13,4 @@ torch == 2.3.1+cxx11.abi intel-extension-for-pytorch == 2.3.110+xpu oneccl_bind_pt == 2.3.100+xpu -triton-xpu == 3.0.0b2 +triton-xpu == 3.0.0b1 diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index ec39856b6f67c..b04bbc478534c 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -431,6 +431,28 @@ def gather(self, if dim < 0: # Convert negative dim to positive. dim += input_.dim() + # For xpu path, gather doesn't work properly together with ray + # cluster so we use all_gather instead for now. + if current_platform.is_xpu(): + input_size = input_.size() + # Allocate output tensor. + output_tensor = torch.empty((world_size, ) + input_size, + dtype=input_.dtype, + device=input_.device) + # All-gather. + torch.distributed.all_gather_into_tensor(output_tensor, + input_, + group=self.device_group) + if self.rank_in_group == dst: + # Reshape + output_tensor = output_tensor.movedim(0, dim) + output_tensor = output_tensor.reshape(input_size[:dim] + + (world_size * + input_size[dim], ) + + input_size[dim + 1:]) + else: + output_tensor = None + return output_tensor # Allocate output tensor. if self.rank_in_group == dst: gather_list = [torch.empty_like(input_) for _ in range(world_size)] diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py index bada56068507a..5f78993ddc4b4 100644 --- a/vllm/executor/xpu_executor.py +++ b/vllm/executor/xpu_executor.py @@ -44,7 +44,7 @@ def __init__( self.cache_config = cache_config self.load_config = load_config self.lora_config = lora_config - self.parallel_config = parallel_config + self.parallel_config = _verify_and_get_parallel_config(parallel_config) self.scheduler_config = scheduler_config self.device_config = device_config self.prompt_adapter_config = prompt_adapter_config @@ -94,3 +94,13 @@ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: "mode.") config.enforce_eager = True return config + + +def _verify_and_get_parallel_config(config: ParallelConfig) -> ParallelConfig: + if (config.distributed_executor_backend is not None + and config.distributed_executor_backend != "ray"): + logger.warning( + "%s is not supported on XPU, fallback to ray distributed executor " + "backend.", config.distributed_executor_backend) + config.distributed_executor_backend = "ray" + return config diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 7e9f8b1297b80..524150920b854 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -45,6 +45,9 @@ is_xpu = False try: + # installed IPEX if the machine has XPUs. + import intel_extension_for_pytorch # noqa: F401 + import oneccl_bindings_for_pytorch # noqa: F401 import torch if hasattr(torch, 'xpu') and torch.xpu.is_available(): is_xpu = True diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index d00e0dca84fff..106e8eddf458f 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -20,3 +20,7 @@ def get_device_name(device_id: int = 0) -> str: def get_device_total_memory(cls, device_id: int = 0) -> int: device_props = torch.xpu.get_device_properties(device_id) return device_props.total_memory + + @staticmethod + def inference_mode(): + return torch.no_grad() diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 917866f2d985b..c1d836bb0d318 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -14,7 +14,6 @@ SpeculativeConfig) from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) -from vllm.distributed.parallel_state import get_pp_group from vllm.logger import init_logger from vllm.model_executor import set_random_seed from vllm.platforms import current_platform @@ -183,11 +182,10 @@ def init_worker_distributed_environment(self) -> None: # use sockets as default Level zero IPC exchange backend. By # default oneccl will use `drmfd` as mechanism which need extra # dependency (libdrm and drm headers) on your system. - ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", - "sockets") + ENV_CCL_ATL_TRANSPORT = os.getenv("CCL_ATL_TRANSPORT", "ofi") ENV_LOCAL_WORLD_SIZE = os.getenv("LOCAL_WORLD_SIZE", str(parallel_config.world_size)) - os.environ['CCL_ZE_IPC_EXCHANGE'] = ENV_CCL_ZE_IPC_EXCHANGE + os.environ["CCL_ATL_TRANSPORT"] = ENV_CCL_ATL_TRANSPORT os.environ["LOCAL_WORLD_SIZE"] = ENV_LOCAL_WORLD_SIZE os.environ["LOCAL_RANK"] = str(self.local_rank) init_distributed_environment( @@ -200,8 +198,5 @@ def init_worker_distributed_environment(self) -> None: ensure_model_parallel_initialized( parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size) - - if parallel_config.pipeline_parallel_size > 1: - # torch-ccl xpu need a collective API warm up - # before calling send/recv API - get_pp_group().all_reduce(torch.zeros(1).xpu()) + # global all_reduce needed for overall oneccl warm up + torch.distributed.all_reduce(torch.zeros(1).xpu()) From 7b0365efef35bb03aa94e0085199d20750409363 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Wed, 30 Oct 2024 01:22:23 -0400 Subject: [PATCH 18/88] [Doc] Add the DCO to CONTRIBUTING.md (#9803) Signed-off-by: Russell Bryant Co-authored-by: Michael Goin Co-authored-by: Cyrus Leung --- CONTRIBUTING.md | 12 +++++++++++- DCO | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 DCO diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5f79356bd32f7..b39fd75b5fb70 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,12 +11,14 @@ We also believe in the power of community support; thus, answering queries, offe Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository! +## License + +See [LICENSE](LICENSE). ## Developing Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details. - ## Testing ```bash @@ -33,6 +35,14 @@ pytest tests/ ## Contribution Guidelines +### DCO and Signed-off-by + +When contributing changes to this project, you must agree to the [DCO](DCO). +Commits must include a `Signed-off-by:` header which certifies agreement with +the terms of the [DCO](DCO). + +Using `-s` with `git commit` will automatically add this header. + ### Issues If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. diff --git a/DCO b/DCO new file mode 100644 index 0000000000000..49b8cb0549267 --- /dev/null +++ b/DCO @@ -0,0 +1,34 @@ +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2004, 2006 The Linux Foundation and its contributors. + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. From ff5ed6e1bcbd112a26f8eb43b6bfdbc5ec73726e Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 29 Oct 2024 23:03:49 -0700 Subject: [PATCH 19/88] [torch.compile] rework compile control with piecewise cudagraph (#9715) Signed-off-by: youkaichao --- .buildkite/test-pipeline.yaml | 3 + tests/compile/piecewise/__init__.py | 0 .../piecewise_compilation_config.json | 4 + tests/compile/piecewise/test_simple.py | 96 +++++ tests/compile/piecewise/test_toy_llama.py | 334 +++++++++++++++ tests/compile/test_full_graph.py | 2 +- tests/compile/utils.py | 18 +- vllm/compilation/backends.py | 384 ++++++++++++++---- vllm/compilation/config.py | 154 +++++++ vllm/compilation/counter.py | 30 ++ vllm/compilation/decorators.py | 10 +- vllm/compilation/levels.py | 3 +- vllm/envs.py | 5 + vllm/model_executor/custom_op.py | 4 +- vllm/platforms/tpu.py | 2 +- vllm/plugins/__init__.py | 15 +- vllm/utils.py | 25 ++ 17 files changed, 983 insertions(+), 106 deletions(-) create mode 100644 tests/compile/piecewise/__init__.py create mode 100644 tests/compile/piecewise/piecewise_compilation_config.json create mode 100644 tests/compile/piecewise/test_simple.py create mode 100644 tests/compile/piecewise/test_toy_llama.py create mode 100644 vllm/compilation/config.py create mode 100644 vllm/compilation/counter.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8c98aa36ac0ff..ed847a7e3696b 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -229,6 +229,9 @@ steps: - tests/compile commands: - pytest -v -s compile/test_basic_correctness.py + # these tests need to be separated, cannot combine + - pytest -v -s compile/piecewise/test_simple.py + - pytest -v -s compile/piecewise/test_toy_llama.py - label: "PyTorch Fullgraph Test" # 18min source_file_dependencies: diff --git a/tests/compile/piecewise/__init__.py b/tests/compile/piecewise/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/compile/piecewise/piecewise_compilation_config.json b/tests/compile/piecewise/piecewise_compilation_config.json new file mode 100644 index 0000000000000..03d077b76f627 --- /dev/null +++ b/tests/compile/piecewise/piecewise_compilation_config.json @@ -0,0 +1,4 @@ +{ + "use_cudagraph": true, + "non_cudagraph_ops": ["silly.attention"] +} \ No newline at end of file diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py new file mode 100644 index 0000000000000..a34d33efba1d8 --- /dev/null +++ b/tests/compile/piecewise/test_simple.py @@ -0,0 +1,96 @@ +""" +Test the piecewise compilation with a simple model so that we +can exactly calculate the expected output and side effects. +""" +import os + +import torch +from torch import nn + +from vllm.compilation.compile_context import set_compile_context +from vllm.compilation.counter import compilation_counter +from vllm.compilation.decorators import support_torch_compile +from vllm.compilation.levels import CompilationLevel + +os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE) + +global_counter = 0 + + +@torch.library.custom_op("silly::attention", mutates_args=["out"]) +def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + global global_counter + global_counter += 1 + print(f"{global_counter=}") + out.copy_(q) + out[0] += 1 + + +@silly_attention.register_fake +def _(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + return + + +@support_torch_compile +class SillyModel(nn.Module): + + def __init__(self) -> None: + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Overall effect: + x += 1 + x[0] += 2 + global_counter += 2 + """ + x = x + 1 + x = x + 2 + out = torch.empty_like(x) + torch.ops.silly.attention(x, x, x, out) + x = out + x = x - 2 + x = x - 1 + out = torch.empty_like(x) + torch.ops.silly.attention(x, x, x, out) + x = out + x = x + 1 + return x + + +def test_simple_piecewise_compile(): + + model = SillyModel() + + directory = os.path.dirname(__file__) + config = os.path.join(directory, "piecewise_compilation_config.json") + os.environ["VLLM_TORCH_COMPILE_CONFIG"] = config + + input_buffer = torch.randn(100).cuda() + + with compilation_counter.expect( + num_graphs_seen=1, # one graph for the model + num_piecewise_graphs_seen=5, # 2 * num_layers + 1 + num_piecewise_capturable_graphs_seen=3, # 1 + num_layers + num_inductor_compilations=3, # num_piecewise_capturable_graphs_seen + num_cudagraph_caputured= + 6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + + with set_compile_context([1, 2]): + model(input_buffer) + + model(input_buffer[:2]) + model(input_buffer[:1]) + + input_buffer[:2].zero_() + global global_counter + global_counter = 0 + output = model(input_buffer[:2]) + assert global_counter == 2 + assert torch.allclose(output.cpu(), torch.tensor([3., 1.])) + + # clean up to avoid side effects for other tests + del os.environ["VLLM_TORCH_COMPILE_CONFIG"] diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py new file mode 100644 index 0000000000000..db6a983d70feb --- /dev/null +++ b/tests/compile/piecewise/test_toy_llama.py @@ -0,0 +1,334 @@ +""" +Test the piecewise compilation with a simple model, comparing the output +with and without the piecewise compilation. +""" +import os +from dataclasses import dataclass +from typing import Optional, Tuple + +import torch +from torch import nn + +from vllm.compilation.compile_context import set_compile_context +from vllm.compilation.config import CompilationConfig +from vllm.compilation.counter import compilation_counter +from vllm.compilation.decorators import support_torch_compile +from vllm.compilation.levels import CompilationLevel +from vllm.plugins import set_compilation_config + + +@torch.library.custom_op("silly::attention", mutates_args=["out"]) +def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + out.copy_(q) + out += k + out += v + + +@silly_attention.register_fake +def _(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + return + + +@dataclass +class LlamaConfig: + hidden_size: int = 128 + mlp_size: int = 256 + vocab_size: int = 128 + num_layers: int = 2 + + +class LlamaMLP(nn.Module): + + def __init__(self, config: LlamaConfig) -> None: + super().__init__() + self.gate_up_projection = nn.Linear( + in_features=config.hidden_size, + out_features=config.mlp_size * 2, + bias=False, + ) + self.down_projection = nn.Linear( + in_features=config.mlp_size, + out_features=config.hidden_size, + bias=False, + ) + + self.gate_up_projection.weight.data.fill_(0.0) + self.down_projection.weight.data.fill_(0.0) + + def forward(self, x): + x = self.gate_up_projection(x) + x = x[:, :x.size(1) // 2] * torch.nn.functional.relu( + x[:, x.size(1) // 2:]) + x = self.down_projection(x) + return x + + +class LlamaAttention(nn.Module): + + def __init__(self, config: LlamaConfig) -> None: + super().__init__() + self.qkv_projection = nn.Linear( + in_features=config.hidden_size, + out_features=config.hidden_size * 3, + ) + + self.output_projection = nn.Linear( + in_features=config.hidden_size, + out_features=config.hidden_size, + ) + + self.qkv_projection.weight.data.fill_(0.0) + self.output_projection.weight.data.fill_(0.0) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv = self.qkv_projection(hidden_states) + hidden_size = qkv.size(-1) // 3 + q, k, v = qkv.split([hidden_size, hidden_size, hidden_size], dim=-1) + + q = q + positions.unsqueeze(1) + k = k + positions.unsqueeze(1) + + attn_output = torch.empty_like(q) + torch.ops.silly.attention(q, k, v, attn_output) + + output = self.output_projection(attn_output) + return output + + +class LlamaDecoderLayer(nn.Module): + + def __init__(self, config: LlamaConfig) -> None: + super().__init__() + self.self_attention = LlamaAttention(config) + self.mlp = LlamaMLP(config) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + if residual is None: + residual = hidden_states + hidden_states = hidden_states / 2 + else: + hidden_states = hidden_states + residual + residual = hidden_states + hidden_states = hidden_states / 2 + + hidden_states = self.self_attention(positions=positions, + hidden_states=hidden_states) + + hidden_states = hidden_states + residual + residual = hidden_states + hidden_states = hidden_states / 2 + hidden_states = self.mlp(hidden_states) + + return hidden_states, residual + + +class LlamaModel(nn.Module): + + def __init__(self, config: LlamaConfig) -> None: + super().__init__() + self.embedding_tokens = nn.Embedding( + num_embeddings=config.vocab_size, + embedding_dim=config.hidden_size, + ) + self.layers = nn.ModuleList( + [LlamaDecoderLayer(config) for _ in range(config.num_layers)]) + + self.embedding_tokens.weight.data.fill_(0.0) + + def forward( + self, + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + ) -> torch.Tensor: + hidden_states = self.embedding_tokens(input_ids) + residual = None + for layer in self.layers: + hidden_states, residual = layer(positions, hidden_states, residual) + return hidden_states + + +@torch.inference_mode +def run_model(llama_config, + use_compile: bool, + split_attn: bool = False) -> torch.Tensor: + + if use_compile: + os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str( + CompilationLevel.PIECEWISE) + + if split_attn: + set_compilation_config( + CompilationConfig( + use_cudagraph=True, + non_cudagraph_ops=["silly.attention"], + )) + else: + set_compilation_config(CompilationConfig(use_cudagraph=True, )) + else: + os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str( + CompilationLevel.NO_COMPILATION) + set_compilation_config(None) + + cls = LlamaModel + if use_compile: + cls = support_torch_compile(LlamaModel) + model = cls(llama_config).eval().cuda() + + B = 16 # max batch size + input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda() + positions = torch.arange(B).cuda() + + with set_compile_context([1, 2]): + model(input_ids, positions) + model(input_ids[:2], positions[:2]) + model(input_ids[:1], positions[:1]) + + input_ids[:2].zero_() + output = model(input_ids[:2], positions[:2]) + + # manual cleanup + del os.environ["VLLM_TORCH_COMPILE_LEVEL"] + set_compilation_config(None) + + return output.cpu() + + +def test_toy_llama(): + # compare output with and without piecewise compilation + + llama_config = LlamaConfig(hidden_size=128, + mlp_size=256, + vocab_size=128, + num_layers=2) + + outputs = [] + with compilation_counter.expect( + num_graphs_seen=0, + num_piecewise_graphs_seen=0, + num_piecewise_capturable_graphs_seen=0, + num_inductor_compilations=0, + num_cudagraph_caputured=0, + ): + outputs.append(run_model(llama_config, use_compile=False)) + with compilation_counter.expect( + num_graphs_seen=1, # one graph for the model + num_piecewise_graphs_seen=1, + num_piecewise_capturable_graphs_seen=1, + num_inductor_compilations=1, # num_piecewise_capturable_graphs_seen + num_cudagraph_caputured= + 2, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + outputs.append(run_model(llama_config, use_compile=True)) + + with compilation_counter.expect( + num_graphs_seen=1, # one graph for the model + num_piecewise_graphs_seen=2 * llama_config.num_layers + + 1, # 2 * num_layers + 1 + num_piecewise_capturable_graphs_seen=1 + + llama_config.num_layers, # 1 + num_layers + num_inductor_compilations=1 + + llama_config.num_layers, # num_piecewise_capturable_graphs_seen + num_cudagraph_caputured=2 * + (1 + llama_config.num_layers + ), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + outputs.append( + run_model(llama_config, use_compile=True, split_attn=True)) + + for i in range(1, len(outputs)): + assert torch.allclose(outputs[0], outputs[i]) + + +@torch.inference_mode +def benchmark(): + os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE) + from triton.testing import do_bench + cls = support_torch_compile(LlamaModel) + + # similar to llama 3.1-8B + llama_config = LlamaConfig(hidden_size=4096, + mlp_size=14336, + vocab_size=128 * 1024, + num_layers=32) + + # a tiny model to measure the overhead + # of piecewise cudagraph + llama_config = LlamaConfig(hidden_size=40, + mlp_size=80, + vocab_size=128, + num_layers=2) + + cudagraph_sizes = [1, 2, 4] + [i * 8 for i in range(1, 33)] + + eager_time = {} + full_cudagraph_time = {} + piecewise_cudagraph_time = {} + + pool = torch.cuda.graph_pool_handle() + + for piecewise in [False, True]: + if piecewise: + set_compilation_config( + CompilationConfig( + use_cudagraph=True, + non_cudagraph_ops=["silly.attention"], + )) + else: + set_compilation_config(None) + + model = cls(llama_config).eval().cuda().to(torch.bfloat16) + + B = 256 # max batch size + input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda() + positions = torch.arange(B).cuda().to(torch.bfloat16) + + graphs = {} + + with set_compile_context(cudagraph_sizes): + model(input_ids, positions) + for b in cudagraph_sizes[::-1]: + if not piecewise: + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph, pool=pool): + output = model(input_ids[:b], positions[:b]) + graphs[b] = (graph, output) + else: + output = model(input_ids[:b], positions[:b]) + graphs[b] = (model, output) + for b in cudagraph_sizes: + if piecewise: + # noqa is for `Function definition does not bind loop variable` + # it will be problematic if we save the created lambda function + # and use it later, because it will look up the name `b` in the + # enclosing scope, and the value of `b` will always be 256. + # it is fine here, because we only use the lambda function once. + runtime = do_bench(lambda: graphs[b][0] # noqa + (input_ids[:b], positions[:b])) # noqa + piecewise_cudagraph_time[b] = runtime + else: + runtime = do_bench(lambda: graphs[b][0].replay()) # noqa + eager_runtime = do_bench( + lambda: model(input_ids[:b], positions[:b])) # noqa + full_cudagraph_time[b] = runtime + eager_time[b] = eager_runtime + + # print in tabular format + print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph") + for b in cudagraph_sizes: + print((f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}" + f"\t{piecewise_cudagraph_time[b]:.3f}")) + + +if __name__ == "__main__": + benchmark() diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index f28f9145bb442..f00334934cb46 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -9,7 +9,7 @@ @pytest.mark.parametrize("model_info", TEST_MODELS) @pytest.mark.parametrize( "optimization_level", - [CompilationLevel.DYNAMO_ONCE, CompilationLevel.INDUCTOR]) + [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE]) @fork_new_process_for_each_test def test_full_graph(model_info, optimization_level): model = model_info[0] diff --git a/tests/compile/utils.py b/tests/compile/utils.py index 64fc08e80de3b..95cad19126df6 100644 --- a/tests/compile/utils.py +++ b/tests/compile/utils.py @@ -9,17 +9,19 @@ TEST_MODELS = [ ("facebook/opt-125m", {}), - ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", { - "dtype": torch.float16, - "quantization": "compressed-tensors" - }), + # TODO: add fake implementation for compressed-tensors + # ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", { + # "dtype": torch.float16, + # "quantization": "compressed-tensors" + # }), ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", { "dtype": torch.float16, "quantization": "fp8" }), - ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", { - "quantization": "compressed-tensors" - }), + # TODO: add fake implementation for compressed-tensors + # ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", { + # "quantization": "compressed-tensors" + # }), ("meta-llama/Meta-Llama-3-8B", {}), ] @@ -73,7 +75,7 @@ def check_full_graph_support(model, # much memory. quantization = model_kwargs.get("quantization") if ((quantization == "fp8" or model == "meta-llama/Meta-Llama-3-8B") - and optimization_level >= CompilationLevel.INDUCTOR): + and optimization_level >= CompilationLevel.PIECEWISE): return prompts = [ diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 6d9832e2c39c0..10cf49e19eccc 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -1,13 +1,16 @@ import copy +import dataclasses import operator -from typing import Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union import torch import torch.fx as fx from vllm.logger import init_logger +from vllm.utils import weak_ref_tensors -from .compile_context import get_compile_context +from .config import CompilationConfig +from .counter import compilation_counter from .levels import CompilationLevel logger = init_logger(__name__) @@ -157,113 +160,326 @@ def fix_functionalization(graph: fx.Graph): # print(graph.python_code(root_module="self", verbose=True).src, file=f) -def wrap_inductor(graph, example_inputs, additional_inductor_config): +def wrap_inductor(graph, + example_inputs, + additional_inductor_config, + do_logging=False, + runtime_shape: Optional[int] = None, + use_inductor: bool = True): + if not use_inductor: + return graph + + compilation_counter.num_inductor_compilations += 1 + + if do_logging: + if runtime_shape is None: + logger.info("Compiling a graph for general shape") + else: + logger.info("Compiling a graph for shape %s", runtime_shape) + from torch._inductor import config current_config = config.shallow_copy_dict() from torch._inductor.compile_fx import compile_fx if additional_inductor_config is not None: current_config.update(additional_inductor_config) - if current_config['post_grad_custom_post_pass'] is not None: - logger.warning( - "post_grad_custom_post_pass is already set in the config. " - "Overwriting it with the fix_functionalization") - current_config['post_grad_custom_post_pass'] = fix_functionalization + + # inductor can inplace modify the graph, so we need to copy it + # see https://github.com/pytorch/pytorch/issues/138980 + graph = copy.deepcopy(graph) return compile_fx(graph, example_inputs, config_patches=current_config) -def vllm_backend( +@dataclasses.dataclass +class SplitItem: + submod_name: str + is_splitting_graph: bool + graph: fx.GraphModule + + +def split_graph(graph: fx.GraphModule, + ops: List[str]) -> Tuple[fx.GraphModule, List[SplitItem]]: + # split graph by ops + subgraph_id = 0 + node_to_subgraph_id = {} + split_op_graphs = [] + for node in graph.graph.nodes: + if node.op in ("output", "placeholder"): + continue + if node.op == 'call_function' and str(node.target) in ops: + subgraph_id += 1 + node_to_subgraph_id[node] = subgraph_id + split_op_graphs.append(subgraph_id) + subgraph_id += 1 + else: + node_to_subgraph_id[node] = subgraph_id + + # `keep_original_order` is important! + # otherwise pytorch might reorder the nodes and + # the semantics of the graph will change when we + # have mutations in the graph + split_gm = torch.fx.passes.split_module.split_module( graph, - example_inputs, - additional_inductor_config: Optional[Dict] = None) -> Callable: - - context = get_compile_context() - context = copy.deepcopy(context) if context is not None else [] - sizes_to_specialize: List[int] = context + None, + lambda node: node_to_subgraph_id[node], + keep_original_order=True) - # flags for all the seen shapes, whether we need to specialize - runtime_shapes_to_compile_flags: Dict[Tuple[int, ...], bool] = {} + outputs = [] - # if we need to specialize, the compiled graph for that shape - runtime_shapes_to_compiled_graph: Dict[Tuple[int, ...], Callable] = {} + # sort the names to make sure the order is deterministic + names = [name for (name, module) in split_gm.named_modules()] + names.sort() - # this is the first compilation, we will compile a graph with - # dynamic shape, as the caller will mark first dimension as dynamic - logger.info("Compiling a graph for general shapes") - graph_for_symbolic_shape = wrap_inductor(graph, example_inputs, - additional_inductor_config) + for name in names: + if "." in name or name == "": + # recursive child module or the root module + continue - # TODO: Dynamo does not pass all dynamic shapes. - # Need to investigate why. It works now because all the dynamic - # shapes have the same value, and either of them can be used. - sym_shape_indices = [ - i for i, x in enumerate(example_inputs) if isinstance(x, torch.SymInt) - ] + module = getattr(split_gm, name) - first_run = True + graph_id = int(name.replace("submod_", "")) + outputs.append(SplitItem(name, graph_id in split_op_graphs, module)) - # this is the function we return to Dynamo to run finally - def compiled_graph_wrapper(*args): + return split_gm, outputs - runtime_shapes: Tuple[int, - ...] = tuple(args[i] for i in sym_shape_indices) - nonlocal first_run - nonlocal runtime_shapes_to_compile_flags - nonlocal runtime_shapes_to_compiled_graph +class VllmBackend: + """The compilation backend for `torch.compile` with VLLM. + It is used for compilation level of `CompilationLevel.PIECEWISE`, + where we customize the compilation. - if first_run: - # the first compilation is for profiling, we directly run it - first_run = False - return graph_for_symbolic_shape(*args) - - if runtime_shapes not in runtime_shapes_to_compile_flags: - # we haven't seen this shape before - # query if we need to specialize for this shape - # we only specialize for the first dimension. - # TODO: investigate if any model needs to specialize - # beyond the first dimension - runtime_shapes_to_compile_flags[runtime_shapes] = runtime_shapes[ - 0] in sizes_to_specialize - - if not runtime_shapes_to_compile_flags[runtime_shapes]: - # we don't need to specialize for this shape - return graph_for_symbolic_shape(*args) + The major work of this backend is to split the graph into + piecewise graphs, and pass them to the piecewise backend. + """ - if runtime_shapes not in runtime_shapes_to_compiled_graph: - # we need to specialize for this shape, and we haven't compiled - # compile the graph for this shape - logger.info("Compiling a graph for shapes %s", runtime_shapes) - runtime_shapes_to_compiled_graph[runtime_shapes] = wrap_inductor( - graph, args, additional_inductor_config) + compilation_configs: CompilationConfig + graph_pool: Any + _called: bool = False + # the graph we compiled + graph: fx.GraphModule + # the stiching graph module for all the piecewise graphs + split_gm: fx.GraphModule + piecewise_graphs: List[SplitItem] + returned_callable: Callable + + def __init__(self, ): + # every instance of VllmBackend has its own graph pool + self.graph_pool = torch.cuda.graph_pool_handle() + + # `torch.compile` is JIT compiled, so we don't need to + # do anything here + + def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: + + compilation_counter.num_graphs_seen += 1 + + # we control the compilation process, each instance can only be + # called once + assert not self._called, "VllmBackend can only be called once" + + self.graph = graph + # config is read now, because only here can + # we get the sizes to capture for cudagraph + # from compilation context + self.compilation_configs = CompilationConfig.select_and_init_config() + + self.split_gm, self.piecewise_graphs = split_graph( + graph, self.compilation_configs.non_cudagraph_ops) + + returned_callable: Callable # type: ignore + + if len(self.piecewise_graphs) == 0: + compilation_counter.num_piecewise_graphs_seen += 1 + compilation_counter.num_piecewise_capturable_graphs_seen += 1 + returned_callable = PiecewiseBackend(graph, + self.compilation_configs, + self.graph_pool, + is_first_graph=True) + else: + from torch._dynamo.utils import lazy_format_graph_code + logger.debug( + "%s", lazy_format_graph_code("stiching module", self.split_gm)) + + is_first_graph = True + + for item in self.piecewise_graphs: + compilation_counter.num_piecewise_graphs_seen += 1 + compilation_counter.num_piecewise_capturable_graphs_seen += not item.is_splitting_graph # noqa + if not item.is_splitting_graph: + # cannot setattr to a module, so we need to set + # the attribute in the __dict__ + self.split_gm.__dict__[ + item.submod_name] = PiecewiseBackend( + item.graph, self.compilation_configs, + self.graph_pool, is_first_graph) + is_first_graph = False + returned_callable = self.split_gm + + self.returned_callable = returned_callable + # trigger the first compilation + # code borrowed from https://github.com/pytorch/pytorch/blob/4e3e08b71171fa34172b2362ff668553fac75f27/torch/_dynamo/backends/distributed.py#L206 # noqa + # to turn the inputs into fake tensors + import torch._guards + from torch._guards import detect_fake_mode + fake_mode = detect_fake_mode(example_inputs) + fake_args = [] + for arg in example_inputs: + if isinstance(arg, torch.Tensor) and not isinstance( + arg, torch._subclasses.FakeTensor): + fake_args.append( + torch._dynamo.utils.to_fake_tensor(arg, fake_mode)) + else: + fake_args.append(arg) + self.returned_callable(*fake_args) + + self._called = True + + return self.returned_callable + + +@dataclasses.dataclass +class ConcreteSizeEntry: + runtime_shape: int + need_to_compile: bool # the size is in compile_sizes + use_cudagraph: bool # the size is in capture_sizes + + compiled: bool = False + runnable: Callable = None # type: ignore + num_finished_warmup: int = 0 + cudagraph: Optional[torch.cuda.CUDAGraph] = None + output: Optional[Any] = None + + +class PiecewiseBackend: + + def __init__(self, + graph: fx.GraphModule, + compilation_configs: CompilationConfig, + graph_pool: Any, + is_first_graph: bool = False): + """ + The backend for piecewise compilation. + It mainly handles the compilation and cudagraph capturing. + + We will compile `self.graph` once for the general shape, + and then compile for different shapes specified in + `compilation_configs.compile_sizes`. + + Independently, we will capture cudagraph for different shapes. + + If a shape needs both compilation and cudagraph, we will + compile it first, and then capture cudagraph. + """ + self.graph = graph + self.compilation_configs = compilation_configs + self.graph_pool = graph_pool + self.is_first_graph = is_first_graph + + self.compile_sizes: Set[int] = set( + self.compilation_configs.compile_sizes) + self.capture_sizes: Set[int] = set( + self.compilation_configs.capture_sizes + ) if self.compilation_configs.use_cudagraph else set() + + self.compile_finished = False + self.first_run_finished = False + + self.compiled_graph_for_general_shape: Callable = None # type: ignore + + self.sym_shape_indices: List[int] = [] + + # the entries for different shapes that we need to either + # compile or capture cudagraph + self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {} + for shape in self.compile_sizes.union(self.capture_sizes): + self.concrete_size_entries[shape] = ConcreteSizeEntry( + runtime_shape=shape, + need_to_compile=shape in self.compile_sizes, + use_cudagraph=shape in self.capture_sizes, + ) + + def __call__(self, *args) -> Any: + + if not self.compile_finished: + self.compile_finished = True + + # this is the first compilation, we will compile a graph with + # dynamic shape, as the caller will mark first dimension as dynamic + + self.sym_shape_indices = [ + i for i, x in enumerate(args) if isinstance(x, torch.SymInt) + ] + + self.compiled_graph_for_general_shape = wrap_inductor( + self.graph, + args, + self.compilation_configs.inductor_compile_config, + runtime_shape=None, + do_logging=self.is_first_graph, + use_inductor=self.compilation_configs.use_inductor) + + return self.graph(*args) + + if not self.first_run_finished: + self.first_run_finished = True + return self.compiled_graph_for_general_shape(*args) + + runtime_shape = args[self.sym_shape_indices[0]] + if runtime_shape not in self.concrete_size_entries: + # we don't need to do anything for this shape + return self.compiled_graph_for_general_shape(*args) + + entry = self.concrete_size_entries[runtime_shape] - return runtime_shapes_to_compiled_graph[runtime_shapes](*args) + if entry.runnable is None: + entry.runnable = self.compiled_graph_for_general_shape - return compiled_graph_wrapper + if entry.need_to_compile and not entry.compiled: + entry.compiled = True + # args are real arguments + entry.runnable = wrap_inductor( + self.graph, + args, + self.compilation_configs.inductor_compile_config, + runtime_shape=runtime_shape, + do_logging=self.is_first_graph, + use_inductor=self.compilation_configs.use_inductor) + + if not entry.use_cudagraph: + return entry.runnable(*args) + + if entry.cudagraph is None: + if entry.num_finished_warmup < self.compilation_configs.cudagraph_num_of_warmups: # noqa + entry.num_finished_warmup += 1 + if self.is_first_graph: + logger.debug( + "Warming up %s/%s for shape %s", + entry.num_finished_warmup, + self.compilation_configs.cudagraph_num_of_warmups, + runtime_shape) + return entry.runnable(*args) + + if self.is_first_graph: + logger.info("Capturing a cudagraph for shape %s", + runtime_shape) + + cudagraph = torch.cuda.CUDAGraph() + with torch.cuda.graph(cudagraph, pool=self.graph_pool): + entry.output = weak_ref_tensors(entry.runnable(*args)) + + compilation_counter.num_cudagraph_caputured += 1 + + entry.cudagraph = cudagraph + return entry.output + + entry.cudagraph.replay() + return entry.output def select_default_backend(level: int) -> Union[str, Callable]: if level in [CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE]: backend_str = "eager" return backend_str - assert level in [ - CompilationLevel.INDUCTOR, CompilationLevel.INDUCTOR_MAX_AUTOTUNE - ], f"Invalid level {level}" - - from vllm.compilation.backends import vllm_backend - from vllm.plugins import get_inductor_additional_configs - additional_configs = get_inductor_additional_configs() - - if level == CompilationLevel.INDUCTOR_MAX_AUTOTUNE: - if "max_autotune" in additional_configs and not additional_configs[ - "max_autotune"]: - logger.warning( - "max_autotune is disabled, but is overridden by level %s", - CompilationLevel.INDUCTOR_MAX_AUTOTUNE) - additional_configs['max_autotune'] = True - - from functools import partial - backend = partial(vllm_backend, - additional_inductor_config=additional_configs) - - return backend + assert level == CompilationLevel.PIECEWISE + + return VllmBackend() diff --git a/vllm/compilation/config.py b/vllm/compilation/config.py new file mode 100644 index 0000000000000..514f2b93ef64f --- /dev/null +++ b/vllm/compilation/config.py @@ -0,0 +1,154 @@ +import copy +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel, Field, PrivateAttr + +import vllm.envs as envs +from vllm.logger import init_logger + +from .compile_context import get_compile_context + +logger = init_logger(__name__) + + +class CompilationConfig(BaseModel): + """ + Configuration for compilation. + It has two parts: + - CudaGraph capture: + - use_cudagraph: whether to use cudagraph inside compilation. + - False: cudagraph inside compilation is not used. + - True: cudagraph inside compilation is used. It requires + that all input buffers have fixed addresses. + Note that this is orthogonal to the cudagraph capture out + side of compilation. + TODO: move outside cudagraph logic into compilation. + torch.compile will handle cudagraph capture logic in the future. + - cudagraph_capture_sizes: sizes to capture cudagraph. + - None: capture sizes are inferred from compilation context. + - List[int]: capture sizes are specified. + - cudagraph_num_of_warmups: number of warmup runs for cudagraph. + It means the first several runs will be treated as warmup runs. + Only after that, the execution will be recorded, and the recorded + cudagraph will be used for subsequent runs. + - Inductor compilation: + - use_inductor: whether to use inductor compilation. + - False: inductor compilation is not used. graph runs in eager. + - True: inductor compilation is used. one graph for symbolic shape + is compiled. In addition, compile for different sizes specified + in inductor_compile_sizes, using configurations + in inductor_compile_config. + - inductor_compile_sizes: sizes to compile for inductor. + - inductor_specialize_for_cudagraph_no_more_than: an optional integer + to specialize inductor for cudagraph sizes no more than the + specified size. It is useful when we want to specialize inductor + with a subset of cudagraph sizes. + - inductor_compile_config: additional configurations for inductor. + - None: use default configurations. + - inductor_passes: additional passes for inductor. It is a dictionary + from pass name to pass function qualified name. We use function + name because the config uses json format. If we pass the config + from Python, functions can also be passed directly via Python object + constructor, e.g. `CompilationConfig(inductor_passes={"a": func})` + + Why we have different sizes for cudagraph and inductor: + - cudagraph: a cudagraph captured for a specific size can only be used + for the same size. We need to capture all the sizes we want to use. + - inductor: a graph compiled by inductor for a general shape can be used + for different sizes. Inductor can also compile for specific sizes, + where it can have more information to optimize the graph with fully + static shapes. However, we find the general shape compilation is + sufficient for most cases. It might be beneficial to compile for + certain small batchsizes, where inductor is good at optimizing. + """ + use_inductor: bool = True + inductor_specialize_for_cudagraph_no_more_than: Optional[int] = None + inductor_compile_sizes: Optional[List[int]] = Field(default_factory=dict) + inductor_compile_config: Dict = Field(default_factory=dict) + inductor_passes: Dict[str, str] = Field(default_factory=dict) + + use_cudagraph: bool = False + non_cudagraph_ops: List[str] = Field(default_factory=list) + cudagraph_num_of_warmups: int = 0 + cudagraph_capture_sizes: Optional[List[int]] = None + + # not configurable, computed after init + compile_sizes: List[int] = PrivateAttr + capture_sizes: List[int] = PrivateAttr + + def model_post_init(self, __context: Any) -> None: + for k, v in self.inductor_passes.items(): + if not isinstance(v, str): + assert callable(v), ( + f"pass {k} should be a function or a qualified name") + self.inductor_passes[k] = v + continue + + # resolve function from qualified name + names = v.split(".") + module = ".".join(names[:-1]) + func_name = names[-1] + func = __import__(module).__dict__[func_name] + self.inductor_compile_config[k] = func + + from vllm.compilation.backends import fix_functionalization + from vllm.utils import combine_fx_passes + if "post_grad_custom_post_pass" in self.inductor_compile_config: + self.inductor_compile_config[ + "post_grad_custom_post_pass"] = combine_fx_passes( + fix_functionalization, + self.inductor_compile_config["post_grad_custom_post_pass"], + ) + else: + self.inductor_compile_config[ + "post_grad_custom_post_pass"] = fix_functionalization + + def init_during_runtime(self): + """To complete the initialization of config, + we need to know the compile context, which is only available + during the first run of the model. + """ + context = get_compile_context() + context = copy.deepcopy(context) if context is not None else [] + sizes_to_specialize: List[int] = context + if self.cudagraph_capture_sizes is None: + self.capture_sizes = sizes_to_specialize + else: + self.capture_sizes = self.cudagraph_capture_sizes + logger.info(("cudagraph sizes specified by model runner" + " %s is overridden by config %s"), + sizes_to_specialize, self.cudagraph_capture_sizes) + if self.inductor_specialize_for_cudagraph_no_more_than is not None: + assert self.inductor_compile_sizes is None, ( + "inductor_compile_sizes should be None when " + "inductor_specialize_for_cudagraph_no_more_than is not None") + self.compile_sizes = [ + x for x in self.capture_sizes + if x <= self.inductor_specialize_for_cudagraph_no_more_than + ] + else: + assert self.inductor_compile_sizes is not None, ( + "inductor_compile_sizes should not be None when " + "inductor_specialize_for_cudagraph_no_more_than is None") + self.compile_sizes = self.inductor_compile_sizes + + @staticmethod + def select_and_init_config() -> "CompilationConfig": + """The order of selecting config is: + 1. Use the config specified in environment variable. + 2. Use the config specified in plugins. + 3. Use the default config. + """ + config_path = envs.VLLM_TORCH_COMPILE_CONFIG + if config_path is not None: + with open(config_path) as json_file: + config = CompilationConfig.model_validate_json( + json_file.read()) + else: + from vllm.plugins import get_compilation_config + predefined_config = get_compilation_config() + config = predefined_config if predefined_config is not None else ( + CompilationConfig()) + + config.init_during_runtime() + return config diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py new file mode 100644 index 0000000000000..100a49aba74ac --- /dev/null +++ b/vllm/compilation/counter.py @@ -0,0 +1,30 @@ +import copy +import dataclasses +from contextlib import contextmanager + + +@dataclasses.dataclass +class CompilationCounter: + num_graphs_seen: int = 0 + # including the splitting ops + num_piecewise_graphs_seen: int = 0 + # not including the splitting ops + num_piecewise_capturable_graphs_seen: int = 0 + num_inductor_compilations: int = 0 + num_cudagraph_caputured: int = 0 + + def clone(self) -> "CompilationCounter": + return copy.deepcopy(self) + + @contextmanager + def expect(self, **kwargs): + old = self.clone() + yield + for k, v in kwargs.items(): + assert getattr(self, k) - getattr(old, k) == v, ( + f"{k} not as expected, before it is {getattr(old, k)}" + f", after it is {getattr(self, k)}, " + f"expected diff is {v}") + + +compilation_counter = CompilationCounter() diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 0449f9354d0a2..3053e57e0b63b 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -121,7 +121,10 @@ def _support_torch_compile(cls: type, # take care of method resolution order # make sure super().__init__ is called on the base class # other than TorchCompileWrapperWithCustomDispatcher - cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher, ) + if TorchCompileWrapperWithCustomDispatcher not in cls.__bases__: + # support decorating multiple times + cls.__bases__ = cls.__bases__ + ( + TorchCompileWrapperWithCustomDispatcher, ) old_init = cls.__init__ # type: ignore @@ -160,6 +163,11 @@ def __call__(self, *args, **kwargs): # compiled function and let torch.compile handle the dispatching, # with the overhead of guard evaluation and recompilation. if len(self.compiled_codes) < 1 or not self.use_custom_dispatcher: + # it seems Dynamo reuse the compilation across instances, + # while we need to make sure the compiled code is not reused. + # we need to control all the compilation of the model. + torch._dynamo.eval_frame.remove_from_cache( + self.original_code_object) return self.compiled_callable(*args, **kwargs) # usually, capturing the model once is enough, and then we can diff --git a/vllm/compilation/levels.py b/vllm/compilation/levels.py index 162bf5ae64997..19a3a2b526870 100644 --- a/vllm/compilation/levels.py +++ b/vllm/compilation/levels.py @@ -5,5 +5,4 @@ class CompilationLevel: NO_COMPILATION = 0 DYNAMO_AS_IS = 1 DYNAMO_ONCE = 2 - INDUCTOR = 3 - INDUCTOR_MAX_AUTOTUNE = 4 + PIECEWISE = 3 diff --git a/vllm/envs.py b/vllm/envs.py index ae6825f280073..b4a263d1e086e 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -209,6 +209,11 @@ def get_default_config_root(): os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"), "VLLM_TORCH_COMPILE_LEVEL": lambda: int(os.environ.get("VLLM_TORCH_COMPILE_LEVEL", "0")), + + # Path to the config file for torch compile + "VLLM_TORCH_COMPILE_CONFIG": + lambda: os.environ.get("VLLM_TORCH_COMPILE_CONFIG", None), + # Fine-grained control over which custom ops to enable/disable. # Use 'all' to enable all, 'none' to disable all. # Also specify a list of custom op names to enable (prefixed with a '+'), diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 83910339f3c9f..764f4e9c99df8 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -100,7 +100,7 @@ def enabled(cls) -> bool: return (CustomOp.default_on() or enabled) and not disabled - # On by default if VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.INDUCTOR + # On by default if VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE # Specifying 'all' or 'none' in VLLM_CUSTOM_OPS takes precedence. @staticmethod @lru_cache() @@ -108,7 +108,7 @@ def default_on() -> bool: count_none = envs.VLLM_CUSTOM_OPS.count("none") count_all = envs.VLLM_CUSTOM_OPS.count("all") assert count_none + count_all <= 1, "Can only specify 'none' or 'all'" - return envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.INDUCTOR and \ + return envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE and \ not count_none > 0 or count_all > 0 # Dictionary of all custom ops (classes, indexed by registered name). diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 8ba973b28263f..8d0ce47df4040 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -11,7 +11,7 @@ if "VLLM_TORCH_COMPILE_LEVEL" not in os.environ: os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.DYNAMO_ONCE) -assert envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.INDUCTOR,\ +assert envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE,\ "TPU does not support Inductor." set_torch_compile_backend("openxla") diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 211fedbc6e2ec..4338cbc37f6c1 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -1,7 +1,8 @@ import logging -from typing import Callable, Dict, Optional, Union +from typing import Callable, Optional, Union import vllm.envs as envs +from vllm.compilation.config import CompilationConfig logger = logging.getLogger(__name__) @@ -44,13 +45,13 @@ def get_torch_compile_backend() -> Optional[Union[Callable, str]]: return _torch_compile_backend -_inductor_additional_configs: Dict = {} +_compilation_config: Optional[CompilationConfig] = None -def set_inductor_additional_configs(configs: Dict): - global _inductor_additional_configs - _inductor_additional_configs = configs +def set_compilation_config(config: Optional[CompilationConfig]): + global _compilation_config + _compilation_config = config -def get_inductor_additional_configs() -> Dict: - return _inductor_additional_configs +def get_compilation_config() -> Optional[CompilationConfig]: + return _compilation_config diff --git a/vllm/utils.py b/vllm/utils.py index fea318ebcdf41..90c4b84757810 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1479,6 +1479,15 @@ def __len__(self): return len(self._factory) +def combine_fx_passes(passes: List[Callable]) -> Callable: + + def combined_fx(graph) -> None: + for fx in passes: + fx(graph) + + return combined_fx + + def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor: """ Create a weak reference to a tensor. @@ -1486,3 +1495,19 @@ def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor: but will not keep the original tensor alive. """ return torch.ops._C.weak_ref_tensor(tensor) + + +def weak_ref_tensors( + tensors: Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor]] +) -> Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor]]: + """ + Convenience function to create weak references to tensors, + for single tensor, list of tensors or tuple of tensors. + """ + if isinstance(tensors, torch.Tensor): + return weak_ref_tensor(tensors) + if isinstance(tensors, list): + return [weak_ref_tensor(t) for t in tensors] + if isinstance(tensors, tuple): + return tuple(weak_ref_tensor(t) for t in tensors) + raise ValueError("Invalid type for tensors") From 6aa6020f9bd4c1e414c10f7bd3a7c2555f1950b2 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 30 Oct 2024 14:05:43 +0800 Subject: [PATCH 20/88] [Misc] Specify minimum pynvml version (#9827) Signed-off-by: Jee Jee Li --- requirements-cuda.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 92fa303d687a2..282ab11838bf4 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -3,7 +3,7 @@ # Dependencies for NVIDIA GPUs ray >= 2.9 -nvidia-ml-py # for pynvml package +nvidia-ml-py >= 12.560.30 # for pynvml package torch == 2.5.0 # These must be updated alongside torch torchvision == 0.20 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version From 211fe91aa88730c04df439298d8103a587302493 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 30 Oct 2024 02:41:38 -0700 Subject: [PATCH 21/88] [TPU] Correctly profile peak memory usage & Upgrade PyTorch XLA (#9438) --- Dockerfile.tpu | 2 +- docs/source/getting_started/tpu-installation.rst | 4 ++-- vllm/worker/tpu_worker.py | 15 ++++++++------- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index bdfab3f61910f..dd8f9ad4714a9 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -1,4 +1,4 @@ -ARG NIGHTLY_DATE="20240828" +ARG NIGHTLY_DATE="20241017" ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE" FROM $BASE_IMAGE diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst index 217028839e347..edba209986f6a 100644 --- a/docs/source/getting_started/tpu-installation.rst +++ b/docs/source/getting_started/tpu-installation.rst @@ -56,8 +56,8 @@ First, install the dependencies: $ pip uninstall torch torch-xla -y $ # Install PyTorch and PyTorch XLA. - $ export DATE="20240828" - $ export TORCH_VERSION="2.5.0" + $ export DATE="20241017" + $ export TORCH_VERSION="2.6.0" $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index fe819b9f4b3a8..de6f7ab0072fd 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -133,18 +133,19 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # Synchronize before measuring the memory usage. xm.wait_device_ops() - dtype_btyes = get_dtype_size(self.cache_dtype) - block_size = self.cache_config.block_size - block_size_bytes = (dtype_btyes * block_size * num_layers * 2 * - head_size * num_kv_heads) - - # Calculate the TPU KV cache size based on profiling. + # Get the maximum amount of memory used by the model weights and + # intermediate activations. m = xm.get_memory_info(self.device) total_memory_size = m["bytes_limit"] + profiled = m["peak_bytes_used"] # Weights + intermediate activations. + + # Calculate the TPU KV cache size based on profiling. usable_memory_size = int(total_memory_size * self.cache_config.gpu_memory_utilization) - profiled = m["bytes_used"] # Weights + intermediate activations. tpu_kv_cache_bytes = max(usable_memory_size - profiled, 0) + dtype_btyes = get_dtype_size(self.cache_dtype) + block_size_bytes = (dtype_btyes * self.cache_config.block_size * + num_layers * 2 * head_size * num_kv_heads) num_tpu_blocks = tpu_kv_cache_bytes // block_size_bytes num_tpu_blocks = (num_tpu_blocks // 8) * 8 # Round down to 8. From cc98f1e0798cf2b5ea5bc5d0c565af2f884bf6e8 Mon Sep 17 00:00:00 2001 From: Alex Brooks Date: Wed, 30 Oct 2024 10:32:17 -0600 Subject: [PATCH 22/88] [CI/Build] VLM Test Consolidation (#9372) Signed-off-by: Alex-Brooks --- .buildkite/test-pipeline.yaml | 7 +- tests/conftest.py | 6 +- tests/engine/test_short_mm_context.py | 29 + .../audio_language/test_ultravox.py | 2 +- .../models/decoder_only/language/test_qwen.py | 34 + .../mm_processor_kwargs/__init__.py | 0 .../mm_processor_kwargs/test_llava_next.py | 68 ++ .../mm_processor_kwargs/test_phi3v.py | 181 ++++++ .../mm_processor_kwargs/test_qwen.py | 144 +++++ .../test_qwen2_vl.py | 4 +- .../vision_language/test_blip2.py | 101 --- .../vision_language/test_broadcast.py | 46 -- .../vision_language/test_chameleon.py | 130 ---- .../decoder_only/vision_language/test_fuyu.py | 139 ---- .../decoder_only/vision_language/test_glm4.py | 133 ---- .../vision_language/test_internvl.py | 290 +-------- .../vision_language/test_llava.py | 313 --------- .../test_llava_image_embeds.py | 158 ----- .../vision_language/test_llava_next.py | 347 ---------- .../vision_language/test_llava_next_video.py | 226 ------- .../vision_language/test_llava_onevision.py | 272 -------- .../vision_language/test_minicpmv.py | 199 ------ .../vision_language/test_models.py | 594 ++++++++++++++++++ .../vision_language/test_paligemma.py | 174 ----- .../vision_language/test_phi3v.py | 185 +----- .../decoder_only/vision_language/test_qwen.py | 374 ----------- .../vision_language/vlm_utils/__init__.py | 0 .../vision_language/vlm_utils/builders.py | 235 +++++++ .../vlm_utils/case_filtering.py | 157 +++++ .../vision_language/vlm_utils/core.py | 141 +++++ .../vlm_utils/custom_inputs.py | 102 +++ .../vision_language/vlm_utils/model_utils.py | 338 ++++++++++ .../vision_language/vlm_utils/runners.py | 130 ++++ .../vision_language/vlm_utils/types.py | 187 ++++++ .../vision_language/test_llava_next.py | 2 + .../vision_language/test_mllama.py | 2 +- tests/utils.py | 24 +- vllm/utils.py | 3 +- 38 files changed, 2381 insertions(+), 3096 deletions(-) create mode 100644 tests/engine/test_short_mm_context.py create mode 100644 tests/models/decoder_only/language/test_qwen.py create mode 100644 tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py create mode 100644 tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py create mode 100644 tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py create mode 100644 tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py rename tests/models/decoder_only/vision_language/{ => mm_processor_kwargs}/test_qwen2_vl.py (98%) delete mode 100644 tests/models/decoder_only/vision_language/test_blip2.py delete mode 100644 tests/models/decoder_only/vision_language/test_broadcast.py delete mode 100644 tests/models/decoder_only/vision_language/test_chameleon.py delete mode 100644 tests/models/decoder_only/vision_language/test_fuyu.py delete mode 100644 tests/models/decoder_only/vision_language/test_glm4.py delete mode 100644 tests/models/decoder_only/vision_language/test_llava.py delete mode 100644 tests/models/decoder_only/vision_language/test_llava_image_embeds.py delete mode 100644 tests/models/decoder_only/vision_language/test_llava_next.py delete mode 100644 tests/models/decoder_only/vision_language/test_llava_next_video.py delete mode 100644 tests/models/decoder_only/vision_language/test_llava_onevision.py delete mode 100644 tests/models/decoder_only/vision_language/test_minicpmv.py create mode 100644 tests/models/decoder_only/vision_language/test_models.py delete mode 100644 tests/models/decoder_only/vision_language/test_paligemma.py delete mode 100644 tests/models/decoder_only/vision_language/test_qwen.py create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/__init__.py create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/builders.py create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/core.py create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/model_utils.py create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/runners.py create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/types.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index ed847a7e3696b..32eed1a771718 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -338,7 +338,10 @@ steps: - tests/models/decoder_only/vision_language commands: - pytest -v -s models/decoder_only/audio_language - - pytest -v -s models/decoder_only/vision_language + # HACK - run phi3v tests separately to sidestep this transformers bug + # https://github.com/huggingface/transformers/issues/34307 + - pytest -v -s models/decoder_only/vision_language/test_phi3v.py + - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language - label: Other Models Test # 6min #mirror_hardwares: [amd] @@ -413,7 +416,7 @@ steps: # Avoid importing model tests that cause CUDA reinitialization error - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus - - pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus + - pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s distributed/test_distributed_oot.py diff --git a/tests/conftest.py b/tests/conftest.py index 2fce2d772c6ed..bdc6ffb148602 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -259,8 +259,7 @@ def __init__( is_sentence_transformer: bool = False, skip_tokenizer_init: bool = False, auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM, - postprocess_inputs: Callable[[BatchEncoding], - BatchEncoding] = identity, + postprocess_inputs: Callable[..., BatchEncoding] = identity, ) -> None: torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] @@ -303,6 +302,7 @@ def __init__( if skip_tokenizer_init: self.tokenizer = self.processor.tokenizer + self.dtype = dtype self.postprocess_inputs = postprocess_inputs def get_inputs( @@ -337,7 +337,7 @@ def get_inputs( processor_kwargs["sampling_rate"] = sr inputs = self.processor(**processor_kwargs) - inputs = self.postprocess_inputs(inputs) + inputs = self.postprocess_inputs(inputs, dtype=self.dtype) all_inputs.append(inputs) diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py new file mode 100644 index 0000000000000..a6ba7a131c506 --- /dev/null +++ b/tests/engine/test_short_mm_context.py @@ -0,0 +1,29 @@ +import pytest + +from ..conftest import IMAGE_ASSETS + +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + "USER: \nWhat's the content of the image?\nASSISTANT:", + "cherry_blossom": + "USER: \nWhat is the season?\nASSISTANT:", +}) + +models = ["llava-hf/llava-1.5-7b-hf"] + + +@pytest.mark.parametrize("model", models) +def test_context_length_too_short(vllm_runner, image_assets, model): + images = [asset.pil_image for asset in image_assets] + + with pytest.raises(ValueError, match="too long to fit into the model"): + vllm_model = vllm_runner( + model, + max_model_len=128, # LLaVA has a feature size of 576 + enforce_eager=True, + ) + + with vllm_model: + vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]], + max_tokens=1, + images=[images[0]]) diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index bfffd34d1142c..ad6c2d854d1f0 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -92,7 +92,7 @@ def run_test( for vllm_prompt, _, audio in prompts_and_audios ] - def process(hf_inputs: BatchEncoding): + def process(hf_inputs: BatchEncoding, **kwargs): hf_inputs["audio_values"] = hf_inputs["audio_values"] \ .to(torch_dtype) # type: ignore return hf_inputs diff --git a/tests/models/decoder_only/language/test_qwen.py b/tests/models/decoder_only/language/test_qwen.py new file mode 100644 index 0000000000000..128fe65afbb84 --- /dev/null +++ b/tests/models/decoder_only/language/test_qwen.py @@ -0,0 +1,34 @@ +"""Ensure that a text-only Qwen model can be run without throwing an error. +We explicitly test this because Qwen is implemented as a multimodal and +supports a visual encoder for models like Qwen-VL. +""" +from typing import List, Type + +import pytest + +from ....conftest import VllmRunner + +models = [ + "Qwen/Qwen-7B-Chat" # Has no visual encoder +] + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_text_only_qwen_model_can_be_loaded_and_run( + vllm_runner: Type[VllmRunner], + example_prompts: List[str], + model: str, + *, + dtype: str, + max_tokens: int, + num_logprobs: int, +): + with vllm_runner(model, dtype=dtype) as vllm_model: + vllm_model.generate_greedy_logprobs( + example_prompts, + max_tokens, + num_logprobs=num_logprobs, + ) diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py new file mode 100644 index 0000000000000..c2d3fda6994f6 --- /dev/null +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py @@ -0,0 +1,68 @@ +import pytest + +from vllm.inputs import InputContext + +from ....utils import build_model_context + + +@pytest.fixture() +def get_max_llava_next_image_tokens(): + from vllm.model_executor.models.llava_next import ( + get_max_llava_next_image_tokens) + return get_max_llava_next_image_tokens + + +@pytest.fixture() +def dummy_data_for_llava_next(): + from vllm.model_executor.models.llava_next import dummy_data_for_llava_next + return dummy_data_for_llava_next + + +@pytest.mark.parametrize("gridpoints,expected_max_tokens", [ + ([[336, 336]], 1176), + ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928), +]) +def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens, + get_max_llava_next_image_tokens): + ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf") + + # Update the config image_grid_pinpoints + # and calculate the resulting max tokens + ctx.model_config.hf_config.image_grid_pinpoints = gridpoints + + actual_max_tokens = get_max_llava_next_image_tokens( + InputContext(ctx.model_config)) + + assert expected_max_tokens == actual_max_tokens + + +@pytest.mark.parametrize( + "gridpoints,expected_size", + [ + # One point; it has to be the largest + ([[336, 336]], (336, 336)), + # Default for most llava next models; the 2x2 tile is the largest + ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], + (672, 672)), + # If two rectangular gridpoints are the same, the more vertical + # one has the higher feature count due to newline features + ([[336, 672], [672, 336]], (672, 336)) + ]) +def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next, + gridpoints, expected_size): + ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf") + + # Update the config image_grid_pinpoints + ctx.model_config.hf_config.image_grid_pinpoints = gridpoints + seq_len = 5000 # bigger than the max feature size for any image + + seq_data, mm_data = dummy_data_for_llava_next( + ctx, + seq_len=seq_len, + mm_counts={"image": 1}, + ) + + # The dummy data dims should match the gridpoint with the biggest feat size + assert mm_data["image"].height == expected_size[0] + assert mm_data["image"].width == expected_size[1] + assert len(seq_data.get_token_ids()) >= seq_len diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py new file mode 100644 index 0000000000000..d6a7b34fdde9f --- /dev/null +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py @@ -0,0 +1,181 @@ +"""Tests for phi3v's multimodal preprocessing kwargs.""" +from typing import Optional + +import pytest +import torch +from transformers import AutoImageProcessor, AutoTokenizer + +from vllm.inputs import InputContext, token_inputs +from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID +from vllm.multimodal import MultiModalRegistry + +from .....conftest import _ImageAssets +from ....utils import build_model_context + +models = ["microsoft/Phi-3.5-vision-instruct"] + + +# Wrap lazy imports to avoid initializing CUDA during test collection +@pytest.fixture() +def input_processor_for_phi3v(): + from vllm.model_executor.models.phi3v import input_processor_for_phi3v + return input_processor_for_phi3v + + +@pytest.fixture() +def dummy_data_for_phi3v(): + from vllm.model_executor.models.phi3v import dummy_data_for_phi3v + return dummy_data_for_phi3v + + +@pytest.fixture() +def get_max_phi3v_image_tokens(): + from vllm.model_executor.models.phi3v import get_max_phi3v_image_tokens + return get_max_phi3v_image_tokens + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("num_crops", [4, 16, None]) +def test_input_mapper_override(model: str, image_assets: _ImageAssets, + num_crops: Optional[int]): + """Ensure that the [default] input mapper handles num_crops properly.""" + # We pass the processor kwargs here since for this model, we fall back to + # the default mapper; this will fall back to the HF mapper and forward + # mm_processor_kwargs to it. + mm_processor_kwargs = { + "num_crops": num_crops + } if num_crops is not None else {} + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=mm_processor_kwargs, + ) + + hf_processor = AutoImageProcessor.from_pretrained(model, + trust_remote_code=True, + **mm_processor_kwargs) + + mm_registry = MultiModalRegistry() + mm_registry.init_mm_limits_per_prompt(ctx.model_config) + + image = image_assets[0].pil_image + hf_result = hf_processor.preprocess( + image, + return_tensors="pt", + ) + + vllm_result = mm_registry.map_input( + ctx.model_config, + {"image": image}, + ) + + assert torch.all(hf_result["image_sizes"] == vllm_result["image_sizes"]) + assert torch.all( + hf_result["num_img_tokens"] == vllm_result["num_img_tokens"]) + + # For pixel values, the second axis should be the num_crops + 1 + # for the rescaled original image. The default value in VLLM falls + # back to the HF config, which is why we compare to the processor num_crops + assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"]) + assert vllm_result["pixel_values"].shape[1] == hf_processor.num_crops + 1 + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("num_crops,expected_max_tokens", [ + (4, 781), + (16, 2653), +]) +def test_max_tokens_override(get_max_phi3v_image_tokens, model: str, + num_crops: int, expected_max_tokens: int): + """Ensure get_max_phi3v_image_tokens handles num_crops properly.""" + # NOTE: mm_processor_kwargs on the context in this test is unused, since + # this is testing the mapper directly. In practice, the processor kwargs + # are wrapped in a closure when calling the max tokens func. We explicitly + # do NOT use the mm_processor_kwargs in the model context here to ensure + # that the max image tokens implementation is referencing a mix of the + # kwargs to the function and the original mm_processor_kwargs in case + # values are somehow updated and end up in a bad state. + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=None, + ) + + actual_max_tokens = get_max_phi3v_image_tokens( + InputContext(ctx.model_config), + num_crops=num_crops, + ) + + assert expected_max_tokens == actual_max_tokens + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("num_crops,toks_per_img,num_imgs", [ + (4, 781, 1), + (4, 781, 2), + (16, 2653, 1), + (16, 2653, 2), +]) +def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int, + toks_per_img: int, num_imgs: int): + """Ensure dummy_data_for_phi3v handles num_crops properly.""" + # Same as the previous test - don't initialize mm_processor_kwargs + # in this test and assume that the kwargs will be correctly expanded by + # the partial when calling the dummy data func. + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=None, + ) + + sequence_data, _, = dummy_data_for_phi3v( + ctx=ctx, + seq_len=8192, # Should be bigger than num_imgs * toks_per_img + mm_counts={"image": num_imgs}, + num_crops=num_crops, + ) + # Ensure we have the right number of placeholders per num_crops size + img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID) + assert img_tok_count == toks_per_img * num_imgs + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("num_crops,expected_toks_per_img,num_imgs", [ + (4, 757, 1), + (4, 757, 2), + (16, 1921, 1), + (16, 1921, 2), +]) +def test_input_processor_override(input_processor_for_phi3v, + image_assets: _ImageAssets, model: str, + num_crops: int, expected_toks_per_img: int, + num_imgs: int): + """Ensure input_processor_for_phi3v handles num_crops properly.""" + # Same as the previous test - don't initialize mm_processor_kwargs + # in this test and assume that the kwargs will be correctly expanded by + # the partial when calling the custom input processor. + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + ) + tokenizer = AutoTokenizer.from_pretrained(model) + # Build the image str / prompt based on the number of images we pass + img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)]) + prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n" + images = [image_assets[0].pil_image] * num_imgs + + inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt), + prompt=prompt, + multi_modal_data={"image": images}) + + processed_inputs = input_processor_for_phi3v(ctx, + inputs, + num_crops=num_crops) + + # Ensure we have the right number of placeholders per num_crops size + img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID) + assert img_tok_count == expected_toks_per_img * num_imgs diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py new file mode 100644 index 0000000000000..a01651b171d60 --- /dev/null +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py @@ -0,0 +1,144 @@ +"""Tests for Qwen's multimodal preprocessing kwargs.""" +from typing import Dict, List, Union + +import pytest +import torch +from PIL.Image import Image + +from vllm.inputs import InputContext, token_inputs +from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.utils import cached_get_tokenizer + +from .....conftest import IMAGE_ASSETS +from ....utils import build_model_context + +### Multimodal preprocessing tests +SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image +# These values are specific to Qwen-VL/Chat; we can get these from the model +# config also, but they are hardcoded here to keep the parameterize/fixtures +# easy to read. +IMG_START_ID = 151857 +IMG_END_ID = 151858 +IMG_PAD_ID = 151859 +TOKS_PER_IMG = 256 +VIS_ENC_DIM = 4096 +IMG_SIZE = 448 + + +@pytest.fixture() +def input_mapper_for_qwen(): + # Lazy import to avoid initializing CUDA during test collection + from vllm.model_executor.models.qwen import input_mapper_for_qwen + return input_mapper_for_qwen + + +@pytest.fixture() +def input_processor_for_qwen(): + # Lazy import to avoid initializing CUDA during test collection + from vllm.model_executor.models.qwen import input_processor_for_qwen + return input_processor_for_qwen + + +@pytest.fixture() +def qwen_vl_context() -> InputContext: + """Get an InputContext for Qwen-VL.""" + return build_model_context(model_name="Qwen/Qwen-VL", + trust_remote_code=True) + + +# Happy path tests for single/multi-image scenarios for the multimodal +# input processor and mapper, respectively +@pytest.mark.parametrize("num_images", [1, 2]) +def test_input_processor_valid_mm_data(input_processor_for_qwen, + qwen_vl_context: InputContext, + num_images: int): + """Happy cases for image inputs to Qwen's multimodal input processor.""" + prompt = "".join( + [f"Picture {num}: \n" for num in range(1, num_images + 1)]) + inputs = token_inputs( + prompt=prompt, + # When processing multimodal data for a multimodal model, the qwen + # input processor will overwrite the provided prompt_token_ids with + # the image prompts + prompt_token_ids=[], + multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)}, + ) + proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs) + assert isinstance(proc_inputs, dict) + + # Each image should have one start / stop and a fixed context of 256 + proc_tokens = proc_inputs["prompt_token_ids"] + assert proc_tokens.count(IMG_START_ID) == num_images + assert proc_tokens.count(IMG_END_ID) == num_images + assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG + + +@pytest.mark.parametrize( + "img_data,expected_shape", + [ + # single / multi-image + (SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)), + (2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)), + # single / multi-image embeddings + (torch.rand( + (TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)), + (torch.rand( + (1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)), + (torch.rand( + (2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)), + ]) +def test_input_mapper_valid_mm_data(input_mapper_for_qwen, + qwen_vl_context: InputContext, + img_data: Union[torch.Tensor, List[Image], + Image], + expected_shape: List[int]): + """Happy cases for image inputs to Qwen's multimodal input mapper.""" + mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data) + # Ensure that we get the appropriately shaped pixel_values + # for images and image embeddings, respectively. + assert isinstance(mapped_img_data, MultiModalInputs) + assert "pixel_values" in mapped_img_data + assert mapped_img_data["pixel_values"].shape == expected_shape + + +# Sad path tests for the multimodal input processor and mapper, respectively +@pytest.mark.parametrize("mm_data", [ + { + "image": torch.rand((5)) + }, + { + "image": torch.rand((5, 5, 5, 5, 5)) + }, +]) +def test_input_processor_invalid_mm_data(input_processor_for_qwen, + qwen_vl_context: InputContext, + mm_data: Dict[str, torch.Tensor]): + """Test sad cases validated in Qwen's multimodal input processor.""" + tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer, + trust_remote_code=True) + prompt = "Picture 1: \n" + prompt_token_ids = tokenizer.encode(prompt) + inputs = token_inputs(prompt=prompt, + prompt_token_ids=prompt_token_ids, + multi_modal_data=mm_data) + # Should fail since we have too many or too few dimensions for embeddings + with pytest.raises(ValueError): + input_processor_for_qwen(qwen_vl_context, inputs) + + +@pytest.mark.parametrize( + "img_data", + [ + # Wrong context length + torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)), + # Wrong visual encoder output size + torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)), + ]) +def test_input_mapper_invalid_mm_data( + input_mapper_for_qwen, + qwen_vl_context: InputContext, + img_data: Union[torch.Tensor, List[Image], Image], +): + """Sad cases validated in Qwen VL's multimodal input mapper.""" + with pytest.raises(ValueError): + input_mapper_for_qwen(qwen_vl_context, img_data) diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py similarity index 98% rename from tests/models/decoder_only/vision_language/test_qwen2_vl.py rename to tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py index d3de5fb26d4b8..5c90e7f7a267c 100644 --- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py @@ -8,8 +8,8 @@ from vllm.inputs import InputContext, token_inputs from vllm.multimodal import MultiModalRegistry -from ....conftest import _ImageAssets -from ...utils import build_model_context +from .....conftest import _ImageAssets +from ....utils import build_model_context MODEL = "Qwen/Qwen2-VL-2B-Instruct" MIN_PIXELS = "min_pixels" diff --git a/tests/models/decoder_only/vision_language/test_blip2.py b/tests/models/decoder_only/vision_language/test_blip2.py deleted file mode 100644 index e1e32b96d89ac..0000000000000 --- a/tests/models/decoder_only/vision_language/test_blip2.py +++ /dev/null @@ -1,101 +0,0 @@ -from typing import List, Optional, Tuple - -import pytest -from transformers import AutoModelForVision2Seq, AutoTokenizer - -from vllm.multimodal.utils import rescale_image_size -from vllm.sequence import SampleLogprobs - -from ....conftest import IMAGE_ASSETS -from ...utils import check_logprobs_close - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "Question: What's the content of the image? Answer:", - "cherry_blossom": - "Question: What is the season? Answer:", -}) - - -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, - Optional[SampleLogprobs]], - model: str): - """Sanitize vllm output to be comparable with hf output.""" - _, output_str, out_logprobs = vllm_output - - hf_output_str = output_str + "\n" - - tokenizer = AutoTokenizer.from_pretrained(model) - hf_output_ids = tokenizer.encode(hf_output_str) - assert hf_output_ids[0] == tokenizer.bos_token_id - hf_output_ids = hf_output_ids[1:] - - return hf_output_ids, hf_output_str, out_logprobs - - -@pytest.mark.parametrize("model", ["Salesforce/blip2-opt-2.7b"]) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype: str, max_tokens: int, num_logprobs: int) -> None: - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalData objects and corresponding - MultiModalConfig as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - - # max_model_len should be greater than image_feature_size - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs_per_image - ] - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForVision2Seq) as hf_model: - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs_per_image - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, model) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) diff --git a/tests/models/decoder_only/vision_language/test_broadcast.py b/tests/models/decoder_only/vision_language/test_broadcast.py deleted file mode 100644 index 38c4a95de16f4..0000000000000 --- a/tests/models/decoder_only/vision_language/test_broadcast.py +++ /dev/null @@ -1,46 +0,0 @@ -import pytest -import transformers - -from ....utils import multi_gpu_test - - -@multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) -@pytest.mark.parametrize("model", [ - "llava-hf/llava-1.5-7b-hf", - "llava-hf/llava-v1.6-mistral-7b-hf", - "facebook/chameleon-7b", -]) -def test_models(hf_runner, vllm_runner, image_assets, - distributed_executor_backend, model) -> None: - - dtype = "half" - max_tokens = 5 - num_logprobs = 5 - tensor_parallel_size = 2 - - if model.startswith("llava-hf/llava-1.5"): - from .test_llava import models, run_test - elif model.startswith("llava-hf/llava-v1.6"): - from .test_llava_next import models, run_test # type: ignore[no-redef] - elif model.startswith("facebook/chameleon"): - if transformers.__version__.startswith("4.46"): - pytest.skip("Model broken in HF, " - "see huggingface/transformers#34379") - from .test_chameleon import models, run_test # type: ignore[no-redef] - else: - raise NotImplementedError(f"Unsupported model: {model}") - - run_test( - hf_runner, - vllm_runner, - image_assets, - model=models[0], - # So that LLaVA-NeXT processor may return nested list - size_factors=[0.25, 0.5, 1.0], - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - ) diff --git a/tests/models/decoder_only/vision_language/test_chameleon.py b/tests/models/decoder_only/vision_language/test_chameleon.py deleted file mode 100644 index 4bd678b9f21c4..0000000000000 --- a/tests/models/decoder_only/vision_language/test_chameleon.py +++ /dev/null @@ -1,130 +0,0 @@ -from typing import List, Optional, Type - -import pytest -import transformers -from transformers import AutoModelForVision2Seq, BatchEncoding - -from vllm.multimodal.utils import rescale_image_size -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE - -from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from ...utils import check_outputs_equal - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "USER: \nWhat's the content of the image?\nASSISTANT:", - "cherry_blossom": - "USER: \nWhat is the season?\nASSISTANT:", -}) - -models = ["facebook/chameleon-7b"] - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: List[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding vision language config as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - - with vllm_runner(model, - max_model_len=4096, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs_per_image - ] - - def process(hf_inputs: BatchEncoding): - hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \ - .to(torch_dtype) # type: ignore - return hf_inputs - - with hf_runner(model, - dtype=dtype, - postprocess_inputs=process, - auto_cls=AutoModelForVision2Seq) as hf_model: - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs_per_image - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - # HF Logprobs include image tokens, unlike vLLM, so we don't directly - # compare them - check_outputs_equal( - outputs_0_lst=[outputs[:2] for outputs in hf_outputs], - outputs_1_lst=[outputs[:2] for outputs in vllm_outputs], - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.skipif( - transformers.__version__.startswith("4.46.0"), - reason="Model broken in HF, see huggingface/transformers#34379", -) -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [8]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype, max_tokens, num_logprobs) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - size_factors=size_factors, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) diff --git a/tests/models/decoder_only/vision_language/test_fuyu.py b/tests/models/decoder_only/vision_language/test_fuyu.py deleted file mode 100644 index 1affcd10ee72d..0000000000000 --- a/tests/models/decoder_only/vision_language/test_fuyu.py +++ /dev/null @@ -1,139 +0,0 @@ -from typing import List, Optional, Tuple, Type - -import pytest - -from vllm.multimodal.utils import rescale_image_size -from vllm.platforms import current_platform -from vllm.sequence import SampleLogprobs - -from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from ...utils import check_logprobs_close - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "What's the content of the image?\n", - "cherry_blossom": - "What is the season?\n", -}) - -models = ["adept/fuyu-8b"] - - -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, - Optional[SampleLogprobs]]): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - hf_output_str = output_str.lstrip() + "|ENDOFTEXT|" - - return output_ids, hf_output_str, out_logprobs - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: List[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding MultiModalConfig as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - max_model_len=2048, - max_num_seqs=2, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs_per_image - ] - - with hf_runner(model, dtype=dtype) as hf_model: - eos_token_id = hf_model.processor.tokenizer.eos_token_id - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images, - eos_token_id=eos_token_id) - for prompts, images in inputs_per_image - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) - - -target_dtype = "half" -if current_platform.is_cpu(): - target_dtype = "bfloat16" - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [0.25], - # Single-scale, batched - [0.25, 0.25, 0.25], - # Multi-scale - [0.25, 0.2, 0.15], - ], -) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [10]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype: str, max_tokens: int, num_logprobs: int) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - size_factors=size_factors, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) diff --git a/tests/models/decoder_only/vision_language/test_glm4.py b/tests/models/decoder_only/vision_language/test_glm4.py deleted file mode 100644 index 47922a57f680b..0000000000000 --- a/tests/models/decoder_only/vision_language/test_glm4.py +++ /dev/null @@ -1,133 +0,0 @@ -from typing import List, Optional, Tuple, Type - -import pytest - -from vllm.multimodal.utils import rescale_image_size -from vllm.transformers_utils.tokenizer import patch_padding_side - -from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner -from ....utils import large_gpu_test -from ...utils import check_logprobs_close - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "What's the content of the image?", - "cherry_blossom": - "What is the season?", -}) - -models = ["THUDM/glm-4v-9b"] -target_dtype = "bfloat16" - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput]], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - mm_limit: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - max_model_len=2048, - max_num_seqs=2, - dtype=dtype, - limit_mm_per_prompt={"image": mm_limit}, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - stop_token_ids = [151329, 151336, 151338] - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images, - stop_token_ids=stop_token_ids) - for prompts, images in inputs - ] - - with hf_runner(model, dtype=dtype) as hf_model: - hf_processor = hf_model.processor - patch_padding_side(hf_processor) - - def processor(*args, text="", images=None, **kwargs): - if images is None: - return hf_processor(*args, **kwargs) - - return hf_processor.apply_chat_template( - [{ - "role": "user", - "image": images, - "content": text - }], - add_generation_prompt=True, - tokenize=True, - return_dict=True, - **kwargs, - ) - - hf_model.processor = processor - hf_model.model.get_output_embeddings = lambda: \ - hf_model.model.transformer.output_layer - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit( - prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images, - ) for prompts, images in inputs - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - -@large_gpu_test(min_gb=48) -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype: str, max_tokens: int, num_logprobs: int) -> None: - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - run_test( - hf_runner, - vllm_runner, - inputs_per_image, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - mm_limit=1, - tensor_parallel_size=1, - ) diff --git a/tests/models/decoder_only/vision_language/test_internvl.py b/tests/models/decoder_only/vision_language/test_internvl.py index fc842ec4a6171..2fd1ac4bb08f7 100644 --- a/tests/models/decoder_only/vision_language/test_internvl.py +++ b/tests/models/decoder_only/vision_language/test_internvl.py @@ -1,15 +1,11 @@ -import types -from typing import List, Optional, Tuple, Type, Union +from typing import List, Optional, Tuple, Type import pytest import torch -from PIL.Image import Image -from transformers import AutoConfig from vllm.multimodal.utils import rescale_image_size -from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, - _ImageAssets) +from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets from ...utils import check_logprobs_close HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ @@ -18,171 +14,6 @@ "cherry_blossom": "<|im_start|>User\n\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 }) -HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: \nImage-2: \nDescribe the two images in short.<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501 - -models = [ - "OpenGVLab/InternVL2-1B", - "OpenGVLab/InternVL2-2B", - # NOTE: Mono-InternVL-2B doesn't work with fp16, - # it will result NaN during inference. - # See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9 - "OpenGVLab/Mono-InternVL-2B", - # Broken due to outdated implementation of Phi-3 - # See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3 - # "OpenGVLab/InternVL2-4B", -] -target_dtype = "bfloat16" - - -# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py -def generate( - self, - pixel_values: torch.FloatTensor, - input_ids: torch.FloatTensor, - attention_mask: Optional[torch.LongTensor] = None, - **generate_kwargs, -) -> torch.LongTensor: - """Generate method for InternVL2 model without fixed use_cache.""" - assert self.img_context_token_id is not None - vit_embeds = self.extract_feature(pixel_values) - input_embeds = self.language_model.get_input_embeddings()(input_ids) - B, N, C = input_embeds.shape - input_embeds = input_embeds.reshape(B * N, C) - - input_ids = input_ids.reshape(B * N) - selected = (input_ids == self.img_context_token_id) - assert selected.sum() != 0 - input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device) - - input_embeds = input_embeds.reshape(B, N, C) - - forward_kwargs = dict( - inputs_embeds=input_embeds, - attention_mask=attention_mask, - ) - if getattr(self, "use_visual_token_mask", False): - visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype) - forward_kwargs["visual_token_mask"] = visual_token_mask - outputs = self.language_model.generate( - **forward_kwargs, - **generate_kwargs, - ) - - return outputs - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput]], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - mm_limit: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding MultiModalConfig as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - - class InternVLProcessor: - """A simple processor for InternVL2 which misses a processor.""" - - def __init__(self, hf_runner: HfRunner): - self.num_image_token = hf_runner.model.num_image_token - self.tokenizer = hf_runner.tokenizer - self.dtype = hf_runner.model.dtype - - self.config = AutoConfig.from_pretrained(hf_runner.model_name, - trust_remote_code=True) - self.vision_config = self.config.vision_config - self.use_thumbnail = self.config.use_thumbnail - self.min_num = self.config.min_dynamic_patch - self.max_num = self.config.max_dynamic_patch - self.image_size = self.vision_config.image_size - - def __call__(self, text: str, images: Union[Image, List[Image]], - **kwargs): - from vllm.model_executor.models.internvl import ( - IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values) - images = [images] if isinstance(images, Image) else images - pixel_values = [ - image_to_pixel_values(image, self.image_size, self.min_num, - self.max_num, - self.use_thumbnail).to(self.dtype) - for image in images - ] - num_patches_list = [ - pixel_value.shape[0] for pixel_value in pixel_values - ] - pixel_values = torch.cat(pixel_values, dim=0) - for num_patches in num_patches_list: - context_tokens = IMG_CONTEXT * self.num_image_token \ - * num_patches - image_tokens = IMG_START + context_tokens + IMG_END - text = text.replace('', image_tokens, 1) - prompt = self.tokenizer(text, return_tensors="pt") - prompt.update({"pixel_values": pixel_values}) - return prompt - - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - max_model_len=4096, - dtype=dtype, - limit_mm_per_prompt={"image": mm_limit}, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs - ] - - with hf_runner(model, dtype=dtype) as hf_model: - img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids( - "") - hf_model.model.img_context_token_id = img_context_token_id - hf_model.processor = InternVLProcessor(hf_model) - hf_model.model.get_output_embeddings = lambda: \ - hf_model.model.language_model.get_output_embeddings() - hf_model.model.generate = types.MethodType(generate, hf_model.model) - eos_token_id = hf_model.tokenizer.eos_token_id - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=hf_images, - eos_token_id=eos_token_id) - for prompts, hf_images in inputs - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - # TODO: Check whether using original CLIPVisionModel can improve - # consistency against HF - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) def run_awq_test( @@ -253,123 +84,6 @@ def run_awq_test( ) -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@torch.inference_mode() -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype: str, max_tokens: int, num_logprobs: int) -> None: - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - - run_test( - hf_runner, - vllm_runner, - inputs_per_image, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - mm_limit=1, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.5, 0.75, 1.0], - ], -) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@torch.inference_mode() -def test_multi_images_models(hf_runner, vllm_runner, image_assets, model, - size_factors, dtype: str, max_tokens: int, - num_logprobs: int) -> None: - images = [asset.pil_image for asset in image_assets] - - inputs_per_case = [ - ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors], - [[rescale_image_size(image, factor) for image in images] - for factor in size_factors]) - ] - - run_test( - hf_runner, - vllm_runner, - inputs_per_case, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - mm_limit=2, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("model", ["OpenGVLab/InternVL2-2B"]) -@pytest.mark.parametrize("size_factors", [[0.5, 1.0]]) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@torch.inference_mode() -def test_different_num_patches(hf_runner, vllm_runner, image_assets, model, - size_factors, dtype: str, max_tokens: int, - num_logprobs: int) -> None: - images = [asset.pil_image.resize((896, 896)) for asset in image_assets] - - inputs_batching = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - - inputs_multi_images = [ - ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors], - [[rescale_image_size(image, factor) for image in images] - for factor in size_factors]) - ] - for inputs in [inputs_batching, inputs_multi_images]: - run_test( - hf_runner, - vllm_runner, - inputs, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - mm_limit=2, - tensor_parallel_size=1, - ) - - @pytest.mark.parametrize( "models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")]) @pytest.mark.parametrize( diff --git a/tests/models/decoder_only/vision_language/test_llava.py b/tests/models/decoder_only/vision_language/test_llava.py deleted file mode 100644 index fd28a9367b4b2..0000000000000 --- a/tests/models/decoder_only/vision_language/test_llava.py +++ /dev/null @@ -1,313 +0,0 @@ -from typing import List, Optional, Tuple, Type, overload - -import pytest -from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, - BatchEncoding) - -from vllm.multimodal.utils import rescale_image_size -from vllm.sequence import SampleLogprobs -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE - -from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, - _ImageAssets) -from ...utils import check_logprobs_close - -_LIMIT_IMAGE_PER_PROMPT = 4 - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "USER: \nWhat's the content of the image?\nASSISTANT:", - "cherry_blossom": - "USER: \nWhat is the season?\nASSISTANT:", -}) - -models = [ - "llava-hf/llava-1.5-7b-hf", - # TODO: Get this model to produce meaningful output in vLLM - # "TIGER-Lab/Mantis-8B-siglip-llama3", -] - - -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, - Optional[SampleLogprobs]], - model: str): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - config = AutoConfig.from_pretrained(model) - image_token_id = config.image_token_index - - tokenizer = AutoTokenizer.from_pretrained(model) - eos_token_id = tokenizer.eos_token_id - - hf_output_ids = [ - token_id for idx, token_id in enumerate(output_ids) - if token_id != image_token_id or output_ids[idx - 1] != image_token_id - ] - - assert output_str[0] == " " - hf_output_str = output_str[1:] - if hf_output_ids[-1] == eos_token_id: - hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) - - return hf_output_ids, hf_output_str, out_logprobs - - -@overload -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: List[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - ... - - -@overload -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - sizes: List[Tuple[int, int]], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - ... - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: Optional[List[float]] = None, - sizes: Optional[List[Tuple[int, int]]] = None, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - images = [asset.pil_image for asset in image_assets] - - if size_factors is not None: - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - elif sizes is not None: - inputs_per_image = [( - [prompt for _ in sizes], - [image.resize(size) for size in sizes], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - else: - raise ValueError("You must provide either `size_factors` or `sizes`") - - _run_test(hf_runner, - vllm_runner, - inputs_per_image, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend) - - -def _run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput]], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding MultiModalConfig as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - # NOTE: For local use; this isn't tested in CI yet (see TODO above) - if model.startswith("TIGER-Lab/Mantis"): - from mantis.models.mllava import MLlavaProcessor - - torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] - mantis_processor = MLlavaProcessor.from_pretrained( - model, torch_dtype=torch_dtype) - assert isinstance(mantis_processor, MLlavaProcessor) - else: - mantis_processor = None - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - dtype=dtype, - max_model_len=4096, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True, - limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT - }) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs - ] - - if mantis_processor is not None: - - def process(hf_inputs: BatchEncoding): - hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \ - .to(torch_dtype) # type: ignore - return hf_inputs - else: - - def process(hf_inputs: BatchEncoding): - return hf_inputs - - with hf_runner(model, - dtype=dtype, - postprocess_inputs=process, - auto_cls=AutoModelForVision2Seq) as hf_model: - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - # TODO: Check whether using original CLIPVisionModel can improve - # consistency against HF - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, model) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype, max_tokens, num_logprobs) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - size_factors=size_factors, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets, - model, dtype, max_tokens, - num_logprobs) -> None: - stop_sign = image_assets[0].pil_image - cherry_blossom = image_assets[1].pil_image - - inputs = [( - [ - "USER: \nDescribe 2 images.\nASSISTANT:", - "USER: \nDescribe 2 images.\nASSISTANT:", - "USER: \nDescribe 4 images.\nASSISTANT:", # noqa: E501 - "USER: \nWhat is the season?\nASSISTANT:", - ], - [ - [stop_sign, cherry_blossom], - # Images with different sizes and aspect-ratios - [ - rescale_image_size(stop_sign, 0.1), - stop_sign, - ], - [ - stop_sign, - rescale_image_size(stop_sign, 0.25), - cherry_blossom.resize((183, 488)), - cherry_blossom.resize((488, 183)) - ], - cherry_blossom, - ])] - - _run_test( - hf_runner, - vllm_runner, - inputs, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("model", models) -def test_context_length_too_short(vllm_runner, image_assets, model): - images = [asset.pil_image for asset in image_assets] - - with pytest.raises(ValueError, match="too long to fit into the model"): - vllm_model = vllm_runner( - model, - max_model_len=128, # LLaVA has a feature size of 576 - enforce_eager=True, - ) - - with vllm_model: - vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]], - max_tokens=1, - images=[images[0]]) diff --git a/tests/models/decoder_only/vision_language/test_llava_image_embeds.py b/tests/models/decoder_only/vision_language/test_llava_image_embeds.py deleted file mode 100644 index 66414032509ed..0000000000000 --- a/tests/models/decoder_only/vision_language/test_llava_image_embeds.py +++ /dev/null @@ -1,158 +0,0 @@ -from typing import List, Optional, Tuple, Type - -import pytest -from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer - -from vllm.sequence import SampleLogprobs - -from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from ...utils import check_logprobs_close - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "USER: \nWhat's the content of the image?\nASSISTANT:", - "cherry_blossom": - "USER: \nWhat is the season?\nASSISTANT:", -}) - -models = [ - "llava-hf/llava-1.5-7b-hf", -] - - -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, - Optional[SampleLogprobs]], - model: str): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - config = AutoConfig.from_pretrained(model) - image_token_id = config.image_token_index - - tokenizer = AutoTokenizer.from_pretrained(model) - eos_token_id = tokenizer.eos_token_id - - hf_output_ids = [ - token_id for idx, token_id in enumerate(output_ids) - if token_id != image_token_id or output_ids[idx - 1] != image_token_id - ] - - assert output_str[0] == " " - hf_output_str = output_str[1:] - if hf_output_ids[-1] == eos_token_id: - hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) - - return hf_output_ids, hf_output_str, out_logprobs - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: List[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding vision language config as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - - # vLLM to load from image embeddings - vllm_images = [asset.image_embeds for asset in image_assets] - - # transformers to load from PIL images - hf_images = [asset.pil_image for asset in image_assets] - - vllm_inputs_per_image = [( - [prompt for _ in size_factors], - [image for _ in size_factors], - ) for image, prompt in zip(vllm_images, HF_IMAGE_PROMPTS)] - - hf_inputs_per_image = [( - [prompt for _ in size_factors], - [image for _ in size_factors], - ) for image, prompt in zip(hf_images, HF_IMAGE_PROMPTS)] - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in vllm_inputs_per_image - ] - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForVision2Seq) as hf_model: - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in hf_inputs_per_image - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - # TODO: Check whether using original CLIPVisionModel can improve - # consistency against HF - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, model) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - ], -) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype: str, max_tokens: int, num_logprobs: int) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - size_factors=size_factors, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) diff --git a/tests/models/decoder_only/vision_language/test_llava_next.py b/tests/models/decoder_only/vision_language/test_llava_next.py deleted file mode 100644 index aa9b297c5dd4e..0000000000000 --- a/tests/models/decoder_only/vision_language/test_llava_next.py +++ /dev/null @@ -1,347 +0,0 @@ -from typing import List, Optional, Tuple, Type, overload - -import pytest -from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer - -from vllm.inputs import InputContext -from vllm.multimodal.utils import rescale_image_size -from vllm.sequence import SampleLogprobs - -from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, - _ImageAssets) -from ...utils import build_model_context, check_logprobs_close - -_LIMIT_IMAGE_PER_PROMPT = 4 - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "[INST] \nWhat's the content of the image? [/INST]", - "cherry_blossom": - "[INST] \nWhat is the season? [/INST]", -}) - -models = ["llava-hf/llava-v1.6-mistral-7b-hf"] - - -@pytest.fixture() -def get_max_llava_next_image_tokens(): - from vllm.model_executor.models.llava_next import ( - get_max_llava_next_image_tokens) - return get_max_llava_next_image_tokens - - -@pytest.fixture() -def dummy_data_for_llava_next(): - from vllm.model_executor.models.llava_next import dummy_data_for_llava_next - return dummy_data_for_llava_next - - -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, - Optional[SampleLogprobs]], - model: str): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - config = AutoConfig.from_pretrained(model) - image_token_id = config.image_token_index - - tokenizer = AutoTokenizer.from_pretrained(model) - eos_token_id = tokenizer.eos_token_id - - hf_output_ids = [ - token_id for idx, token_id in enumerate(output_ids) - if token_id != image_token_id or output_ids[idx - 1] != image_token_id - ] - - assert output_str[0] == " " - hf_output_str = output_str[1:] - if hf_output_ids[-1] == eos_token_id: - hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) - - return hf_output_ids, hf_output_str, out_logprobs - - -@overload -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: List[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - ... - - -@overload -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - sizes: List[Tuple[int, int]], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - ... - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: Optional[List[float]] = None, - sizes: Optional[List[Tuple[int, int]]] = None, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - images = [asset.pil_image for asset in image_assets] - - if size_factors is not None: - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - elif sizes is not None: - inputs_per_image = [( - [prompt for _ in sizes], - [image.resize(size) for size in sizes], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - else: - raise ValueError("You must provide either `size_factors` or `sizes`") - - _run_test(hf_runner, - vllm_runner, - inputs_per_image, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend) - - -def _run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput]], - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - dtype=dtype, - max_model_len=10240, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True, - limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT - }) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs - ] - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForVision2Seq) as hf_model: - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - # TODO: Check whether using original CLIPVisionModel can improve - # consistency against HF - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, model) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype, max_tokens, num_logprobs) -> None: - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding MultiModalConfig as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - size_factors=size_factors, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "sizes", - [[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]], -) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models_fixed_sizes(hf_runner, vllm_runner, image_assets, model, sizes, - dtype, max_tokens, num_logprobs) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - sizes=sizes, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets, - model, dtype, max_tokens, - num_logprobs) -> None: - stop_sign = image_assets[0].pil_image - cherry_blossom = image_assets[1].pil_image - - inputs = [( - [ - "[INST] \nDescribe 2 images. [/INST]", - "[INST] \nDescribe 2 images. [/INST]", - "[INST] \nDescribe 4 images. [/INST]", - "[INST] \nWhat is the season? [/INST]" - ], - [ - [stop_sign, cherry_blossom], - # Images with different sizes and aspect-ratios - [ - rescale_image_size(stop_sign, 0.1), - stop_sign, - ], - [ - stop_sign, - rescale_image_size(stop_sign, 0.25), - cherry_blossom.resize((183, 488)), - cherry_blossom.resize((488, 183)) - ], - cherry_blossom, - ])] - - _run_test( - hf_runner, - vllm_runner, - inputs, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("gridpoints,expected_max_tokens", [ - ([[336, 336]], 1176), - ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928), -]) -def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens, - get_max_llava_next_image_tokens): - ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf") - - # Update the config image_grid_pinpoints - # and calculate the resulting max tokens - ctx.model_config.hf_config.image_grid_pinpoints = gridpoints - - actual_max_tokens = get_max_llava_next_image_tokens( - InputContext(ctx.model_config)) - - assert expected_max_tokens == actual_max_tokens - - -@pytest.mark.parametrize( - "gridpoints,expected_size", - [ - # One point; it has to be the largest - ([[336, 336]], (336, 336)), - # Default for most llava next models; the 2x2 tile is the largest - ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], - (672, 672)), - # If two rectangular gridpoints are the same, the more vertical - # one has the higher feature count due to newline features - ([[336, 672], [672, 336]], (672, 336)) - ]) -def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next, - gridpoints, expected_size): - ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf") - - # Update the config image_grid_pinpoints - ctx.model_config.hf_config.image_grid_pinpoints = gridpoints - seq_len = 5000 # bigger than the max feature size for any image - - seq_data, mm_data = dummy_data_for_llava_next( - ctx, - seq_len=seq_len, - mm_counts={"image": 1}, - ) - - # The dummy data dims should match the gridpoint with the biggest feat size - assert mm_data["image"].height == expected_size[0] - assert mm_data["image"].width == expected_size[1] - assert len(seq_data.get_token_ids()) >= seq_len diff --git a/tests/models/decoder_only/vision_language/test_llava_next_video.py b/tests/models/decoder_only/vision_language/test_llava_next_video.py deleted file mode 100644 index 7b7b23c783e2a..0000000000000 --- a/tests/models/decoder_only/vision_language/test_llava_next_video.py +++ /dev/null @@ -1,226 +0,0 @@ -from typing import List, Optional, Tuple, Type, overload - -import pytest -from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer - -from vllm.multimodal.utils import (rescale_video_size, resize_video, - sample_frames_from_video) -from vllm.sequence import SampleLogprobs - -from ....conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets -from ...utils import check_logprobs_close - -_PREFACE = ( - "A chat between a curious human and an artificial intelligence assistant. " - "The assistant gives helpful, detailed, and polite answers to the human's " - "questions.") - -HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({ - "sample_demo_1": - f"{_PREFACE}USER: