From 5c7d4e214ede6a52df17a2f4a6225a0f5f637ada Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 1 Oct 2024 12:18:03 +0800 Subject: [PATCH 1/5] fix fuyu tp inference --- vllm/model_executor/models/fuyu.py | 1 + vllm/model_executor/models/persimmon.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 9f4dca78d435d..0816389b701a0 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -237,6 +237,7 @@ def __init__(self, self.image_feature_size, config.hidden_size, quant_config=quant_config, + gather_output=True, ) self.language_model = PersimmonForCausalLM(config, cache_config=cache_config, diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index ced846cbe3358..ae58d5b55cb14 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -25,11 +25,11 @@ import torch from torch import nn from transformers import PersimmonConfig -from transformers.activations import ReLUSquaredActivation from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) @@ -57,7 +57,7 @@ def __init__(self, self.dense_4h_to_h = RowParallelLinear(config.intermediate_size, config.hidden_size, quant_config=quant_config) - self.act = ReLUSquaredActivation() + self.act = get_act_fn(config.hidden_act, quant_config) def forward(self, hidden_states) -> torch.Tensor: hidden_states, _ = self.dense_h_to_4h(hidden_states) @@ -96,7 +96,7 @@ def __init__(self, quant_config=quant_config, ) self.dense = RowParallelLinear( - self.num_heads * self.head_dim, + self.total_num_heads * self.head_dim, self.hidden_size, bias=True, quant_config=quant_config, From d9a9e22a1445fd790c0c9144531a3fc71b52bc7f Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 1 Oct 2024 12:46:14 +0800 Subject: [PATCH 2/5] use new transformers text_config --- vllm/model_executor/models/fuyu.py | 2 +- vllm/model_executor/models/persimmon.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 0816389b701a0..87b88da0dc05c 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -239,7 +239,7 @@ def __init__(self, quant_config=quant_config, gather_output=True, ) - self.language_model = PersimmonForCausalLM(config, + self.language_model = PersimmonForCausalLM(config.text_config, cache_config=cache_config, quant_config=quant_config) diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index ae58d5b55cb14..59eb39e07a08c 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -213,10 +213,10 @@ def __init__(self, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None): super().__init__() - self.vocab_size = config.text_config.vocab_size + self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding( - config.text_config.vocab_size, config.hidden_size) + config.vocab_size, config.hidden_size) self.layers = nn.ModuleList([ PersimmonDecoderLayer(config, cache_config=cache_config, @@ -252,19 +252,19 @@ def forward( class PersimmonForCausalLM(nn.Module): def __init__(self, - config, + config: PersimmonConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None): super().__init__() self.config = config - self.vocab_size = config.text_config.vocab_size + self.vocab_size = config.vocab_size self.model = PersimmonModel(config, cache_config=cache_config, quant_config=quant_config) - self.lm_head = ParallelLMHead(config.text_config.vocab_size, + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, bias=False) - self.logits_processor = LogitsProcessor(config.text_config.vocab_size) + self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = Sampler() def forward( From 9c5f144dc30779791be6324eba5befccbb732c4d Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 1 Oct 2024 12:46:30 +0800 Subject: [PATCH 3/5] code format --- vllm/model_executor/models/persimmon.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 59eb39e07a08c..fda0602110a0b 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -215,8 +215,8 @@ def __init__(self, super().__init__() self.vocab_size = config.vocab_size - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, config.hidden_size) + self.embed_tokens = VocabParallelEmbedding(config.vocab_size, + config.hidden_size) self.layers = nn.ModuleList([ PersimmonDecoderLayer(config, cache_config=cache_config, From 4aaffab59c63dc73a44a8ce2a95db5141274d794 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 1 Oct 2024 13:18:39 +0800 Subject: [PATCH 4/5] add fuyu to pp test with tp only --- tests/distributed/test_pipeline_parallel.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 9fd1368cc2b59..f2298d1a4c036 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -37,7 +37,9 @@ (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "mp"), (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "mp"), (1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "mp"), - (1, 2, 0, 1, 0, "Qwen/Qwen2-VL-2B-Instruct", "mp") + (1, 2, 0, 1, 0, "Qwen/Qwen2-VL-2B-Instruct", "mp"), + # TP only models + (2, 1, 0, 1, 0, "adept/fuyu-8b", "mp"), ], ) @fork_new_process_for_each_test From b6bb7c5ce3c8ef8f7d0575b8b7f3a2f57f8213a8 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 1 Oct 2024 15:26:38 +0800 Subject: [PATCH 5/5] eager and disable chunked_prefill for fuyu --- tests/distributed/test_pipeline_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index f2298d1a4c036..2e8e83c3d271b 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -39,7 +39,7 @@ (1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "mp"), (1, 2, 0, 1, 0, "Qwen/Qwen2-VL-2B-Instruct", "mp"), # TP only models - (2, 1, 0, 1, 0, "adept/fuyu-8b", "mp"), + (2, 1, 1, 0, 0, "adept/fuyu-8b", "mp"), ], ) @fork_new_process_for_each_test