From 60b6ff74a15077fb2c213b352eef83bed93b942a Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 28 Jun 2024 00:08:10 +0800 Subject: [PATCH] [Model][Bugfix] Implicit model flags and reenable Phi-3-Vision (#5896) --- vllm/model_executor/models/baichuan.py | 2 -- vllm/model_executor/models/chatglm.py | 2 -- vllm/model_executor/models/gemma.py | 2 -- vllm/model_executor/models/gpt_bigcode.py | 2 -- vllm/model_executor/models/interfaces.py | 18 ++++++++++++++++-- vllm/model_executor/models/llama.py | 2 -- vllm/model_executor/models/llava.py | 2 -- vllm/model_executor/models/llava_next.py | 2 -- vllm/model_executor/models/minicpm.py | 2 -- vllm/model_executor/models/mixtral.py | 2 -- vllm/model_executor/models/phi.py | 2 -- vllm/model_executor/models/phi3v.py | 16 ++++++++++------ vllm/model_executor/models/qwen2.py | 2 -- vllm/model_executor/models/xverse.py | 2 -- 14 files changed, 26 insertions(+), 32 deletions(-) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index abaefa3cf7781..5cf5a199b7690 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -295,8 +295,6 @@ def forward( class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA): - supports_lora = True - packed_modules_mapping = { "W_pack": ["W_pack"], "gate_up_proj": [ diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index bf64538ef54a3..5b5a69447e0b8 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -325,8 +325,6 @@ def forward( class ChatGLMForCausalLM(nn.Module, SupportsLoRA): - supports_lora = True - packed_modules_mapping = { "query_key_value": ["query_key_value"], "dense_h_to_4h": ["dense_h_to_4h"] diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 9e071a155061b..ce97fc808c85e 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -291,8 +291,6 @@ def forward( class GemmaForCausalLM(nn.Module, SupportsLoRA): - supports_lora = True - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 009d7b1498c22..17bbe4e312fc3 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -233,8 +233,6 @@ def forward( class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA): - supports_lora = True - packed_modules_mapping = {"c_attn": ["c_attn"]} supported_lora_modules = ["c_fc", "c_proj", "wte", "lm_head", "c_attn"] diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index a9eb397a5a97f..cb0fc154a74d8 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -13,7 +13,14 @@ class SupportsVision(Protocol): """The interface required for all vision language models (VLMs).""" - supports_vision: ClassVar[Literal[True]] + supports_vision: ClassVar[Literal[True]] = True + """ + A flag that indicates this model supports vision inputs. + + Note: + There is no need to redefine this flag if this class is in the + MRO of your model class. + """ def __init__(self, *, vlm_config: VisionLanguageConfig) -> None: ... @@ -52,7 +59,14 @@ def supports_vision( class SupportsLoRA(Protocol): """The interface required for all models that support LoRA.""" - supports_lora: ClassVar[Literal[True]] + supports_lora: ClassVar[Literal[True]] = True + """ + A flag that indicates this model supports LoRA. + + Note: + There is no need to redefine this flag if this class is in the + MRO of your model class. + """ packed_modules_mapping: ClassVar[Dict[str, List[str]]] supported_lora_modules: ClassVar[List[str]] diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index f4918cbfef294..54d01701f04fb 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -299,8 +299,6 @@ def forward( class LlamaForCausalLM(nn.Module, SupportsLoRA): - supports_lora = True - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 8e18b42b76734..125e3ddea7df3 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -88,8 +88,6 @@ class LlavaImageFeatureInputs(TypedDict): @MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data) class LlavaForConditionalGeneration(nn.Module, SupportsVision): - supports_vision = True - def __init__(self, config: LlavaConfig, vlm_config: VisionLanguageConfig, diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 5c03fb3705561..841818d8db6ff 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -108,8 +108,6 @@ def _image_pixel_processor( @MULTIMODAL_REGISTRY.register_dummy_data(_get_dummy_image_data) class LlavaNextForConditionalGeneration(nn.Module, SupportsVision): - supports_vision = True - def __init__(self, config: LlavaNextConfig, vlm_config: VisionLanguageConfig, diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index ae17309bd5223..a76ed049828e7 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -392,8 +392,6 @@ def forward( class MiniCPMForCausalLM(nn.Module, SupportsLoRA): - supports_lora = True - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 0bdcb21e514fd..a662db6d28d00 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -475,8 +475,6 @@ def forward( class MixtralForCausalLM(nn.Module, SupportsLoRA): - supports_lora = True - fall_back_to_pt_during_load = False packed_modules_mapping = { diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index d288bdd9d78f5..008fceb624f75 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -232,8 +232,6 @@ def forward( class PhiForCausalLM(nn.Module, SupportsLoRA): - supports_lora = True - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 578e22beaa3d6..0bbe93241b139 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -32,12 +32,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.models.llama import LlamaModel -from vllm.model_executor.models.vlm_base import VisionLanguageModelBase from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import ImagePixelData, get_dummy_image_data from vllm.sequence import SamplerOutput +from .interfaces import SupportsVision + logger = init_logger(__name__) _KEYS_TO_MODIFY_MAPPING = { @@ -317,18 +318,21 @@ def _image_processor( @MULTIMODAL_REGISTRY.register_image_pixel_input(_image_processor) @MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data) -class Phi3VForCausalLM(VisionLanguageModelBase): +class Phi3VForCausalLM(nn.Module, SupportsVision): def __init__(self, config: PretrainedConfig, - vision_language_config: VisionLanguageConfig, + vlm_config: VisionLanguageConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None) -> None: - super().__init__(vision_language_config) + super().__init__() + self.config = config + self.vlm_config = vlm_config + self.model = LlamaModel(config, cache_config, quant_config) self.vision_embed_tokens = Phi3HDImageEmbedding( - vision_language_config, config, self.model.embed_tokens) + vlm_config, config, self.model.embed_tokens) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = Sampler() @@ -338,7 +342,7 @@ def _parse_and_validate_image_input( pixel_values = kwargs.pop("pixel_values", None) image_sizes = kwargs.pop("image_sizes", None) - expected_input_type = self.vision_language_config.image_input_type + expected_input_type = self.vlm_config.image_input_type ImageInputType = VisionLanguageConfig.ImageInputType if expected_input_type != ImageInputType.PIXEL_VALUES: diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index d351adcefc974..e2d725af63593 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -266,8 +266,6 @@ def forward( class Qwen2ForCausalLM(nn.Module, SupportsLoRA): - supports_lora = True - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py index 639c3443bc369..b61721999ca9b 100644 --- a/vllm/model_executor/models/xverse.py +++ b/vllm/model_executor/models/xverse.py @@ -269,8 +269,6 @@ def forward( class XverseForCausalLM(nn.Module, SupportsLoRA): - supports_lora = True - packed_modules_mapping = { "qkv_proj": [ "q_proj",