Skip to content

Commit

Permalink
[Bugfix]Disable the post_norm layer of the vision encoder for LLaVA m…
Browse files Browse the repository at this point in the history
…odels (vllm-project#9653)

Signed-off-by: Shanshan Wang <[email protected]>
  • Loading branch information
litianjian authored and cooleel committed Oct 28, 2024
1 parent b6d2293 commit c67bc12
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 4 deletions.
3 changes: 2 additions & 1 deletion vllm/model_executor/models/llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,8 @@ def __init__(self,
config.projector_hidden_act = "gelu"

# TODO: Optionally initializes this for supporting embeddings.
self.vision_tower = init_vision_tower_for_llava(config, quant_config)
self.vision_tower = init_vision_tower_for_llava(
config, quant_config, require_post_norm=False)
self.multi_modal_projector = LlavaMultiModalProjector(
vision_hidden_size=config.vision_config.hidden_size,
text_hidden_size=config.text_config.hidden_size,
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,8 @@ def __init__(self,
self.multimodal_config = multimodal_config

# TODO: Optionally initializes this for supporting embeddings.
self.vision_tower = init_vision_tower_for_llava(config, quant_config)
self.vision_tower = init_vision_tower_for_llava(
config, quant_config, require_post_norm=False)
self.image_newline = nn.Parameter(
torch.empty(config.text_config.hidden_size))
self.multi_modal_projector = LlavaMultiModalProjector(
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/llava_next_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,8 @@ def __init__(self,
self.multimodal_config = multimodal_config

# Initialize the vision tower only up to the required feature layer
self.vision_tower = init_vision_tower_for_llava(config, quant_config)
self.vision_tower = init_vision_tower_for_llava(
config, quant_config, require_post_norm=False)
self.vision_resampler = LlavaNextVideoPooler(config)
self.multi_modal_projector = LlavaNextMultiModalProjector(
vision_hidden_size=config.vision_config.hidden_size,
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/llava_onevision.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,8 @@ def __init__(self,
self.multimodal_config = multimodal_config

# Initialize the vision tower only up to the required feature layer
self.vision_tower = init_vision_tower_for_llava(config, quant_config)
self.vision_tower = init_vision_tower_for_llava(
config, quant_config, require_post_norm=False)
self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config)
self.language_model = init_vllm_registered_model(
config.text_config, cache_config, quant_config)
Expand Down

0 comments on commit c67bc12

Please sign in to comment.