Skip to content

Commit

Permalink
[Bugfix] Make image processor respect mm_processor_kwargs for Qwen2…
Browse files Browse the repository at this point in the history
…-VL (#10112)

Signed-off-by: Jiahao Li <[email protected]>
  • Loading branch information
li-plus authored Nov 7, 2024
1 parent a6f332d commit 999df95
Showing 1 changed file with 23 additions and 10 deletions.
33 changes: 23 additions & 10 deletions vllm/model_executor/models/qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
# limitations under the License.
"""Inference-only Qwen2-VL model compatible with HuggingFace weights."""
from functools import partial
from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
Tuple, Type, TypedDict, Union)
from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
Optional, Tuple, Type, TypedDict, Union)

import torch
import torch.nn as nn
Expand Down Expand Up @@ -558,6 +558,17 @@ def forward(
# === Vision input helpers === #


def get_mm_processor_kwargs(
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None) -> Dict[str, int]:
mm_processor_kwargs = {}
if min_pixels:
mm_processor_kwargs["min_pixels"] = min_pixels
if max_pixels:
mm_processor_kwargs["max_pixels"] = max_pixels
return mm_processor_kwargs


def mm_input_mapper_for_qwen2_vl(
ctx: InputContext,
data: MultiModalData[object],
Expand All @@ -575,12 +586,8 @@ def mm_input_mapper_for_qwen2_vl(
model_config = ctx.model_config
# Handle mm processor kwargs; we pass these at creation time
# because preprocess() in transformers doesn't expose them
mm_processor_kwargs = {}
if min_pixels:
mm_processor_kwargs["min_pixels"] = min_pixels
if max_pixels:
mm_processor_kwargs["max_pixels"] = max_pixels

mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
max_pixels=max_pixels)
image_processor = cached_get_image_processor(
model_config.model,
trust_remote_code=model_config.trust_remote_code,
Expand Down Expand Up @@ -683,7 +690,10 @@ def get_max_qwen2_vl_mm_tokens(ctx: InputContext,
*,
min_pixels=None,
max_pixels=None) -> int:
image_processor = cached_get_image_processor(ctx.model_config.model)
mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
max_pixels=max_pixels)
image_processor = cached_get_image_processor(ctx.model_config.model,
**mm_processor_kwargs)
max_resized_height, max_resized_width, max_llm_image_tokens = \
_get_max_image_info(image_processor, data_type_key=data_type_key,
mm_count=1, min_pixels=min_pixels,
Expand All @@ -705,7 +715,10 @@ def dummy_data_for_qwen2_vl(
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None
) -> Tuple[SequenceData, Optional[MultiModalDataDict]]:
image_processor = cached_get_image_processor(ctx.model_config.model)
mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
max_pixels=max_pixels)
image_processor = cached_get_image_processor(ctx.model_config.model,
**mm_processor_kwargs)

num_images = mm_counts["image"]
max_resized_height, max_resized_width, max_llm_image_tokens = \
Expand Down

0 comments on commit 999df95

Please sign in to comment.