Skip to content

Commit

Permalink
[Hotfix][Core][VLM] Disable chunked prefill by default and prefix cac…
Browse files Browse the repository at this point in the history
…hing for multimodal models (vllm-project#8425)
  • Loading branch information
ywang96 authored and MengqingCao committed Sep 30, 2024
1 parent 7335b8d commit 29a8585
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 3 deletions.
12 changes: 11 additions & 1 deletion vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,13 @@ def create_engine_config(self) -> EngineConfig:
device_config = DeviceConfig(device=self.device)
model_config = self.create_model_config()

if model_config.is_multimodal_model:
if self.enable_prefix_caching:
logger.warning(
"--enable-prefix-caching is currently not "
"supported for multimodal models and has been disabled.")
self.enable_prefix_caching = False

cache_config = CacheConfig(
block_size=self.block_size if self.device != "neuron" else
self.max_model_len, # neuron needs block_size = max_model_len
Expand Down Expand Up @@ -874,7 +881,10 @@ def create_engine_config(self) -> EngineConfig:
# If not explicitly set, enable chunked prefill by default for
# long context (> 32K) models. This is to avoid OOM errors in the
# initial memory profiling phase.
if use_long_context:

# Chunked prefill is currently disabled for multimodal models by
# default.
if use_long_context and not model_config.is_multimodal_model:
is_gpu = device_config.device_type == "cuda"
use_sliding_window = (model_config.get_sliding_window()
is not None)
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,12 @@
"PaliGemmaForConditionalGeneration": ("paligemma",
"PaliGemmaForConditionalGeneration"),
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
"UltravoxModel": ("ultravox", "UltravoxModel"),
"QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
"PixtralForConditionalGeneration": ("pixtral",
"PixtralForConditionalGeneration"),
"QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
"Qwen2VLForConditionalGeneration": ("qwen2_vl",
"Qwen2VLForConditionalGeneration"),
"UltravoxModel": ("ultravox", "UltravoxModel"),
}
_CONDITIONAL_GENERATION_MODELS = {
"BartModel": ("bart", "BartForConditionalGeneration"),
Expand Down

0 comments on commit 29a8585

Please sign in to comment.