From ccddc259af3f6f491bc572a3e99ce37dcf1f44c7 Mon Sep 17 00:00:00 2001 From: yue Date: Fri, 18 Oct 2024 00:59:29 +0000 Subject: [PATCH 01/10] support more simple image input format -- which is supported by open ai but not show in public doc --- vllm/entrypoints/chat_utils.py | 57 +++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 785dcbfa83119..e62a028264053 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -57,11 +57,15 @@ class CustomChatCompletionContentPartParam(TypedDict, total=False): type: Required[str] """The type of the content part.""" +class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): + """A simpler version of the param that only accepts a plain image_url.""" + image_url: Required[str] + ChatCompletionContentPartParam: TypeAlias = Union[ OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam, ChatCompletionContentPartRefusalParam, - CustomChatCompletionContentPartParam] + CustomChatCompletionContentPartParam, CustomChatCompletionContentSimpleImageParam, str] class CustomChatCompletionMessageParam(TypedDict, total=False): @@ -401,29 +405,38 @@ def _parse_chat_message_content_parts( has_image = False for part in parts: - part_type = part["type"] - if part_type == "text": - text = _TextParser(part)["text"] + if isinstance(part, str): + text = _TextParser(part) texts.append(text) - elif part_type == "image_url": - image_url = _ImageParser(part)["image_url"] - - if image_url.get("detail", "auto") != "auto": - logger.warning( - "'image_url.detail' is currently not supported and " - "will be ignored.") - - mm_parser.parse_image(image_url["url"]) + elif isinstance(part, dict) and "image_url" in part and isinstance(part["image_url"], str): + mm_parser.parse_image(part["image_url"]) has_image = True - elif part_type == "audio_url": - audio_url = _AudioParser(part)["audio_url"] - - mm_parser.parse_audio(audio_url["url"]) - elif part_type == "refusal": - text = _RefusalParser(part)["refusal"] - texts.append(text) else: - raise NotImplementedError(f"Unknown part type: {part_type}") + part_type = part["type"] + if part_type == "text": + text = _TextParser(part)["text"] + texts.append(text) + # This is the logic that distinguish it's a text / image / audio. + + elif part_type == "image_url": + image_url = _ImageParser(part)["image_url"] + + if image_url.get("detail", "auto") != "auto": + logger.warning( + "'image_url.detail' is currently not supported and " + "will be ignored.") + + mm_parser.parse_image(image_url["url"]) + has_image = True + elif part_type == "audio_url": + audio_url = _AudioParser(part)["audio_url"] + + mm_parser.parse_audio(audio_url["url"]) + elif part_type == "refusal": + text = _RefusalParser(part)["refusal"] + texts.append(text) + else: + raise NotImplementedError(f"Unknown part type: {part_type}") text_prompt = "\n".join(texts) if keep_multimodal_content: @@ -524,7 +537,7 @@ def parse_chat_messages_futures( ) -> Tuple[List[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]: conversation: List[ConversationMessage] = [] mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer) - + for msg in messages: sub_messages = _parse_chat_message_content(msg, mm_tracker) From 82e95b2d6a466fc963dd6158273b8d3e4f01ca6a Mon Sep 17 00:00:00 2001 From: yue Date: Fri, 18 Oct 2024 01:00:23 +0000 Subject: [PATCH 02/10] nit --- vllm/entrypoints/chat_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index e62a028264053..69885b81f3ba3 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -537,7 +537,7 @@ def parse_chat_messages_futures( ) -> Tuple[List[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]: conversation: List[ConversationMessage] = [] mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer) - + for msg in messages: sub_messages = _parse_chat_message_content(msg, mm_tracker) From d1b5e00928e2db02ed0353381abefe363b1ea7be Mon Sep 17 00:00:00 2001 From: yue Date: Fri, 18 Oct 2024 18:00:40 +0000 Subject: [PATCH 03/10] handle simple audio input format as well --- vllm/entrypoints/chat_utils.py | 39 +++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 69885b81f3ba3..6b6fee3ba8afe 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -58,14 +58,30 @@ class CustomChatCompletionContentPartParam(TypedDict, total=False): """The type of the content part.""" class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): - """A simpler version of the param that only accepts a plain image_url.""" + """A simpler version of the param that only accepts a plain image_url. + + Example: + { + "image_url": "https://example.com/image.jpg" + } + """ image_url: Required[str] +class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): + """A simpler version of the param that only accepts a plain audio_url. + + Example: + { + "audio_url": "https://example.com/audio.mp3" + } + """ + audio_url: Required[str] ChatCompletionContentPartParam: TypeAlias = Union[ OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam, ChatCompletionContentPartRefusalParam, - CustomChatCompletionContentPartParam, CustomChatCompletionContentSimpleImageParam, str] + CustomChatCompletionContentPartParam, CustomChatCompletionContentSimpleImageParam, + CustomChatCompletionContentSimpleAudioParam, str] class CustomChatCompletionMessageParam(TypedDict, total=False): @@ -390,6 +406,15 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) MODEL_KEEP_MULTI_MODAL_CONTENT = {'mllama'} +def _is_simple_image_part(part: ChatCompletionContentPartParam) -> bool: + """Check if the part is CustomChatCompletionContentSimpleImageParam type.""" + return isinstance(part, dict) and "image_url" in part and isinstance( + part["image_url"], str) + +def _is_simple_audio_part(part: ChatCompletionContentPartParam) -> bool: + """Check if the part is CustomChatCompletionContentSimpleAudioParam type.""" + return isinstance(part, dict) and "audio_url" in part and isinstance( + part["audio_url"], str) def _parse_chat_message_content_parts( role: str, @@ -408,11 +433,15 @@ def _parse_chat_message_content_parts( if isinstance(part, str): text = _TextParser(part) texts.append(text) - elif isinstance(part, dict) and "image_url" in part and isinstance(part["image_url"], str): - mm_parser.parse_image(part["image_url"]) + elif _is_simple_image_part(part): + mm_parser.parse_image(part["image_url"]) # type: ignore has_image = True + elif _is_simple_audio_part(part): + mm_parser.parse_audio(part["audio_url"]) # type: ignore else: - part_type = part["type"] + # If part is not string, CustomChatCompletionContentSimpleImageParam + # CustomChatCompletionContentSimpleAudioParam, process in the following way. + part_type = part["type"] # type: ignore if part_type == "text": text = _TextParser(part)["text"] texts.append(text) From 75921e99bb71ae51cb08e84974cff130c9ab4db8 Mon Sep 17 00:00:00 2001 From: yue Date: Fri, 18 Oct 2024 18:10:49 +0000 Subject: [PATCH 04/10] typing --- vllm/entrypoints/chat_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 6b6fee3ba8afe..7d515ad266ec6 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -80,7 +80,8 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): ChatCompletionContentPartParam: TypeAlias = Union[ OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam, ChatCompletionContentPartRefusalParam, - CustomChatCompletionContentPartParam, CustomChatCompletionContentSimpleImageParam, + CustomChatCompletionContentPartParam, + CustomChatCompletionContentSimpleImageParam, CustomChatCompletionContentSimpleAudioParam, str] @@ -439,8 +440,8 @@ def _parse_chat_message_content_parts( elif _is_simple_audio_part(part): mm_parser.parse_audio(part["audio_url"]) # type: ignore else: - # If part is not string, CustomChatCompletionContentSimpleImageParam - # CustomChatCompletionContentSimpleAudioParam, process in the following way. + # Process not string, CustomChatCompletionContentSimpleImageParam + # CustomChatCompletionContentSimpleAudioParam parts. part_type = part["type"] # type: ignore if part_type == "text": text = _TextParser(part)["text"] From a70acab6e62694f68836e81d960a93961c59e708 Mon Sep 17 00:00:00 2001 From: yue Date: Fri, 18 Oct 2024 18:27:18 +0000 Subject: [PATCH 05/10] format --- vllm/entrypoints/chat_utils.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 7d515ad266ec6..90f0a3c96a923 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -57,6 +57,7 @@ class CustomChatCompletionContentPartParam(TypedDict, total=False): type: Required[str] """The type of the content part.""" + class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): """A simpler version of the param that only accepts a plain image_url. @@ -67,6 +68,7 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): """ image_url: Required[str] + class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): """A simpler version of the param that only accepts a plain audio_url. @@ -77,6 +79,7 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): """ audio_url: Required[str] + ChatCompletionContentPartParam: TypeAlias = Union[ OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam, ChatCompletionContentPartRefusalParam, @@ -407,15 +410,18 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) MODEL_KEEP_MULTI_MODAL_CONTENT = {'mllama'} + def _is_simple_image_part(part: ChatCompletionContentPartParam) -> bool: - """Check if the part is CustomChatCompletionContentSimpleImageParam type.""" - return isinstance(part, dict) and "image_url" in part and isinstance( - part["image_url"], str) + """Check if the part is CustomChatCompletionContentSimpleImageParam.""" + image_url = part.get("image_url") if isinstance(part, dict) else None + return isinstance(image_url, str) + def _is_simple_audio_part(part: ChatCompletionContentPartParam) -> bool: - """Check if the part is CustomChatCompletionContentSimpleAudioParam type.""" - return isinstance(part, dict) and "audio_url" in part and isinstance( - part["audio_url"], str) + """Check if the part is CustomChatCompletionContentSimpleAudioParam.""" + audio_url = part.get("audio_url") if isinstance(part, dict) else None + return isinstance(audio_url, str) + def _parse_chat_message_content_parts( role: str, @@ -435,19 +441,19 @@ def _parse_chat_message_content_parts( text = _TextParser(part) texts.append(text) elif _is_simple_image_part(part): - mm_parser.parse_image(part["image_url"]) # type: ignore + mm_parser.parse_image(part["image_url"]) # type: ignore has_image = True elif _is_simple_audio_part(part): - mm_parser.parse_audio(part["audio_url"]) # type: ignore + mm_parser.parse_audio(part["audio_url"]) # type: ignore else: # Process not string, CustomChatCompletionContentSimpleImageParam # CustomChatCompletionContentSimpleAudioParam parts. - part_type = part["type"] # type: ignore + part_type = part["type"] # type: ignore if part_type == "text": text = _TextParser(part)["text"] texts.append(text) # This is the logic that distinguish it's a text / image / audio. - + elif part_type == "image_url": image_url = _ImageParser(part)["image_url"] From b8b2060119059b7d5d6737b7ea227519bf1fa98e Mon Sep 17 00:00:00 2001 From: yue Date: Fri, 18 Oct 2024 20:09:03 +0000 Subject: [PATCH 06/10] address comments - will add unittest --- vllm/entrypoints/chat_utils.py | 95 ++++++++++++++++++++-------------- 1 file changed, 57 insertions(+), 38 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 90f0a3c96a923..0a30dbe3d8a75 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -5,8 +5,8 @@ from collections import defaultdict from functools import lru_cache, partial from pathlib import Path -from typing import (Any, Awaitable, Dict, Generic, Iterable, List, Literal, - Mapping, Optional, Tuple, TypeVar, Union, cast) +from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List, + Literal, Mapping, Optional, Tuple, TypeVar, Union, cast) # yapf conflicts with isort for this block # yapf: disable @@ -410,17 +410,56 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) MODEL_KEEP_MULTI_MODAL_CONTENT = {'mllama'} +# Define a mapping from part types to their corresponding parsing functions. +MM_PARSER_MAP: Dict[str, Callable[[ChatCompletionContentPartParam], str]] = { + "text": lambda part: _TextParser(part)["text"], + "image_url": lambda part: _ImageParser(part)["image_url"]["url"], + "audio_url": lambda part: _AudioParser(part)["audio_url"]["url"], + "refusal": lambda part: _RefusalParser(part)["refusal"], +} -def _is_simple_image_part(part: ChatCompletionContentPartParam) -> bool: - """Check if the part is CustomChatCompletionContentSimpleImageParam.""" - image_url = part.get("image_url") if isinstance(part, dict) else None - return isinstance(image_url, str) +def _parse_chat_message_content_mm_parts( + part: ChatCompletionContentPartParam) -> Tuple[str, str]: + """ + Parses a given multi modal content part based on its type. + + Args: + part: A dict containing the content part, with a potential 'type' field. + + Returns: + A tuple (part_type, content) where: + - part_type: Type of the part (e.g., 'text', 'image_url'). + - content: Parsed content or an empty string if unsupported. + + Raises: + ValueError: If the 'type' field is missing and no direct URL is found. + """ + part_type = part.get("type", None) # type: ignore + + if part_type in MM_PARSER_MAP: + content = MM_PARSER_MAP[part_type](part) # type: ignore + + # Special case for 'image_url.detail' + if part_type == "image_url" and part.get( # type: ignore + "detail") != "auto": + logger.warning("'image_url.detail' is currently not supported " + "and will be ignored.") -def _is_simple_audio_part(part: ChatCompletionContentPartParam) -> bool: - """Check if the part is CustomChatCompletionContentSimpleAudioParam.""" - audio_url = part.get("audio_url") if isinstance(part, dict) else None - return isinstance(audio_url, str) + return part_type, content # type: ignore + + # Handle missing 'type' but provided direct URL fields. + if part_type is None: + for url_type in ["image_url", "audio_url"]: + if url_type in part and isinstance( + part[url_type], # type: ignore + str): + return url_type, part[url_type] # type: ignore + + # Raise an error if no 'type' or direct URL is found. + raise ValueError("Missing 'type' field in multimodal part.") + + return part_type, "unknown part_type content" # type: ignore def _parse_chat_message_content_parts( @@ -437,40 +476,20 @@ def _parse_chat_message_content_parts( has_image = False for part in parts: - if isinstance(part, str): + if isinstance(part, str): # Handle plain text parts text = _TextParser(part) texts.append(text) - elif _is_simple_image_part(part): - mm_parser.parse_image(part["image_url"]) # type: ignore - has_image = True - elif _is_simple_audio_part(part): - mm_parser.parse_audio(part["audio_url"]) # type: ignore - else: - # Process not string, CustomChatCompletionContentSimpleImageParam - # CustomChatCompletionContentSimpleAudioParam parts. - part_type = part["type"] # type: ignore - if part_type == "text": - text = _TextParser(part)["text"] - texts.append(text) - # This is the logic that distinguish it's a text / image / audio. + else: # Handle structured dictionary parts + assert isinstance(part, dict) + part_type, content = _parse_chat_message_content_mm_parts(part) + if part_type in ["text", "refusal"]: + texts.append(content) elif part_type == "image_url": - image_url = _ImageParser(part)["image_url"] - - if image_url.get("detail", "auto") != "auto": - logger.warning( - "'image_url.detail' is currently not supported and " - "will be ignored.") - - mm_parser.parse_image(image_url["url"]) + mm_parser.parse_image(content) has_image = True elif part_type == "audio_url": - audio_url = _AudioParser(part)["audio_url"] - - mm_parser.parse_audio(audio_url["url"]) - elif part_type == "refusal": - text = _RefusalParser(part)["refusal"] - texts.append(text) + mm_parser.parse_audio(content) else: raise NotImplementedError(f"Unknown part type: {part_type}") From 457c25225190ea60b97a6815a4c1bd4f98c46d7f Mon Sep 17 00:00:00 2001 From: yue Date: Fri, 18 Oct 2024 20:09:53 +0000 Subject: [PATCH 07/10] format --- vllm/entrypoints/chat_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 0a30dbe3d8a75..6c1ad02c73919 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -60,6 +60,7 @@ class CustomChatCompletionContentPartParam(TypedDict, total=False): class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): """A simpler version of the param that only accepts a plain image_url. + This is supported by OpenAI API, although it is not documented. Example: { From fe2304bc51e2135e1cc3066e61dee63ea8dcd697 Mon Sep 17 00:00:00 2001 From: yue Date: Fri, 18 Oct 2024 21:39:47 +0000 Subject: [PATCH 08/10] add unittest for simple image input format --- tests/entrypoints/test_chat_utils.py | 26 ++++++++++++++++++++++++++ vllm/entrypoints/chat_utils.py | 4 ++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 6ded5102c9314..1b172f88e204b 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -387,3 +387,29 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages( "text": "What about these two?" }] }], phi3v_model_config, phi3v_tokenizer) + + +def test_parse_chat_messages_multiple_images_uncommon_input( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + conversation, mm_data = parse_chat_messages([{ + "role": + "user", + "content": [ + "What's in these images?", { + "image_url": image_url + }, { + "image_url": image_url + } + ] + }], phi3v_model_config, phi3v_tokenizer) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in these images?" + }] + _assert_mm_data_is_image_input(mm_data, 2) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 6c1ad02c73919..5a887483a0faa 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -420,7 +420,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], } -def _parse_chat_message_content_mm_parts( +def _parse_chat_message_content_mm_part( part: ChatCompletionContentPartParam) -> Tuple[str, str]: """ Parses a given multi modal content part based on its type. @@ -482,7 +482,7 @@ def _parse_chat_message_content_parts( texts.append(text) else: # Handle structured dictionary parts assert isinstance(part, dict) - part_type, content = _parse_chat_message_content_mm_parts(part) + part_type, content = _parse_chat_message_content_mm_part(part) if part_type in ["text", "refusal"]: texts.append(content) From a562e35f4725e2d1f4fd1789b9add84f558c1885 Mon Sep 17 00:00:00 2001 From: yue Date: Fri, 18 Oct 2024 22:54:15 +0000 Subject: [PATCH 09/10] address comments - better handle types, remove the ignore types, and use tuple instead of list --- vllm/entrypoints/chat_utils.py | 47 ++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 5a887483a0faa..b0fd139cda9a6 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -413,10 +413,14 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], # Define a mapping from part types to their corresponding parsing functions. MM_PARSER_MAP: Dict[str, Callable[[ChatCompletionContentPartParam], str]] = { - "text": lambda part: _TextParser(part)["text"], - "image_url": lambda part: _ImageParser(part)["image_url"]["url"], - "audio_url": lambda part: _AudioParser(part)["audio_url"]["url"], - "refusal": lambda part: _RefusalParser(part)["refusal"], + "text": + lambda part: _TextParser(part).get("text", ""), + "image_url": + lambda part: _ImageParser(part).get("image_url", {}).get("url", ""), + "audio_url": + lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""), + "refusal": + lambda part: _RefusalParser(part).get("refusal", ""), } @@ -431,36 +435,42 @@ def _parse_chat_message_content_mm_part( Returns: A tuple (part_type, content) where: - part_type: Type of the part (e.g., 'text', 'image_url'). - - content: Parsed content or an empty string if unsupported. + - content: Parsed content (e.g., text, image URL). Raises: ValueError: If the 'type' field is missing and no direct URL is found. """ - part_type = part.get("type", None) # type: ignore + assert isinstance( + part, dict) # This is needed to avoid mypy errors: part.get() from str + part_type = part.get("type", None) - if part_type in MM_PARSER_MAP: - content = MM_PARSER_MAP[part_type](part) # type: ignore + if isinstance(part_type, str) and part_type in MM_PARSER_MAP: + content = MM_PARSER_MAP[part_type](part) # Special case for 'image_url.detail' - if part_type == "image_url" and part.get( # type: ignore - "detail") != "auto": + if part_type == "image_url" and part.get("detail") != "auto": logger.warning("'image_url.detail' is currently not supported " "and will be ignored.") - return part_type, content # type: ignore + return part_type, content # Handle missing 'type' but provided direct URL fields. if part_type is None: - for url_type in ["image_url", "audio_url"]: - if url_type in part and isinstance( - part[url_type], # type: ignore - str): - return url_type, part[url_type] # type: ignore + if part.get("image_url") is not None: + image_params = cast(CustomChatCompletionContentSimpleImageParam, + part) + return "image_url", image_params["image_url"] + if part.get("audio_url") is not None: + audio_params = cast(CustomChatCompletionContentSimpleAudioParam, + part) + return "audio_url", audio_params["audio_url"] # Raise an error if no 'type' or direct URL is found. raise ValueError("Missing 'type' field in multimodal part.") - return part_type, "unknown part_type content" # type: ignore + if not isinstance(part_type, str): + raise ValueError("Invalid 'type' field in multimodal part.") + return part_type, "unknown part_type content" def _parse_chat_message_content_parts( @@ -481,10 +491,9 @@ def _parse_chat_message_content_parts( text = _TextParser(part) texts.append(text) else: # Handle structured dictionary parts - assert isinstance(part, dict) part_type, content = _parse_chat_message_content_mm_part(part) - if part_type in ["text", "refusal"]: + if part_type in ("text", "refusal"): texts.append(content) elif part_type == "image_url": mm_parser.parse_image(content) From 3f2ecac1d34bacfd651834de48f1b6fecbf6d22d Mon Sep 17 00:00:00 2001 From: yue Date: Fri, 18 Oct 2024 23:22:08 +0000 Subject: [PATCH 10/10] nits --- vllm/entrypoints/chat_utils.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index b0fd139cda9a6..845ccbf57f5ae 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -459,11 +459,11 @@ def _parse_chat_message_content_mm_part( if part.get("image_url") is not None: image_params = cast(CustomChatCompletionContentSimpleImageParam, part) - return "image_url", image_params["image_url"] + return "image_url", image_params.get("image_url", "") if part.get("audio_url") is not None: audio_params = cast(CustomChatCompletionContentSimpleAudioParam, part) - return "audio_url", audio_params["audio_url"] + return "audio_url", audio_params.get("audio_url", "") # Raise an error if no 'type' or direct URL is found. raise ValueError("Missing 'type' field in multimodal part.") @@ -473,6 +473,10 @@ def _parse_chat_message_content_mm_part( return part_type, "unknown part_type content" +VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url", + "audio_url") + + def _parse_chat_message_content_parts( role: str, parts: Iterable[ChatCompletionContentPartParam], @@ -493,6 +497,13 @@ def _parse_chat_message_content_parts( else: # Handle structured dictionary parts part_type, content = _parse_chat_message_content_mm_part(part) + # if part_type is text/refusal/image_url/audio_url but + # content is empty, logg a warning and skip + if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content: + logger.warning("Skipping multimodal part " + "with empty / unparsable content.") + continue + if part_type in ("text", "refusal"): texts.append(content) elif part_type == "image_url":