Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Frontend] Add OpenAI Vision API Support #5237

Merged
merged 54 commits into from
Jun 7, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
8ba11d4
initial
ywang96 Jun 2, 2024
d361d20
iterate
ywang96 Jun 3, 2024
fd5aba5
Merge branch 'main' into gpt4v-fe
ywang96 Jun 3, 2024
730cda7
iterate
ywang96 Jun 3, 2024
1c0b89d
iterate
ywang96 Jun 4, 2024
520f5a0
iterate
ywang96 Jun 4, 2024
3a57a6d
iterate
ywang96 Jun 4, 2024
31b941b
adding test
ywang96 Jun 4, 2024
9b3cf48
iterate
ywang96 Jun 4, 2024
af94f8c
docstring
ywang96 Jun 4, 2024
332dd10
remove unused lib
ywang96 Jun 4, 2024
d52a907
revert hardcoded chat template
ywang96 Jun 4, 2024
58746fc
address feedback
ywang96 Jun 4, 2024
99d9197
update pytestmark
ywang96 Jun 4, 2024
0b65271
apply asyncio mark
ywang96 Jun 4, 2024
3a965d9
update doc
ywang96 Jun 4, 2024
f9b9707
update test
ywang96 Jun 5, 2024
04ebbf7
minor doc update
ywang96 Jun 5, 2024
0cdd54f
minor doc update
ywang96 Jun 5, 2024
82a0052
Clarify experiment support
ywang96 Jun 5, 2024
dd01246
note regarding prompt format when using API server
ywang96 Jun 5, 2024
e40da86
Merge branch 'main' into gpt4v-fe
ywang96 Jun 5, 2024
088ad81
fix typo
ywang96 Jun 5, 2024
daa7085
update template
ywang96 Jun 5, 2024
1b32e2f
revert and update token count
ywang96 Jun 5, 2024
c45b34e
update template
ywang96 Jun 5, 2024
d6c1322
update
ywang96 Jun 5, 2024
05fe635
update
ywang96 Jun 5, 2024
938e5c9
template format
ywang96 Jun 5, 2024
b9318bc
correct and add test for multi image
ywang96 Jun 5, 2024
199ced7
fix test
ywang96 Jun 5, 2024
9e686e0
Add unit test for `fetch_image`
DarkLight1337 Jun 5, 2024
d9fbb17
Apply formatter
DarkLight1337 Jun 5, 2024
2833ba0
address feedback
ywang96 Jun 6, 2024
6c365bd
fix notes
ywang96 Jun 6, 2024
26c38f1
use aiohttp
ywang96 Jun 6, 2024
734e50b
fix test
ywang96 Jun 6, 2024
0cd2931
test
ywang96 Jun 6, 2024
561f07f
fix test
ywang96 Jun 6, 2024
481fea8
update test
ywang96 Jun 6, 2024
9585cc6
update fixture
ywang96 Jun 6, 2024
7f9500d
fix field
ywang96 Jun 6, 2024
32d1a25
fix field
ywang96 Jun 6, 2024
1e665b7
format
ywang96 Jun 6, 2024
cce804e
fix image loading
ywang96 Jun 6, 2024
31b219c
revert change that merges fetch and parse
ywang96 Jun 6, 2024
dcf8c8d
add encoded image fixture
ywang96 Jun 6, 2024
a9a9712
Merge branch 'main' into gpt4v-fe
ywang96 Jun 6, 2024
89a452a
update fetch image and remove unused fixture
ywang96 Jun 6, 2024
4e3eca9
cleanup
ywang96 Jun 6, 2024
afadfac
fix fixture
ywang96 Jun 6, 2024
d3bae73
remove unused client close
ywang96 Jun 6, 2024
a149368
add TODO and format
ywang96 Jun 6, 2024
72d4bc4
address comment
ywang96 Jun 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions tests/entrypoints/test_openai_vision.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from pathlib import Path
from typing import Dict

import openai
import pytest
import pytest_asyncio
import ray
from PIL import Image

from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64

Expand Down Expand Up @@ -62,9 +62,12 @@ def client():


@pytest_asyncio.fixture(scope="session")
async def base64_encoded_image(image_url: str) -> Image.Image:
return encode_image_base64(
await ImageFetchAiohttp.fetch_image(image_url=image_url))
async def base64_encoded_image() -> Dict[str, str]:
return {
image_url:
encode_image_base64(await ImageFetchAiohttp.fetch_image(image_url))
for image_url in TEST_IMAGE_URLS
}


@pytest.mark.asyncio
Expand Down Expand Up @@ -123,8 +126,8 @@ async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image_base64encoded(
server, client: openai.AsyncOpenAI, model_name: str,
base64_encoded_image: str):
server, client: openai.AsyncOpenAI, model_name: str, image_url: str,
base64_encoded_image: Dict[str, str]):

messages = [{
"role":
Expand All @@ -133,7 +136,8 @@ async def test_single_chat_session_image_base64encoded(
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_encoded_image}"
"url":
f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
}
},
{
Expand Down
10 changes: 5 additions & 5 deletions tests/multimodal/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
base64_image = base64.b64encode(f.read()).decode("utf-8")
data_url = f"data:{mime_type};base64,{base64_image}"

async with ImageFetchAiohttp.fetch_image(data_url) as data_image:
if _image_equals(url_image, Image.open(f)):
assert _image_equals(url_image, await data_image)
else:
pass # Lossy format; only check that image can be opened
data_image = await ImageFetchAiohttp.fetch_image(data_url)
if _image_equals(url_image, Image.open(f)):
assert _image_equals(url_image, data_image)
else:
pass # Lossy format; only check that image can be opened
27 changes: 13 additions & 14 deletions vllm/multimodal/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import asyncio
import base64
from io import BytesIO
from typing import Optional, Union
Expand All @@ -23,30 +24,26 @@ def get_aiohttp_client(cls) -> aiohttp.ClientSession:

return cls.aiohttp_client

@classmethod
async def close_aiohttp_client(cls) -> None:
if cls.aiohttp_client:
await cls.aiohttp_client.close()
cls.aiohttp_client = None

@classmethod
async def fetch_image(cls, image_url: str) -> Image.Image:
"""Load image from a url or base64 encoded openai GPT4V format"""
"""Load PIL image from a url or base64 encoded openai GPT4V format"""

# Avoid circular import
from vllm import __version__ as VLLM_VERSION
if image_url.startswith('http'):
# Avoid circular import
from vllm import __version__ as VLLM_VERSION

client = cls.get_aiohttp_client()
headers = {"User-Agent": f"vLLM/{VLLM_VERSION}"}
client = cls.get_aiohttp_client()
headers = {"User-Agent": f"vLLM/{VLLM_VERSION}"}

if image_url.startswith('http'):
async with client.get(url=image_url, headers=headers) as response:
response.raise_for_status()
image_raw = await response.read()
image = Image.open(BytesIO(image_raw))

elif image_url.startswith('data:image'):
image = load_image_from_base64(image_url.split(',')[1])
loop = asyncio.get_event_loop()
image = await loop.run_in_executor(None, load_image_from_base64,
image_url.split(',')[1])
ywang96 marked this conversation as resolved.
Show resolved Hide resolved

else:
raise ValueError("Invalid image url: A valid image url must start "
Expand Down Expand Up @@ -75,10 +72,12 @@ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
return Image.open(BytesIO(base64.b64decode(image)))


# TODO(ywang96): move this to a model registry for preprocessing vision
# language prompts based on the model type.
def get_full_image_text_prompt(image_prompt: str, text_prompt: str,
config: ModelConfig) -> str:
"""Combine image and text prompts for vision language model depending on
the model architecture."""
the model architecture."""

if config.hf_config.model_type == "llava":
full_prompt = f"{image_prompt}\n{text_prompt}"
Expand Down
Loading