vllm-project · ywang96 · Jun 7, 2024 · Jun 2, 2024 · Jun 3, 2024 · Jun 3, 2024
@@ -1,10 +1,10 @@
 from pathlib import Path
+from typing import Dict
 
 import openai
 import pytest
 import pytest_asyncio
 import ray
-from PIL import Image
 
 from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
 
@@ -62,9 +62,12 @@ def client():
 
 
 @pytest_asyncio.fixture(scope="session")
-async def base64_encoded_image(image_url: str) -> Image.Image:
-    return encode_image_base64(
-        await ImageFetchAiohttp.fetch_image(image_url=image_url))
+async def base64_encoded_image() -> Dict[str, str]:
+    return {
+        image_url:
+        encode_image_base64(await ImageFetchAiohttp.fetch_image(image_url))
+        for image_url in TEST_IMAGE_URLS
+    }
 
 
 @pytest.mark.asyncio
@@ -123,8 +126,8 @@ async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_single_chat_session_image_base64encoded(
-        server, client: openai.AsyncOpenAI, model_name: str,
-        base64_encoded_image: str):
+        server, client: openai.AsyncOpenAI, model_name: str, image_url: str,
+        base64_encoded_image: Dict[str, str]):
 
     messages = [{
         "role":
@@ -133,7 +136,8 @@ async def test_single_chat_session_image_base64encoded(
             {
                 "type": "image_url",
                 "image_url": {
-                    "url": f"data:image/jpeg;base64,{base64_encoded_image}"
+                    "url":
+                    f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
                 }
             },
             {

@@ -68,8 +68,8 @@ async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
         base64_image = base64.b64encode(f.read()).decode("utf-8")
         data_url = f"data:{mime_type};base64,{base64_image}"
 
-        async with ImageFetchAiohttp.fetch_image(data_url) as data_image:
-            if _image_equals(url_image, Image.open(f)):
-                assert _image_equals(url_image, await data_image)
-            else:
-                pass  # Lossy format; only check that image can be opened
+        data_image = await ImageFetchAiohttp.fetch_image(data_url)
+        if _image_equals(url_image, Image.open(f)):
+            assert _image_equals(url_image, data_image)
+        else:
+            pass  # Lossy format; only check that image can be opened
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
@@ -1,3 +1,4 @@
+import asyncio
 import base64
 from io import BytesIO
 from typing import Optional, Union
@@ -23,30 +24,26 @@ def get_aiohttp_client(cls) -> aiohttp.ClientSession:
 
         return cls.aiohttp_client
 
-    @classmethod
-    async def close_aiohttp_client(cls) -> None:
-        if cls.aiohttp_client:
-            await cls.aiohttp_client.close()
-            cls.aiohttp_client = None
-
     @classmethod
     async def fetch_image(cls, image_url: str) -> Image.Image:
-        """Load image from a url or base64 encoded openai GPT4V format"""
+        """Load PIL image from a url or base64 encoded openai GPT4V format"""
 
-        # Avoid circular import
-        from vllm import __version__ as VLLM_VERSION
+        if image_url.startswith('http'):
+            # Avoid circular import
+            from vllm import __version__ as VLLM_VERSION
 
-        client = cls.get_aiohttp_client()
-        headers = {"User-Agent": f"vLLM/{VLLM_VERSION}"}
+            client = cls.get_aiohttp_client()
+            headers = {"User-Agent": f"vLLM/{VLLM_VERSION}"}
 
-        if image_url.startswith('http'):
             async with client.get(url=image_url, headers=headers) as response:
                 response.raise_for_status()
                 image_raw = await response.read()
             image = Image.open(BytesIO(image_raw))
 
         elif image_url.startswith('data:image'):
-            image = load_image_from_base64(image_url.split(',')[1])
+            loop = asyncio.get_event_loop()
+            image = await loop.run_in_executor(None, load_image_from_base64,
+                                               image_url.split(',')[1])
 
         else:
             raise ValueError("Invalid image url: A valid image url must start "
@@ -75,10 +72,12 @@ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
     return Image.open(BytesIO(base64.b64decode(image)))
 
 
+# TODO(ywang96): move this to a model registry for preprocessing vision
+# language prompts based on the model type.
 def get_full_image_text_prompt(image_prompt: str, text_prompt: str,
                                config: ModelConfig) -> str:
     """Combine image and text prompts for vision language model depending on
-    the  model architecture."""
+    the model architecture."""
 
     if config.hf_config.model_type == "llava":
         full_prompt = f"{image_prompt}\n{text_prompt}"