Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT: support qwenvl2 vllm engine #2428

Merged
merged 9 commits into from
Oct 12, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 9 additions & 12 deletions xinference/model/llm/llm_family.json
Original file line number Diff line number Diff line change
Expand Up @@ -6909,18 +6909,15 @@
"model_id":"Qwen/Qwen2-VL-72B-Instruct-GPTQ-{quantization}"
}
],
"prompt_style":{
"style_name":"QWEN",
"system_prompt":"You are a helpful assistant",
"roles":[
"user",
"assistant"
],
"stop": [
"<|im_end|>",
"<|endoftext|>"
]
}
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
"stop_token_ids": [
151645,
151643
],
"stop": [
"<|im_end|>",
"<|endoftext|>"
]
},
{
"version": 1,
Expand Down
17 changes: 9 additions & 8 deletions xinference/model/llm/llm_family_modelscope.json
Original file line number Diff line number Diff line change
Expand Up @@ -4627,14 +4627,15 @@
"model_hub": "modelscope"
}
],
"prompt_style": {
"style_name": "QWEN",
"system_prompt": "You are a helpful assistant",
"roles": [
"user",
"assistant"
]
}
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
"stop_token_ids": [
151645,
151643
],
"stop": [
"<|im_end|>",
"<|endoftext|>"
]
},
{
"version": 1,
Expand Down
3 changes: 2 additions & 1 deletion xinference/model/llm/transformers/cogvlm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
parse_messages,
)
from .core import PytorchChatModel, PytorchGenerateConfig
from .utils import get_max_src_len
from .utils import cache_clean, get_max_src_len

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -176,6 +176,7 @@ def get_query_and_history(
query = content
return query, image, history

@cache_clean
def chat(
self,
messages: List[Dict],
Expand Down
2 changes: 2 additions & 0 deletions xinference/model/llm/transformers/cogvlm2_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
parse_messages,
)
from .core import PytorchChatModel, PytorchGenerateConfig
from .utils import cache_clean

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -227,6 +228,7 @@ def get_query_and_history(

return query, image, video, history

@cache_clean
def chat(
self,
messages: List[Dict],
Expand Down
2 changes: 2 additions & 0 deletions xinference/model/llm/transformers/deepseek_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from ..llm_family import LLMFamilyV1, LLMSpecV1
from ..utils import generate_chat_completion, generate_completion_chunk
from .core import PytorchChatModel, PytorchGenerateConfig
from .utils import cache_clean

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -137,6 +138,7 @@ def _fill_placeholder(_url, _index):
return "".join(new_content), images
return content, []

@cache_clean
def chat(
self,
messages: List[Dict],
Expand Down
3 changes: 2 additions & 1 deletion xinference/model/llm/transformers/glm4v.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from ..llm_family import LLMFamilyV1, LLMSpecV1
from ..utils import _decode_image, generate_chat_completion, generate_completion_chunk
from .core import PytorchChatModel, PytorchGenerateConfig
from .utils import get_max_src_len
from .utils import cache_clean, get_max_src_len

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -129,6 +129,7 @@ def _get_processed_msgs(messages: List[Dict]) -> List[Dict]:
res.append({"role": role, "content": text})
return res

@cache_clean
def chat(
self,
messages: List[Dict],
Expand Down
2 changes: 2 additions & 0 deletions xinference/model/llm/transformers/intern_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
parse_messages,
)
from .core import PytorchChatModel, PytorchGenerateConfig
from .utils import cache_clean

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -326,6 +327,7 @@ def load(self, **kwargs):
use_fast=False,
)

@cache_clean
def chat(
self,
messages: List[Dict],
Expand Down
2 changes: 2 additions & 0 deletions xinference/model/llm/transformers/minicpmv25.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
parse_messages,
)
from .core import PytorchChatModel, PytorchGenerateConfig
from .utils import cache_clean

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -119,6 +120,7 @@ def _message_content_to_chat(self, content):
raise RuntimeError("Only one image per message is supported")
return content, []

@cache_clean
def chat(
self,
messages: List[Dict],
Expand Down
2 changes: 2 additions & 0 deletions xinference/model/llm/transformers/minicpmv26.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
parse_messages,
)
from .core import PytorchChatModel, PytorchGenerateConfig
from .utils import cache_clean

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -198,6 +199,7 @@ def _convert_to_specific_style(self, messages: List[Dict]) -> Tuple:
msgs.append({"role": "user", "content": images_chat + [content]})
return msgs, video_existed

@cache_clean
def chat(
self,
messages: List[Dict],
Expand Down
2 changes: 2 additions & 0 deletions xinference/model/llm/transformers/omnilmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from ..llm_family import LLMFamilyV1, LLMSpecV1
from ..utils import generate_chat_completion, parse_messages
from .core import PytorchChatModel, PytorchGenerateConfig
from .utils import cache_clean

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -87,6 +88,7 @@ def _ensure_url(_url):
return images, other_content
return [], [{"type": "text", "text": content}]

@cache_clean
def chat(
self,
messages: List[Dict],
Expand Down
15 changes: 11 additions & 4 deletions xinference/model/llm/transformers/qwen2_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,22 @@
import logging
import uuid
from io import BytesIO
from typing import Dict, Iterator, List, Optional, Union
from typing import Iterator, List, Optional, Union
from urllib.request import urlopen

import numpy as np

from ....model.utils import select_device
from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
from ....types import (
ChatCompletion,
ChatCompletionChunk,
ChatCompletionMessage,
CompletionChunk,
)
from ..llm_family import LLMFamilyV1, LLMSpecV1
from ..utils import generate_chat_completion, generate_completion_chunk
from .core import PytorchChatModel, PytorchGenerateConfig
from .utils import cache_clean

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -68,7 +74,7 @@ def load(self):

def _transform_messages(
self,
messages: List[Dict],
messages: List[ChatCompletionMessage],
):
import librosa

Expand All @@ -89,9 +95,10 @@ def _transform_messages(

return text, audios

@cache_clean
def chat(
self,
messages: List[Dict],
messages: List[ChatCompletionMessage],
generate_config: Optional[PytorchGenerateConfig] = None,
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
text, audios = self._transform_messages(messages)
Expand Down
30 changes: 2 additions & 28 deletions xinference/model/llm/transformers/qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from ..llm_family import LLMFamilyV1, LLMSpecV1
from ..utils import generate_chat_completion, generate_completion_chunk
from .core import PytorchChatModel, PytorchGenerateConfig
from .utils import cache_clean

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -75,34 +76,7 @@ def load(self):
self.model_path, device_map=device, trust_remote_code=True
).eval()

def _transform_messages(
self,
messages: List[ChatCompletionMessage],
):
transformed_messages = []
for msg in messages:
new_content = []
role = msg["role"]
content = msg["content"]
if isinstance(content, str):
new_content.append({"type": "text", "text": content})
elif isinstance(content, List):
for item in content: # type: ignore
if "text" in item:
new_content.append({"type": "text", "text": item["text"]})
elif "image_url" in item:
new_content.append(
{"type": "image", "image": item["image_url"]["url"]}
)
elif "video_url" in item:
new_content.append(
{"type": "video", "video": item["video_url"]["url"]}
)
new_message = {"role": role, "content": new_content}
transformed_messages.append(new_message)

return transformed_messages

@cache_clean
def chat(
self,
messages: List[ChatCompletionMessage], # type: ignore
Expand Down
3 changes: 2 additions & 1 deletion xinference/model/llm/transformers/qwen_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from ..llm_family import LLMFamilyV1, LLMSpecV1
from ..utils import generate_chat_completion, generate_completion_chunk
from .core import PytorchChatModel, PytorchGenerateConfig
from .utils import pad_prefill_tokens
from .utils import cache_clean, pad_prefill_tokens

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -137,6 +137,7 @@ def _get_prompt_and_chat_history(self, messages: List[Dict]):
prompt = self._message_content_to_qwen(messages[-1]["content"])
return prompt, qwen_history

@cache_clean
def chat(
self,
messages: List[Dict],
Expand Down
34 changes: 33 additions & 1 deletion xinference/model/llm/transformers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import asyncio
import functools
import gc
import logging
import os
Expand Down Expand Up @@ -777,3 +778,34 @@ def batch_inference_one_step(
for r in req_list:
r.stopped = True
r.error_msg = str(e)


def cache_clean(fn):
@functools.wraps(fn)
async def _async_wrapper(self, *args, **kwargs):
import gc

from ....device_utils import empty_cache

result = await fn(self, *args, **kwargs)

gc.collect()
empty_cache()
return result

@functools.wraps(fn)
def _wrapper(self, *args, **kwargs):
import gc

from ....device_utils import empty_cache

result = fn(self, *args, **kwargs)

gc.collect()
empty_cache()
return result

if asyncio.iscoroutinefunction(fn):
return _async_wrapper
else:
return _wrapper
2 changes: 2 additions & 0 deletions xinference/model/llm/transformers/yi_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
parse_messages,
)
from .core import PytorchChatModel, PytorchGenerateConfig
from .utils import cache_clean

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -99,6 +100,7 @@ def _message_content_to_yi(content) -> Union[str, tuple]:
raise RuntimeError("Only one image per message is supported by Yi VL.")
return content

@cache_clean
def chat(
self,
messages: List[Dict],
Expand Down
Loading
Loading