From 25a515fff3c3948e4eb075def735448cf9add148 Mon Sep 17 00:00:00 2001 From: davidp Date: Thu, 1 Feb 2024 17:17:08 +0200 Subject: [PATCH 01/10] added lora params to the request --- vllm/entrypoints/api_server.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index f7b8d258fae4c..3accc877e729f 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -35,13 +35,15 @@ async def generate(request: Request) -> Response: prompt = request_dict.pop("prompt") prefix_pos = request_dict.pop("prefix_pos", None) stream = request_dict.pop("stream", False) + lora_request = request_dict.pop("lora_request", None) sampling_params = SamplingParams(**request_dict) request_id = random_uuid() results_generator = engine.generate(prompt, sampling_params, request_id, - prefix_pos=prefix_pos) + prefix_pos=prefix_pos, + lora_request=lora_request) # Streaming case async def stream_results() -> AsyncGenerator[bytes, None]: From caba7b73d72d69df5152cdc00a2d61a541d3cffe Mon Sep 17 00:00:00 2001 From: davidp Date: Thu, 1 Feb 2024 17:32:13 +0200 Subject: [PATCH 02/10] added lora params to openai api --- vllm/entrypoints/openai/protocol.py | 21 +++++++++++++++++++ vllm/entrypoints/openai/serving_chat.py | 3 ++- vllm/entrypoints/openai/serving_completion.py | 4 +++- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index fc15b7833ecf2..b6bb6afab204d 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -7,6 +7,7 @@ from vllm.utils import random_uuid from vllm.sampling_params import SamplingParams +from vllm.lora.request import LoRARequest class ErrorResponse(BaseModel): @@ -80,6 +81,16 @@ class ChatCompletionRequest(BaseModel): min_p: Optional[float] = 0.0 include_stop_str_in_output: Optional[bool] = False length_penalty: Optional[float] = 1.0 + lora_request: Optional[dict] = Field(default_factory=dict) + + def to_lora_params(self) -> Union[LoRARequest, None]: + if not self.lora_request: + return None + return LoRARequest( + lora_name=self.lora_request["lora_name"], + lora_int_id=self.lora_request["lora_int_id"], + lora_local_path=self.lora_request["lora_local_path"], + ) def to_sampling_params(self) -> SamplingParams: return SamplingParams( @@ -133,6 +144,16 @@ class CompletionRequest(BaseModel): min_p: Optional[float] = 0.0 include_stop_str_in_output: Optional[bool] = False length_penalty: Optional[float] = 1.0 + lora_request: Optional[dict] = Field(default_factory=dict) + + def to_lora_params(self) -> Union[LoRARequest, None]: + if not self.lora_request: + return None + return LoRARequest( + lora_name=self.lora_request["lora_name"], + lora_int_id=self.lora_request["lora_int_id"], + lora_local_path=self.lora_request["lora_local_path"], + ) def to_sampling_params(self): echo_without_generation = self.echo and self.max_tokens == 0 diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index a9e4c355560b8..5a0fff4a90bdd 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -64,11 +64,12 @@ async def create_chat_completion( token_ids = self._validate_prompt_and_tokenize(request, prompt=prompt) sampling_params = request.to_sampling_params() + lora_params = request.to_lora_params() except ValueError as e: return self.create_error_response(str(e)) result_generator = self.engine.generate(prompt, sampling_params, - request_id, token_ids) + request_id, token_ids, lora_request=lora_params) # Streaming response if request.stream: return self.chat_completion_stream_generator( diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 8c9a7ad309cea..552156f8f652b 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -284,6 +284,7 @@ async def create_completion(self, request: CompletionRequest, generators = [] try: sampling_params = request.to_sampling_params() + lora_params = request.to_lora_params() prompt_is_tokens, prompts = parse_prompt_format(request.prompt) for i, prompt in enumerate(prompts): @@ -298,7 +299,8 @@ async def create_completion(self, request: CompletionRequest, self.engine.generate(None, sampling_params, f"{request_id}-{i}", - prompt_token_ids=input_ids)) + prompt_token_ids=input_ids, + lora_request=lora_params)) except ValueError as e: return self.create_error_response(str(e)) From 10854727c492b46c048c3fe598e606ebc6818b3a Mon Sep 17 00:00:00 2001 From: davidp Date: Thu, 1 Feb 2024 18:08:12 +0200 Subject: [PATCH 03/10] fixed api request --- vllm/entrypoints/api_server.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 3accc877e729f..f2d96e1f1c2a2 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -9,6 +9,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.sampling_params import SamplingParams +from vllm.lora.request import LoRARequest from vllm.utils import random_uuid TIMEOUT_KEEP_ALIVE = 5 # seconds. @@ -36,6 +37,7 @@ async def generate(request: Request) -> Response: prefix_pos = request_dict.pop("prefix_pos", None) stream = request_dict.pop("stream", False) lora_request = request_dict.pop("lora_request", None) + lora_params = LoRARequest(**lora_request) if lora_request else None sampling_params = SamplingParams(**request_dict) request_id = random_uuid() @@ -43,7 +45,7 @@ async def generate(request: Request) -> Response: sampling_params, request_id, prefix_pos=prefix_pos, - lora_request=lora_request) + lora_request=lora_params) # Streaming case async def stream_results() -> AsyncGenerator[bytes, None]: From f26ae8fa8b2577860dc2c2794e98fea43fccc064 Mon Sep 17 00:00:00 2001 From: davidp Date: Thu, 4 Apr 2024 11:22:15 +0300 Subject: [PATCH 04/10] slight fix to api request --- vllm/entrypoints/openai/protocol.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index b6bb6afab204d..66352deb08074 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -86,11 +86,7 @@ class ChatCompletionRequest(BaseModel): def to_lora_params(self) -> Union[LoRARequest, None]: if not self.lora_request: return None - return LoRARequest( - lora_name=self.lora_request["lora_name"], - lora_int_id=self.lora_request["lora_int_id"], - lora_local_path=self.lora_request["lora_local_path"], - ) + return LoRARequest(**self.lora_request) def to_sampling_params(self) -> SamplingParams: return SamplingParams( @@ -149,11 +145,7 @@ class CompletionRequest(BaseModel): def to_lora_params(self) -> Union[LoRARequest, None]: if not self.lora_request: return None - return LoRARequest( - lora_name=self.lora_request["lora_name"], - lora_int_id=self.lora_request["lora_int_id"], - lora_local_path=self.lora_request["lora_local_path"], - ) + return LoRARequest(**self.lora_request) def to_sampling_params(self): echo_without_generation = self.echo and self.max_tokens == 0 From ce28f5c5298fe9d1568f9bfa5f72685e37602d8b Mon Sep 17 00:00:00 2001 From: davidp Date: Thu, 4 Apr 2024 14:46:06 +0300 Subject: [PATCH 05/10] updated to support loading new adapters in a request --- vllm/entrypoints/openai/serving_engine.py | 25 ++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 9dbd1750e631a..60e88b59122bc 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -3,6 +3,8 @@ from dataclasses import dataclass from http import HTTPStatus from typing import Dict, List, Optional, Union +import os +import hashlib from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, @@ -23,6 +25,14 @@ class LoRA: local_path: str +def positive_hash_sha256(input_string): + """ + function to generate positive hash from input string, which is used to identify the model variant for lora + sha-256 is used to keep it consistent between python versions and the sheets addon + """ + return int(hashlib.sha256(input_string.encode('utf-8')).hexdigest(), 16) % (2 ** 63) + + class OpenAIServing: def __init__(self, @@ -154,11 +164,24 @@ async def _check_model(self, request) -> Optional[ErrorResponse]: def _maybe_get_lora(self, request) -> Optional[LoRARequest]: if request.model == self.served_model: return + + # if this lora adapter was already encountered, use it. otherwise, load a new adapter from disk for lora in self.lora_requests: if request.model == lora.lora_name: return lora + + if request.lora_request and os.path.exists(request.lora_request.lora_local_path): + lora_int_id = positive_hash_sha256(request.model) + new_lora = LoRARequest( + lora_name=request.model, + lora_int_id=lora_int_id, + lora_local_path=request.lora_request.lora_local_path, + ) + self.lora_requests.append(new_lora) + return new_lora + # if _check_model has been called earlier, this will be unreachable - raise ValueError("The model `{request.model}` does not exist.") + raise ValueError(f"The model `{request.model}` does not exist.") def _validate_prompt_and_tokenize( self, From 7e074a91cac51df5b403f0e19996dc6b51955e47 Mon Sep 17 00:00:00 2001 From: davidp Date: Thu, 4 Apr 2024 15:09:14 +0300 Subject: [PATCH 06/10] updated to support loading new adapters in a request --- vllm/entrypoints/openai/serving_engine.py | 5 +---- vllm/lora/request.py | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 60e88b59122bc..9e952e48f4bee 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -4,7 +4,6 @@ from http import HTTPStatus from typing import Dict, List, Optional, Union import os -import hashlib from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, @@ -171,11 +170,9 @@ def _maybe_get_lora(self, request) -> Optional[LoRARequest]: return lora if request.lora_request and os.path.exists(request.lora_request.lora_local_path): - lora_int_id = positive_hash_sha256(request.model) new_lora = LoRARequest( lora_name=request.model, - lora_int_id=lora_int_id, - lora_local_path=request.lora_request.lora_local_path, + lora_local_path=request.lora_request.lora_local_path ) self.lora_requests.append(new_lora) return new_lora diff --git a/vllm/lora/request.py b/vllm/lora/request.py index bbbf4880ab81b..394c42c63a5c4 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -1,4 +1,14 @@ from dataclasses import dataclass +from typing import Optional +import hashlib + + +def positive_hash_sha256(input_string): + """ + function to generate positive hash from input string, which is used to identify the model variant for lora + sha-256 is used to keep it consistent between python versions and the sheets addon + """ + return int(hashlib.sha256(input_string.encode('utf-8')).hexdigest(), 16) % (2 ** 63) @dataclass @@ -16,10 +26,13 @@ class LoRARequest: """ lora_name: str - lora_int_id: int lora_local_path: str + lora_int_id: Optional[int] = 0 def __post_init__(self): + # if no int_id was given, use the name hash as id + if not self.lora_int_id: + self.lora_int_id = positive_hash_sha256(self.lora_name) if self.lora_int_id < 1: raise ValueError( f"lora_int_id must be > 0, got {self.lora_int_id}") From ba8374ddfd8afde7cf97f2702957701a1ae80208 Mon Sep 17 00:00:00 2001 From: davidp Date: Thu, 4 Apr 2024 15:10:04 +0300 Subject: [PATCH 07/10] updated docs --- docs/source/models/lora.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst index 2278640481a91..4679ca2037c60 100644 --- a/docs/source/models/lora.rst +++ b/docs/source/models/lora.rst @@ -102,3 +102,18 @@ The following is an example request "max_tokens": 7, "temperature": 0 }' | jq + + +Alternatively, the request can specify a LoRA adapter to load dynamically from the server's local disk storage: + +.. code-block:: bash + + curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "sql-lora", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0, + "lora_request": {"lora_name":"sql-lora","lora_local_path":"/data/adapters/sql-lora"} + }' | jq \ No newline at end of file From 35a50f891ed80db65f31e79fcc6be7c8c6bd564c Mon Sep 17 00:00:00 2001 From: davidp Date: Thu, 4 Apr 2024 16:27:07 +0300 Subject: [PATCH 08/10] updated check model --- vllm/entrypoints/openai/serving_engine.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 9e952e48f4bee..84e8483404774 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -155,6 +155,8 @@ async def _check_model(self, request) -> Optional[ErrorResponse]: return if request.model in [lora.lora_name for lora in self.lora_requests]: return + elif request.lora_request and os.path.exists(request.lora_request.lora_local_path): + return return self.create_error_response( message=f"The model `{request.model}` does not exist.", err_type="NotFoundError", From 1547877d2f0075962b7937a483bbf8c420187915 Mon Sep 17 00:00:00 2001 From: davidp Date: Thu, 4 Apr 2024 16:32:33 +0300 Subject: [PATCH 09/10] removed redundant hash func --- vllm/entrypoints/openai/serving_engine.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 84e8483404774..3c3f157911856 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -24,14 +24,6 @@ class LoRA: local_path: str -def positive_hash_sha256(input_string): - """ - function to generate positive hash from input string, which is used to identify the model variant for lora - sha-256 is used to keep it consistent between python versions and the sheets addon - """ - return int(hashlib.sha256(input_string.encode('utf-8')).hexdigest(), 16) % (2 ** 63) - - class OpenAIServing: def __init__(self, From ffa0b0a2ea5f196e1a29e30544471296590bb8d3 Mon Sep 17 00:00:00 2001 From: davidp Date: Thu, 4 Apr 2024 16:46:28 +0300 Subject: [PATCH 10/10] small bugfix in dict fetch --- vllm/entrypoints/openai/serving_engine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 3c3f157911856..7f7dc9544fcc6 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -147,7 +147,7 @@ async def _check_model(self, request) -> Optional[ErrorResponse]: return if request.model in [lora.lora_name for lora in self.lora_requests]: return - elif request.lora_request and os.path.exists(request.lora_request.lora_local_path): + elif request.lora_request and os.path.exists(request.lora_request.get("lora_local_path")): return return self.create_error_response( message=f"The model `{request.model}` does not exist.", @@ -163,10 +163,10 @@ def _maybe_get_lora(self, request) -> Optional[LoRARequest]: if request.model == lora.lora_name: return lora - if request.lora_request and os.path.exists(request.lora_request.lora_local_path): + if request.lora_request and os.path.exists(request.lora_request.get("lora_local_path")): new_lora = LoRARequest( lora_name=request.model, - lora_local_path=request.lora_request.lora_local_path + lora_local_path=request.lora_request.get("lora_local_path") ) self.lora_requests.append(new_lora) return new_lora