From 25a515fff3c3948e4eb075def735448cf9add148 Mon Sep 17 00:00:00 2001
From: davidp <davidp@aui.io>
Date: Thu, 1 Feb 2024 17:17:08 +0200
Subject: [PATCH 01/10] added lora params to the request

---
 vllm/entrypoints/api_server.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index f7b8d258fae4c..3accc877e729f 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -35,13 +35,15 @@ async def generate(request: Request) -> Response:
     prompt = request_dict.pop("prompt")
     prefix_pos = request_dict.pop("prefix_pos", None)
     stream = request_dict.pop("stream", False)
+    lora_request = request_dict.pop("lora_request", None)
     sampling_params = SamplingParams(**request_dict)
     request_id = random_uuid()
 
     results_generator = engine.generate(prompt,
                                         sampling_params,
                                         request_id,
-                                        prefix_pos=prefix_pos)
+                                        prefix_pos=prefix_pos,
+                                        lora_request=lora_request)
 
     # Streaming case
     async def stream_results() -> AsyncGenerator[bytes, None]:

From caba7b73d72d69df5152cdc00a2d61a541d3cffe Mon Sep 17 00:00:00 2001
From: davidp <davidp@aui.io>
Date: Thu, 1 Feb 2024 17:32:13 +0200
Subject: [PATCH 02/10] added lora params to openai api

---
 vllm/entrypoints/openai/protocol.py           | 21 +++++++++++++++++++
 vllm/entrypoints/openai/serving_chat.py       |  3 ++-
 vllm/entrypoints/openai/serving_completion.py |  4 +++-
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index fc15b7833ecf2..b6bb6afab204d 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -7,6 +7,7 @@
 
 from vllm.utils import random_uuid
 from vllm.sampling_params import SamplingParams
+from vllm.lora.request import LoRARequest
 
 
 class ErrorResponse(BaseModel):
@@ -80,6 +81,16 @@ class ChatCompletionRequest(BaseModel):
     min_p: Optional[float] = 0.0
     include_stop_str_in_output: Optional[bool] = False
     length_penalty: Optional[float] = 1.0
+    lora_request: Optional[dict] = Field(default_factory=dict)
+
+    def to_lora_params(self) -> Union[LoRARequest, None]:
+        if not self.lora_request:
+            return None
+        return LoRARequest(
+            lora_name=self.lora_request["lora_name"],
+            lora_int_id=self.lora_request["lora_int_id"],
+            lora_local_path=self.lora_request["lora_local_path"],
+        )
 
     def to_sampling_params(self) -> SamplingParams:
         return SamplingParams(
@@ -133,6 +144,16 @@ class CompletionRequest(BaseModel):
     min_p: Optional[float] = 0.0
     include_stop_str_in_output: Optional[bool] = False
     length_penalty: Optional[float] = 1.0
+    lora_request: Optional[dict] = Field(default_factory=dict)
+
+    def to_lora_params(self) -> Union[LoRARequest, None]:
+        if not self.lora_request:
+            return None
+        return LoRARequest(
+            lora_name=self.lora_request["lora_name"],
+            lora_int_id=self.lora_request["lora_int_id"],
+            lora_local_path=self.lora_request["lora_local_path"],
+        )
 
     def to_sampling_params(self):
         echo_without_generation = self.echo and self.max_tokens == 0
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index a9e4c355560b8..5a0fff4a90bdd 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -64,11 +64,12 @@ async def create_chat_completion(
             token_ids = self._validate_prompt_and_tokenize(request,
                                                            prompt=prompt)
             sampling_params = request.to_sampling_params()
+            lora_params = request.to_lora_params()
         except ValueError as e:
             return self.create_error_response(str(e))
 
         result_generator = self.engine.generate(prompt, sampling_params,
-                                                request_id, token_ids)
+                                                request_id, token_ids, lora_request=lora_params)
         # Streaming response
         if request.stream:
             return self.chat_completion_stream_generator(
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 8c9a7ad309cea..552156f8f652b 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -284,6 +284,7 @@ async def create_completion(self, request: CompletionRequest,
         generators = []
         try:
             sampling_params = request.to_sampling_params()
+            lora_params = request.to_lora_params()
             prompt_is_tokens, prompts = parse_prompt_format(request.prompt)
 
             for i, prompt in enumerate(prompts):
@@ -298,7 +299,8 @@ async def create_completion(self, request: CompletionRequest,
                     self.engine.generate(None,
                                          sampling_params,
                                          f"{request_id}-{i}",
-                                         prompt_token_ids=input_ids))
+                                         prompt_token_ids=input_ids,
+                                         lora_request=lora_params))
         except ValueError as e:
             return self.create_error_response(str(e))
 

From 10854727c492b46c048c3fe598e606ebc6818b3a Mon Sep 17 00:00:00 2001
From: davidp <davidp@aui.io>
Date: Thu, 1 Feb 2024 18:08:12 +0200
Subject: [PATCH 03/10] fixed api request

---
 vllm/entrypoints/api_server.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 3accc877e729f..f2d96e1f1c2a2 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -9,6 +9,7 @@
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
+from vllm.lora.request import LoRARequest
 from vllm.utils import random_uuid
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds.
@@ -36,6 +37,7 @@ async def generate(request: Request) -> Response:
     prefix_pos = request_dict.pop("prefix_pos", None)
     stream = request_dict.pop("stream", False)
     lora_request = request_dict.pop("lora_request", None)
+    lora_params = LoRARequest(**lora_request) if lora_request else None
     sampling_params = SamplingParams(**request_dict)
     request_id = random_uuid()
 
@@ -43,7 +45,7 @@ async def generate(request: Request) -> Response:
                                         sampling_params,
                                         request_id,
                                         prefix_pos=prefix_pos,
-                                        lora_request=lora_request)
+                                        lora_request=lora_params)
 
     # Streaming case
     async def stream_results() -> AsyncGenerator[bytes, None]:

From f26ae8fa8b2577860dc2c2794e98fea43fccc064 Mon Sep 17 00:00:00 2001
From: davidp <davidp@aui.io>
Date: Thu, 4 Apr 2024 11:22:15 +0300
Subject: [PATCH 04/10] slight fix to api request

---
 vllm/entrypoints/openai/protocol.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index b6bb6afab204d..66352deb08074 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -86,11 +86,7 @@ class ChatCompletionRequest(BaseModel):
     def to_lora_params(self) -> Union[LoRARequest, None]:
         if not self.lora_request:
             return None
-        return LoRARequest(
-            lora_name=self.lora_request["lora_name"],
-            lora_int_id=self.lora_request["lora_int_id"],
-            lora_local_path=self.lora_request["lora_local_path"],
-        )
+        return LoRARequest(**self.lora_request)
 
     def to_sampling_params(self) -> SamplingParams:
         return SamplingParams(
@@ -149,11 +145,7 @@ class CompletionRequest(BaseModel):
     def to_lora_params(self) -> Union[LoRARequest, None]:
         if not self.lora_request:
             return None
-        return LoRARequest(
-            lora_name=self.lora_request["lora_name"],
-            lora_int_id=self.lora_request["lora_int_id"],
-            lora_local_path=self.lora_request["lora_local_path"],
-        )
+        return LoRARequest(**self.lora_request)
 
     def to_sampling_params(self):
         echo_without_generation = self.echo and self.max_tokens == 0

From ce28f5c5298fe9d1568f9bfa5f72685e37602d8b Mon Sep 17 00:00:00 2001
From: davidp <davidp@aui.io>
Date: Thu, 4 Apr 2024 14:46:06 +0300
Subject: [PATCH 05/10] updated to support loading new adapters in a request

---
 vllm/entrypoints/openai/serving_engine.py | 25 ++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 9dbd1750e631a..60e88b59122bc 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -3,6 +3,8 @@
 from dataclasses import dataclass
 from http import HTTPStatus
 from typing import Dict, List, Optional, Union
+import os
+import hashlib
 
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
@@ -23,6 +25,14 @@ class LoRA:
     local_path: str
 
 
+def positive_hash_sha256(input_string):
+    """
+    function to generate positive hash from input string, which is used to identify the model variant for lora
+    sha-256 is used to keep it consistent between python versions and the sheets addon
+    """
+    return int(hashlib.sha256(input_string.encode('utf-8')).hexdigest(), 16) % (2 ** 63)
+
+
 class OpenAIServing:
 
     def __init__(self,
@@ -154,11 +164,24 @@ async def _check_model(self, request) -> Optional[ErrorResponse]:
     def _maybe_get_lora(self, request) -> Optional[LoRARequest]:
         if request.model == self.served_model:
             return
+
+        # if this lora adapter was already encountered, use it. otherwise, load a new adapter from disk
         for lora in self.lora_requests:
             if request.model == lora.lora_name:
                 return lora
+
+        if request.lora_request and os.path.exists(request.lora_request.lora_local_path):
+            lora_int_id = positive_hash_sha256(request.model)
+            new_lora = LoRARequest(
+                lora_name=request.model,
+                lora_int_id=lora_int_id,
+                lora_local_path=request.lora_request.lora_local_path,
+            )
+            self.lora_requests.append(new_lora)
+            return new_lora
+
         # if _check_model has been called earlier, this will be unreachable
-        raise ValueError("The model `{request.model}` does not exist.")
+        raise ValueError(f"The model `{request.model}` does not exist.")
 
     def _validate_prompt_and_tokenize(
             self,

From 7e074a91cac51df5b403f0e19996dc6b51955e47 Mon Sep 17 00:00:00 2001
From: davidp <davidp@aui.io>
Date: Thu, 4 Apr 2024 15:09:14 +0300
Subject: [PATCH 06/10] updated to support loading new adapters in a request

---
 vllm/entrypoints/openai/serving_engine.py |  5 +----
 vllm/lora/request.py                      | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 60e88b59122bc..9e952e48f4bee 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -4,7 +4,6 @@
 from http import HTTPStatus
 from typing import Dict, List, Optional, Union
 import os
-import hashlib
 
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
@@ -171,11 +170,9 @@ def _maybe_get_lora(self, request) -> Optional[LoRARequest]:
                 return lora
 
         if request.lora_request and os.path.exists(request.lora_request.lora_local_path):
-            lora_int_id = positive_hash_sha256(request.model)
             new_lora = LoRARequest(
                 lora_name=request.model,
-                lora_int_id=lora_int_id,
-                lora_local_path=request.lora_request.lora_local_path,
+                lora_local_path=request.lora_request.lora_local_path
             )
             self.lora_requests.append(new_lora)
             return new_lora
diff --git a/vllm/lora/request.py b/vllm/lora/request.py
index bbbf4880ab81b..394c42c63a5c4 100644
--- a/vllm/lora/request.py
+++ b/vllm/lora/request.py
@@ -1,4 +1,14 @@
 from dataclasses import dataclass
+from typing import Optional
+import hashlib
+
+
+def positive_hash_sha256(input_string):
+    """
+    function to generate positive hash from input string, which is used to identify the model variant for lora
+    sha-256 is used to keep it consistent between python versions and the sheets addon
+    """
+    return int(hashlib.sha256(input_string.encode('utf-8')).hexdigest(), 16) % (2 ** 63)
 
 
 @dataclass
@@ -16,10 +26,13 @@ class LoRARequest:
     """
 
     lora_name: str
-    lora_int_id: int
     lora_local_path: str
+    lora_int_id: Optional[int] = 0
 
     def __post_init__(self):
+        # if no int_id was given, use the name hash as id
+        if not self.lora_int_id:
+            self.lora_int_id = positive_hash_sha256(self.lora_name)
         if self.lora_int_id < 1:
             raise ValueError(
                 f"lora_int_id must be > 0, got {self.lora_int_id}")

From ba8374ddfd8afde7cf97f2702957701a1ae80208 Mon Sep 17 00:00:00 2001
From: davidp <davidp@aui.io>
Date: Thu, 4 Apr 2024 15:10:04 +0300
Subject: [PATCH 07/10] updated docs

---
 docs/source/models/lora.rst | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst
index 2278640481a91..4679ca2037c60 100644
--- a/docs/source/models/lora.rst
+++ b/docs/source/models/lora.rst
@@ -102,3 +102,18 @@ The following is an example request
             "max_tokens": 7,
             "temperature": 0
         }' | jq
+
+
+Alternatively, the request can specify a LoRA adapter to load dynamically from the server's local disk storage:
+
+.. code-block:: bash
+
+    curl http://localhost:8000/v1/completions \
+        -H "Content-Type: application/json" \
+        -d '{
+            "model": "sql-lora",
+            "prompt": "San Francisco is a",
+            "max_tokens": 7,
+            "temperature": 0,
+            "lora_request": {"lora_name":"sql-lora","lora_local_path":"/data/adapters/sql-lora"}
+        }' | jq
\ No newline at end of file

From 35a50f891ed80db65f31e79fcc6be7c8c6bd564c Mon Sep 17 00:00:00 2001
From: davidp <davidp@aui.io>
Date: Thu, 4 Apr 2024 16:27:07 +0300
Subject: [PATCH 08/10] updated check model

---
 vllm/entrypoints/openai/serving_engine.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 9e952e48f4bee..84e8483404774 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -155,6 +155,8 @@ async def _check_model(self, request) -> Optional[ErrorResponse]:
             return
         if request.model in [lora.lora_name for lora in self.lora_requests]:
             return
+        elif request.lora_request and os.path.exists(request.lora_request.lora_local_path):
+            return
         return self.create_error_response(
             message=f"The model `{request.model}` does not exist.",
             err_type="NotFoundError",

From 1547877d2f0075962b7937a483bbf8c420187915 Mon Sep 17 00:00:00 2001
From: davidp <davidp@aui.io>
Date: Thu, 4 Apr 2024 16:32:33 +0300
Subject: [PATCH 09/10] removed redundant hash func

---
 vllm/entrypoints/openai/serving_engine.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 84e8483404774..3c3f157911856 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -24,14 +24,6 @@ class LoRA:
     local_path: str
 
 
-def positive_hash_sha256(input_string):
-    """
-    function to generate positive hash from input string, which is used to identify the model variant for lora
-    sha-256 is used to keep it consistent between python versions and the sheets addon
-    """
-    return int(hashlib.sha256(input_string.encode('utf-8')).hexdigest(), 16) % (2 ** 63)
-
-
 class OpenAIServing:
 
     def __init__(self,

From ffa0b0a2ea5f196e1a29e30544471296590bb8d3 Mon Sep 17 00:00:00 2001
From: davidp <davidp@aui.io>
Date: Thu, 4 Apr 2024 16:46:28 +0300
Subject: [PATCH 10/10] small bugfix in dict fetch

---
 vllm/entrypoints/openai/serving_engine.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 3c3f157911856..7f7dc9544fcc6 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -147,7 +147,7 @@ async def _check_model(self, request) -> Optional[ErrorResponse]:
             return
         if request.model in [lora.lora_name for lora in self.lora_requests]:
             return
-        elif request.lora_request and os.path.exists(request.lora_request.lora_local_path):
+        elif request.lora_request and os.path.exists(request.lora_request.get("lora_local_path")):
             return
         return self.create_error_response(
             message=f"The model `{request.model}` does not exist.",
@@ -163,10 +163,10 @@ def _maybe_get_lora(self, request) -> Optional[LoRARequest]:
             if request.model == lora.lora_name:
                 return lora
 
-        if request.lora_request and os.path.exists(request.lora_request.lora_local_path):
+        if request.lora_request and os.path.exists(request.lora_request.get("lora_local_path")):
             new_lora = LoRARequest(
                 lora_name=request.model,
-                lora_local_path=request.lora_request.lora_local_path
+                lora_local_path=request.lora_request.get("lora_local_path")
             )
             self.lora_requests.append(new_lora)
             return new_lora