From 6cb540b42dae27bcd1c7ee3665a065f0e7a2dfb4 Mon Sep 17 00:00:00 2001
From: sang <rkooo567@gmail.com>
Date: Wed, 24 Apr 2024 08:40:12 -0700
Subject: [PATCH 1/3] done

---
 format.sh                   |  2 +-
 vllm/lora/layers.py         | 35 ++++++++++++--------
 vllm/lora/lora.py           | 28 +++++++++-------
 vllm/lora/models.py         | 66 +++++++++++++++++++------------------
 vllm/lora/worker_manager.py | 21 +++++++-----
 vllm/worker/model_runner.py |  4 +--
 6 files changed, 88 insertions(+), 68 deletions(-)

diff --git a/format.sh b/format.sh
index bd2e9e89e1806..4ac1842daef0a 100755
--- a/format.sh
+++ b/format.sh
@@ -106,7 +106,7 @@ mypy vllm/engine  --config-file pyproject.toml
 mypy vllm/worker --config-file pyproject.toml
 mypy vllm/spec_decode --config-file pyproject.toml
 mypy vllm/model_executor/*.py  --config-file pyproject.toml
-# mypy vllm/lora/*.py --config-file pyproject.toml
+mypy vllm/lora --config-file pyproject.toml
 
 
 CODESPELL_EXCLUDES=(
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index aac86351b15e1..98e74168002c4 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -176,6 +176,8 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
     def __init__(self, base_layer: VocabParallelEmbedding) -> None:
         super().__init__()
         self.base_layer = base_layer
+        self.embeddings_slice: Optional[Tuple[int, int]]
+        self.embeddings_weights: Optional[torch.Tensor]
 
     def create_lora_weights(
             self,
@@ -233,9 +235,10 @@ def create_lora_weights(
             self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1],
             self.lora_a_stacked.shape[2],
         )
-        self.indices: Optional[torch.Tensor] = None
-        self.indices_len: Optional[List[int]] = None
-        self.embeddings_indices = None
+        # Lazily initialized.
+        self.indices: torch.Tensor
+        self.indices_len: List[int]
+        self.embeddings_indices: torch.Tensor
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
@@ -267,6 +270,7 @@ def set_lora(
                     self.embeddings_tensors.shape[1],
                     self.embeddings_tensors.shape[2]
                 )[self.embeddings_slice[0]:self.embeddings_slice[1]]
+                assert self.embeddings_weights is not None
                 self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings)
 
     def set_mapping(
@@ -343,11 +347,12 @@ def create_lora_weights(
             dtype=lora_config.lora_dtype,
             device=self.device,
         )
-
-        self.indices: Optional[torch.Tensor] = None
-        self.indices_len: Optional[List[int]] = None
         self.output_dim = self.lora_b_stacked.shape[2]
 
+        # lazily initialized.
+        self.indices: torch.Tensor
+        self.indices_len: List[int]
+
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
@@ -475,8 +480,9 @@ def create_lora_weights(
                 device=self.device,
             ) for _ in range(n_slices))
 
-        self.indices: Optional[torch.Tensor] = None
         self.output_dim = self.lora_b_stacked[0].shape[2]
+        # Lazily initialized.
+        self.indices: torch.Tensor
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[0][index] = 0
@@ -690,7 +696,8 @@ def create_lora_weights(
                               self.kv_proj_shard_size)
         self.packed_indices: Optional[torch.Tensor] = None
         self.standard_indices: Optional[torch.Tensor] = None
-        self.indices_len: Optional[List[int]] = None
+        # lazily initialized.
+        self.indices_len: List[int]
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[0][index] = 0
@@ -814,8 +821,9 @@ def create_lora_weights(
             dtype=lora_config.lora_dtype,
             device=self.device,
         )
-        self.indices: Optional[torch.Tensor] = None
-        self.indices_len: Optional[List[int]] = None
+        # Lazily initialized
+        self.indices: torch.Tensor
+        self.indices_len: List[int]
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
@@ -991,9 +999,10 @@ def create_lora_weights(
             dtype=self.dtype,
             device=self.device,
         )
-        self.indices = None
-        self.indices_padded = None
-        self.indices_len = None
+        # Lazily initialized.
+        self.indices: torch.Tensor
+        self.indices_len: List[int]
+        self.indices_padded: torch.Tensor
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index fefad16700fe3..d7794aa7cd35c 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -97,9 +97,9 @@ def __init__(
         self,
         module_name: str,
         rank: int,
-        lora_alphas: List[int],
-        lora_a: List[torch.Tensor],
-        lora_b: List[torch.Tensor],
+        lora_alphas: List[Optional[int]],
+        lora_a: List[Optional[torch.Tensor]],
+        lora_b: List[Optional[torch.Tensor]],
         scaling: Optional[List[float]] = None,
     ) -> None:
         super().__init__(
@@ -108,17 +108,20 @@ def __init__(
             lora_alpha=0,
             lora_a=lora_a,
             lora_b=lora_b,
-            scaling=scaling,
+            scaling=scaling,  # type: ignore
             embeddings_tensor=None,
         )
         self.lora_alphas = lora_alphas
         if scaling is None:
-            self.scaling = [
-                lora_alpha / self.rank for lora_alpha in self.lora_alphas
+            self.scaling = [  # type: ignore
+                lora_alpha / self.rank  # type: ignore # noqa
+                for lora_alpha in self.lora_alphas
             ]
 
     @classmethod
-    def pack(cls, loras: List["LoRALayerWeights"]) -> "PackedLoRALayerWeights":
+    def pack(
+            cls, loras: List[Optional["LoRALayerWeights"]]
+    ) -> "PackedLoRALayerWeights":
         """Pack a list of LoRAs into a single LoRA.
 
         If LoRA is None, it signifies that the submodule does not have a LoRA.
@@ -136,16 +139,19 @@ def pack(cls, loras: List["LoRALayerWeights"]) -> "PackedLoRALayerWeights":
             [lora.lora_alpha if lora is not None else None for lora in loras],
             [lora.lora_a if lora is not None else None for lora in loras],
             [lora.lora_b if lora is not None else None for lora in loras],
-            scaling=[1 if lora is not None else None for lora in loras])
+            scaling=[
+                1 if lora is not None else None  # type: ignore
+                for lora in loras
+            ])
         return obj
 
     def optimize(self) -> "PackedLoRALayerWeights":
         """Optimize the LoRA by merging the scaling into lora_b."""
         for i in range(len(self.lora_b)):
-            if self.scaling[i] == 1 or self.lora_b[i] is None:
+            if self.scaling[i] == 1 or self.lora_b[i] is None:  # type: ignore
                 continue
-            self.lora_b[i] *= self.scaling[i]
-            self.scaling[i] = 1
+            self.lora_b[i] *= self.scaling[i]  # type: ignore
+            self.scaling[i] = 1  # type: ignore
         return self
 
     @property
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 6bb9fee27d535..e89fc2502d12f 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -3,7 +3,7 @@
 import math
 import os
 import re
-from typing import Callable, Dict, Hashable, List, Optional, Tuple, Type
+from typing import Callable, Dict, List, Optional, Tuple, Type
 
 import safetensors.torch
 import torch
@@ -53,44 +53,46 @@ def convert_mapping(
                 embeddings.
             indices_len: List of lengths of the above tensors.
     """
-    indices = list(mapping.index_mapping).copy()
-    embedding_indices = indices.copy()
-    lora_indices = indices.copy()
-    prompt_mapping = [
+    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
+    embedding_indices = index_mapping_indices.copy()
+    lora_indices = index_mapping_indices.copy()
+    prompt_mapping: List[int] = [
         lora_index_to_id.index(x) if x > 0 else -1
         for x in mapping.prompt_mapping
     ]
     lora_idx = None
-    for i in range(len(indices)):
+    for i in range(len(index_mapping_indices)):
         # TODO index can be slow. optimize
-        lora_idx = (lora_index_to_id.index(indices[i])
-                    if indices[i] > 0 else -1)
-        embedding_indices[i] = lora_idx if indices[i] > 0 else 0
-        indices[i] = i
+        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
+                    if index_mapping_indices[i] > 0 else -1)
+        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
+        index_mapping_indices[i] = i
         lora_indices[i] = lora_idx
 
-    indices = torch.tensor([indices, lora_indices, embedding_indices],
-                           dtype=torch.long,
-                           device="cuda")
-    prompt_mapping = torch.tensor(prompt_mapping,
-                                  device="cuda",
-                                  dtype=torch.long)
+    indices = torch.tensor(
+        [index_mapping_indices, lora_indices, embedding_indices],
+        dtype=torch.long,
+        device="cuda")
+    prompt_mapping_tensor = torch.tensor(prompt_mapping,
+                                         device="cuda",
+                                         dtype=torch.long)
     embeddings_indices = torch.stack([
         indices[2] * extra_vocab_size,
         indices[2] * (vocab_size + extra_vocab_size)
     ])
     embeddings_indices[embeddings_indices == -1] = max_loras - 1
     base_indices = indices[1]
-    sampler_indices = prompt_mapping
+    sampler_indices = prompt_mapping_tensor
     sampler_indices_padded = sampler_indices.clone()
     sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
     sampler_indices_padded = (
         torch.arange(
             0, len(sampler_indices_padded), device="cuda", dtype=torch.long) +
         (sampler_indices_padded * len(sampler_indices_padded)))
-    indices_len = (base_indices.shape[-1], sampler_indices.shape[-1],
-                   sampler_indices_padded.shape[-1],
-                   embeddings_indices.shape[-1])
+    indices_len = [
+        base_indices.shape[-1], sampler_indices.shape[-1],
+        sampler_indices_padded.shape[-1], embeddings_indices.shape[-1]
+    ]
 
     return (base_indices, sampler_indices, sampler_indices_padded,
             embeddings_indices, indices_len)
@@ -149,6 +151,7 @@ def from_lora_tensors(
             if module_name not in loras:
                 lora_embeddings_tensor = None
                 if embeddings:
+                    assert embedding_modules is not None
                     embeddings_module = next(
                         (k for k in embedding_modules if k in module_name),
                         None)
@@ -171,6 +174,7 @@ def from_lora_tensors(
             else:
                 loras[module_name].lora_b = tensor.to(device=device,
                                                       dtype=dtype).t()
+                assert embedding_padding_modules is not None
                 if any(name in module_name
                        for name in embedding_padding_modules
                        ) and target_embedding_padding is not None:
@@ -295,11 +299,10 @@ def __init__(
                                               self.max_num_batched_tokens,
                                               dtype=torch.long,
                                               device="cuda")
-        self.offsets = []
         # 4 is the number of indicies tensors defined above
         # base_indices, sampler_indices, sampler_indices_padded,
         # embeddings_indices
-        self.indices_len = [None] * 4
+        self.indices_len: List[Optional[int]] = [None] * 4
 
         self.model: nn.Module = model
         if hasattr(self.model, "supported_lora_modules"):
@@ -312,7 +315,7 @@ def __init__(
         self._registered_loras: Dict[int, LoRAModel] = {}
         # Dict instead of a Set for compatibility with LRUCache.
         self._active_loras: Dict[int, None] = {}
-        self._last_mapping = None
+        self._last_mapping: Optional[LoRAMapping] = None
         self._create_lora_modules()
         self.model.lora_manager = self
 
@@ -362,15 +365,13 @@ def _deactivate_lora(self, lora_id: int):
         except ValueError:
             pass
 
-    def deactivate_lora(self, lora_id: int) -> bool:
+    def deactivate_lora(self, lora_id: int):
         """Remove a LoRA from a GPU buffer."""
         if lora_id in self._active_loras:
             self._deactivate_lora(lora_id)
             self._active_loras.pop(lora_id)
-            return True
-        return False
 
-    def _add_lora(self, lora: LoRAModel) -> bool:
+    def _add_lora(self, lora: LoRAModel):
         self._create_merged_loras_inplace(lora)
         self._registered_loras[lora.id] = lora
 
@@ -418,7 +419,7 @@ def list_loras(self) -> Dict[int, LoRAModel]:
     def get_lora(self, lora_id: int) -> Optional[LoRAModel]:
         return self._registered_loras.get(lora_id, None)
 
-    def remove_all_loras(self) -> bool:
+    def remove_all_loras(self):
         """Remove all LoRAModels from the manager."""
         self._registered_loras.clear()
         self.lora_index_to_id = [None] * self.lora_slots
@@ -467,6 +468,7 @@ def create_dummy_lora(
                 continue
             parts = module_name.split(".")
             if module_name not in self.packed_modules:
+                assert embedding_modules is not None
                 if parts[-1] in embedding_modules:
                     input_dim = (module.base_layer.org_vocab_size +
                                  self.lora_config.lora_extra_vocab_size if
@@ -500,7 +502,7 @@ def create_dummy_lora(
             else:
                 parts = module_name.split(".")
                 replacements = self.packed_modules_mapping[parts[-1]]
-                subloras = []
+                subloras: List[Optional["LoRALayerWeights"]] = []
                 for i, r in enumerate(replacements):
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name + "." + r,
@@ -538,7 +540,7 @@ def _register_packed_modules(self, module_full_name: str) -> None:
 
     def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
         for module_name, new_module_names in self.packed_modules.items():
-            replacement_loras = []
+            replacement_loras: List[Optional[LoRALayerWeights]] = []
             has_replacement = False
             for r in new_module_names:
                 lora = lora_model.get_lora(r)
@@ -557,12 +559,12 @@ def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
 
 class LoRALRUCache(LRUCache[LoRAModel]):
 
-    def __init__(self, capacity: int, deactivate_lora_fn: Callable[[Hashable],
+    def __init__(self, capacity: int, deactivate_lora_fn: Callable[[int],
                                                                    None]):
         super().__init__(capacity)
         self.deactivate_lora_fn = deactivate_lora_fn
 
-    def _on_remove(self, key: Hashable, value: LoRAModel):
+    def _on_remove(self, key: int, value: LoRAModel):
         logger.debug(f"Removing LoRA. int id: {key}")
         self.deactivate_lora_fn(key)
         return super()._on_remove(key, value)
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 5356b79537b05..ec3c10c591a18 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod, abstractproperty
-from typing import Any, Dict, List, Optional, Set, Type
+from typing import Any, Dict, List, Set, Type
 
 import torch
 
@@ -37,7 +37,7 @@ def create_lora_manager(
         ...
 
     @abstractmethod
-    def set_active_loras(self, lora_requests: List[LoRARequest],
+    def set_active_loras(self, lora_requests: Set[LoRARequest],
                          lora_mapping: LoRAMapping) -> None:
         ...
 
@@ -54,7 +54,7 @@ def remove_lora(self, lora_id: int) -> bool:
         ...
 
     @abstractmethod
-    def remove_all_loras(self) -> bool:
+    def remove_all_loras(self):
         ...
 
     @abstractmethod
@@ -81,10 +81,11 @@ def __init__(
         embedding_padding_modules: List[str],
         lora_model_cls: Type[LoRAModel] = LoRAModel,
     ):
-        self._lora_manager: Optional[LoRAModelManager] = None
         self._lora_model_cls = lora_model_cls
         self.embedding_modules = embedding_modules
         self.embedding_padding_modules = embedding_padding_modules
+        # Lazily initialized by create_lora_manager.
+        self._lora_manager: LoRAModelManager
         super().__init__(max_num_seqs, max_num_batched_tokens, vocab_size,
                          lora_config, device)
 
@@ -104,7 +105,7 @@ def create_lora_manager(
             lora_config=self.lora_config,
             lora_manager_cls=self._lora_manager_cls,
         )
-        self._lora_manager: LoRAModelManager = lora_manager
+        self._lora_manager = lora_manager
         return lora_manager.model
 
     def set_active_loras(self, lora_requests: Set[LoRARequest],
@@ -188,7 +189,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         return self._lora_manager.remove_lora(lora_id)
 
-    def remove_all_loras(self) -> bool:
+    def remove_all_loras(self):
         self._lora_manager.remove_all_loras()
 
     def list_loras(self) -> Set[int]:
@@ -217,10 +218,10 @@ def create_lora_manager(
             lora_config=self.lora_config,
             max_num_batched_tokens=self.max_num_batched_tokens,
         )
-        self._lora_manager: LRUCacheLoRAModelManager = lora_manager
+        self._lora_manager = lora_manager
         return lora_manager.model
 
-    def _apply_loras(self, lora_requests: List[LoRARequest]) -> None:
+    def _apply_loras(self, lora_requests: Set[LoRARequest]) -> None:
         loras_map = {
             lora_request.lora_int_id: lora_request
             for lora_request in lora_requests if lora_request
@@ -237,12 +238,14 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
         if lora_request.lora_int_id not in self.list_loras():
             # Remove before we load the new lora to save memory
             if len(self._lora_manager) + 1 > self._lora_manager.capacity:
+                assert isinstance(self._lora_manager, LRUCacheLoRAModelManager)
                 self._lora_manager.remove_oldest_lora()
             lora = self._load_lora(lora_request)
             loaded = self._lora_manager.add_lora(lora)
         else:
             # If the lora is already loaded, just touch it to
             # update its position in the caches
-            loaded = self._lora_manager.get_lora(lora_request.lora_int_id)
+            loaded = self._lora_manager.get_lora(
+                lora_request.lora_int_id) is not None
         self._lora_manager.activate_lora(lora_request.lora_int_id)
         return loaded
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 31e08789dfd1f..33dbf8d90c35d 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -928,10 +928,10 @@ def profile_run(self) -> None:
         torch.cuda.synchronize()
         return
 
-    def remove_all_loras(self) -> bool:
+    def remove_all_loras(self):
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.remove_all_loras()
+        self.lora_manager.remove_all_loras()
 
     def set_active_loras(self, lora_requests: Set[LoRARequest],
                          lora_mapping: LoRAMapping) -> None:

From 707193043acfd32f79baf115d271db393b933d61 Mon Sep 17 00:00:00 2001
From: sang <rkooo567@gmail.com>
Date: Wed, 24 Apr 2024 08:43:48 -0700
Subject: [PATCH 2/3] update mypy yaml

---
 .github/workflows/mypy.yaml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 9f1855696e20a..089c7d18ad6f2 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -33,8 +33,6 @@ jobs:
     - name: Mypy
       run: |
         mypy vllm/attention --config-file pyproject.toml
-        # TODO(sang): Fix nested dir
-        mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
         mypy vllm/distributed --config-file pyproject.toml
         mypy vllm/entrypoints --config-file pyproject.toml
         mypy vllm/executor --config-file pyproject.toml
@@ -44,8 +42,9 @@ jobs:
         mypy vllm/engine  --config-file pyproject.toml
         mypy vllm/worker --config-file pyproject.toml
         mypy vllm/spec_decode --config-file pyproject.toml
+        mypy vllm/lora --config-file pyproject.toml
+
         # TODO(sang): Fix nested dir
         mypy vllm/model_executor/*.py  --config-file pyproject.toml
-        # TODO(sang): Fix nested dir
-        # mypy vllm/lora/*.py --config-file pyproject.toml
+        mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
 

From 08afd2037550429ccd4490c71d435fbc631e654b Mon Sep 17 00:00:00 2001
From: sang <rkooo567@gmail.com>
Date: Wed, 24 Apr 2024 21:52:29 -0700
Subject: [PATCH 3/3] test passing

---
 vllm/lora/models.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index e89fc2502d12f..c249497a4d893 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -365,11 +365,13 @@ def _deactivate_lora(self, lora_id: int):
         except ValueError:
             pass
 
-    def deactivate_lora(self, lora_id: int):
+    def deactivate_lora(self, lora_id: int) -> bool:
         """Remove a LoRA from a GPU buffer."""
         if lora_id in self._active_loras:
             self._deactivate_lora(lora_id)
             self._active_loras.pop(lora_id)
+            return True
+        return False
 
     def _add_lora(self, lora: LoRAModel):
         self._create_merged_loras_inplace(lora)
@@ -560,7 +562,7 @@ def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
 class LoRALRUCache(LRUCache[LoRAModel]):
 
     def __init__(self, capacity: int, deactivate_lora_fn: Callable[[int],
-                                                                   None]):
+                                                                   bool]):
         super().__init__(capacity)
         self.deactivate_lora_fn = deactivate_lora_fn