From 0e4076a692c6874d3e65c4c748f89e9441a85c2a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 26 Mar 2024 23:14:06 -0700
Subject: [PATCH] [Misc] Minor fix in KVCache type (#3652)

---
 docs/source/models/adding_model.rst | 4 ++--
 vllm/model_executor/models/llava.py | 6 ++----
 vllm/worker/neuron_model_runner.py  | 2 --
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index bf243a044769f..45ef0340aae25 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -56,8 +56,8 @@ Next, you need to rewrite the :code:`forward` methods of your model by following
     -    return_dict: Optional[bool] = None,
     -) -> Union[Tuple, CausalLMOutputWithPast]:
     +    positions: torch.Tensor,
-    +    kv_caches: List[KVCache],
-    +    input_metadata: InputMetadata,
+    +    kv_caches: List[torch.Tensor],
+    +    attn_metadata: AttentionMetadata,
     +) -> Optional[SamplerOutput]:
 
 1. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors.
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index c824efdf04684..c2571d0893c8d 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple
+from typing import List, Optional
 
 import torch
 from torch import nn
@@ -19,8 +19,6 @@
                                               hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
 _KEYS_TO_MODIFY_MAPPING = {
     "language_model.lm_head": "lm_head",
     "language_model.model": "language_model",
@@ -102,7 +100,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[KVCache],
+        kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         image_input: Optional[torch.Tensor] = None
     ) -> SamplerOutput:  # noqa: E501
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index ded22b9a3ac0f..fff721a80c204 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -14,8 +14,6 @@
 
 logger = init_logger(__name__)
 
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
 
 class NeuronModelRunner: