diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst index bf243a044769f..45ef0340aae25 100644 --- a/docs/source/models/adding_model.rst +++ b/docs/source/models/adding_model.rst @@ -56,8 +56,8 @@ Next, you need to rewrite the :code:`forward` methods of your model by following - return_dict: Optional[bool] = None, -) -> Union[Tuple, CausalLMOutputWithPast]: + positions: torch.Tensor, - + kv_caches: List[KVCache], - + input_metadata: InputMetadata, + + kv_caches: List[torch.Tensor], + + attn_metadata: AttentionMetadata, +) -> Optional[SamplerOutput]: 1. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors. diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index c824efdf04684..c2571d0893c8d 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Tuple +from typing import List, Optional import torch from torch import nn @@ -19,8 +19,6 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] - _KEYS_TO_MODIFY_MAPPING = { "language_model.lm_head": "lm_head", "language_model.model": "language_model", @@ -102,7 +100,7 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: List[KVCache], + kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, image_input: Optional[torch.Tensor] = None ) -> SamplerOutput: # noqa: E501 diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index ded22b9a3ac0f..fff721a80c204 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -14,8 +14,6 @@ logger = init_logger(__name__) -KVCache = Tuple[torch.Tensor, torch.Tensor] - class NeuronModelRunner: