From c281f95805cbfd4813bedb40c8bcbf05e4f7ed1a Mon Sep 17 00:00:00 2001 From: whyiug Date: Mon, 4 Mar 2024 16:34:09 +0800 Subject: [PATCH 1/6] Update Qwen2Model and Qwen2ForCausalLM classes --- vllm/model_executor/models/qwen2.py | 63 +++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index e823e6f8c3dbe..180a894e43573 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -39,14 +39,14 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) + VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.weight_utils import (default_weight_loader, hf_model_weights_iterator) from vllm.sequence import SamplerOutput - +from vllm.config import LoRAConfig KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -225,15 +225,19 @@ def __init__( self, config: Qwen2Config, linear_method: Optional[LinearMethodBase] = None, + lora_config: Optional[LoRAConfig] = None, ) -> None: super().__init__() self.config = config self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - + lora_vocab = (lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0 + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, + self.vocab_size, config.hidden_size, + org_num_embeddings=config.vocab_size, ) self.layers = nn.ModuleList([ Qwen2DecoderLayer(config, layer_idx, linear_method) @@ -264,18 +268,56 @@ def forward( class Qwen2ForCausalLM(nn.Module): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", + "o_proj", + "gate_up_proj", + "down_proj", + "embed_tokens", + "lm_head", + ] + embedding_modules = { + "embed_tokens": "input_embeddings", + "lm_head": "output_embeddings", + } + embedding_padding_modules = ["lm_head"] def __init__( self, config: Qwen2Config, linear_method: Optional[LinearMethodBase] = None, + lora_config: Optional[LoRAConfig] = None, ) -> None: super().__init__() self.config = config self.linear_method = linear_method - self.model = Qwen2Model(config, linear_method) - self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) + self.model = Qwen2Model(config, linear_method, lora_config=lora_config) + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE + # We need bigger padding if using lora for kernel + # compatibility + if not lora_config else lora_config.lora_vocab_padding_size, + ) + self.sampler = Sampler(self.unpadded_vocab_size, config.vocab_size) def forward( self, @@ -315,6 +357,11 @@ def load_weights(self, model_name_or_path, cache_dir, load_format, revision): if "rotary_emb.inv_freq" in name: continue + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue From 55a3cc1603ccd2bbefb6493e4f123c1506aa140b Mon Sep 17 00:00:00 2001 From: whyiug Date: Mon, 4 Mar 2024 19:26:23 +0800 Subject: [PATCH 2/6] Add new narrow value in bgmv_kernel function and remove unused code in Qwen2Model class --- csrc/punica/bgmv/bgmv_config.h | 1 + vllm/model_executor/models/qwen2.py | 44 +++++++---------------------- 2 files changed, 11 insertions(+), 34 deletions(-) diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h index d5fee9c40d00c..9e444d72209d1 100644 --- a/csrc/punica/bgmv/bgmv_config.h +++ b/csrc/punica/bgmv/bgmv_config.h @@ -36,6 +36,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 10240) \ f(in_T, out_T, W_T, narrow, 11008) \ f(in_T, out_T, W_T, narrow, 12288) \ + f(in_T, out_T, W_T, narrow, 13696) \ f(in_T, out_T, W_T, narrow, 13824) \ f(in_T, out_T, W_T, narrow, 14336) \ f(in_T, out_T, W_T, narrow, 16384) \ diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 180a894e43573..9fec207f2a1e9 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -39,7 +39,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) + VocabParallelEmbedding, ParallelLMHead) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -225,19 +225,15 @@ def __init__( self, config: Qwen2Config, linear_method: Optional[LinearMethodBase] = None, - lora_config: Optional[LoRAConfig] = None, ) -> None: super().__init__() self.config = config self.padding_idx = config.pad_token_id - lora_vocab = (lora_config.lora_extra_vocab_size * - (lora_config.max_loras or 1)) if lora_config else 0 - self.vocab_size = config.vocab_size + lora_vocab - self.org_vocab_size = config.vocab_size + self.vocab_size = config.vocab_size + self.embed_tokens = VocabParallelEmbedding( - self.vocab_size, + config.vocab_size, config.hidden_size, - org_num_embeddings=config.vocab_size, ) self.layers = nn.ModuleList([ Qwen2DecoderLayer(config, layer_idx, linear_method) @@ -286,14 +282,9 @@ class Qwen2ForCausalLM(nn.Module): "o_proj", "gate_up_proj", "down_proj", - "embed_tokens", - "lm_head", ] - embedding_modules = { - "embed_tokens": "input_embeddings", - "lm_head": "output_embeddings", - } - embedding_padding_modules = ["lm_head"] + embedding_modules = {} + embedding_padding_modules = [] def __init__( self, @@ -301,23 +292,13 @@ def __init__( linear_method: Optional[LinearMethodBase] = None, lora_config: Optional[LoRAConfig] = None, ) -> None: + del lora_config super().__init__() self.config = config self.linear_method = linear_method - self.model = Qwen2Model(config, linear_method, lora_config=lora_config) - self.unpadded_vocab_size = config.vocab_size - if lora_config: - self.unpadded_vocab_size += lora_config.lora_extra_vocab_size - self.lm_head = ParallelLMHead( - self.unpadded_vocab_size, - config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=DEFAULT_VOCAB_PADDING_SIZE - # We need bigger padding if using lora for kernel - # compatibility - if not lora_config else lora_config.lora_vocab_padding_size, - ) - self.sampler = Sampler(self.unpadded_vocab_size, config.vocab_size) + self.model = Qwen2Model(config, linear_method) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + self.sampler = Sampler(config.vocab_size) def forward( self, @@ -357,11 +338,6 @@ def load_weights(self, model_name_or_path, cache_dir, load_format, revision): if "rotary_emb.inv_freq" in name: continue - if ("rotary_emb.cos_cached" in name - or "rotary_emb.sin_cached" in name): - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - continue for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue From 5f9ca94063dbc1687deccd67b13eb3d3b2ff2bbf Mon Sep 17 00:00:00 2001 From: whyiug Date: Mon, 4 Mar 2024 19:29:36 +0800 Subject: [PATCH 3/6] Add import statement for KVCache --- vllm/model_executor/models/qwen2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 9fec207f2a1e9..731fe7823703f 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -47,6 +47,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput from vllm.config import LoRAConfig + KVCache = Tuple[torch.Tensor, torch.Tensor] From 53f2bc9caed19ee96b18137fcd1852d8b6e21e3f Mon Sep 17 00:00:00 2001 From: whyiug Date: Wed, 6 Mar 2024 01:19:34 +0800 Subject: [PATCH 4/6] Add LoRAConfig support to QWenLMHeadModel --- vllm/model_executor/models/qwen.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 37af84c7cd53f..f0bb5cc2ed6ff 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -28,6 +28,7 @@ from vllm.model_executor.weight_utils import (default_weight_loader, hf_model_weights_iterator) from vllm.sequence import SamplerOutput +from vllm.config import LoRAConfig KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -219,12 +220,36 @@ def forward( class QWenLMHeadModel(nn.Module): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "w2", + "w1", + ], + } + + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", + "gate_up_proj", + "o_proj", + "down_proj", + "c_proj", + ] + embedding_modules = {} + embedding_padding_modules = [] def __init__( self, config: PretrainedConfig, linear_method: Optional[LinearMethodBase] = None, - ): + lora_config: Optional[LoRAConfig] = None, + ) -> None: + del lora_config super().__init__() self.config = config self.linear_method = linear_method @@ -259,6 +284,9 @@ def load_weights(self, revision: Optional[str] = None): stacked_params_mapping = [ # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), ("gate_up_proj", "w2", 0), ("gate_up_proj", "w1", 1), ] From be749fe99a4bef91b61fcdd6945d0a4ae1928444 Mon Sep 17 00:00:00 2001 From: whyiug Date: Wed, 6 Mar 2024 16:47:33 +0800 Subject: [PATCH 5/6] Add support for new narrow value in bgmv_kernel for Qwen1.5-0.5B --- csrc/punica/bgmv/bgmv_config.h | 1 + 1 file changed, 1 insertion(+) diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h index 9e444d72209d1..3eb84ceb4d534 100644 --- a/csrc/punica/bgmv/bgmv_config.h +++ b/csrc/punica/bgmv/bgmv_config.h @@ -21,6 +21,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 2048) \ f(in_T, out_T, W_T, narrow, 2560) \ f(in_T, out_T, W_T, narrow, 2752) \ + f(in_T, out_T, W_T, narrow, 2816) \ f(in_T, out_T, W_T, narrow, 3072) \ f(in_T, out_T, W_T, narrow, 3456) \ f(in_T, out_T, W_T, narrow, 3584) \ From 23234cd61a33942009cb51234ab8130dca4a488f Mon Sep 17 00:00:00 2001 From: whyiug Date: Thu, 7 Mar 2024 18:44:25 +0800 Subject: [PATCH 6/6] Refactor QWenLMHeadModel class in qwen.py --- vllm/model_executor/models/qwen.py | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index f0bb5cc2ed6ff..37af84c7cd53f 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -28,7 +28,6 @@ from vllm.model_executor.weight_utils import (default_weight_loader, hf_model_weights_iterator) from vllm.sequence import SamplerOutput -from vllm.config import LoRAConfig KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -220,36 +219,12 @@ def forward( class QWenLMHeadModel(nn.Module): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "w2", - "w1", - ], - } - - # LoRA specific attributes - supported_lora_modules = [ - "qkv_proj", - "gate_up_proj", - "o_proj", - "down_proj", - "c_proj", - ] - embedding_modules = {} - embedding_padding_modules = [] def __init__( self, config: PretrainedConfig, linear_method: Optional[LinearMethodBase] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: - del lora_config + ): super().__init__() self.config = config self.linear_method = linear_method @@ -284,9 +259,6 @@ def load_weights(self, revision: Optional[str] = None): stacked_params_mapping = [ # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), ("gate_up_proj", "w2", 0), ("gate_up_proj", "w1", 1), ]