vllm-project · robertgshaw2-neuralmagic · Jul 2, 2024 · Apr 26, 2024 · Apr 26, 2024 · Apr 26, 2024
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -437,7 +437,7 @@ def __init__(
         self.model = LLM(
             model=model_name,
             tokenizer=tokenizer_name,
-            trust_remote_code=True,
+            trust_remote_code="falcon" not in model_name,
             dtype=dtype,
             swap_space=swap_space,
             enforce_eager=enforce_eager,

diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
@@ -475,10 +475,10 @@ def _pretest():
 
         lora_result = lora_logits_processor._get_logits(
             hidden_states=torch.cat(inputs),
-            embedding=linear.weight,
+            lm_head=linear,
             embedding_bias=None)
 
-        original_weight = linear.weight.clone()
+        original_lm_head = deepcopy(linear)
 
         linear.weight[logits_processor.
                       org_vocab_size:logits_processor.org_vocab_size +
@@ -490,7 +490,7 @@ def _pretest():
         for input_, lora_id in zip(inputs, prompt_mapping):
             lora = lora_dict[lora_id]
             result = logits_processor._get_logits(hidden_states=input_,
-                                                  embedding=linear.weight,
+                                                  lm_head=linear,
                                                   embedding_bias=None)
             result[:, vocab_size + embeddings_tensor_len:] = float("-inf")
             result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
@@ -519,11 +519,11 @@ def _pretest():
 
         lora_result = lora_logits_processor._get_logits(
             hidden_states=torch.cat(inputs),
-            embedding=original_weight,
+            lm_head=original_lm_head,
             embedding_bias=None)[:, :vocab_size]
         expected_result = logits_processor._get_logits(
             hidden_states=torch.cat(inputs),
-            embedding=original_weight,
+            lm_head=original_lm_head,
             embedding_bias=None)
 
         rtol, atol = TOLERANCES[lora_result.dtype]

diff --git a/tests/models/test_models_logprobs.py b/tests/models/test_models_logprobs.py
@@ -0,0 +1,87 @@
+"""Compares the outputs of hf vs vllm for medium sized models.
+
+There is not bitwise correctness for fp16 inference.
+As a result, in this test, we just confirm that the top selected tokens of the
+Marlin/GPTQ models are in the top 3 selections of each other.
+
+Run `pytest tests/models/test_models_medium_logprobs.py`.
+"""
+import pytest
+
+from tests.models.utils import check_logprobs_close
+
+MAX_MODEL_LEN = 1024
+
+MODELS = [
+    # # arctic - skip: size in automation
+    # "baichuan-inc/Baichuan2-7B-Chat",
+    # "bigscience/bloom-560m",
+    # # "THUDM/chatglm3-6b", skip: hf implementation broken
+    # # commandr - skip: size in automation size
+    # # dbrx - skip: size in automation
+    # "Deci/DeciLM-7B-instruct",
+    # # deepseek_v2 - skip: size in automation
+    # "deepseek-ai/deepseek-coder-1.3b-instruct",
+    # "tiiuae/falcon-rw-1b",
+    # "google/gemma-1.1-2b-it",
+    # # "google/gemma-2-9b-it", skip: not supported in transformers yet
+    # "bigcode/tiny_starcoder_py",
+    # "EleutherAI/gpt-j-6b",
+    # "EleutherAI/pythia-410m",
+    # "gpt2",
+    # "internlm/internlm2-chat-7b",
+    # # jais - skip: size in automation
+    # "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    # "openbmb/MiniCPM-2B-128k",
+    # # mixtral - skip: size in automation
+    # "mosaicml/mpt-7b-instruct",
+    # # "allenai/OLMo-1B", # skip: broken in transformers
+    "facebook/opt-125m",
+    # orion - skip: size in automation
+    "microsoft/phi-2",
+    "microsoft/Phi-3-small-8k-instruct",
+    "Qwen/Qwen-1_8B",
+    "Qwen/Qwen1.5-1.8B",
+    "Qwen/Qwen2-0.5B-Instruct"
+    # qwen2 moe - skip: size in automation
+    "stabilityai/stablelm-2-1_6b-chat",
+    "bigcode/starcoder2-3b",
+    "xverse/XVERSE-7B",
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    hf_runner,
+    example_prompts,
+    model,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    # Run HF.
+    hf_model = hf_runner(model_name=model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy_logprobs_limit(
+        example_prompts, max_tokens, num_logprobs)
+    del hf_model
+
+    # Run vLLM.
+    vllm_model = vllm_runner(model_name=model,
+                             dtype=dtype,
+                             max_model_len=MAX_MODEL_LEN,
+                             tensor_parallel_size=1)
+    vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
+                                                       max_tokens,
+                                                       num_logprobs)
+    del vllm_model
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/tests/models/utils.py b/tests/models/utils.py
@@ -62,4 +62,5 @@ def check_logprobs_close(outputs_0_lst: List[TokensTextLogprobs],
                     f"\n{name_1}:\t{output_str_1!r}")
 
                 # Break out since sequences will now diverge.
+                print(f"BREAKOUT AT IDX {idx} FOR PROMPT_IDX {prompt_idx} ")
                 break
@@ -0,0 +1,45 @@
+"""Tests whether gptq models with quantized lm_head can be loaded.
+
+Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`.
+"""
+from typing import Tuple
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinLinearMethod)
+from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
+
+PROMPT = "On the surface of Mars, we found"
+
+MODELS_QUANT = [(
+    "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse",
+    True), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
+                ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)]
+
+
+@pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT)
+def test_lm_head(
+    vllm_runner,
+    model_lm_head_quant: Tuple[str, bool],
+) -> None:
+    model, lm_head_quantized = model_lm_head_quant
+    vllm_model = vllm_runner(model, dtype=torch.float16, max_model_len=2048)
+
+    lm_head_layer = (vllm_model.model.llm_engine.model_executor.driver_worker.
+                     model_runner.model.lm_head)
+
+    if lm_head_quantized:
+        assert isinstance(
+            lm_head_layer.linear_method,
+            (GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod))
+    else:
+        assert isinstance(lm_head_layer.linear_method, UnquantizedLinearMethod)
+
+    print(
+        vllm_model.generate_greedy(prompts=["Hello my name is"],
+                                   max_tokens=10)[0][1])
+    del vllm_model
diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py
@@ -83,7 +83,7 @@ def pick_ith(token_ids, logits):
         device=device,
         pin_memory=is_pin_memory_available())
     logits_processor_output = logits_processor(
-        embedding=None,
+        lm_head=None,
         hidden_states=input_tensor,
         sampling_metadata=sampling_metadata)
 

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
@@ -1172,11 +1172,11 @@ def set_mapping(
     def _get_logits(
         self,
         hidden_states: torch.Tensor,
-        embedding: torch.Tensor,
+        lm_head: VocabParallelEmbedding,
         embedding_bias: Optional[torch.Tensor] = None,
     ) -> Optional[torch.Tensor]:
         # Get the logits for the next tokens.
-        logits = torch.matmul(hidden_states, embedding.t())
+        logits = lm_head.linear_method.apply(lm_head, hidden_states)
         if embedding_bias is not None:
             logits += embedding_bias
         logits = tensor_model_parallel_gather(logits)

diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
@@ -6,6 +6,8 @@
 import torch.nn as nn
 
 from vllm.distributed import tensor_model_parallel_gather
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 
 
@@ -40,7 +42,7 @@ def __init__(self,
 
     def forward(
         self,
-        embedding: torch.Tensor,
+        lm_head: VocabParallelEmbedding,
         hidden_states: torch.Tensor,
         sampling_metadata: SamplingMetadata,
         embedding_bias: Optional[torch.Tensor] = None,
@@ -52,8 +54,7 @@ def forward(
                                                  sampling_metadata)
 
             # Get the logits for the next tokens.
-            logits = self._get_logits(hidden_states, embedding, embedding_bias)
-
+            logits = self._get_logits(hidden_states, lm_head, embedding_bias)
         if logits is not None:
             if self.soft_cap is not None:
                 logits = logits / self.soft_cap
@@ -68,12 +69,13 @@ def forward(
 
         return logits
 
-    def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor,
+    def _get_logits(self, hidden_states: torch.Tensor,
+                    lm_head: VocabParallelEmbedding,
                     embedding_bias: Optional[torch.Tensor]) -> torch.Tensor:
         # Get the logits for the next tokens.
-        logits = torch.matmul(hidden_states, embedding.t())
-        if embedding_bias is not None:
-            logits += embedding_bias
+        logits = lm_head.linear_method.apply(lm_head,
+                                             hidden_states,
+                                             bias=embedding_bias)
         logits = tensor_model_parallel_gather(logits)
         # Remove paddings in vocab (if any).
         if logits is not None:

diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
@@ -86,6 +86,15 @@ def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any:
         raise ValueError(f"Cannot find any of {keys} in the model's "
                          "quantization config.")
 
+    @staticmethod
+    def get_from_keys_optional(config: Dict[str, Any], keys: List[str],
+                               default: Any) -> Any:
+        """Get a optional value from the model's quantization config."""
+        for key in keys:
+            if key in config:
+                return config[key]
+        return default
+
     @abstractmethod
     def get_quant_method(
             self, layer: torch.nn.Module) -> Optional[QuantizeMethodBase]:

diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
@@ -10,6 +10,7 @@
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.utils import set_weight_attrs
 
 
@@ -24,10 +25,12 @@ def __init__(
         weight_bits: int,
         group_size: int,
         desc_act: bool,
+        lm_head_quantized: bool,
     ) -> None:
         self.weight_bits = weight_bits
         self.group_size = group_size
         self.desc_act = desc_act
+        self.lm_head_quantized = lm_head_quantized
         self.pack_factor = Fraction(32, self.weight_bits)
         if self.weight_bits not in [2, 3, 4, 8]:
             raise ValueError(
@@ -37,7 +40,8 @@ def __init__(
     def __repr__(self) -> str:
         return (f"GPTQConfig(weight_bits={self.weight_bits}, "
                 f"group_size={self.group_size}, "
-                f"desc_act={self.desc_act})")
+                f"desc_act={self.desc_act}),"
+                f"lm_head_quantized={self.lm_head_quantized}")
 
     @classmethod
     def get_name(cls) -> str:
@@ -61,11 +65,14 @@ def from_config(cls, config: Dict[str, Any]) -> "GPTQConfig":
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
-        return cls(weight_bits, group_size, desc_act)
+        lm_head_quantized = cls.get_from_keys_optional(config, ["lm_head"],
+                                                       default=False)
+        return cls(weight_bits, group_size, desc_act, lm_head_quantized)
 
     def get_quant_method(
             self, layer: torch.nn.Module) -> Optional["GPTQLinearMethod"]:
-        if isinstance(layer, LinearBase):
+        if (isinstance(layer, LinearBase) or
+            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
             return GPTQLinearMethod(self)
         return None
 

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -11,6 +11,7 @@
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 
 logger = init_logger(__name__)
 
@@ -58,7 +59,7 @@ class GPTQMarlinConfig(QuantizationConfig):
     """Config class for GPTQ Marlin"""
 
     def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
-                 is_sym: bool) -> None:
+                 is_sym: bool, lm_head_quantized: bool) -> None:
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
             # (since we have only one group per output channel)
@@ -68,6 +69,7 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
         self.group_size = group_size
         self.desc_act = desc_act
         self.is_sym = is_sym
+        self.lm_head_quantized = lm_head_quantized
 
         # Verify
         if self.weight_bits not in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
@@ -95,7 +97,8 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
     def __repr__(self) -> str:
         return (f"GPTQMarlinConfig(weight_bits={self.weight_bits}, "
                 f"group_size={self.group_size}, "
-                f"desc_act={self.desc_act})")
+                f"desc_act={self.desc_act}, "
+                f"lm_head_quantized={self.lm_head_quantized})")
 
     @classmethod
     def get_name(cls) -> str:
@@ -119,7 +122,10 @@ def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
         is_sym = cls.get_from_keys(config, ["sym"])
-        return cls(weight_bits, group_size, desc_act, is_sym)
+        lm_head_quantized = cls.get_from_keys_optional(config, ["lm_head"],
+                                                       default=False)
+        return cls(weight_bits, group_size, desc_act, is_sym,
+                   lm_head_quantized)
 
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg,
@@ -144,7 +150,8 @@ def override_quantization_method(cls, hf_quant_cfg,
     def get_quant_method(
             self,
             layer: torch.nn.Module) -> Optional["GPTQMarlinLinearMethod"]:
-        if isinstance(layer, LinearBase):
+        if (isinstance(layer, LinearBase) or
+            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
             return GPTQMarlinLinearMethod(self)
         return None