support gptq true_sequential and quant_lm_head (#1977)

Signed-off-by: Kaihui-intel <[email protected]>
intel · Aug 19, 2024 · 7ba9fdc · 7ba9fdc
1 parent 68b1f8b
commit 7ba9fdc
Show file tree

Hide file tree

Showing 8 changed files with 430 additions and 99 deletions.
diff --git a/.azure-pipelines/scripts/ut/run_itrex.sh b/.azure-pipelines/scripts/ut/run_itrex.sh
@@ -18,7 +18,8 @@ bash /intel-extension-for-transformers/.github/workflows/script/install_binary.s
 sed -i '/neural-compressor.git/d' /intel-extension-for-transformers/tests/requirements.txt
 pip install -r /intel-extension-for-transformers/tests/requirements.txt
 # workaround
-pip install onnx==1.15.0
+pip install onnx==1.16.0
+pip install onnxruntime==1.18.0
 echo "pip list itrex ut deps..."
 pip list
 LOG_DIR=/neural-compressor/log_dir

diff --git a/docs/source/3x/PT_WeightOnlyQuant.md b/docs/source/3x/PT_WeightOnlyQuant.md
@@ -111,9 +111,10 @@ model = convert(model)
 |               model_path (str)        |  Model path that is used to load   state_dict per layer                                                                                    |                    |
 |               use_double_quant (bool) |  Enables double quantization                                                                                                               |  False  |
 |               act_order (bool)        |  Whether to sort Hessian's diagonal   values to rearrange channel-wise quantization order                                                  |  False  |
-|               percdamp (float)        |  Percentage of Hessian's diagonal   values' average, which will be added to Hessian's diagonal to increase   numerical stability           |  0.01.  |
+|               percdamp (float)        |  Percentage of Hessian's diagonal   values' average, which will be added to Hessian's diagonal to increase   numerical stability           |  0.01  |
 |               block_size (int)        |  Execute GPTQ quantization per   block, block shape = [C_out, block_size]                                                                  |  128     |
-|               static_groups (bool)    |  Whether to calculate group wise   quantization parameters in advance. This option mitigate actorder's extra   computational requirements. |  False.  |
+|               static_groups (bool)    |  Whether to calculate group wise   quantization parameters in advance. This option mitigate actorder's extra   computational requirements. |  False  |
+|               true_sequential (bool)    |  Whether to quantize layers within a transformer block in their original order. This can lead to higher accuracy but slower overall quantization process. |  False  |
 > **Note:** `model_path` is only used when use_layer_wise=True. `layer-wise` is stay-tuned.
 
 ``` python

diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py
diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -177,6 +177,8 @@ def convert(
                 if dtype != "int" and "int" in dtype:
                     bits = int(dtype.lstrip("int"))
                     dtype = "int"
+            else:
+                continue
             log_msg = (
                 f"RTN quantization config: bits={bits}, group_size={group_size}, "
                 + f"scheme={scheme}, quantile={quantile}"

diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -159,11 +159,13 @@ def gptq_entry(
             "percdamp": quant_config.percdamp,
             "block_size": quant_config.block_size,
             "static_groups": quant_config.static_groups,
+            "true_sequential": quant_config.true_sequential,
         }
     kwargs.update(
         {
             "use_layer_wise": quant_config.use_layer_wise,
             "model_path": quant_config.model_path,
+            "quant_lm_head": quant_config.quant_lm_head,
         }
     )
     kwargs.pop("example_inputs")

diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
@@ -351,6 +351,7 @@ class GPTQConfig(TorchBaseConfig):
         "percdamp",
         "block_size",
         "static_groups",
+        "true_sequential",
     ]
 
     def __init__(
@@ -376,6 +377,7 @@ def __init__(
         percdamp: float = 0.01,
         block_size: int = 2048,
         static_groups: bool = False,
+        true_sequential: bool = False,
         # Tuning space
         white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST,
     ):
@@ -404,10 +406,12 @@ def __init__(
             static_groups (bool): Whether to calculate group wise quantization parameters in advance.
                                   This option mitigate actorder's extra computational requirements.
                                   Default is False.
+            true_sequential (bool): Whether to quantize layers within a transformer block in their original order.
+                                  This can lead to higher accuracy but slower overall quantization process.
+                                  Default is False.
             white_list (Optional[List[OP_NAME_OR_MODULE_TYPE]]): White list of operator names or module types.
                                                                  Default is DEFAULT_WHITE_LIST.
         """
-        assert not quant_lm_head, "GPTQ doesn't support lm_head quantization currently, it's coming soon!"
         super().__init__(white_list=white_list)
         self.dtype = dtype
         self.bits = bits
@@ -428,6 +432,7 @@ def __init__(
         self.percdamp = percdamp
         self.block_size = block_size
         self.static_groups = static_groups
+        self.true_sequential = true_sequential
         self.quant_lm_head = quant_lm_head
         self._post_init()  # initialize global & local configuration
 
@@ -599,7 +604,7 @@ def __init__(
             double_quant_bits (int): Number of bits used to represent double_quant scale, default is 4.
             double_quant_use_sym (bool): Indicates whether double_quant scale are symmetric, default is True.
             double_quant_group_size (int): Size of double_quant groups, default is 32.
-            quant_lm_head (bool): Indicates whether quantize the lm_head layer in transformers。 Default is False.
+            quant_lm_head (bool): Indicates whether quantize the lm_head layer in transformer, default is False.
             use_auto_scale (bool): Enables best scales search based on activation distribution, default is True.
             use_auto_clip (bool):  Enables clip range search. Defaults to True.
             folding(bool): Allow insert mul before linear when the scale cannot be absorbed by last layer,

diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py
@@ -182,9 +182,10 @@ def test_act_order(self):
         # compare atol, this case is an ideal case.
         assert atol_false > atol_true, "act_order=True doesn't help accuracy, maybe is reasonable, please double check."
 
-    def test_layer_wise(self):
+    @pytest.mark.parametrize("quant_lm_head", [False, True])
+    def test_layer_wise(self, quant_lm_head):
         model = copy.deepcopy(self.tiny_gptj)
-        quant_config = GPTQConfig()
+        quant_config = GPTQConfig(quant_lm_head=quant_lm_head)
         model = prepare(model, quant_config)
         run_fn(model)
         model = convert(model)
@@ -194,12 +195,76 @@ def test_layer_wise(self):
 
         model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM")
 
-        quant_config = GPTQConfig(use_layer_wise=True, model_path="hf-internal-testing/tiny-random-GPTJForCausalLM")
+        quant_config = GPTQConfig(
+            use_layer_wise=True,
+            quant_lm_head=quant_lm_head,
+            model_path="hf-internal-testing/tiny-random-GPTJForCausalLM",
+        )
+        model = prepare(model, quant_config)
+        run_fn(model)
+        model = convert(model)
+        out = model(self.example_inputs)[0]
+
+        # remove lwq tmp directory
+        from neural_compressor.torch.algorithms.layer_wise.utils import LWQ_WORKSPACE
+
+        shutil.rmtree(LWQ_WORKSPACE, ignore_errors=True)
+        assert torch.equal(
+            out, q_label
+        ), f"use_layer_wise=True and quant_lm_head={quant_lm_head} output should be same. Please double check."
+
+    def test_true_sequential(self):
+        # true_sequential=False
+        model = copy.deepcopy(self.tiny_gptj)
+        quant_config = GPTQConfig(
+            true_sequential=False,
+        )
+        model = prepare(model, quant_config)
+        run_fn(model)
+        model = convert(model)
+        out = model(self.example_inputs)[0]
+        atol_false = (out - self.label).amax()
+        # true_sequential=True
+        model = copy.deepcopy(self.tiny_gptj)
+        quant_config = GPTQConfig(
+            true_sequential=True,
+        )
+        model = prepare(model, quant_config)
+        run_fn(model)
+        model = convert(model)
+        out = model(self.example_inputs)[0]
+        atol_true = (out - self.label).amax()
+        # compare atol, this case is an ideal case.
+        assert (
+            atol_false < atol_true
+        ), "true_sequential=True doesn't help accuracy, maybe is reasonable, please double check."
+
+    def test_quant_lm_head(self):
+        # quant_lm_head=False
+        model = copy.deepcopy(self.tiny_gptj)
+        quant_config = GPTQConfig(
+            quant_lm_head=False,
+        )
         model = prepare(model, quant_config)
         run_fn(model)
         model = convert(model)
         out = model(self.example_inputs)[0]
-        assert torch.equal(out, q_label), "use_layer_wise=True output should be same. Please double check."
+        atol_false = (out - self.label).amax()
+        # quant_lm_head=True
+        model = copy.deepcopy(self.tiny_gptj)
+        quant_config = GPTQConfig(
+            quant_lm_head=True,
+        )
+        model = prepare(model, quant_config)
+        run_fn(model)
+        model = convert(model)
+        out = model(self.example_inputs)[0]
+        atol_true = (out - self.label).amax()
+        # compare atol, this case is an ideal case.
+        assert (
+            atol_false < atol_true
+        ), "quant_lm_head=True doesn't help accuracy, maybe is reasonable, please double check."
+        assert get_woq_linear_num(model, "INCWeightOnlyLinear") == 31, "Incorrect number of INCWeightOnlyLinear modules"
 
     @pytest.mark.parametrize("dtype", ["nf4", "int4"])
     @pytest.mark.parametrize("double_quant_bits", [6])

diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py
@@ -174,6 +174,15 @@ def test_quant_lm_head(self):
         ), "The tied lm_head weight is not deep copied, please check!"
 
     def test_layer_wise(self):
+        # use_layer_wise=False
+        model = copy.deepcopy(self.tiny_gptj)
+        quant_config = RTNConfig(
+            use_layer_wise=False,
+        )
+        model = prepare(model, quant_config)
+        model = convert(model)
+        out0 = model(self.example_inputs)[0]
+
         from neural_compressor.torch import load_empty_model
 
         model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM")
@@ -182,8 +191,8 @@ def test_layer_wise(self):
         )
         model = prepare(model, quant_config)
         model = convert(model)
-        out = model(self.example_inputs)[0]
-        assert torch.equal(out, self.q_label), "use_layer_wise=True output should be same. Please double check."
+        out1 = model(self.example_inputs)[0]
+        assert torch.equal(out1, out0), "use_layer_wise=True output should be same. Please double check."
 
     @pytest.mark.parametrize(
         "dtype",