From 8d2bf278eaa7c51ad0947104974f7c60dd29b64b Mon Sep 17 00:00:00 2001
From: xinhe <xin3.he@intel.com>
Date: Tue, 12 Sep 2023 09:31:18 +0800
Subject: [PATCH] fix bug in nf4/fp4 (#1241)

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../adaptor/torch_utils/weight_only.py        |  2 +-
 .../test_weight_only_adaptor.py               | 21 +++++++++----------
 test/algorithm/test_smooth_quant.py           |  3 +--
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py
index 2400c0ee56d..f3c22691092 100644
--- a/neural_compressor/adaptor/torch_utils/weight_only.py
+++ b/neural_compressor/adaptor/torch_utils/weight_only.py
@@ -78,7 +78,7 @@ def quantize_4bit(tensor, quantile=1.0, data_type="nf4", return_int=False):
     allow_data = FLOAT_MAPPING[data_type]
     allow_data_bit = INT_MAPPING[data_type]
     # get scale and update tensor
-    scale = tensor.max(1)[0] * quantile / max(allow_data)
+    scale = tensor.abs().max(1)[0] * quantile / max(allow_data)
     scale.unsqueeze_(dim=-1)
     tensor = tensor / scale
     mid_data = [(allow_data[i] + allow_data[i + 1]) / 2 for i in range(len(allow_data) - 1)]
diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
index 186d0b2aa62..9a6d41a61e2 100644
--- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
+++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
@@ -239,11 +239,10 @@ def test_RTN_int_quant(self):
         self.assertTrue(isinstance(inc_model.model.fc1, WeightOnlyLinear))
         self.assertTrue(model_size1 / model_size2 > 2)
 
-    def test_RTN_fp4_quant(self):
-        for dtype in ["nf4", "fp4", "fp4_e2m1_bnb", "fp4_e2m1"]:
-            input = torch.randn(3, 30)
-            model = Model()
-            out1 = model(input)
+    def test_RTN_4bit_quant(self):
+        for dtype in ["int4", "nf4", "fp4", "fp4_e2m1_bnb", "fp4_e2m1"]:
+            model = copy.deepcopy(self.gptj)
+            out1 = model(self.lm_input)
             conf = PostTrainingQuantConfig(
                 approach="weight_only",
                 op_type_dict={
@@ -251,19 +250,19 @@ def test_RTN_fp4_quant(self):
                         "weight": {
                             "dtype": dtype,  # select from int, nf4, or fp4
                             # nf4/fp4 have fixed bits and scheme.
-                            "group_size": 32,  # -1 (per-channel)
+                            "group_size": 64,  # -1 (per-channel)
                             "algorithm": "RTN",
                         },
                     },
                 },
             )
             q_model = quantization.fit(model, conf)
-            out2 = q_model(input)
-            self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1)))
-            self.assertFalse(torch.all(out1 == out2))
+            out2 = q_model(self.lm_input)
+            self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1)))
+            self.assertFalse(torch.all(out1[0] == out2[0]))
             compressed_model = q_model.export_compressed_model()
-            out3 = compressed_model(input)
-            self.assertTrue(torch.all(out3 == out2))
+            out3 = compressed_model(self.lm_input)
+            self.assertTrue(torch.all(out3[0] == out2[0]))
 
     def test_AWQ_quant(self):
         conf = PostTrainingQuantConfig(
diff --git a/test/algorithm/test_smooth_quant.py b/test/algorithm/test_smooth_quant.py
index 4c774dbca79..af9284caacf 100644
--- a/test/algorithm/test_smooth_quant.py
+++ b/test/algorithm/test_smooth_quant.py
@@ -780,8 +780,7 @@ def calib_func(model):
         output2 = q_model.model(input_ids)
         assert isinstance(q_model.model.fc1, SQLinearWrapper)
         # set a big atol to avoid random issue
-        print(output1, output2)
-        self.assertTrue(torch.allclose(output1, output2, atol=1e-02))
+        self.assertTrue(torch.allclose(output1, output2, atol=2e-02))
 
         q_model.save("saved_result")
         from neural_compressor.utils.pytorch import load