fix bug in nf4/fp4 (#1241)

Signed-off-by: Xin He <[email protected]>
intel · Sep 12, 2023 · 8d2bf27 · 8d2bf27
1 parent 2019f41
commit 8d2bf27
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 14 deletions.
diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py
@@ -78,7 +78,7 @@ def quantize_4bit(tensor, quantile=1.0, data_type="nf4", return_int=False):
     allow_data = FLOAT_MAPPING[data_type]
     allow_data_bit = INT_MAPPING[data_type]
     # get scale and update tensor
-    scale = tensor.max(1)[0] * quantile / max(allow_data)
+    scale = tensor.abs().max(1)[0] * quantile / max(allow_data)
     scale.unsqueeze_(dim=-1)
     tensor = tensor / scale
     mid_data = [(allow_data[i] + allow_data[i + 1]) / 2 for i in range(len(allow_data) - 1)]

diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
@@ -239,31 +239,30 @@ def test_RTN_int_quant(self):
         self.assertTrue(isinstance(inc_model.model.fc1, WeightOnlyLinear))
         self.assertTrue(model_size1 / model_size2 > 2)
 
-    def test_RTN_fp4_quant(self):
-        for dtype in ["nf4", "fp4", "fp4_e2m1_bnb", "fp4_e2m1"]:
-            input = torch.randn(3, 30)
-            model = Model()
-            out1 = model(input)
+    def test_RTN_4bit_quant(self):
+        for dtype in ["int4", "nf4", "fp4", "fp4_e2m1_bnb", "fp4_e2m1"]:
+            model = copy.deepcopy(self.gptj)
+            out1 = model(self.lm_input)
             conf = PostTrainingQuantConfig(
                 approach="weight_only",
                 op_type_dict={
                     ".*": {  # re.match
                         "weight": {
                             "dtype": dtype,  # select from int, nf4, or fp4
                             # nf4/fp4 have fixed bits and scheme.
-                            "group_size": 32,  # -1 (per-channel)
+                            "group_size": 64,  # -1 (per-channel)
                             "algorithm": "RTN",
                         },
                     },
                 },
             )
             q_model = quantization.fit(model, conf)
-            out2 = q_model(input)
-            self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1)))
-            self.assertFalse(torch.all(out1 == out2))
+            out2 = q_model(self.lm_input)
+            self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1)))
+            self.assertFalse(torch.all(out1[0] == out2[0]))
             compressed_model = q_model.export_compressed_model()
-            out3 = compressed_model(input)
-            self.assertTrue(torch.all(out3 == out2))
+            out3 = compressed_model(self.lm_input)
+            self.assertTrue(torch.all(out3[0] == out2[0]))
 
     def test_AWQ_quant(self):
         conf = PostTrainingQuantConfig(

diff --git a/test/algorithm/test_smooth_quant.py b/test/algorithm/test_smooth_quant.py
@@ -780,8 +780,7 @@ def calib_func(model):
         output2 = q_model.model(input_ids)
         assert isinstance(q_model.model.fc1, SQLinearWrapper)
         # set a big atol to avoid random issue
-        print(output1, output2)
-        self.assertTrue(torch.allclose(output1, output2, atol=1e-02))
+        self.assertTrue(torch.allclose(output1, output2, atol=2e-02))
 
         q_model.save("saved_result")
         from neural_compressor.utils.pytorch import load