From 8d2bf278eaa7c51ad0947104974f7c60dd29b64b Mon Sep 17 00:00:00 2001 From: xinhe Date: Tue, 12 Sep 2023 09:31:18 +0800 Subject: [PATCH] fix bug in nf4/fp4 (#1241) Signed-off-by: Xin He --- .../adaptor/torch_utils/weight_only.py | 2 +- .../test_weight_only_adaptor.py | 21 +++++++++---------- test/algorithm/test_smooth_quant.py | 3 +-- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py index 2400c0ee56d..f3c22691092 100644 --- a/neural_compressor/adaptor/torch_utils/weight_only.py +++ b/neural_compressor/adaptor/torch_utils/weight_only.py @@ -78,7 +78,7 @@ def quantize_4bit(tensor, quantile=1.0, data_type="nf4", return_int=False): allow_data = FLOAT_MAPPING[data_type] allow_data_bit = INT_MAPPING[data_type] # get scale and update tensor - scale = tensor.max(1)[0] * quantile / max(allow_data) + scale = tensor.abs().max(1)[0] * quantile / max(allow_data) scale.unsqueeze_(dim=-1) tensor = tensor / scale mid_data = [(allow_data[i] + allow_data[i + 1]) / 2 for i in range(len(allow_data) - 1)] diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py index 186d0b2aa62..9a6d41a61e2 100644 --- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py +++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py @@ -239,11 +239,10 @@ def test_RTN_int_quant(self): self.assertTrue(isinstance(inc_model.model.fc1, WeightOnlyLinear)) self.assertTrue(model_size1 / model_size2 > 2) - def test_RTN_fp4_quant(self): - for dtype in ["nf4", "fp4", "fp4_e2m1_bnb", "fp4_e2m1"]: - input = torch.randn(3, 30) - model = Model() - out1 = model(input) + def test_RTN_4bit_quant(self): + for dtype in ["int4", "nf4", "fp4", "fp4_e2m1_bnb", "fp4_e2m1"]: + model = copy.deepcopy(self.gptj) + out1 = model(self.lm_input) conf = PostTrainingQuantConfig( approach="weight_only", op_type_dict={ @@ -251,19 +250,19 @@ def test_RTN_fp4_quant(self): "weight": { "dtype": dtype, # select from int, nf4, or fp4 # nf4/fp4 have fixed bits and scheme. - "group_size": 32, # -1 (per-channel) + "group_size": 64, # -1 (per-channel) "algorithm": "RTN", }, }, }, ) q_model = quantization.fit(model, conf) - out2 = q_model(input) - self.assertTrue(torch.all(torch.isclose(out1, out2, atol=5e-1))) - self.assertFalse(torch.all(out1 == out2)) + out2 = q_model(self.lm_input) + self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1))) + self.assertFalse(torch.all(out1[0] == out2[0])) compressed_model = q_model.export_compressed_model() - out3 = compressed_model(input) - self.assertTrue(torch.all(out3 == out2)) + out3 = compressed_model(self.lm_input) + self.assertTrue(torch.all(out3[0] == out2[0])) def test_AWQ_quant(self): conf = PostTrainingQuantConfig( diff --git a/test/algorithm/test_smooth_quant.py b/test/algorithm/test_smooth_quant.py index 4c774dbca79..af9284caacf 100644 --- a/test/algorithm/test_smooth_quant.py +++ b/test/algorithm/test_smooth_quant.py @@ -780,8 +780,7 @@ def calib_func(model): output2 = q_model.model(input_ids) assert isinstance(q_model.model.fc1, SQLinearWrapper) # set a big atol to avoid random issue - print(output1, output2) - self.assertTrue(torch.allclose(output1, output2, atol=1e-02)) + self.assertTrue(torch.allclose(output1, output2, atol=2e-02)) q_model.save("saved_result") from neural_compressor.utils.pytorch import load