diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py index 037c88a3b15..d81c05a92cc 100644 --- a/neural_compressor/adaptor/onnxrt.py +++ b/neural_compressor/adaptor/onnxrt.py @@ -172,6 +172,7 @@ def smooth_quant( op_types=["MatMul", "Gemm", "Conv", "FusedConv"], scales_per_op=True, record_max_info=False, + weight_clip=True, ): """Get augmented model with smooth quant. diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py index 3d28437e849..e1cd20a8998 100644 --- a/neural_compressor/adaptor/pytorch.py +++ b/neural_compressor/adaptor/pytorch.py @@ -1761,6 +1761,7 @@ def smooth_quant( scales_per_op=None, force_re_smooth=False, record_max_info=False, + weight_clip=True, ): """Convert the model by smooth quant. @@ -1804,7 +1805,9 @@ def smooth_quant( kwargs["percentile"] = percentile if scales_per_op is not None: kwargs["scales_per_op"] = scales_per_op - model._model = self.sq.transform(alpha=alpha, folding=folding, calib_iter=calib_iter, **kwargs) + model._model = self.sq.transform( + alpha=alpha, folding=folding, calib_iter=calib_iter, weight_clip=weight_clip, **kwargs + ) if self.sq.record_max_info: model.sq_max_info = self.sq.max_value_info return model @@ -1833,7 +1836,9 @@ def _apply_pre_optimization(self, model, tune_cfg, recover=False): absorb_layer = op_name absorbed_layer = info["absorbed_layer"] input_minmax = info["input_minmax"] - weight_max = info["weight_max"].clamp(min=1e-5) + weight_max = info["weight_max"] + if self.sq.weight_clip: + weight_max = weight_max.clamp(min=1e-5) abs_input_max = torch.max(torch.abs(input_minmax[0]), torch.abs(input_minmax[1])) input_power = torch.pow(abs_input_max, alpha) weight_power = torch.pow(weight_max, 1 - alpha) @@ -1877,7 +1882,9 @@ def qdq_quantize(self, model, tune_cfg): alpha = info["alpha"] absorbed_layer = info["absorbed_layer"] input_minmax = info["input_minmax"] - weight_max = info["weight_max"].clamp(min=1e-5) + weight_max = info["weight_max"] + if self.sq.weight_clip: + weight_max = weight_max.clamp(min=1e-5) abs_input_max = torch.max(torch.abs(input_minmax[0]), torch.abs(input_minmax[1])) input_power = torch.pow(abs_input_max, alpha) weight_power = torch.pow(weight_max, 1 - alpha) @@ -3279,7 +3286,9 @@ def qdq_quantize(self, model, q_model, tune_cfg, dataloader, q_func): absorbed_layer = info["absorbed_layer"] input_minmax = info["input_minmax"] # for peft model,lora_B weights is 0. - weight_max = info["weight_max"].clamp(min=1e-5) + weight_max = info["weight_max"] + if self.sq.weight_clip: + weight_max = weight_max.clamp(min=1e-5) abs_input_max = torch.max(torch.abs(input_minmax[0]), torch.abs(input_minmax[1])) input_power = torch.pow(abs_input_max, alpha) weight_power = torch.pow(weight_max, 1 - alpha) diff --git a/neural_compressor/adaptor/tensorflow.py b/neural_compressor/adaptor/tensorflow.py index b28cf65175f..338fbfe82e8 100644 --- a/neural_compressor/adaptor/tensorflow.py +++ b/neural_compressor/adaptor/tensorflow.py @@ -1822,6 +1822,7 @@ def smooth_quant( op_types=["MatMul", "Conv2D"], scales_per_op=True, record_max_info=False, + weight_clip=True, ): """Convert the model by smooth quant. diff --git a/neural_compressor/adaptor/torch_utils/smooth_quant.py b/neural_compressor/adaptor/torch_utils/smooth_quant.py index b91b95e8563..d2b15cae0f0 100644 --- a/neural_compressor/adaptor/torch_utils/smooth_quant.py +++ b/neural_compressor/adaptor/torch_utils/smooth_quant.py @@ -330,6 +330,7 @@ def __init__(self, model, dataloader=None, example_inputs=None, q_func=None, tra self.self_absorb_layers = {} self.absorb_to_layer = {} self.adjust_alpha_space = False + self.weight_clip = True def _get_device(self): """Get the model device @@ -577,6 +578,8 @@ def _cal_scales(self, absorb_to_layer, input_maxes, alpha=0.5, tuning=False): weights.append(weight) weight_max_per_channel = torch.max(torch.abs(torch.cat(weights, dim=0)), dim=0)[0] + if self.weight_clip: + weight_max_per_channel = weight_max_per_channel.clamp(min=1e-5) if self.record_max_info and not tuning: # the input of layers with same absorb layer is the same. input_minmax = [self.input_mins[layer_names[0]], self.input_maxes[layer_names[0]]] @@ -946,6 +949,7 @@ def transform( scales_per_op=False, calib_iter=100, auto_alpha_args={"alpha_min": 0.0, "alpha_max": 1.0, "alpha_step": 0.1, "shared_criterion": "mean"}, + weight_clip=True, ): """The main entry of smooth quant :param alpha: Alpha value to balance the quantization difficulty of activation and weight, please refer @@ -971,6 +975,7 @@ def transform( alpha = numpy.clip(alpha, 0.0, 1.0) + self.weight_clip = weight_clip self.recover() need_calibration = self._check_need_calibration(alpha, percentile, op_types, scales_per_op, calib_iter) with torch.no_grad(): diff --git a/neural_compressor/algorithm/smooth_quant.py b/neural_compressor/algorithm/smooth_quant.py index faffca4c2e7..89848e545c6 100644 --- a/neural_compressor/algorithm/smooth_quant.py +++ b/neural_compressor/algorithm/smooth_quant.py @@ -52,6 +52,7 @@ def __init__(self, alpha=0.5): self.op_types = None self.scales_per_op = None self.tune_cfg = None + self.weight_clip = None def __call__(self, origin_model, q_model, adaptor, dataloader, calib_iter): """Return the processed model via SmoothQuant algorithm. @@ -80,6 +81,7 @@ def __call__(self, origin_model, q_model, adaptor, dataloader, calib_iter): kwargs["scales_per_op"] = self.scales_per_op kwargs["folding"] = self.folding kwargs["record_max_info"] = True + kwargs["weight_clip"] = self.weight_clip q_model = adaptor.smooth_quant( origin_model, dataloader, diff --git a/neural_compressor/strategy/strategy.py b/neural_compressor/strategy/strategy.py index a81b3d16b17..c07f6552051 100644 --- a/neural_compressor/strategy/strategy.py +++ b/neural_compressor/strategy/strategy.py @@ -948,6 +948,9 @@ def set_param_for_pre_tuning_algos(self, algo_scheduler, config, fp32_model) -> if self.framework == "pytorch_ipex": smooth_quant_args["folding"] = None # will reset it to True if IPEX version < 2.1. sq_algo.folding = smooth_quant_args["folding"] + sq_algo.weight_clip = smooth_quant_args.get( + "weight_clip", True + ) # make weight_clipping a default_on option. logger.debug(f"Set smooth quant with alpha {sq_algo.alpha} as the pre-tuning algo.") algo_scheduler.append_algorithm("pre_quantization", sq_algo) diff --git a/test/algorithm/test_smooth_quant.py b/test/algorithm/test_smooth_quant.py index 3f0cb63a4d9..c4bfc646bae 100644 --- a/test/algorithm/test_smooth_quant.py +++ b/test/algorithm/test_smooth_quant.py @@ -1427,5 +1427,41 @@ def calib_func(model): out2 = q_model.model(example_input)[0] +class TestInputConfig(unittest.TestCase): + @classmethod + def setUpClass(self): + class RandDataloader: + def __init__(self): + self.batch_size = 1 + + def __iter__(self): + yield torch.rand((1, 3)) + + self.linear_dl = RandDataloader() + + @classmethod + def test_sq_weight_clipping(self): + class Model(torch.nn.Module): + device = torch.device("cpu") + + def __init__(self): + super(Model, self).__init__() + self.fc1 = torch.nn.Linear(3, 4) + self.norm = LlamaRMSNorm(4) + self.fc2 = torch.nn.Linear(4, 3) + + def forward(self, x): + out = self.fc1(x) + out = self.norm(out) + out = self.fc2(out) + return out + + model = Model() + + sq = TorchSmoothQuant(model, self.linear_dl) + sq.transform(alpha="auto", calib_iter=1, folding=True, weight_clip=False) + assert sq.weight_clip is False + + if __name__ == "__main__": unittest.main()