From 7057e3b2a3ae1f9911516fb173287e1832f77afd Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Fri, 17 Nov 2023 10:32:58 +0800 Subject: [PATCH] add attr to MatMulNBits (#1378) Signed-off-by: Mengni Wang --- neural_compressor/adaptor/onnxrt.py | 11 ++++++- .../adaptor/ox_utils/weight_only.py | 32 +++++++++++++++++-- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py index 142ceb37ad9..10f173d981b 100644 --- a/neural_compressor/adaptor/onnxrt.py +++ b/neural_compressor/adaptor/onnxrt.py @@ -1666,6 +1666,7 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None): actorder = self.recipes.get("gptq_args", {}).get("actorder", False) mse = self.recipes.get("gptq_args", {}).get("mse", False) perchannel = self.recipes.get("gptq_args", {}).get("perchannel", True) + accuracy_level = self.recipes.get("gptq_args", {}).get("accuracy_level", 0) calib_sampling_size = tune_cfg.get("calib_sampling_size", 1) tmp_model = gptq_quantize( tmp_model, @@ -1677,6 +1678,7 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None): actorder=actorder, mse=mse, perchannel=perchannel, + accuracy_level=accuracy_level, ) if "AWQ" in algos: from neural_compressor.adaptor.ox_utils.weight_only import awq_quantize @@ -1684,6 +1686,7 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None): assert data_loader is not None, "AWQ WOQ algorithm needs to pass 'calib_dataloader' to quantization.fit()" enable_auto_scale = self.recipes.get("awq_args", {}).get("enable_auto_scale", True) enable_mse_search = self.recipes.get("awq_args", {}).get("enable_mse_search", True) + accuracy_level = self.recipes.get("awq_args", {}).get("accuracy_level", 0) calib_sampling_size = tune_cfg.get("calib_sampling_size", 1) tmp_model = awq_quantize( tmp_model, @@ -1692,11 +1695,17 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None): n_samples=calib_sampling_size, enable_auto_scale=enable_auto_scale, enable_mse_search=enable_mse_search, + accuracy_level=accuracy_level, ) elif "RTN" in algos: from neural_compressor.adaptor.ox_utils.weight_only import rtn_quantize - tmp_model = rtn_quantize(tmp_model, quant_config) + accuracy_level = self.recipes.get("rtn_args", {}).get("accuracy_level", 0) + tmp_model = rtn_quantize( + tmp_model, + quant_config, + accuracy_level=accuracy_level, + ) tmp_model.q_config = copy.deepcopy(quant_config) self._dump_model_op_stats(tmp_model, tune_cfg) tmp_model.topological_sort() diff --git a/neural_compressor/adaptor/ox_utils/weight_only.py b/neural_compressor/adaptor/ox_utils/weight_only.py index 5138f827fd8..a7f6170f51b 100644 --- a/neural_compressor/adaptor/ox_utils/weight_only.py +++ b/neural_compressor/adaptor/ox_utils/weight_only.py @@ -56,7 +56,15 @@ def get_blob_size(group_size, has_zp): # pragma: no cover def make_matmul_weight_only_node( - node, weight_shape, num_bits, group_size, k_blocks, q_weight, scale, zero_point + node, + weight_shape, + num_bits, + group_size, + k_blocks, + q_weight, + scale, + zero_point, + accuracy_level=0, ): # pragma: no cover """Build MatMulFpQ4 node. @@ -69,6 +77,9 @@ def make_matmul_weight_only_node( q_weight (array): quantized weight scale (array): scale zero_point (array): zero point + accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), + 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), + 4 (int8 compute type of jblas kernel) Returns: matmul_weight_only_node: MatMulFpQ4 or MatMulNBits node @@ -125,6 +136,9 @@ def make_matmul_weight_only_node( kwargs["N"] = weight_shape[1] kwargs["bits"] = num_bits kwargs["block_size"] = group_size + if accuracy_level > 0: + # require onnxruntime > 1.16.2 + kwargs["accuracy_level"] = accuracy_level else: offset = 5 if zero_point is not None else 4 @@ -274,6 +288,7 @@ def rtn_quantize( group_size=32, scheme="asym", ratios={}, + accuracy_level=0, ): """Quant the model with round to nearst method. @@ -294,6 +309,9 @@ def rtn_quantize( group_size (int, optional): how many elements share one scale/zp. Default is 32. scheme (str, optional): sym or asym. Defaults to "asym". ratios (dict, optional): percentile of clip. Defaults to {}. + accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), + 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), + 4 (int8 compute type of jblas kernel) Returns: model: fake quantized ONNXModel @@ -344,6 +362,7 @@ def rtn_quantize( q_weight=q_weight.astype("uint8"), scale=scale, zero_point=zp if scheme == "asym" else None, + accuracy_level=accuracy_level, ) model.add_initializers(new_inits) @@ -664,6 +683,7 @@ def awq_quantize( n_samples=128, enable_auto_scale=True, enable_mse_search=True, + accuracy_level=0, ): """Quant the model with Activation-aware Weight quantization(AWQ) method. @@ -687,6 +707,9 @@ def awq_quantize( n_samples (int, optional): calibration sample number. enable_auto_scale (bool, optional): whether enable scale for salient weight. Defaults to True. enable_mse_search (bool, optional): whether enable clip for weight by checking mse. Defaults to True. + accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), + 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), + 4 (int8 compute type of jblas kernel) Returns: model: fake quantized ONNXModel @@ -773,7 +796,7 @@ def awq_quantize( model.remove_tensors_from_outputs(output_names) model.model.graph.output.MergeFrom(org_output) - model = rtn_quantize(model, weight_config, num_bits, group_size, scheme, full_ratio) + model = rtn_quantize(model, weight_config, num_bits, group_size, scheme, full_ratio, accuracy_level) return model @@ -934,6 +957,7 @@ def gptq_quantize( actorder=False, mse=False, perchannel=True, + accuracy_level=0, ): """Quant the model with GPTQ method. @@ -960,6 +984,9 @@ def gptq_quantize( actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value. mse (bool, optional): whether get scale and zero point with mse error. perchannel (bool, optional): whether quantize weight per-channel. + accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), + 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), + 4 (int8 compute type of jblas kernel) Returns: model: fake quantized ONNXModel @@ -1076,6 +1103,7 @@ def gptq_quantize( q_weight=q_weight.astype("uint8"), scale=scale, zero_point=zp if scheme == "asym" else None, + accuracy_level=accuracy_level, ) model.add_initializers(new_inits)