From 9887c6cb3b1f4435c1167c0e94c9dd079c5005ed Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Thu, 2 Nov 2023 16:24:20 +0800 Subject: [PATCH 1/6] add attr to MatMulNBits Signed-off-by: Mengni Wang --- neural_compressor/adaptor/onnxrt.py | 11 +++++++++- .../adaptor/ox_utils/weight_only.py | 22 +++++++++++++++++-- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py index 037c88a3b15..826fe8f8cf9 100644 --- a/neural_compressor/adaptor/onnxrt.py +++ b/neural_compressor/adaptor/onnxrt.py @@ -1659,6 +1659,7 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None): actorder = self.recipes.get("gptq_args", {}).get("actorder", False) mse = self.recipes.get("gptq_args", {}).get("mse", False) perchannel = self.recipes.get("gptq_args", {}).get("perchannel", True) + compute_type = self.recipes.get("gptq_args", {}).get("compute_type", -1) calib_sampling_size = tune_cfg.get("calib_sampling_size", 1) tmp_model = gptq_quantize( tmp_model, @@ -1670,6 +1671,7 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None): actorder=actorder, mse=mse, perchannel=perchannel, + compute_type=compute_type, ) if "AWQ" in algos: from neural_compressor.adaptor.ox_utils.weight_only import awq_quantize @@ -1677,6 +1679,7 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None): assert data_loader is not None, "AWQ WOQ algorithm needs to pass 'calib_dataloader' to quantization.fit()" enable_auto_scale = self.recipes.get("awq_args", {}).get("enable_auto_scale", True) enable_mse_search = self.recipes.get("awq_args", {}).get("enable_mse_search", True) + compute_type = self.recipes.get("awq_args", {}).get("compute_type", -1) calib_sampling_size = tune_cfg.get("calib_sampling_size", 1) tmp_model = awq_quantize( tmp_model, @@ -1685,11 +1688,17 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None): n_samples=calib_sampling_size, enable_auto_scale=enable_auto_scale, enable_mse_search=enable_mse_search, + compute_type=compute_type, ) elif "RTN" in algos: from neural_compressor.adaptor.ox_utils.weight_only import rtn_quantize - tmp_model = rtn_quantize(tmp_model, quant_config) + compute_type = self.recipes.get("rtn_args", {}).get("compute_type", -1) + tmp_model = rtn_quantize( + tmp_model, + quant_config, + compute_type=compute_type, + ) tmp_model.q_config = copy.deepcopy(quant_config) self._dump_model_op_stats(tmp_model, tune_cfg) tmp_model.topological_sort() diff --git a/neural_compressor/adaptor/ox_utils/weight_only.py b/neural_compressor/adaptor/ox_utils/weight_only.py index 5138f827fd8..97028bee22f 100644 --- a/neural_compressor/adaptor/ox_utils/weight_only.py +++ b/neural_compressor/adaptor/ox_utils/weight_only.py @@ -56,7 +56,15 @@ def get_blob_size(group_size, has_zp): # pragma: no cover def make_matmul_weight_only_node( - node, weight_shape, num_bits, group_size, k_blocks, q_weight, scale, zero_point + node, + weight_shape, + num_bits, + group_size, + k_blocks, + q_weight, + scale, + zero_point, + compute_type=-1, ): # pragma: no cover """Build MatMulFpQ4 node. @@ -69,6 +77,7 @@ def make_matmul_weight_only_node( q_weight (array): quantized weight scale (array): scale zero_point (array): zero point + compute_type (int): compute type Returns: matmul_weight_only_node: MatMulFpQ4 or MatMulNBits node @@ -125,6 +134,7 @@ def make_matmul_weight_only_node( kwargs["N"] = weight_shape[1] kwargs["bits"] = num_bits kwargs["block_size"] = group_size + kwargs["compute_type"] = compute_type else: offset = 5 if zero_point is not None else 4 @@ -274,6 +284,7 @@ def rtn_quantize( group_size=32, scheme="asym", ratios={}, + compute_type=-1, ): """Quant the model with round to nearst method. @@ -294,6 +305,7 @@ def rtn_quantize( group_size (int, optional): how many elements share one scale/zp. Default is 32. scheme (str, optional): sym or asym. Defaults to "asym". ratios (dict, optional): percentile of clip. Defaults to {}. + compute_type (int): compute type Returns: model: fake quantized ONNXModel @@ -344,6 +356,7 @@ def rtn_quantize( q_weight=q_weight.astype("uint8"), scale=scale, zero_point=zp if scheme == "asym" else None, + compute_type=compute_type, ) model.add_initializers(new_inits) @@ -664,6 +677,7 @@ def awq_quantize( n_samples=128, enable_auto_scale=True, enable_mse_search=True, + compute_type=-1, ): """Quant the model with Activation-aware Weight quantization(AWQ) method. @@ -687,6 +701,7 @@ def awq_quantize( n_samples (int, optional): calibration sample number. enable_auto_scale (bool, optional): whether enable scale for salient weight. Defaults to True. enable_mse_search (bool, optional): whether enable clip for weight by checking mse. Defaults to True. + compute_type (int): compute type Returns: model: fake quantized ONNXModel @@ -773,7 +788,7 @@ def awq_quantize( model.remove_tensors_from_outputs(output_names) model.model.graph.output.MergeFrom(org_output) - model = rtn_quantize(model, weight_config, num_bits, group_size, scheme, full_ratio) + model = rtn_quantize(model, weight_config, num_bits, group_size, scheme, full_ratio, compute_type) return model @@ -934,6 +949,7 @@ def gptq_quantize( actorder=False, mse=False, perchannel=True, + compute_type=-1, ): """Quant the model with GPTQ method. @@ -960,6 +976,7 @@ def gptq_quantize( actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value. mse (bool, optional): whether get scale and zero point with mse error. perchannel (bool, optional): whether quantize weight per-channel. + compute_type (int): compute type Returns: model: fake quantized ONNXModel @@ -1076,6 +1093,7 @@ def gptq_quantize( q_weight=q_weight.astype("uint8"), scale=scale, zero_point=zp if scheme == "asym" else None, + compute_type=compute_type, ) model.add_initializers(new_inits) From 27fbe7ca06546fab1673754d85b3463897ce6f48 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Fri, 10 Nov 2023 11:00:36 +0800 Subject: [PATCH 2/6] Update onnxrt.py --- neural_compressor/adaptor/onnxrt.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py index 826fe8f8cf9..2525ada1aca 100644 --- a/neural_compressor/adaptor/onnxrt.py +++ b/neural_compressor/adaptor/onnxrt.py @@ -1659,7 +1659,7 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None): actorder = self.recipes.get("gptq_args", {}).get("actorder", False) mse = self.recipes.get("gptq_args", {}).get("mse", False) perchannel = self.recipes.get("gptq_args", {}).get("perchannel", True) - compute_type = self.recipes.get("gptq_args", {}).get("compute_type", -1) + accuracy_level = self.recipes.get("gptq_args", {}).get("accuracy_level", 0) calib_sampling_size = tune_cfg.get("calib_sampling_size", 1) tmp_model = gptq_quantize( tmp_model, @@ -1671,7 +1671,7 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None): actorder=actorder, mse=mse, perchannel=perchannel, - compute_type=compute_type, + accuracy_level=accuracy_level, ) if "AWQ" in algos: from neural_compressor.adaptor.ox_utils.weight_only import awq_quantize @@ -1679,7 +1679,7 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None): assert data_loader is not None, "AWQ WOQ algorithm needs to pass 'calib_dataloader' to quantization.fit()" enable_auto_scale = self.recipes.get("awq_args", {}).get("enable_auto_scale", True) enable_mse_search = self.recipes.get("awq_args", {}).get("enable_mse_search", True) - compute_type = self.recipes.get("awq_args", {}).get("compute_type", -1) + accuracy_level = self.recipes.get("awq_args", {}).get("accuracy_level", 0) calib_sampling_size = tune_cfg.get("calib_sampling_size", 1) tmp_model = awq_quantize( tmp_model, @@ -1688,16 +1688,16 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None): n_samples=calib_sampling_size, enable_auto_scale=enable_auto_scale, enable_mse_search=enable_mse_search, - compute_type=compute_type, + accuracy_level=accuracy_level, ) elif "RTN" in algos: from neural_compressor.adaptor.ox_utils.weight_only import rtn_quantize - compute_type = self.recipes.get("rtn_args", {}).get("compute_type", -1) + accuracy_level = self.recipes.get("rtn_args", {}).get("accuracy_level", 0) tmp_model = rtn_quantize( tmp_model, quant_config, - compute_type=compute_type, + accuracy_level=accuracy_level, ) tmp_model.q_config = copy.deepcopy(quant_config) self._dump_model_op_stats(tmp_model, tune_cfg) From e18fc75406192db1dcee80e03fc8c2b03603e970 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Fri, 10 Nov 2023 11:03:01 +0800 Subject: [PATCH 3/6] Update weight_only.py --- .../adaptor/ox_utils/weight_only.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/weight_only.py b/neural_compressor/adaptor/ox_utils/weight_only.py index 97028bee22f..453555ccc34 100644 --- a/neural_compressor/adaptor/ox_utils/weight_only.py +++ b/neural_compressor/adaptor/ox_utils/weight_only.py @@ -64,7 +64,7 @@ def make_matmul_weight_only_node( q_weight, scale, zero_point, - compute_type=-1, + accuracy_level=-1, ): # pragma: no cover """Build MatMulFpQ4 node. @@ -77,7 +77,7 @@ def make_matmul_weight_only_node( q_weight (array): quantized weight scale (array): scale zero_point (array): zero point - compute_type (int): compute type + accuracy_level (int): accuracy level Returns: matmul_weight_only_node: MatMulFpQ4 or MatMulNBits node @@ -134,7 +134,8 @@ def make_matmul_weight_only_node( kwargs["N"] = weight_shape[1] kwargs["bits"] = num_bits kwargs["block_size"] = group_size - kwargs["compute_type"] = compute_type + if accuracy_level > 0: + kwargs["accuracy_level"] = accuracy_level else: offset = 5 if zero_point is not None else 4 @@ -284,7 +285,7 @@ def rtn_quantize( group_size=32, scheme="asym", ratios={}, - compute_type=-1, + accuracy_level=0, ): """Quant the model with round to nearst method. @@ -305,7 +306,7 @@ def rtn_quantize( group_size (int, optional): how many elements share one scale/zp. Default is 32. scheme (str, optional): sym or asym. Defaults to "asym". ratios (dict, optional): percentile of clip. Defaults to {}. - compute_type (int): compute type + accuracy_level (int): accuracy level Returns: model: fake quantized ONNXModel @@ -356,7 +357,7 @@ def rtn_quantize( q_weight=q_weight.astype("uint8"), scale=scale, zero_point=zp if scheme == "asym" else None, - compute_type=compute_type, + accuracy_level=accuracy_level, ) model.add_initializers(new_inits) @@ -677,7 +678,7 @@ def awq_quantize( n_samples=128, enable_auto_scale=True, enable_mse_search=True, - compute_type=-1, + accuracy_level=0, ): """Quant the model with Activation-aware Weight quantization(AWQ) method. @@ -701,7 +702,7 @@ def awq_quantize( n_samples (int, optional): calibration sample number. enable_auto_scale (bool, optional): whether enable scale for salient weight. Defaults to True. enable_mse_search (bool, optional): whether enable clip for weight by checking mse. Defaults to True. - compute_type (int): compute type + accuracy_level (int): accuracy level Returns: model: fake quantized ONNXModel @@ -788,7 +789,7 @@ def awq_quantize( model.remove_tensors_from_outputs(output_names) model.model.graph.output.MergeFrom(org_output) - model = rtn_quantize(model, weight_config, num_bits, group_size, scheme, full_ratio, compute_type) + model = rtn_quantize(model, weight_config, num_bits, group_size, scheme, full_ratio, accuracy_level) return model @@ -949,7 +950,7 @@ def gptq_quantize( actorder=False, mse=False, perchannel=True, - compute_type=-1, + accuracy_level=0, ): """Quant the model with GPTQ method. @@ -976,7 +977,7 @@ def gptq_quantize( actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value. mse (bool, optional): whether get scale and zero point with mse error. perchannel (bool, optional): whether quantize weight per-channel. - compute_type (int): compute type + accuracy_level (int): accuracy level Returns: model: fake quantized ONNXModel @@ -1093,7 +1094,7 @@ def gptq_quantize( q_weight=q_weight.astype("uint8"), scale=scale, zero_point=zp if scheme == "asym" else None, - compute_type=compute_type, + accuracy_level=accuracy_level, ) model.add_initializers(new_inits) From 060a0610724c18dd58b673eb37af373f3425b05a Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Fri, 10 Nov 2023 11:03:38 +0800 Subject: [PATCH 4/6] Update weight_only.py --- neural_compressor/adaptor/ox_utils/weight_only.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/adaptor/ox_utils/weight_only.py b/neural_compressor/adaptor/ox_utils/weight_only.py index 453555ccc34..eccd9c19981 100644 --- a/neural_compressor/adaptor/ox_utils/weight_only.py +++ b/neural_compressor/adaptor/ox_utils/weight_only.py @@ -64,7 +64,7 @@ def make_matmul_weight_only_node( q_weight, scale, zero_point, - accuracy_level=-1, + accuracy_level=0, ): # pragma: no cover """Build MatMulFpQ4 node. From b47814be78009c9d6bbda5b144532c102d316614 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Thu, 16 Nov 2023 20:20:33 +0800 Subject: [PATCH 5/6] Update weight_only.py --- .../adaptor/ox_utils/weight_only.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/weight_only.py b/neural_compressor/adaptor/ox_utils/weight_only.py index eccd9c19981..d737ab146ca 100644 --- a/neural_compressor/adaptor/ox_utils/weight_only.py +++ b/neural_compressor/adaptor/ox_utils/weight_only.py @@ -77,7 +77,9 @@ def make_matmul_weight_only_node( q_weight (array): quantized weight scale (array): scale zero_point (array): zero point - accuracy_level (int): accuracy level + accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), + 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), + 4 (int8 compute type of jblas kernel) Returns: matmul_weight_only_node: MatMulFpQ4 or MatMulNBits node @@ -134,7 +136,8 @@ def make_matmul_weight_only_node( kwargs["N"] = weight_shape[1] kwargs["bits"] = num_bits kwargs["block_size"] = group_size - if accuracy_level > 0: + if accuracy_level > 0: # pragma: no cover + # require onnxruntime > 1.16.2 kwargs["accuracy_level"] = accuracy_level else: @@ -306,7 +309,9 @@ def rtn_quantize( group_size (int, optional): how many elements share one scale/zp. Default is 32. scheme (str, optional): sym or asym. Defaults to "asym". ratios (dict, optional): percentile of clip. Defaults to {}. - accuracy_level (int): accuracy level + accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), + 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), + 4 (int8 compute type of jblas kernel) Returns: model: fake quantized ONNXModel @@ -702,7 +707,9 @@ def awq_quantize( n_samples (int, optional): calibration sample number. enable_auto_scale (bool, optional): whether enable scale for salient weight. Defaults to True. enable_mse_search (bool, optional): whether enable clip for weight by checking mse. Defaults to True. - accuracy_level (int): accuracy level + accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), + 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), + 4 (int8 compute type of jblas kernel) Returns: model: fake quantized ONNXModel @@ -977,7 +984,9 @@ def gptq_quantize( actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value. mse (bool, optional): whether get scale and zero point with mse error. perchannel (bool, optional): whether quantize weight per-channel. - accuracy_level (int): accuracy level + accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), + 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), + 4 (int8 compute type of jblas kernel) Returns: model: fake quantized ONNXModel From 22a7697ed23cb2cc86a056c607619a757f5009bb Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Thu, 16 Nov 2023 20:21:32 +0800 Subject: [PATCH 6/6] Update weight_only.py --- neural_compressor/adaptor/ox_utils/weight_only.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/adaptor/ox_utils/weight_only.py b/neural_compressor/adaptor/ox_utils/weight_only.py index d737ab146ca..a7f6170f51b 100644 --- a/neural_compressor/adaptor/ox_utils/weight_only.py +++ b/neural_compressor/adaptor/ox_utils/weight_only.py @@ -136,7 +136,7 @@ def make_matmul_weight_only_node( kwargs["N"] = weight_shape[1] kwargs["bits"] = num_bits kwargs["block_size"] = group_size - if accuracy_level > 0: # pragma: no cover + if accuracy_level > 0: # require onnxruntime > 1.16.2 kwargs["accuracy_level"] = accuracy_level