From 9887c6cb3b1f4435c1167c0e94c9dd079c5005ed Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Thu, 2 Nov 2023 16:24:20 +0800
Subject: [PATCH 1/6] add attr to MatMulNBits

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 neural_compressor/adaptor/onnxrt.py           | 11 +++++++++-
 .../adaptor/ox_utils/weight_only.py           | 22 +++++++++++++++++--
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py
index 037c88a3b15..826fe8f8cf9 100644
--- a/neural_compressor/adaptor/onnxrt.py
+++ b/neural_compressor/adaptor/onnxrt.py
@@ -1659,6 +1659,7 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
             actorder = self.recipes.get("gptq_args", {}).get("actorder", False)
             mse = self.recipes.get("gptq_args", {}).get("mse", False)
             perchannel = self.recipes.get("gptq_args", {}).get("perchannel", True)
+            compute_type = self.recipes.get("gptq_args", {}).get("compute_type", -1)
             calib_sampling_size = tune_cfg.get("calib_sampling_size", 1)
             tmp_model = gptq_quantize(
                 tmp_model,
@@ -1670,6 +1671,7 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
                 actorder=actorder,
                 mse=mse,
                 perchannel=perchannel,
+                compute_type=compute_type,
             )
         if "AWQ" in algos:
             from neural_compressor.adaptor.ox_utils.weight_only import awq_quantize
@@ -1677,6 +1679,7 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
             assert data_loader is not None, "AWQ WOQ algorithm needs to pass 'calib_dataloader' to quantization.fit()"
             enable_auto_scale = self.recipes.get("awq_args", {}).get("enable_auto_scale", True)
             enable_mse_search = self.recipes.get("awq_args", {}).get("enable_mse_search", True)
+            compute_type = self.recipes.get("awq_args", {}).get("compute_type", -1)
             calib_sampling_size = tune_cfg.get("calib_sampling_size", 1)
             tmp_model = awq_quantize(
                 tmp_model,
@@ -1685,11 +1688,17 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
                 n_samples=calib_sampling_size,
                 enable_auto_scale=enable_auto_scale,
                 enable_mse_search=enable_mse_search,
+                compute_type=compute_type,
             )
         elif "RTN" in algos:
             from neural_compressor.adaptor.ox_utils.weight_only import rtn_quantize
 
-            tmp_model = rtn_quantize(tmp_model, quant_config)
+            compute_type = self.recipes.get("rtn_args", {}).get("compute_type", -1)
+            tmp_model = rtn_quantize(
+                tmp_model,
+                quant_config,
+                compute_type=compute_type,
+            )
         tmp_model.q_config = copy.deepcopy(quant_config)
         self._dump_model_op_stats(tmp_model, tune_cfg)
         tmp_model.topological_sort()
diff --git a/neural_compressor/adaptor/ox_utils/weight_only.py b/neural_compressor/adaptor/ox_utils/weight_only.py
index 5138f827fd8..97028bee22f 100644
--- a/neural_compressor/adaptor/ox_utils/weight_only.py
+++ b/neural_compressor/adaptor/ox_utils/weight_only.py
@@ -56,7 +56,15 @@ def get_blob_size(group_size, has_zp):  # pragma: no cover
 
 
 def make_matmul_weight_only_node(
-    node, weight_shape, num_bits, group_size, k_blocks, q_weight, scale, zero_point
+    node,
+    weight_shape,
+    num_bits,
+    group_size,
+    k_blocks,
+    q_weight,
+    scale,
+    zero_point,
+    compute_type=-1,
 ):  # pragma: no cover
     """Build MatMulFpQ4 node.
 
@@ -69,6 +77,7 @@ def make_matmul_weight_only_node(
         q_weight (array): quantized weight
         scale (array): scale
         zero_point (array): zero point
+        compute_type (int): compute type
 
     Returns:
         matmul_weight_only_node: MatMulFpQ4 or MatMulNBits node
@@ -125,6 +134,7 @@ def make_matmul_weight_only_node(
         kwargs["N"] = weight_shape[1]
         kwargs["bits"] = num_bits
         kwargs["block_size"] = group_size
+        kwargs["compute_type"] = compute_type
 
     else:
         offset = 5 if zero_point is not None else 4
@@ -274,6 +284,7 @@ def rtn_quantize(
     group_size=32,
     scheme="asym",
     ratios={},
+    compute_type=-1,
 ):
     """Quant the model with round to nearst method.
 
@@ -294,6 +305,7 @@ def rtn_quantize(
         group_size (int, optional): how many elements share one scale/zp. Default is 32.
         scheme (str, optional): sym or asym. Defaults to "asym".
         ratios (dict, optional): percentile of clip. Defaults to {}.
+        compute_type (int): compute type
 
     Returns:
         model: fake quantized ONNXModel
@@ -344,6 +356,7 @@ def rtn_quantize(
                     q_weight=q_weight.astype("uint8"),
                     scale=scale,
                     zero_point=zp if scheme == "asym" else None,
+                    compute_type=compute_type,
                 )
 
                 model.add_initializers(new_inits)
@@ -664,6 +677,7 @@ def awq_quantize(
     n_samples=128,
     enable_auto_scale=True,
     enable_mse_search=True,
+    compute_type=-1,
 ):
     """Quant the model with Activation-aware Weight quantization(AWQ) method.
 
@@ -687,6 +701,7 @@ def awq_quantize(
         n_samples (int, optional): calibration sample number.
         enable_auto_scale (bool, optional): whether enable scale for salient weight. Defaults to True.
         enable_mse_search (bool, optional):  whether enable clip for weight by checking mse. Defaults to True.
+        compute_type (int): compute type
 
     Returns:
         model: fake quantized ONNXModel
@@ -773,7 +788,7 @@ def awq_quantize(
 
         model.remove_tensors_from_outputs(output_names)
         model.model.graph.output.MergeFrom(org_output)
-    model = rtn_quantize(model, weight_config, num_bits, group_size, scheme, full_ratio)
+    model = rtn_quantize(model, weight_config, num_bits, group_size, scheme, full_ratio, compute_type)
     return model
 
 
@@ -934,6 +949,7 @@ def gptq_quantize(
     actorder=False,
     mse=False,
     perchannel=True,
+    compute_type=-1,
 ):
     """Quant the model with GPTQ method.
 
@@ -960,6 +976,7 @@ def gptq_quantize(
         actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
         mse (bool, optional): whether get scale and zero point with mse error.
         perchannel (bool, optional): whether quantize weight per-channel.
+        compute_type (int): compute type
 
     Returns:
         model: fake quantized ONNXModel
@@ -1076,6 +1093,7 @@ def gptq_quantize(
                     q_weight=q_weight.astype("uint8"),
                     scale=scale,
                     zero_point=zp if scheme == "asym" else None,
+                    compute_type=compute_type,
                 )
 
                 model.add_initializers(new_inits)

From 27fbe7ca06546fab1673754d85b3463897ce6f48 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Fri, 10 Nov 2023 11:00:36 +0800
Subject: [PATCH 2/6] Update onnxrt.py

---
 neural_compressor/adaptor/onnxrt.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py
index 826fe8f8cf9..2525ada1aca 100644
--- a/neural_compressor/adaptor/onnxrt.py
+++ b/neural_compressor/adaptor/onnxrt.py
@@ -1659,7 +1659,7 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
             actorder = self.recipes.get("gptq_args", {}).get("actorder", False)
             mse = self.recipes.get("gptq_args", {}).get("mse", False)
             perchannel = self.recipes.get("gptq_args", {}).get("perchannel", True)
-            compute_type = self.recipes.get("gptq_args", {}).get("compute_type", -1)
+            accuracy_level = self.recipes.get("gptq_args", {}).get("accuracy_level", 0)
             calib_sampling_size = tune_cfg.get("calib_sampling_size", 1)
             tmp_model = gptq_quantize(
                 tmp_model,
@@ -1671,7 +1671,7 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
                 actorder=actorder,
                 mse=mse,
                 perchannel=perchannel,
-                compute_type=compute_type,
+                accuracy_level=accuracy_level,
             )
         if "AWQ" in algos:
             from neural_compressor.adaptor.ox_utils.weight_only import awq_quantize
@@ -1679,7 +1679,7 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
             assert data_loader is not None, "AWQ WOQ algorithm needs to pass 'calib_dataloader' to quantization.fit()"
             enable_auto_scale = self.recipes.get("awq_args", {}).get("enable_auto_scale", True)
             enable_mse_search = self.recipes.get("awq_args", {}).get("enable_mse_search", True)
-            compute_type = self.recipes.get("awq_args", {}).get("compute_type", -1)
+            accuracy_level = self.recipes.get("awq_args", {}).get("accuracy_level", 0)
             calib_sampling_size = tune_cfg.get("calib_sampling_size", 1)
             tmp_model = awq_quantize(
                 tmp_model,
@@ -1688,16 +1688,16 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
                 n_samples=calib_sampling_size,
                 enable_auto_scale=enable_auto_scale,
                 enable_mse_search=enable_mse_search,
-                compute_type=compute_type,
+                accuracy_level=accuracy_level,
             )
         elif "RTN" in algos:
             from neural_compressor.adaptor.ox_utils.weight_only import rtn_quantize
 
-            compute_type = self.recipes.get("rtn_args", {}).get("compute_type", -1)
+            accuracy_level = self.recipes.get("rtn_args", {}).get("accuracy_level", 0)
             tmp_model = rtn_quantize(
                 tmp_model,
                 quant_config,
-                compute_type=compute_type,
+                accuracy_level=accuracy_level,
             )
         tmp_model.q_config = copy.deepcopy(quant_config)
         self._dump_model_op_stats(tmp_model, tune_cfg)

From e18fc75406192db1dcee80e03fc8c2b03603e970 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Fri, 10 Nov 2023 11:03:01 +0800
Subject: [PATCH 3/6] Update weight_only.py

---
 .../adaptor/ox_utils/weight_only.py           | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/neural_compressor/adaptor/ox_utils/weight_only.py b/neural_compressor/adaptor/ox_utils/weight_only.py
index 97028bee22f..453555ccc34 100644
--- a/neural_compressor/adaptor/ox_utils/weight_only.py
+++ b/neural_compressor/adaptor/ox_utils/weight_only.py
@@ -64,7 +64,7 @@ def make_matmul_weight_only_node(
     q_weight,
     scale,
     zero_point,
-    compute_type=-1,
+    accuracy_level=-1,
 ):  # pragma: no cover
     """Build MatMulFpQ4 node.
 
@@ -77,7 +77,7 @@ def make_matmul_weight_only_node(
         q_weight (array): quantized weight
         scale (array): scale
         zero_point (array): zero point
-        compute_type (int): compute type
+        accuracy_level (int): accuracy level
 
     Returns:
         matmul_weight_only_node: MatMulFpQ4 or MatMulNBits node
@@ -134,7 +134,8 @@ def make_matmul_weight_only_node(
         kwargs["N"] = weight_shape[1]
         kwargs["bits"] = num_bits
         kwargs["block_size"] = group_size
-        kwargs["compute_type"] = compute_type
+        if accuracy_level > 0:
+            kwargs["accuracy_level"] = accuracy_level
 
     else:
         offset = 5 if zero_point is not None else 4
@@ -284,7 +285,7 @@ def rtn_quantize(
     group_size=32,
     scheme="asym",
     ratios={},
-    compute_type=-1,
+    accuracy_level=0,
 ):
     """Quant the model with round to nearst method.
 
@@ -305,7 +306,7 @@ def rtn_quantize(
         group_size (int, optional): how many elements share one scale/zp. Default is 32.
         scheme (str, optional): sym or asym. Defaults to "asym".
         ratios (dict, optional): percentile of clip. Defaults to {}.
-        compute_type (int): compute type
+        accuracy_level (int): accuracy level
 
     Returns:
         model: fake quantized ONNXModel
@@ -356,7 +357,7 @@ def rtn_quantize(
                     q_weight=q_weight.astype("uint8"),
                     scale=scale,
                     zero_point=zp if scheme == "asym" else None,
-                    compute_type=compute_type,
+                    accuracy_level=accuracy_level,
                 )
 
                 model.add_initializers(new_inits)
@@ -677,7 +678,7 @@ def awq_quantize(
     n_samples=128,
     enable_auto_scale=True,
     enable_mse_search=True,
-    compute_type=-1,
+    accuracy_level=0,
 ):
     """Quant the model with Activation-aware Weight quantization(AWQ) method.
 
@@ -701,7 +702,7 @@ def awq_quantize(
         n_samples (int, optional): calibration sample number.
         enable_auto_scale (bool, optional): whether enable scale for salient weight. Defaults to True.
         enable_mse_search (bool, optional):  whether enable clip for weight by checking mse. Defaults to True.
-        compute_type (int): compute type
+        accuracy_level (int): accuracy level
 
     Returns:
         model: fake quantized ONNXModel
@@ -788,7 +789,7 @@ def awq_quantize(
 
         model.remove_tensors_from_outputs(output_names)
         model.model.graph.output.MergeFrom(org_output)
-    model = rtn_quantize(model, weight_config, num_bits, group_size, scheme, full_ratio, compute_type)
+    model = rtn_quantize(model, weight_config, num_bits, group_size, scheme, full_ratio, accuracy_level)
     return model
 
 
@@ -949,7 +950,7 @@ def gptq_quantize(
     actorder=False,
     mse=False,
     perchannel=True,
-    compute_type=-1,
+    accuracy_level=0,
 ):
     """Quant the model with GPTQ method.
 
@@ -976,7 +977,7 @@ def gptq_quantize(
         actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
         mse (bool, optional): whether get scale and zero point with mse error.
         perchannel (bool, optional): whether quantize weight per-channel.
-        compute_type (int): compute type
+        accuracy_level (int): accuracy level
 
     Returns:
         model: fake quantized ONNXModel
@@ -1093,7 +1094,7 @@ def gptq_quantize(
                     q_weight=q_weight.astype("uint8"),
                     scale=scale,
                     zero_point=zp if scheme == "asym" else None,
-                    compute_type=compute_type,
+                    accuracy_level=accuracy_level,
                 )
 
                 model.add_initializers(new_inits)

From 060a0610724c18dd58b673eb37af373f3425b05a Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Fri, 10 Nov 2023 11:03:38 +0800
Subject: [PATCH 4/6] Update weight_only.py

---
 neural_compressor/adaptor/ox_utils/weight_only.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/adaptor/ox_utils/weight_only.py b/neural_compressor/adaptor/ox_utils/weight_only.py
index 453555ccc34..eccd9c19981 100644
--- a/neural_compressor/adaptor/ox_utils/weight_only.py
+++ b/neural_compressor/adaptor/ox_utils/weight_only.py
@@ -64,7 +64,7 @@ def make_matmul_weight_only_node(
     q_weight,
     scale,
     zero_point,
-    accuracy_level=-1,
+    accuracy_level=0,
 ):  # pragma: no cover
     """Build MatMulFpQ4 node.
 

From b47814be78009c9d6bbda5b144532c102d316614 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Thu, 16 Nov 2023 20:20:33 +0800
Subject: [PATCH 5/6] Update weight_only.py

---
 .../adaptor/ox_utils/weight_only.py           | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/neural_compressor/adaptor/ox_utils/weight_only.py b/neural_compressor/adaptor/ox_utils/weight_only.py
index eccd9c19981..d737ab146ca 100644
--- a/neural_compressor/adaptor/ox_utils/weight_only.py
+++ b/neural_compressor/adaptor/ox_utils/weight_only.py
@@ -77,7 +77,9 @@ def make_matmul_weight_only_node(
         q_weight (array): quantized weight
         scale (array): scale
         zero_point (array): zero point
-        accuracy_level (int): accuracy level
+        accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel),
+                              2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
+                              4 (int8 compute type of jblas kernel)
 
     Returns:
         matmul_weight_only_node: MatMulFpQ4 or MatMulNBits node
@@ -134,7 +136,8 @@ def make_matmul_weight_only_node(
         kwargs["N"] = weight_shape[1]
         kwargs["bits"] = num_bits
         kwargs["block_size"] = group_size
-        if accuracy_level > 0:
+        if accuracy_level > 0: # pragma: no cover
+            # require onnxruntime > 1.16.2
             kwargs["accuracy_level"] = accuracy_level
 
     else:
@@ -306,7 +309,9 @@ def rtn_quantize(
         group_size (int, optional): how many elements share one scale/zp. Default is 32.
         scheme (str, optional): sym or asym. Defaults to "asym".
         ratios (dict, optional): percentile of clip. Defaults to {}.
-        accuracy_level (int): accuracy level
+        accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel),
+                              2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
+                              4 (int8 compute type of jblas kernel)
 
     Returns:
         model: fake quantized ONNXModel
@@ -702,7 +707,9 @@ def awq_quantize(
         n_samples (int, optional): calibration sample number.
         enable_auto_scale (bool, optional): whether enable scale for salient weight. Defaults to True.
         enable_mse_search (bool, optional):  whether enable clip for weight by checking mse. Defaults to True.
-        accuracy_level (int): accuracy level
+        accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel),
+                              2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
+                              4 (int8 compute type of jblas kernel)
 
     Returns:
         model: fake quantized ONNXModel
@@ -977,7 +984,9 @@ def gptq_quantize(
         actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
         mse (bool, optional): whether get scale and zero point with mse error.
         perchannel (bool, optional): whether quantize weight per-channel.
-        accuracy_level (int): accuracy level
+        accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel),
+                              2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
+                              4 (int8 compute type of jblas kernel)
 
     Returns:
         model: fake quantized ONNXModel

From 22a7697ed23cb2cc86a056c607619a757f5009bb Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Thu, 16 Nov 2023 20:21:32 +0800
Subject: [PATCH 6/6] Update weight_only.py

---
 neural_compressor/adaptor/ox_utils/weight_only.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/adaptor/ox_utils/weight_only.py b/neural_compressor/adaptor/ox_utils/weight_only.py
index d737ab146ca..a7f6170f51b 100644
--- a/neural_compressor/adaptor/ox_utils/weight_only.py
+++ b/neural_compressor/adaptor/ox_utils/weight_only.py
@@ -136,7 +136,7 @@ def make_matmul_weight_only_node(
         kwargs["N"] = weight_shape[1]
         kwargs["bits"] = num_bits
         kwargs["block_size"] = group_size
-        if accuracy_level > 0: # pragma: no cover
+        if accuracy_level > 0:
             # require onnxruntime > 1.16.2
             kwargs["accuracy_level"] = accuracy_level