format code

microsoft · Oct 9, 2023 · 3b59970 · 3b59970
1 parent 1b2d7ee
commit 3b59970
Show file tree

Hide file tree

Showing 8 changed files with 35 additions and 25 deletions.
diff --git a/onnxruntime/contrib_ops/cpu/quantization/dequantize_blockwise.h b/onnxruntime/contrib_ops/cpu/quantization/dequantize_blockwise.h
@@ -58,7 +58,7 @@ void QuantizeBlockwise(
       zero_points[zp_idx] = ((zero_points_tmp[zp_idx * 2]) | (zero_points_tmp[zp_idx * 2 + 1] << 4));
     }
     if (total_block_count & 1) {
-      zero_points[total_block_count / 2] = (zero_points[total_block_count / 2] &0xf0) | zero_points_tmp[total_block_count - 1];
+      zero_points[total_block_count / 2] = (zero_points[total_block_count / 2] & 0xf0) | zero_points_tmp[total_block_count - 1];
     }
   }
 }

diff --git a/onnxruntime/python/onnxruntime_pybind_quant.cc b/onnxruntime/python/onnxruntime_pybind_quant.cc
@@ -10,7 +10,7 @@
 
 namespace pybind11 {
 namespace detail {
-  // python3 -c 'import numpy as np; print(np.dtype(np.float16).num)'
+// python3 -c 'import numpy as np; print(np.dtype(np.float16).num)'
 constexpr int NPY_FLOAT16 = 23;
 template <>
 struct npy_format_descriptor<onnxruntime::MLFloat16> {
@@ -19,7 +19,7 @@ struct npy_format_descriptor<onnxruntime::MLFloat16> {
     handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_FLOAT16);
     return reinterpret_borrow<pybind11::dtype>(ptr);
   }
-    static std::string format() {
+  static std::string format() {
     // following: https://docs.python.org/3/library/struct.html#format-characters
     return "e";
   }

diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/dequantize_int4.py b/onnxruntime/python/tools/kernel_explorer/kernels/dequantize_int4.py
@@ -34,8 +34,8 @@ def report(self):
 def profile_dequantize_int4_func(n, k, dtype, func):
     np.random.seed(0)
     output = np.random.rand(n, k).astype(dtype)
-    quant = np.random.randint(low=0, high=127, size=(n, (k+31)//32, 16)).astype('uint8')
-    scales = np.random.rand(n, (k+31)//32).astype(dtype)
+    quant = np.random.randint(low=0, high=127, size=(n, (k + 31) // 32, 16)).astype("uint8")
+    scales = np.random.rand(n, (k + 31) // 32).astype(dtype)
 
     output_d = ke.DeviceArray(output)
     quant_d = ke.DeviceArray(quant)
@@ -55,11 +55,10 @@ def profile_with_args(n, k, dtype, sort):
 
 
 def profile():
-    dims = [4096, 12288,]
     for dt in dtypes:
-            for n,k in ((4096, 4096), (4096, 12288), (12288, 4096)):
-                profile_with_args(n, k, dt, True)
-                print()
+        for n, k in ((4096, 4096), (4096, 12288), (12288, 4096)):
+            profile_with_args(n, k, dt, True)
+            print()
 
 
 if __name__ == "__main__":

diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -5,30 +5,30 @@
 # --------------------------------------------------------------------------
 
 import argparse
-import struct
-from pathlib import Path
-from typing import List, Tuple
-
 import logging
+import os
+from typing import List, Tuple
 
+import coloredlogs
 import numpy as np
 import numpy.typing as npt
 import onnx
 from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
-import os
 
-from .onnx_model import ONNXModel
-from .quant_utils import attribute_to_kwarg, load_model_with_shape_infer
-import coloredlogs
 from onnxruntime.capi._pybind_state import quantize_matmul_4bits
 
+from .onnx_model import ONNXModel
+from .quant_utils import attribute_to_kwarg
+
 logger = logging.getLogger(__name__)
 
 
 class MatMul4BitsQuantizer:
     """Perform 4b quantization of constant MatMul weights"""
 
-    def __init__(self, model: ModelProto, block_size: int, is_symmetric: bool, nodes_to_exclude=[]):
+    def __init__(self, model: ModelProto, block_size: int, is_symmetric: bool, nodes_to_exclude=None):
+        if nodes_to_exclude is None:
+            nodes_to_exclude = []
         self.model = ONNXModel(model)
         self.block_size = block_size
         self.is_symmetric = is_symmetric
@@ -97,13 +97,13 @@ def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto])
                 Bs_graph.input.remove(input)
                 break
 
-        scales_tensor = onnx.numpy_helper.from_array(scales)  # noqa: N806
+        scales_tensor = onnx.numpy_helper.from_array(scales)
         scales_tensor.name = B.name + "_scales"
         Bs_graph.initializer.extend([B_quant, scales_tensor])
 
         input_names = [node.input[0], B_quant.name, scales_tensor.name]
         if not self.is_symmetric:
-            zp_tensor = onnx.numpy_helper.from_array(zero_points)  # noqa: N806
+            zp_tensor = onnx.numpy_helper.from_array(zero_points)
             zp_tensor.name = B.name + "_zero_points"
             Bs_graph.initializer.extend([zp_tensor])
             input_names.append(zp_tensor.name)

diff --git a/onnxruntime/python/tools/transformers/llama2_export.py b/onnxruntime/python/tools/transformers/llama2_export.py
@@ -0,0 +1,9 @@
+from optimum.onnxruntime import ORTModelForCausalLM
+
+name = "meta-llama/Llama-2-7b-hf"
+model = ORTModelForCausalLM.from_pretrained(
+    name,
+    export=True,
+    use_auth_token=True,
+)
+model.save_pretrained(name.split("/")[-1] + "-onnx")
diff --git a/onnxruntime/test/contrib_ops/matmul_with_quant_weight_test.cc b/onnxruntime/test/contrib_ops/matmul_with_quant_weight_test.cc
@@ -75,7 +75,7 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, bool has_zerop
   int64_t buf_size = number_of_block * (block_size * 4 / 8);
   std::vector<uint8_t> input1_vals(buf_size);
   std::vector<float> scales(number_of_block);
-  std::vector<uint8_t> zp((N *block_per_k + 1) / 2);
+  std::vector<uint8_t> zp((N * block_per_k + 1) / 2);
 
   QuantizeDequantize(input1_f_vals, input1_vals, scales, has_zeropoint ? &zp : nullptr, N, K, block_size);
 

diff --git a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
@@ -21,12 +21,11 @@
 class TestOpMatMul4Bits(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        #cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="test_matmulfpq4.")
-        cls._tmp_model_dir = Path("test_models")
+        cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="test_matmulfpq4.")
 
     @classmethod
     def tearDownClass(cls):
-        #cls._tmp_model_dir.cleanup()
+        cls._tmp_model_dir.cleanup()
         pass
 
     def fill_int4_data(self, shape: Union[int, Tuple[int, ...]], symmetric: bool) -> np.ndarray:
@@ -114,7 +113,9 @@ def quant_test(
         block_size: int,
         is_symmetric: bool,
     ):
-        model_int4_path = str(Path(self._tmp_model_dir.name).joinpath(f"MatMulNBits_{block_size}_{is_symmetric}.onnx").absolute())
+        model_int4_path = str(
+            Path(self._tmp_model_dir.name).joinpath(f"MatMulNBits_{block_size}_{is_symmetric}.onnx").absolute()
+        )
 
         # Quantize fp32 model to int4 model
         model = quant_utils.load_model_with_shape_infer(Path(model_fp32_path))

diff --git a/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py b/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py
@@ -6,6 +6,7 @@
 # --------------------------------------------------------------------------
 
 import unittest
+
 import numpy as np
 import numpy.typing as npt