intel · chensuyue · Jul 23, 2024 · Jul 19, 2024 · Jul 22, 2024 · Jul 22, 2024
diff --git a/.azure-pipelines/scripts/ut/env_setup.sh b/.azure-pipelines/scripts/ut/env_setup.sh
@@ -92,7 +92,7 @@ elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then
 fi
 
 if [[ $(echo "${test_case}" | grep -c "api") != 0 ]] || [[ $(echo "${test_case}" | grep -c "adaptor") != 0 ]]; then
-    pip install git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c
+    pip install git+https://github.com/intel/auto-round.git@e24b9074af6cdb099e31c92eb81b7f5e9a4a244e
 fi
 
 # test deps

diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
@@ -4926,7 +4926,7 @@ def autoround_quantize(self, model, tune_cfg, dataloader):
         act_group_size = self.recipes["autoround_args"].get("act_group_size", None)
         act_sym = self.recipes["autoround_args"].get("act_sym", None)
         act_dynamic = self.recipes["autoround_args"].get("act_dynamic", True)
-        multimodal = self.recipes["autoround_args"].get("multimodal", False)
+        quant_block_list = self.recipes["autoround_args"].get("quant_block_list", None)
         use_layer_wise = self.recipes["autoround_args"].get("use_layer_wise", False)
 
         if dataloader is not None:
@@ -4959,7 +4959,7 @@ def autoround_quantize(self, model, tune_cfg, dataloader):
             dynamic_max_gap=dynamic_max_gap,
             data_type=data_type,
             scale_dtype=scale_dtype,
-            multimodal=multimodal,
+            quant_block_list=quant_block_list,
             act_bits=act_bits,
             act_group_size=act_group_size,
             act_sym=act_sym,

diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py
@@ -706,7 +706,7 @@ def autoround_quantize(
     dynamic_max_gap: int = -1,
     data_type: str = "int",  ##only support int for now
     scale_dtype: str = "fp16",
-    multimodal: bool = False,
+    quant_block_list: list = None,
     act_bits: int = 32,
     act_group_size: int = None,
     act_sym: bool = None,
@@ -761,7 +761,7 @@ def autoround_quantize(
         data_type (str): The data type to be used (default is "int").
         scale_dtype (str): The data type of quantization scale to be used (default is "float32"), different kernels
                            have different choices.
-        multimodal(bool): Enable multimodal model quantization, (default is "False").
+        quant_block_list (list): A list whose elements are list of block's layer names to be quantized.
         act_bits (int): Number of bits for activation quantization. Default is 32.
         act_group_size (int): Group size for activation quantization. Default is None.
         act_sym (bool): Whether to use symmetric activation quantization. Default is None.
@@ -800,7 +800,7 @@ def autoround_quantize(
         dynamic_max_gap=dynamic_max_gap,
         data_type=data_type,  ## only support data_type
         scale_dtype=scale_dtype,
-        multimodal=multimodal,
+        quant_block_list=quant_block_list,
         act_bits=act_bits,
         act_group_size=act_group_size,
         act_sym=act_sym,

diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py
@@ -55,7 +55,7 @@ def __init__(
         dynamic_max_gap: int = -1,
         data_type: str = "int",
         scale_dtype: str = "fp16",
-        multimodal: bool = False,
+        quant_block_list: list = None,
         act_bits: int = 32,
         act_group_size: int = None,
         act_sym: bool = None,
@@ -113,8 +113,8 @@ def __init__(
             dynamic_max_gap (int): The dynamic maximum gap (default is -1).
             data_type (str): The data type to be used (default is "int").
             scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels
-                                    have different choices.
-            multimodal(bool): Enable multimodal model quantization, (default is "False").
+                have different choices.
+            quant_block_list (list): A list whose elements are list of block's layer names to be quantized.
             act_bits (int): Number of bits for activation quantization. Default is 32.
             act_group_size (int): Group size for activation quantization. Default is None.
             act_sym (bool): Whether to use symmetric activation quantization. Default is None.
@@ -146,7 +146,7 @@ def __init__(
         self.dynamic_max_gap = dynamic_max_gap
         self.data_type = data_type
         self.scale_dtype = scale_dtype
-        self.multimodal = multimodal
+        self.quant_block_list = quant_block_list
         self.act_bits = act_bits
         self.act_group_size = act_group_size
         self.act_sym = act_sym
@@ -202,7 +202,7 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
             dynamic_max_gap=self.dynamic_max_gap,
             data_type=self.data_type,
             scale_dtype=self.scale_dtype,
-            multimodal=self.multimodal,
+            quant_block_list=self.quant_block_list,
             act_bits=self.act_bits,
             act_group_size=self.act_group_size,
             act_sym=self.act_sym,

diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -567,9 +567,14 @@ def autoround_quantize_entry(
         if quant_config.name != AUTOROUND or quant_config.dtype == "fp32":
             continue
         else:
+            dtype = quant_config.dtype
+            bits = quant_config.bits
+            if dtype != "int" and "int" in dtype:
+                bits = int(dtype.lstrip("int"))
+                dtype = "int"
             weight_config[op_name] = {
-                "data_type": quant_config.dtype,
-                "bits": quant_config.bits,
+                "data_type": dtype,
+                "bits": bits,
                 "sym": quant_config.use_sym,
                 "group_size": quant_config.group_size,
                 "act_bits": quant_config.act_bits,
@@ -595,7 +600,7 @@ def autoround_quantize_entry(
             not_use_best_mse = quant_config.not_use_best_mse
             dynamic_max_gap = quant_config.dynamic_max_gap
             scale_dtype = quant_config.scale_dtype
-            multimodal = quant_config.multimodal
+            quant_block_list = quant_config.quant_block_list
             low_cpu_mem_usage = quant_config.use_layer_wise
 
     kwargs.pop("example_inputs")
@@ -622,7 +627,7 @@ def autoround_quantize_entry(
         not_use_best_mse=not_use_best_mse,
         dynamic_max_gap=dynamic_max_gap,
         scale_dtype=scale_dtype,
-        multimodal=multimodal,
+        quant_block_list=quant_block_list,
         low_cpu_mem_usage=low_cpu_mem_usage,
     )
     model = quantizer.execute(model=model, mode=mode, *args, **kwargs)

diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
@@ -917,7 +917,7 @@ def __init__(
         dynamic_max_gap: int = -1,
         scale_dtype: str = "fp16",
         use_layer_wise: bool = False,
-        multimodal: bool = False,
+        quant_block_list: list = None,
         white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST,
     ):
         """Init AUTOROUND weight-only quantization config.
@@ -951,7 +951,7 @@ def __init__(
             scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels
               have different choices.
             use_layer_wise (bool): Enables quantize model per layer. Defaults to False.
-            multimodal(bool): Enable multimodal model quantization, (default is "False").
+            quant_block_list (list): A list whose elements are list of block's layer names to be quantized.
             white_list (Optional[List[OP_NAME_OR_MODULE_TYPE]]): White list of operator names or module types.
               Default is DEFAULT_WHITE_LIST.
         """
@@ -983,7 +983,7 @@ def __init__(
         self.dynamic_max_gap = dynamic_max_gap
         self.scale_dtype = scale_dtype
         self.use_layer_wise = use_layer_wise
-        self.multimodal = multimodal
+        self.quant_block_list = quant_block_list
         self._post_init()
 
     @classmethod

diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py
@@ -80,6 +80,23 @@ def test_autoround(self, quant_lm_head):
         if quant_lm_head is True:
             assert isinstance(q_model.lm_head, WeightOnlyLinear), "quantization for lm_head failed."
 
+    def test_int4_dtype(self):
+        fp32_model = copy.deepcopy(self.gptj)
+        quant_config = AutoRoundConfig(dtype="int4", nsamples=32, seqlen=10, iters=10, scale_dtype="fp32")
+        logger.info(f"Test AutoRound with config {quant_config}")
+
+        # prepare + convert API
+        model = prepare(model=fp32_model, quant_config=quant_config)
+
+        run_fn(model, self.dataloader)
+        q_model = convert(model)
+        out = q_model(self.inp)[0]
+        assert torch.allclose(out, self.label, atol=1e-1)
+        assert "transformer.h.0.attn.k_proj" in q_model.autoround_config.keys()
+        assert "scale" in q_model.autoround_config["transformer.h.0.attn.k_proj"].keys()
+        assert torch.float32 == q_model.autoround_config["transformer.h.0.attn.k_proj"]["scale_dtype"]
+        assert isinstance(q_model.transformer.h[0].attn.k_proj, WeightOnlyLinear), "packing model failed."
+
     def test_autoround_with_quantize_API(self):
         gpt_j_model = copy.deepcopy(self.gptj)
 

diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt
@@ -1,4 +1,4 @@
-auto_round @ git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c
+auto_round @ git+https://github.com/intel/auto-round.git@e24b9074af6cdb099e31c92eb81b7f5e9a4a244e
 expecttest
 intel_extension_for_pytorch
 numpy

diff --git a/test/requirements.txt b/test/requirements.txt
@@ -1,6 +1,6 @@
 --find-links https://download.pytorch.org/whl/torch_stable.html
 accelerate==0.21.0
-auto-round @ git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c
+auto-round @ git+https://github.com/intel/auto-round.git@e24b9074af6cdb099e31c92eb81b7f5e9a4a244e
 dynast==1.6.0rc1
 horovod
 intel-extension-for-pytorch