diff --git a/.azure-pipelines/scripts/ut/env_setup.sh b/.azure-pipelines/scripts/ut/env_setup.sh index 84e7fc654ec..3715c485631 100644 --- a/.azure-pipelines/scripts/ut/env_setup.sh +++ b/.azure-pipelines/scripts/ut/env_setup.sh @@ -92,7 +92,7 @@ elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then fi if [[ $(echo "${test_case}" | grep -c "api") != 0 ]] || [[ $(echo "${test_case}" | grep -c "adaptor") != 0 ]]; then - pip install git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c + pip install git+https://github.com/intel/auto-round.git@e24b9074af6cdb099e31c92eb81b7f5e9a4a244e fi # test deps diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py index 4df070d080f..490008bffa9 100644 --- a/neural_compressor/adaptor/pytorch.py +++ b/neural_compressor/adaptor/pytorch.py @@ -4926,7 +4926,7 @@ def autoround_quantize(self, model, tune_cfg, dataloader): act_group_size = self.recipes["autoround_args"].get("act_group_size", None) act_sym = self.recipes["autoround_args"].get("act_sym", None) act_dynamic = self.recipes["autoround_args"].get("act_dynamic", True) - multimodal = self.recipes["autoround_args"].get("multimodal", False) + quant_block_list = self.recipes["autoround_args"].get("quant_block_list", None) use_layer_wise = self.recipes["autoround_args"].get("use_layer_wise", False) if dataloader is not None: @@ -4959,7 +4959,7 @@ def autoround_quantize(self, model, tune_cfg, dataloader): dynamic_max_gap=dynamic_max_gap, data_type=data_type, scale_dtype=scale_dtype, - multimodal=multimodal, + quant_block_list=quant_block_list, act_bits=act_bits, act_group_size=act_group_size, act_sym=act_sym, diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py index 5e21b97d10a..570124ae9c1 100644 --- a/neural_compressor/adaptor/torch_utils/weight_only.py +++ b/neural_compressor/adaptor/torch_utils/weight_only.py @@ -706,7 +706,7 @@ def autoround_quantize( dynamic_max_gap: int = -1, data_type: str = "int", ##only support int for now scale_dtype: str = "fp16", - multimodal: bool = False, + quant_block_list: list = None, act_bits: int = 32, act_group_size: int = None, act_sym: bool = None, @@ -761,7 +761,7 @@ def autoround_quantize( data_type (str): The data type to be used (default is "int"). scale_dtype (str): The data type of quantization scale to be used (default is "float32"), different kernels have different choices. - multimodal(bool): Enable multimodal model quantization, (default is "False"). + quant_block_list (list): A list whose elements are list of block's layer names to be quantized. act_bits (int): Number of bits for activation quantization. Default is 32. act_group_size (int): Group size for activation quantization. Default is None. act_sym (bool): Whether to use symmetric activation quantization. Default is None. @@ -800,7 +800,7 @@ def autoround_quantize( dynamic_max_gap=dynamic_max_gap, data_type=data_type, ## only support data_type scale_dtype=scale_dtype, - multimodal=multimodal, + quant_block_list=quant_block_list, act_bits=act_bits, act_group_size=act_group_size, act_sym=act_sym, diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 9ff488573c0..9931a9e87b3 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -55,7 +55,7 @@ def __init__( dynamic_max_gap: int = -1, data_type: str = "int", scale_dtype: str = "fp16", - multimodal: bool = False, + quant_block_list: list = None, act_bits: int = 32, act_group_size: int = None, act_sym: bool = None, @@ -113,8 +113,8 @@ def __init__( dynamic_max_gap (int): The dynamic maximum gap (default is -1). data_type (str): The data type to be used (default is "int"). scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels - have different choices. - multimodal(bool): Enable multimodal model quantization, (default is "False"). + have different choices. + quant_block_list (list): A list whose elements are list of block's layer names to be quantized. act_bits (int): Number of bits for activation quantization. Default is 32. act_group_size (int): Group size for activation quantization. Default is None. act_sym (bool): Whether to use symmetric activation quantization. Default is None. @@ -146,7 +146,7 @@ def __init__( self.dynamic_max_gap = dynamic_max_gap self.data_type = data_type self.scale_dtype = scale_dtype - self.multimodal = multimodal + self.quant_block_list = quant_block_list self.act_bits = act_bits self.act_group_size = act_group_size self.act_sym = act_sym @@ -202,7 +202,7 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): dynamic_max_gap=self.dynamic_max_gap, data_type=self.data_type, scale_dtype=self.scale_dtype, - multimodal=self.multimodal, + quant_block_list=self.quant_block_list, act_bits=self.act_bits, act_group_size=self.act_group_size, act_sym=self.act_sym, diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py index 1493d176484..014e48129e7 100644 --- a/neural_compressor/torch/quantization/algorithm_entry.py +++ b/neural_compressor/torch/quantization/algorithm_entry.py @@ -567,9 +567,14 @@ def autoround_quantize_entry( if quant_config.name != AUTOROUND or quant_config.dtype == "fp32": continue else: + dtype = quant_config.dtype + bits = quant_config.bits + if dtype != "int" and "int" in dtype: + bits = int(dtype.lstrip("int")) + dtype = "int" weight_config[op_name] = { - "data_type": quant_config.dtype, - "bits": quant_config.bits, + "data_type": dtype, + "bits": bits, "sym": quant_config.use_sym, "group_size": quant_config.group_size, "act_bits": quant_config.act_bits, @@ -595,7 +600,7 @@ def autoround_quantize_entry( not_use_best_mse = quant_config.not_use_best_mse dynamic_max_gap = quant_config.dynamic_max_gap scale_dtype = quant_config.scale_dtype - multimodal = quant_config.multimodal + quant_block_list = quant_config.quant_block_list low_cpu_mem_usage = quant_config.use_layer_wise kwargs.pop("example_inputs") @@ -622,7 +627,7 @@ def autoround_quantize_entry( not_use_best_mse=not_use_best_mse, dynamic_max_gap=dynamic_max_gap, scale_dtype=scale_dtype, - multimodal=multimodal, + quant_block_list=quant_block_list, low_cpu_mem_usage=low_cpu_mem_usage, ) model = quantizer.execute(model=model, mode=mode, *args, **kwargs) diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 10dbd01657d..c4b4f6bf04a 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -917,7 +917,7 @@ def __init__( dynamic_max_gap: int = -1, scale_dtype: str = "fp16", use_layer_wise: bool = False, - multimodal: bool = False, + quant_block_list: list = None, white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST, ): """Init AUTOROUND weight-only quantization config. @@ -951,7 +951,7 @@ def __init__( scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels have different choices. use_layer_wise (bool): Enables quantize model per layer. Defaults to False. - multimodal(bool): Enable multimodal model quantization, (default is "False"). + quant_block_list (list): A list whose elements are list of block's layer names to be quantized. white_list (Optional[List[OP_NAME_OR_MODULE_TYPE]]): White list of operator names or module types. Default is DEFAULT_WHITE_LIST. """ @@ -983,7 +983,7 @@ def __init__( self.dynamic_max_gap = dynamic_max_gap self.scale_dtype = scale_dtype self.use_layer_wise = use_layer_wise - self.multimodal = multimodal + self.quant_block_list = quant_block_list self._post_init() @classmethod diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py index f5351656595..b30c6c644bc 100644 --- a/test/3x/torch/quantization/weight_only/test_autoround.py +++ b/test/3x/torch/quantization/weight_only/test_autoround.py @@ -80,6 +80,23 @@ def test_autoround(self, quant_lm_head): if quant_lm_head is True: assert isinstance(q_model.lm_head, WeightOnlyLinear), "quantization for lm_head failed." + def test_int4_dtype(self): + fp32_model = copy.deepcopy(self.gptj) + quant_config = AutoRoundConfig(dtype="int4", nsamples=32, seqlen=10, iters=10, scale_dtype="fp32") + logger.info(f"Test AutoRound with config {quant_config}") + + # prepare + convert API + model = prepare(model=fp32_model, quant_config=quant_config) + + run_fn(model, self.dataloader) + q_model = convert(model) + out = q_model(self.inp)[0] + assert torch.allclose(out, self.label, atol=1e-1) + assert "transformer.h.0.attn.k_proj" in q_model.autoround_config.keys() + assert "scale" in q_model.autoround_config["transformer.h.0.attn.k_proj"].keys() + assert torch.float32 == q_model.autoround_config["transformer.h.0.attn.k_proj"]["scale_dtype"] + assert isinstance(q_model.transformer.h[0].attn.k_proj, WeightOnlyLinear), "packing model failed." + def test_autoround_with_quantize_API(self): gpt_j_model = copy.deepcopy(self.gptj) diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt index 3a71a9603d5..c17e22d6f77 100644 --- a/test/3x/torch/requirements.txt +++ b/test/3x/torch/requirements.txt @@ -1,4 +1,4 @@ -auto_round @ git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c +auto_round @ git+https://github.com/intel/auto-round.git@e24b9074af6cdb099e31c92eb81b7f5e9a4a244e expecttest intel_extension_for_pytorch numpy diff --git a/test/requirements.txt b/test/requirements.txt index 386b3ee4f2b..1999f21e668 100644 --- a/test/requirements.txt +++ b/test/requirements.txt @@ -1,6 +1,6 @@ --find-links https://download.pytorch.org/whl/torch_stable.html accelerate==0.21.0 -auto-round @ git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c +auto-round @ git+https://github.com/intel/auto-round.git@e24b9074af6cdb099e31c92eb81b7f5e9a4a244e dynast==1.6.0rc1 horovod intel-extension-for-pytorch