From a1feaf5b222e91ec2feebe7d4598417b423e82c4 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Fri, 19 Jul 2024 09:56:24 +0800 Subject: [PATCH 1/5] update commit version Signed-off-by: Kaihui-intel --- .azure-pipelines/scripts/ut/env_setup.sh | 2 +- test/3x/torch/requirements.txt | 2 +- test/requirements.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.azure-pipelines/scripts/ut/env_setup.sh b/.azure-pipelines/scripts/ut/env_setup.sh index 84e7fc654ec..4f1986be13d 100644 --- a/.azure-pipelines/scripts/ut/env_setup.sh +++ b/.azure-pipelines/scripts/ut/env_setup.sh @@ -92,7 +92,7 @@ elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then fi if [[ $(echo "${test_case}" | grep -c "api") != 0 ]] || [[ $(echo "${test_case}" | grep -c "adaptor") != 0 ]]; then - pip install git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c + pip install git+https://github.com/intel/auto-round.git@61cf9eef4a3ccb5a2d83a557deb709091a548581 fi # test deps diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt index 3a71a9603d5..24ab6143ab3 100644 --- a/test/3x/torch/requirements.txt +++ b/test/3x/torch/requirements.txt @@ -1,4 +1,4 @@ -auto_round @ git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c +auto_round @ git+https://github.com/intel/auto-round.git@61cf9eef4a3ccb5a2d83a557deb709091a548581 expecttest intel_extension_for_pytorch numpy diff --git a/test/requirements.txt b/test/requirements.txt index 386b3ee4f2b..48454803e4a 100644 --- a/test/requirements.txt +++ b/test/requirements.txt @@ -1,6 +1,6 @@ --find-links https://download.pytorch.org/whl/torch_stable.html accelerate==0.21.0 -auto-round @ git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c +auto-round @ git+https://github.com/intel/auto-round.git@61cf9eef4a3ccb5a2d83a557deb709091a548581 dynast==1.6.0rc1 horovod intel-extension-for-pytorch From e538a63110b71c303bcceef265b7acaeed9df5f2 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 22 Jul 2024 16:18:20 +0800 Subject: [PATCH 2/5] update int4 usage Signed-off-by: Kaihui-intel --- neural_compressor/torch/quantization/algorithm_entry.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py index 1493d176484..23ce9f31dac 100644 --- a/neural_compressor/torch/quantization/algorithm_entry.py +++ b/neural_compressor/torch/quantization/algorithm_entry.py @@ -567,9 +567,14 @@ def autoround_quantize_entry( if quant_config.name != AUTOROUND or quant_config.dtype == "fp32": continue else: + dtype = quant_config.dtype + bits = quant_config.bits + if dtype != "int" and "int" in dtype: + bits = int(dtype.lstrip("int")) + dtype = "int" weight_config[op_name] = { - "data_type": quant_config.dtype, - "bits": quant_config.bits, + "data_type": dtype, + "bits": bits, "sym": quant_config.use_sym, "group_size": quant_config.group_size, "act_bits": quant_config.act_bits, From ad7ac9d117db4baf95205e0192cbb15e7954e8ce Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 23 Jul 2024 10:14:23 +0800 Subject: [PATCH 3/5] update commit version Signed-off-by: Kaihui-intel --- .azure-pipelines/scripts/ut/env_setup.sh | 2 +- test/3x/torch/requirements.txt | 2 +- test/requirements.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.azure-pipelines/scripts/ut/env_setup.sh b/.azure-pipelines/scripts/ut/env_setup.sh index 4f1986be13d..3715c485631 100644 --- a/.azure-pipelines/scripts/ut/env_setup.sh +++ b/.azure-pipelines/scripts/ut/env_setup.sh @@ -92,7 +92,7 @@ elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then fi if [[ $(echo "${test_case}" | grep -c "api") != 0 ]] || [[ $(echo "${test_case}" | grep -c "adaptor") != 0 ]]; then - pip install git+https://github.com/intel/auto-round.git@61cf9eef4a3ccb5a2d83a557deb709091a548581 + pip install git+https://github.com/intel/auto-round.git@e24b9074af6cdb099e31c92eb81b7f5e9a4a244e fi # test deps diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt index 24ab6143ab3..c17e22d6f77 100644 --- a/test/3x/torch/requirements.txt +++ b/test/3x/torch/requirements.txt @@ -1,4 +1,4 @@ -auto_round @ git+https://github.com/intel/auto-round.git@61cf9eef4a3ccb5a2d83a557deb709091a548581 +auto_round @ git+https://github.com/intel/auto-round.git@e24b9074af6cdb099e31c92eb81b7f5e9a4a244e expecttest intel_extension_for_pytorch numpy diff --git a/test/requirements.txt b/test/requirements.txt index 48454803e4a..1999f21e668 100644 --- a/test/requirements.txt +++ b/test/requirements.txt @@ -1,6 +1,6 @@ --find-links https://download.pytorch.org/whl/torch_stable.html accelerate==0.21.0 -auto-round @ git+https://github.com/intel/auto-round.git@61cf9eef4a3ccb5a2d83a557deb709091a548581 +auto-round @ git+https://github.com/intel/auto-round.git@e24b9074af6cdb099e31c92eb81b7f5e9a4a244e dynast==1.6.0rc1 horovod intel-extension-for-pytorch From f7cfe72235a1434212a711a40a8032cc28905eee Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 23 Jul 2024 12:47:16 +0800 Subject: [PATCH 4/5] add int4 ut Signed-off-by: Kaihui-intel --- .../quantization/weight_only/test_autoround.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py index f5351656595..b30c6c644bc 100644 --- a/test/3x/torch/quantization/weight_only/test_autoround.py +++ b/test/3x/torch/quantization/weight_only/test_autoround.py @@ -80,6 +80,23 @@ def test_autoround(self, quant_lm_head): if quant_lm_head is True: assert isinstance(q_model.lm_head, WeightOnlyLinear), "quantization for lm_head failed." + def test_int4_dtype(self): + fp32_model = copy.deepcopy(self.gptj) + quant_config = AutoRoundConfig(dtype="int4", nsamples=32, seqlen=10, iters=10, scale_dtype="fp32") + logger.info(f"Test AutoRound with config {quant_config}") + + # prepare + convert API + model = prepare(model=fp32_model, quant_config=quant_config) + + run_fn(model, self.dataloader) + q_model = convert(model) + out = q_model(self.inp)[0] + assert torch.allclose(out, self.label, atol=1e-1) + assert "transformer.h.0.attn.k_proj" in q_model.autoround_config.keys() + assert "scale" in q_model.autoround_config["transformer.h.0.attn.k_proj"].keys() + assert torch.float32 == q_model.autoround_config["transformer.h.0.attn.k_proj"]["scale_dtype"] + assert isinstance(q_model.transformer.h[0].attn.k_proj, WeightOnlyLinear), "packing model failed." + def test_autoround_with_quantize_API(self): gpt_j_model = copy.deepcopy(self.gptj) From 650d4356fc01b38ede2463835b695e2e41016cea Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 23 Jul 2024 13:04:37 +0800 Subject: [PATCH 5/5] update config Signed-off-by: Kaihui-intel --- neural_compressor/adaptor/pytorch.py | 4 ++-- neural_compressor/adaptor/torch_utils/weight_only.py | 6 +++--- .../torch/algorithms/weight_only/autoround.py | 8 ++++---- neural_compressor/torch/quantization/algorithm_entry.py | 4 ++-- neural_compressor/torch/quantization/config.py | 6 +++--- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py index 4df070d080f..490008bffa9 100644 --- a/neural_compressor/adaptor/pytorch.py +++ b/neural_compressor/adaptor/pytorch.py @@ -4926,7 +4926,7 @@ def autoround_quantize(self, model, tune_cfg, dataloader): act_group_size = self.recipes["autoround_args"].get("act_group_size", None) act_sym = self.recipes["autoround_args"].get("act_sym", None) act_dynamic = self.recipes["autoround_args"].get("act_dynamic", True) - multimodal = self.recipes["autoround_args"].get("multimodal", False) + quant_block_list = self.recipes["autoround_args"].get("quant_block_list", None) use_layer_wise = self.recipes["autoround_args"].get("use_layer_wise", False) if dataloader is not None: @@ -4959,7 +4959,7 @@ def autoround_quantize(self, model, tune_cfg, dataloader): dynamic_max_gap=dynamic_max_gap, data_type=data_type, scale_dtype=scale_dtype, - multimodal=multimodal, + quant_block_list=quant_block_list, act_bits=act_bits, act_group_size=act_group_size, act_sym=act_sym, diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py index 5e21b97d10a..570124ae9c1 100644 --- a/neural_compressor/adaptor/torch_utils/weight_only.py +++ b/neural_compressor/adaptor/torch_utils/weight_only.py @@ -706,7 +706,7 @@ def autoround_quantize( dynamic_max_gap: int = -1, data_type: str = "int", ##only support int for now scale_dtype: str = "fp16", - multimodal: bool = False, + quant_block_list: list = None, act_bits: int = 32, act_group_size: int = None, act_sym: bool = None, @@ -761,7 +761,7 @@ def autoround_quantize( data_type (str): The data type to be used (default is "int"). scale_dtype (str): The data type of quantization scale to be used (default is "float32"), different kernels have different choices. - multimodal(bool): Enable multimodal model quantization, (default is "False"). + quant_block_list (list): A list whose elements are list of block's layer names to be quantized. act_bits (int): Number of bits for activation quantization. Default is 32. act_group_size (int): Group size for activation quantization. Default is None. act_sym (bool): Whether to use symmetric activation quantization. Default is None. @@ -800,7 +800,7 @@ def autoround_quantize( dynamic_max_gap=dynamic_max_gap, data_type=data_type, ## only support data_type scale_dtype=scale_dtype, - multimodal=multimodal, + quant_block_list=quant_block_list, act_bits=act_bits, act_group_size=act_group_size, act_sym=act_sym, diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 6f5a022cfee..a8ca1e2984f 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -53,7 +53,7 @@ def __init__( dynamic_max_gap: int = -1, data_type: str = "int", scale_dtype: str = "fp16", - multimodal: bool = False, + quant_block_list: list = None, act_bits: int = 32, act_group_size: int = None, act_sym: bool = None, @@ -112,7 +112,7 @@ def __init__( data_type (str): The data type to be used (default is "int"). scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels have different choices. - multimodal(bool): Enable multimodal model quantization, (default is "False"). + quant_block_list (list): A list whose elements are list of block's layer names to be quantized. act_bits (int): Number of bits for activation quantization. Default is 32. act_group_size (int): Group size for activation quantization. Default is None. act_sym (bool): Whether to use symmetric activation quantization. Default is None. @@ -144,7 +144,7 @@ def __init__( self.dynamic_max_gap = dynamic_max_gap self.data_type = data_type self.scale_dtype = scale_dtype - self.multimodal = multimodal + self.quant_block_list = quant_block_list self.act_bits = act_bits self.act_group_size = act_group_size self.act_sym = act_sym @@ -191,7 +191,7 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): dynamic_max_gap=self.dynamic_max_gap, data_type=self.data_type, scale_dtype=self.scale_dtype, - multimodal=self.multimodal, + quant_block_list=self.quant_block_list, act_bits=self.act_bits, act_group_size=self.act_group_size, act_sym=self.act_sym, diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py index 23ce9f31dac..014e48129e7 100644 --- a/neural_compressor/torch/quantization/algorithm_entry.py +++ b/neural_compressor/torch/quantization/algorithm_entry.py @@ -600,7 +600,7 @@ def autoround_quantize_entry( not_use_best_mse = quant_config.not_use_best_mse dynamic_max_gap = quant_config.dynamic_max_gap scale_dtype = quant_config.scale_dtype - multimodal = quant_config.multimodal + quant_block_list = quant_config.quant_block_list low_cpu_mem_usage = quant_config.use_layer_wise kwargs.pop("example_inputs") @@ -627,7 +627,7 @@ def autoround_quantize_entry( not_use_best_mse=not_use_best_mse, dynamic_max_gap=dynamic_max_gap, scale_dtype=scale_dtype, - multimodal=multimodal, + quant_block_list=quant_block_list, low_cpu_mem_usage=low_cpu_mem_usage, ) model = quantizer.execute(model=model, mode=mode, *args, **kwargs) diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 75e6460a53e..2a59ca107c9 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -773,7 +773,7 @@ def __init__( dynamic_max_gap: int = -1, scale_dtype: str = "fp16", use_layer_wise: bool = False, - multimodal: bool = False, + quant_block_list: list = None, white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST, ): """Init AUTOROUND weight-only quantization config. @@ -807,7 +807,7 @@ def __init__( scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels have different choices. use_layer_wise (bool): Enables quantize model per layer. Defaults to False. - multimodal(bool): Enable multimodal model quantization, (default is "False"). + quant_block_list (list): A list whose elements are list of block's layer names to be quantized. """ super().__init__(white_list=white_list) self.dtype = dtype @@ -837,7 +837,7 @@ def __init__( self.dynamic_max_gap = dynamic_max_gap self.scale_dtype = scale_dtype self.use_layer_wise = use_layer_wise - self.multimodal = multimodal + self.quant_block_list = quant_block_list self._post_init() @classmethod