From 9994a865142ca9994675e95c648d5833bcc5573f Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Jul 2024 12:38:43 +0800 Subject: [PATCH 1/4] support autoround v2.1 Signed-off-by: Kaihui-intel --- .azure-pipelines/scripts/ut/env_setup.sh | 2 +- neural_compressor/adaptor/pytorch.py | 22 ++++++-- .../adaptor/torch_utils/auto_round.py | 6 +-- .../adaptor/torch_utils/weight_only.py | 52 +++++++++++++------ neural_compressor/model/torch_model.py | 2 +- .../test_weight_only_adaptor_pytorch.py | 2 +- test/requirements.txt | 2 +- 7 files changed, 60 insertions(+), 28 deletions(-) diff --git a/.azure-pipelines/scripts/ut/env_setup.sh b/.azure-pipelines/scripts/ut/env_setup.sh index d5876b07cef..84e7fc654ec 100644 --- a/.azure-pipelines/scripts/ut/env_setup.sh +++ b/.azure-pipelines/scripts/ut/env_setup.sh @@ -92,7 +92,7 @@ elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then fi if [[ $(echo "${test_case}" | grep -c "api") != 0 ]] || [[ $(echo "${test_case}" | grep -c "adaptor") != 0 ]]; then - pip install auto-round + pip install git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c fi # test deps diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py index 530e15f2308..4df070d080f 100644 --- a/neural_compressor/adaptor/pytorch.py +++ b/neural_compressor/adaptor/pytorch.py @@ -4905,13 +4905,13 @@ def autoround_quantize(self, model, tune_cfg, dataloader): enable_minmax_tuning = self.recipes["autoround_args"].get("enable_minmax_tuning", True) lr = self.recipes["autoround_args"].get("lr", None) minmax_lr = self.recipes["autoround_args"].get("minmax_lr", None) - low_gpu_mem_usage = self.recipes["autoround_args"].get("low_gpu_mem_usage", True) + low_gpu_mem_usage = self.recipes["autoround_args"].get("low_gpu_mem_usage", False) iters = self.recipes["autoround_args"].get("iters", 200) seqlen = self.recipes["autoround_args"].get("seqlen", 2048) - n_samples = self.recipes["autoround_args"].get("n_samples", 512) + nsamples = self.recipes["autoround_args"].get("nsamples", 128) sampler = self.recipes["autoround_args"].get("sampler", "rand") seed = self.recipes["autoround_args"].get("seed", 42) - n_blocks = self.recipes["autoround_args"].get("n_blocks", 1) + nblocks = self.recipes["autoround_args"].get("nblocks", 1) gradient_accumulate_steps = self.recipes["autoround_args"].get("gradient_accumulate_steps", 1) not_use_best_mse = self.recipes["autoround_args"].get("not_use_best_mse", False) dynamic_max_gap = self.recipes["autoround_args"].get("dynamic_max_gap", -1) @@ -4922,6 +4922,12 @@ def autoround_quantize(self, model, tune_cfg, dataloader): bits = self.recipes["autoround_args"].get("bits", 4) group_size = self.recipes["autoround_args"].get("group_size", 128) sym = self.recipes["autoround_args"].get("scheme", "asym") == "sym" + act_bits = self.recipes["autoround_args"].get("act_bits", 32) + act_group_size = self.recipes["autoround_args"].get("act_group_size", None) + act_sym = self.recipes["autoround_args"].get("act_sym", None) + act_dynamic = self.recipes["autoround_args"].get("act_dynamic", True) + multimodal = self.recipes["autoround_args"].get("multimodal", False) + use_layer_wise = self.recipes["autoround_args"].get("use_layer_wise", False) if dataloader is not None: dataset = dataloader @@ -4944,15 +4950,21 @@ def autoround_quantize(self, model, tune_cfg, dataloader): low_gpu_mem_usage=low_gpu_mem_usage, iters=iters, seqlen=seqlen, - n_samples=n_samples, + nsamples=nsamples, sampler=sampler, seed=seed, - n_blocks=n_blocks, + nblocks=nblocks, gradient_accumulate_steps=gradient_accumulate_steps, not_use_best_mse=not_use_best_mse, dynamic_max_gap=dynamic_max_gap, data_type=data_type, scale_dtype=scale_dtype, + multimodal=multimodal, + act_bits=act_bits, + act_group_size=act_group_size, + act_sym=act_sym, + act_dynamic=act_dynamic, + use_layer_wise=use_layer_wise, ) return model, autoround_config diff --git a/neural_compressor/adaptor/torch_utils/auto_round.py b/neural_compressor/adaptor/torch_utils/auto_round.py index 78eca517221..a0de5ef2a75 100644 --- a/neural_compressor/adaptor/torch_utils/auto_round.py +++ b/neural_compressor/adaptor/torch_utils/auto_round.py @@ -13,7 +13,7 @@ # limitations under the License. -def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, n_samples=512): +def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=128): """Generate a DataLoader for calibration using specified parameters. Args: @@ -25,7 +25,7 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42 split (str, optional): The data split to use. Defaults to None. seed (int, optional): The random seed for reproducibility. Defaults to 42. bs (int, optional): The batch size. Defaults to 4. - n_samples (int, optional): The total number of samples to include. Defaults to 512. + n_samples (int, optional): The total number of samples to include. Defaults to 128. Returns: DataLoader: The DataLoader for the calibrated dataset. @@ -33,6 +33,6 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42 from auto_round.calib_dataset import get_dataloader # pylint: disable=E0401 dataloader = get_dataloader( - tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=seed, bs=bs, n_samples=n_samples + tokenizer, seqlen, dataset_name=dataset_name, seed=seed, bs=bs, nsamples=nsamples ) return dataloader diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py index c6bcf5f09e2..2adb6ec48e3 100644 --- a/neural_compressor/adaptor/torch_utils/weight_only.py +++ b/neural_compressor/adaptor/torch_utils/weight_only.py @@ -694,18 +694,24 @@ def autoround_quantize( enable_minmax_tuning: bool = True, lr: float = None, minmax_lr: float = None, - low_gpu_mem_usage: bool = True, + low_gpu_mem_usage: bool = False, iters: int = 200, seqlen: int = 2048, - n_samples: int = 512, + nsamples: int = 128, sampler: str = "rand", seed: int = 42, - n_blocks: int = 1, + nblocks: int = 1, gradient_accumulate_steps: int = 1, not_use_best_mse: bool = False, dynamic_max_gap: int = -1, data_type: str = "int", ##only support int for now scale_dtype: str = "fp16", + multimodal: bool = False, + act_bits: int = 32, + act_group_size: int = None, + act_sym: bool = None, + act_dynamic: bool = True, + use_layer_wise: bool = False, **kwargs, ): """Run autoround weight-only quantization. @@ -717,15 +723,19 @@ def autoround_quantize( sym (bool): Whether symmetric quantization is to be used (default is False). weight_config (dict): Configuration for weight quantization (default is an empty dictionary). weight_config={ - 'layer1':##layer_name - { - 'data_type': 'int', - 'bits': 4, - 'group_size': 32, - 'sym': False - } - ... - } + 'layer1':##layer_name + { + 'data_type': 'int', + 'bits': 4, + 'group_size': 32, + 'sym': False, + 'act_data_type': None, + 'act_bits': 32, + 'act_sym': None, + 'act_dynamic': True, + } + ..., + } enable_full_range (bool): Whether to enable full range quantization (default is False). batch_size (int): Batch size for training (default is 8). amp (bool): Whether to use automatic mixed precision (default is True). @@ -737,7 +747,7 @@ def autoround_quantize( enable_minmax_tuning (bool): Whether to enable weight min-max tuning (default is True). lr (float): The learning rate (default is None, will be set to 1.0/iters). minmax_lr (float): The learning rate for min-max tuning (default is None, it will be set to lr automatically). - low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True). + low_gpu_mem_usage (bool): Whether to use low GPU memory (default is False). iters (int): Number of iterations (default is 200). seqlen (int): Data length of the sequence for tuning (default is 2048). n_samples (int): Number of samples (default is 512). @@ -750,7 +760,11 @@ def autoround_quantize( data_type (str): The data type to be used (default is "int"). scale_dtype (str): The data type of quantization scale to be used (default is "float32"), different kernels have different choices. - + multimodal(bool): Enable multimodal model quantization, (default is "False"). + act_bits (int): Number of bits for activation quantization. Default is 32. + act_group_size (int): Group size for activation quantization. Default is None. + act_sym (bool): Whether to use symmetric activation quantization. Default is None. + act_dynamic (bool): Whether to use dynamic activation quantization. Default is True. Returns: The quantized model. """ @@ -776,15 +790,21 @@ def autoround_quantize( low_gpu_mem_usage=low_gpu_mem_usage, iters=iters, seqlen=seqlen, - n_samples=n_samples, + nsamples=nsamples, sampler=sampler, seed=seed, - n_blocks=n_blocks, + nblocks=nblocks, gradient_accumulate_steps=gradient_accumulate_steps, not_use_best_mse=not_use_best_mse, dynamic_max_gap=dynamic_max_gap, data_type=data_type, ## only support data_type scale_dtype=scale_dtype, + multimodal=multimodal, + act_bits=act_bits, + act_group_size=act_group_size, + act_sym=act_sym, + act_dynamic=act_dynamic, + low_cpu_mem_usage=use_layer_wise, **kwargs, ) qdq_model, weight_config = rounder.quantize() diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py index 7338f196d46..91e189777b0 100644 --- a/neural_compressor/model/torch_model.py +++ b/neural_compressor/model/torch_model.py @@ -609,7 +609,7 @@ def export_compressed_model( self.model = pack_model( self.model, - weight_config=autoround_config, + layer_config=autoround_config, enable_full_range=enable_full_range, compression_dtype=compression_dtype, compression_dim=compression_dim, diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py index c3839a80b8a..61456b923f8 100644 --- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py +++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py @@ -760,7 +760,7 @@ def test_AutoRound_quant(self): tokenizer = transformers.AutoTokenizer.from_pretrained( "hf-internal-testing/tiny-random-GPTJForCausalLM", trust_remote_code=True ) - dataloader = get_dataloader(tokenizer, 32, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, n_samples=20) + dataloader = get_dataloader(tokenizer, 32, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=20) fp32_model = copy.deepcopy(self.gptj) conf = PostTrainingQuantConfig( approach="weight_only", diff --git a/test/requirements.txt b/test/requirements.txt index 3a24001cfd2..386b3ee4f2b 100644 --- a/test/requirements.txt +++ b/test/requirements.txt @@ -1,6 +1,6 @@ --find-links https://download.pytorch.org/whl/torch_stable.html accelerate==0.21.0 -auto-round +auto-round @ git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c dynast==1.6.0rc1 horovod intel-extension-for-pytorch From 8712130561016a8a4e7866ef50036c0926424b4a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 04:43:31 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/adaptor/torch_utils/auto_round.py | 4 +--- neural_compressor/adaptor/torch_utils/weight_only.py | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/neural_compressor/adaptor/torch_utils/auto_round.py b/neural_compressor/adaptor/torch_utils/auto_round.py index a0de5ef2a75..b5aca6258dd 100644 --- a/neural_compressor/adaptor/torch_utils/auto_round.py +++ b/neural_compressor/adaptor/torch_utils/auto_round.py @@ -32,7 +32,5 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42 """ from auto_round.calib_dataset import get_dataloader # pylint: disable=E0401 - dataloader = get_dataloader( - tokenizer, seqlen, dataset_name=dataset_name, seed=seed, bs=bs, nsamples=nsamples - ) + dataloader = get_dataloader(tokenizer, seqlen, dataset_name=dataset_name, seed=seed, bs=bs, nsamples=nsamples) return dataloader diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py index 2adb6ec48e3..ffd5789c612 100644 --- a/neural_compressor/adaptor/torch_utils/weight_only.py +++ b/neural_compressor/adaptor/torch_utils/weight_only.py @@ -715,6 +715,7 @@ def autoround_quantize( **kwargs, ): """Run autoround weight-only quantization. + Args: model: The PyTorch model to be quantized. tokenizer: An optional tokenizer for processing input data. If none is provided, a dataloader must be supplied. From 8e472f91e71bd8a3ba0c3bbc946a106f1d4853bd Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Jul 2024 12:44:21 +0800 Subject: [PATCH 3/4] fix docstring Signed-off-by: Kaihui-intel --- neural_compressor/adaptor/torch_utils/auto_round.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/adaptor/torch_utils/auto_round.py b/neural_compressor/adaptor/torch_utils/auto_round.py index b5aca6258dd..9fd8667e801 100644 --- a/neural_compressor/adaptor/torch_utils/auto_round.py +++ b/neural_compressor/adaptor/torch_utils/auto_round.py @@ -25,7 +25,7 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42 split (str, optional): The data split to use. Defaults to None. seed (int, optional): The random seed for reproducibility. Defaults to 42. bs (int, optional): The batch size. Defaults to 4. - n_samples (int, optional): The total number of samples to include. Defaults to 128. + nsamples (int, optional): The total number of samples to include. Defaults to 128. Returns: DataLoader: The DataLoader for the calibrated dataset. From 4f00f30eefd035f37ab54a5e001e8d97cb7cc6c3 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Jul 2024 14:08:58 +0800 Subject: [PATCH 4/4] update docstring & layer_config Signed-off-by: Kaihui-intel --- neural_compressor/adaptor/torch_utils/weight_only.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py index ffd5789c612..5e21b97d10a 100644 --- a/neural_compressor/adaptor/torch_utils/weight_only.py +++ b/neural_compressor/adaptor/torch_utils/weight_only.py @@ -751,10 +751,10 @@ def autoround_quantize( low_gpu_mem_usage (bool): Whether to use low GPU memory (default is False). iters (int): Number of iterations (default is 200). seqlen (int): Data length of the sequence for tuning (default is 2048). - n_samples (int): Number of samples (default is 512). + nsamples (int): Number of samples (default is 128). sampler (str): The sampling method (default is "rand"). seed (int): The random seed (default is 42). - n_blocks (int): Number of blocks (default is 1). + nblocks (int): Number of blocks (default is 1). gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1). not_use_best_mse (bool): Whether to use mean squared error (default is False). dynamic_max_gap (int): The dynamic maximum gap (default is -1). @@ -777,7 +777,7 @@ def autoround_quantize( bits=bits, group_size=group_size, sym=sym, - weight_config=weight_config, + layer_config=weight_config, enable_full_range=enable_full_range, ##for symmetric, TODO support later batch_size=batch_size, amp=amp,