From aecb058f65b5c4e4037d16c3a991dcb212d044e7 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Wed, 6 Mar 2024 09:47:09 +0800 Subject: [PATCH 1/2] remove autoround args Signed-off-by: Kaihui-intel --- neural_compressor/adaptor/pytorch.py | 8 --- .../adaptor/torch_utils/weight_only.py | 16 +----- neural_compressor/model/torch_model.py | 4 +- .../test_weight_only_adaptor_pytorch.py | 55 ------------------- .../test_weight_only_quantization.py | 30 ---------- 5 files changed, 5 insertions(+), 108 deletions(-) diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py index e08ab7fc794..c937cda3d70 100644 --- a/neural_compressor/adaptor/pytorch.py +++ b/neural_compressor/adaptor/pytorch.py @@ -4918,8 +4918,6 @@ def autoround_quantize(self, model, tune_cfg, dataloader): # auto round recipes enable_full_range = self.recipes["autoround_args"].get("enable_full_range", False) bs = self.recipes["autoround_args"].get("bs", 8) - amp = self.recipes["autoround_args"].get("amp", True) - device = self.recipes["autoround_args"].get("device", "cpu") lr_scheduler = self.recipes["autoround_args"].get("lr_scheduler", None) dataset_name = self.recipes["autoround_args"].get("dataset_name", "NeelNanda/pile-10k") dataset_split = self.recipes["autoround_args"].get("dataset_split", "train") @@ -4939,8 +4937,6 @@ def autoround_quantize(self, model, tune_cfg, dataloader): dynamic_max_gap = self.recipes["autoround_args"].get("dynamic_max_gap", -1) data_type = self.recipes["autoround_args"].get("data_type", "int") ##only support data_type scale_dtype = self.recipes["autoround_args"].get("scale_dtype", "fp16") - # autoround export - export_args = self.recipes["autoround_args"].get("export_args", {"format": None}) model, autoround_config = autoround_quantize( model=model, @@ -4951,8 +4947,6 @@ def autoround_quantize(self, model, tune_cfg, dataloader): weight_config=weight_config, enable_full_range=enable_full_range, bs=bs, - amp=amp, - device=device, lr_scheduler=lr_scheduler, dataloader=dataloader, dataset_name=dataset_name, @@ -4973,8 +4967,6 @@ def autoround_quantize(self, model, tune_cfg, dataloader): dynamic_max_gap=dynamic_max_gap, data_type=data_type, scale_dtype=scale_dtype, - # export arguments - export_args=export_args, ) return model, autoround_config diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py index d3816323432..d40cc2f5fab 100644 --- a/neural_compressor/adaptor/torch_utils/weight_only.py +++ b/neural_compressor/adaptor/torch_utils/weight_only.py @@ -682,7 +682,7 @@ def autoround_quantize( enable_full_range: bool = False, ##for symmetric, TODO support later bs: int = 8, amp: bool = True, - device="cuda:0", + device=None, lr_scheduler=None, dataloader=None, ## to support later dataset_name: str = "NeelNanda/pile-10k", @@ -703,7 +703,6 @@ def autoround_quantize( dynamic_max_gap: int = -1, data_type: str = "int", ##only support data_type scale_dtype="fp16", - export_args: dict = {"format": None, "inplace": True}, **kwargs, ): """Run autoround weight-only quantization. @@ -726,8 +725,8 @@ def autoround_quantize( } enable_full_range (bool): Whether to enable full range quantization (default is False). bs (int): Batch size for training (default is 8). - amp (bool): Whether to use automatic mixed precision (default is True). - device: The device to be used for tuning (default is "cuda:0"). + amp (bool): Whether to use automatic mixed precision (default is True). Automatically detect and set. + device: The device to be used for tuning (default is None). Automatically detect and set. lr_scheduler: The learning rate scheduler to be used. dataloader: The dataloader for input data (to be supported in future). dataset_name (str): The default dataset name (default is "NeelNanda/pile-10k"). @@ -747,8 +746,6 @@ def autoround_quantize( not_use_best_mse (bool): Whether to use mean squared error (default is False). dynamic_max_gap (int): The dynamic maximum gap (default is -1). data_type (str): The data type to be used (default is "int"). - export_args (dict): The arguments for exporting compressed model, default is {"format": None, "inplace": True}. - Supported format: "itrex", "auto_gptq". **kwargs: Additional keyword arguments. Returns: @@ -790,11 +787,4 @@ def autoround_quantize( **kwargs, ) qdq_model, weight_config = rounder.quantize() - if export_args["format"] is not None: - output_dir = export_args.get("output_dir", None) - format = export_args["format"] - inplace = export_args.get("inplace", True) - use_triton = export_args.get("use_triton", False) - model = rounder.save_quantized(output_dir=output_dir, format=format, inplace=inplace, use_triton=use_triton) - return model, weight_config return qdq_model, weight_config diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py index 3685c5c208d..7aea57770c5 100644 --- a/neural_compressor/model/torch_model.py +++ b/neural_compressor/model/torch_model.py @@ -559,9 +559,9 @@ def export_compressed_model( new_module.pack(int_weight, gptq_scale, gptq_zp, m.bias, gptq_perm) set_module(self.model, k, new_module) elif autoround_config: - from auto_round.export.export_to_itrex import compress_model # pylint: disable=E0401 + from auto_round.export.export_to_itrex.export import _pack_model # pylint: disable=E0401 - self.model = compress_model( + self.model = _pack_model( self.model, weight_config=autoround_config, enable_full_range=enable_full_range, diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py index 8ca5c73d50c..8c2bb7844fd 100644 --- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py +++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py @@ -778,11 +778,9 @@ def test_AutoRound_quant(self): recipes={ "autoround_args": { "n_samples": 20, - "amp": False, "seq_len": 10, "iters": 10, "scale_dtype": "fp32", - "device": "cpu", }, }, ) @@ -809,59 +807,6 @@ def test_AutoRound_quant(self): self.assertTrue(isinstance(q_model.model.transformer.h[0].attn.k_proj, WeightOnlyLinear)) self.assertTrue(isinstance(export_model.transformer.h[0].attn.k_proj, WeightOnlyLinear)) - fp32_model = copy.deepcopy(self.gptj) - - conf = PostTrainingQuantConfig( - approach="weight_only", - op_type_dict={ - ".*": { # re.match - "weight": { - "dtype": "int", - "bits": 4, - "group_size": 32, # -1 (per-channel) - "scheme": "sym", - "algorithm": "AUTOROUND", - }, - }, - }, - op_name_dict={ - ".*lm_head": { # re.match - "weight": {"dtype": "fp32"}, - }, - }, - recipes={ - "autoround_args": { - "n_samples": 20, - "amp": False, - "seq_len": 10, - "iters": 10, - "scale_dtype": "fp32", - "device": "cpu", - "export_args": {"format": "itrex", "inplace": False}, - }, - }, - ) - """All export arguments. - - "export_args": { - "format": "itrex", # "iterx", "auto_gptq", default is None - "output_dir": None, # saved path - "inplace": False, - "use_triton": False, - } - """ - input = torch.ones([1, 512], dtype=torch.long) - fp32_model = copy.deepcopy(self.gptj) - out1 = fp32_model(input) - export_model = quantization.fit( - fp32_model, - conf, - calib_dataloader=dataloader, - ) - out2 = export_model.model(input) - self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-01)) - self.assertTrue(isinstance(export_model.model.transformer.h[0].attn.k_proj, WeightOnlyLinear)) - if __name__ == "__main__": unittest.main() diff --git a/test/quantization/test_weight_only_quantization.py b/test/quantization/test_weight_only_quantization.py index 254742329e1..b990f9aee9f 100644 --- a/test/quantization/test_weight_only_quantization.py +++ b/test/quantization/test_weight_only_quantization.py @@ -278,8 +278,6 @@ def test_autoround_int_quant(self): model=model, tokenizer=self.tokenizer, n_samples=20, - device=device, - amp=False, seqlen=10, iters=10, scale_dtype="fp32", @@ -292,34 +290,6 @@ def test_autoround_int_quant(self): self.assertFalse(torch.all(out1[0] == out2[0])) self.assertTrue(torch.all(out2[0] == out3[0])) - def test_autoround_export(self): - model = copy.deepcopy(self.gptj) - device = "cpu" - model = model - out1 = model(self.lm_input) - export_model, weight_config1 = autoround_quantize( - model=model, - tokenizer=self.tokenizer, - n_samples=20, - device=device, - amp=False, - seqlen=10, - iters=10, - scale_dtype="fp32", - export_args={"format": "itrex", "inplace": True}, - ) - export_model = export_model - model = model - out2 = model(self.lm_input) - out3 = export_model(self.lm_input) - self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1))) - self.assertFalse(torch.all(out1[0] == out2[0])) - self.assertTrue(torch.all(out2[0] == out3[0])) - - from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear - - self.assertTrue(isinstance(export_model.transformer.h[0].attn.k_proj, WeightOnlyLinear)) - if __name__ == "__main__": unittest.main() From 3af0996e38951b610912f78d5feea9fec3887180 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Wed, 6 Mar 2024 10:30:16 +0800 Subject: [PATCH 2/2] update autoround version Signed-off-by: Kaihui-intel --- .azure-pipelines/scripts/ut/env_setup.sh | 2 +- test/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.azure-pipelines/scripts/ut/env_setup.sh b/.azure-pipelines/scripts/ut/env_setup.sh index f3bfd6fe7db..70a77354f58 100644 --- a/.azure-pipelines/scripts/ut/env_setup.sh +++ b/.azure-pipelines/scripts/ut/env_setup.sh @@ -99,7 +99,7 @@ elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then fi if [[ $(echo "${test_case}" | grep -c "api") != 0 ]] || [[ $(echo "${test_case}" | grep -c "adaptor") != 0 ]]; then - pip install git+https://github.com/intel/auto-round.git@b65830f3f6cb32d92a5c8ba5f80ace12d517357b + pip install git+https://github.com/intel/auto-round.git@6815f8b66be456ecbef2d0beb33dbc4efeefdc04 fi # test deps diff --git a/test/requirements.txt b/test/requirements.txt index cc0205b2d72..ca603fd7afc 100644 --- a/test/requirements.txt +++ b/test/requirements.txt @@ -1,7 +1,7 @@ --find-links https://download.pytorch.org/whl/torch_stable.html accelerate==0.21.0 dynast==1.6.0rc1 -git+https://github.com/intel/auto-round.git@b65830f3f6cb32d92a5c8ba5f80ace12d517357b +git+https://github.com/intel/auto-round.git@6815f8b66be456ecbef2d0beb33dbc4efeefdc04 horovod intel-extension-for-pytorch intel-tensorflow>=2.12.0