From f9bc76bebee7e3ceae0615b2f99566c8bcb7c0fa Mon Sep 17 00:00:00 2001 From: Yiyang Cai <49231152+YIYANGCAI@users.noreply.github.com> Date: Sun, 28 Jan 2024 20:55:46 +0800 Subject: [PATCH] Remove gptq_debug options in examples (#1569) Signed-off-by: YIYANGCAI Signed-off-by: chensuyue --- examples/.config/model_params_pytorch.json | 16 +------- .../quantization/llm/README.md | 17 +++----- .../quantization/llm/run_benchmark.sh | 12 +++--- .../quantization/llm/run_clm_no_trainer.py | 39 ++----------------- .../quantization/llm/run_gptj_mlperf_int4.py | 2 +- .../quantization/llm/run_quant.sh | 12 +++--- neural_compressor/adaptor/torch_utils/gptq.py | 3 +- 7 files changed, 25 insertions(+), 76 deletions(-) diff --git a/examples/.config/model_params_pytorch.json b/examples/.config/model_params_pytorch.json index d610b4ba8b3..1532affd5c1 100644 --- a/examples/.config/model_params_pytorch.json +++ b/examples/.config/model_params_pytorch.json @@ -492,13 +492,6 @@ "main_script": "run_clm_no_trainer.py", "batch_size": 8 }, - "opt_125m_woq_gptq_debug_int4":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 8 - }, "opt_125m_woq_teq":{ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm", "dataset_location": "", @@ -583,13 +576,6 @@ "main_script": "run_clm_no_trainer.py", "batch_size": 1 }, - "gpt_j_woq_gptq_debug_int4":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, "gpt_j_woq_gptq_int4":{ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm", "dataset_location": "", @@ -618,7 +604,7 @@ "main_script": "run_clm_no_trainer.py", "batch_size": 1 }, - "falcon_7b_woq_gptq_debug_int4":{ + "falcon_7b_woq_gptq_int4":{ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm", "dataset_location": "", "input_model": "", diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/README.md index 6fa558ab8ef..97240bea87a 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/README.md @@ -61,10 +61,9 @@ python run_clm_no_trainer.py \ --woq_scheme asym \ --woq_group_size 128 \ --gptq_pad_max_length 2048 \ - --gptq_use_max_length \ - --gptq_debug + --gptq_use_max_length ``` -**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md). Our GPTQ API support various CLMs including GPTJ, OPTs, Blooms, Llamas, Falcons, MPTs, ChatGLMs, etc. Simply replace the "--model" argument with other models to quantize different CLMs with GPTQ. +**Notes**: Weight-only quantization based on fake quantization is supported in preview, including RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md). Our GPTQ API support various CLMs including GPTJ, OPTs, Blooms, Llamas, Falcons, MPTs, ChatGLMs, etc. Simply replace the "--model" argument with other models to quantize different CLMs with GPTQ. #### Accuracy with lm_eval @@ -111,8 +110,7 @@ python run_clm_no_trainer.py \ --woq_scheme asym \ --woq_group_size 128 \ --gptq_pad_max_length 2048 \ - --gptq_use_max_length \ - --gptq_debug + --gptq_use_max_length ``` #### Accuracy with lm_eval @@ -158,8 +156,7 @@ python run_clm_no_trainer.py \ --woq_scheme asym \ --woq_group_size 128 \ --gptq_pad_max_length 2048 \ - --gptq_use_max_length \ - --gptq_debug + --gptq_use_max_length ``` #### Accuracy with lm_eval @@ -202,8 +199,7 @@ python run_clm_no_trainer.py \ --woq_scheme asym \ --woq_group_size 128 \ --gptq_pad_max_length 2048 \ - --gptq_use_max_length \ - --gptq_debug + --gptq_use_max_length ``` #### Accuracy with lm_eval ```bash @@ -244,8 +240,7 @@ python run_clm_no_trainer.py \ --woq_scheme asym \ --woq_group_size 128 \ --gptq_pad_max_length 2048 \ - --gptq_use_max_length \ - --gptq_debug + --gptq_use_max_length ``` #### Accuracy with lm_eval ```bash diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_benchmark.sh index b32d1ff4dea..0277a26c79c 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_benchmark.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_benchmark.sh @@ -79,10 +79,10 @@ function run_benchmark { model_name_or_path="facebook/opt-125m" approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ" - elif [ "${topology}" = "opt_125m_woq_gptq_debug_int4" ]; then + elif [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then model_name_or_path="facebook/opt-125m" approach="weight_only" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_scheme asym --woq_group_size 128 --gptq_use_max_length --gptq_debug" + extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_scheme asym --woq_group_size 128 --gptq_use_max_length" elif [ "${topology}" = "opt_125m_woq_teq" ]; then model_name_or_path="facebook/opt-125m" approach="weight_only" @@ -106,17 +106,17 @@ function run_benchmark { model_name_or_path="EleutherAI/gpt-j-6b" approach="weight_only" extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_enable_mse_search" - elif [ "${topology}" = "gpt_j_woq_gptq_debug_int4" ]; then + elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then model_name_or_path="EleutherAI/gpt-j-6b" approach="weight_only" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length --gptq_debug" + extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length" elif [ "${topology}" = "falcon_7b_sq" ]; then model_name_or_path="tiiuae/falcon-7b-instruct" extra_cmd=$extra_cmd" --sq --alpha 0.5" - elif [ "${topology}" = "falcon_7b_woq_gptq_debug_int4" ]; then + elif [ "${topology}" = "falcon_7b_woq_gptq_int4" ]; then model_name_or_path="tiiuae/falcon-7b-instruct" approach="weight_only" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length --gptq_debug" + extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length" fi python -u run_clm_no_trainer.py \ diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py index fc3799f5fd9..74a31bfdb35 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py @@ -77,7 +77,6 @@ parser.add_argument('--gptq_pad_max_length', type=int, default=2048, help='Calibration dataset sequence max length, \ this should align with your model config, \ and your dataset builder args: args.pad_max_length') -parser.add_argument('--gptq_debug', action='store_true', help='Whether to use debug model ') parser.add_argument('--gptq_static_groups', action='store_true', help='Use determined group to do quantization') # ==============code generation args=========== parser.add_argument("--code_generation", action="store_true") @@ -292,35 +291,6 @@ def calib_func(prepared_model): op_name_dict=op_name_dict, recipes=recipes, ) - - # for test on various models, keep the code of directly call gptq_quantize - if args.gptq_debug: - - from neural_compressor.adaptor.torch_utils.weight_only import gptq_quantize - - gptq_conf = { - ".*": { - 'wbits': args.woq_bits, # 1-8 bits - 'group_size': args.woq_group_size, # -1 (per-channel) - 'sym': (args.woq_scheme == "sym"), - 'act_order': args.gptq_actorder, - 'static_groups': args.gptq_static_groups, - } - } - q_model_gptq_debug, gptq_config = gptq_quantize( - user_model, - weight_config=gptq_conf, - dataloader=calib_dataloader, - nsamples=args.gptq_nsamples, - use_max_length=args.gptq_use_max_length, - pad_max_length=args.gptq_pad_max_length, - ) - - # save the fake quantized model - os.makedirs(args.output_dir, exist_ok=True) - torch.save(q_model_gptq_debug, os.path.join(args.output_dir, "gptq_best_model.pt")) - exit(0) - else: if re.search("gpt", user_model.config.model_type): op_type_dict = { @@ -371,12 +341,9 @@ def eval_func(model): if args.ipex: user_model = load(os.path.abspath(os.path.expanduser(args.output_dir))) else: - if args.gptq_debug: - user_model = torch.load(os.path.join(args.output_dir, "gptq_best_model.pt")) - else: - user_model, _ = get_user_model() - kwargs = {'weight_only': True} if args.approach == 'weight_only' else {} - user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)), user_model, **kwargs) + user_model, _ = get_user_model() + kwargs = {'weight_only': True} if args.approach == 'weight_only' else {} + user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)), user_model, **kwargs) else: user_model, _ = get_user_model() diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.py index fa50e1ca192..e2172f54a56 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.py @@ -315,7 +315,7 @@ def forward(self, *inp, **kwargs): 'percdamp': 0.01, 'act_order':args.act_order, 'block_size': args.block_size, - 'nsampeles': args.nsamples, + 'nsamples': args.nsamples, 'use_max_length': args.use_max_length, 'pad_max_length': args.pad_max_length }, diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_quant.sh index 179cb3cec22..e4986785d10 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_quant.sh @@ -50,10 +50,10 @@ function run_tuning { model_name_or_path="facebook/opt-125m" approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ" - elif [ "${topology}" = "opt_125m_woq_gptq_debug_int4" ]; then + elif [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then model_name_or_path="facebook/opt-125m" approach="weight_only" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_scheme asym --woq_group_size 128 --gptq_use_max_length --gptq_debug" + extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_scheme asym --woq_group_size 128 --gptq_use_max_length" elif [ "${topology}" = "opt_125m_woq_teq" ]; then model_name_or_path="facebook/opt-125m" approach="weight_only" @@ -77,17 +77,17 @@ function run_tuning { model_name_or_path="EleutherAI/gpt-j-6b" approach="weight_only" extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_enable_mse_search" - elif [ "${topology}" = "gpt_j_woq_gptq_debug_int4" ]; then + elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then model_name_or_path="EleutherAI/gpt-j-6b" approach="weight_only" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length --gptq_debug" + extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length" elif [ "${topology}" = "falcon_7b_sq" ]; then model_name_or_path="tiiuae/falcon-7b-instruct" extra_cmd=$extra_cmd" --sq --alpha 0.5" - elif [ "${topology}" = "falcon_7b_woq_gptq_debug_int4" ]; then + elif [ "${topology}" = "falcon_7b_woq_gptq_int4" ]; then model_name_or_path="tiiuae/falcon-7b-instruct" approach="weight_only" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length --gptq_debug" + extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length" fi python -u run_clm_no_trainer.py \ diff --git a/neural_compressor/adaptor/torch_utils/gptq.py b/neural_compressor/adaptor/torch_utils/gptq.py index 371150b779c..7b0ee1c35f7 100644 --- a/neural_compressor/adaptor/torch_utils/gptq.py +++ b/neural_compressor/adaptor/torch_utils/gptq.py @@ -133,7 +133,8 @@ def find_layers(module, layers=[nn.Conv2d, nn.Conv1d, nn.Linear, transformers.Co return {name: module} else: # use string type to find name: - if type(module).__name__ in ["Linear"]: + # if type(module).__name__ in ["Linear"]: + if isinstance(module, (nn.Conv2d, nn.Conv1d, nn.Linear, transformers.Conv1D)): return {name: module} else: pass