Remove gptq_debug options in examples (#1569)

Signed-off-by: YIYANGCAI <[email protected]> Signed-off-by: chensuyue <[email protected]>
intel · Jan 28, 2024 · f9bc76b · f9bc76b
1 parent b08725a
commit f9bc76b
Show file tree

Hide file tree

Showing 7 changed files with 25 additions and 76 deletions.
diff --git a/examples/.config/model_params_pytorch.json b/examples/.config/model_params_pytorch.json
@@ -492,13 +492,6 @@
       "main_script": "run_clm_no_trainer.py",
       "batch_size": 8
     },
-    "opt_125m_woq_gptq_debug_int4":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 8
-    },
     "opt_125m_woq_teq":{
       "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm",
       "dataset_location": "",
@@ -583,13 +576,6 @@
       "main_script": "run_clm_no_trainer.py",
       "batch_size": 1
     },
-    "gpt_j_woq_gptq_debug_int4":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 1
-    },
     "gpt_j_woq_gptq_int4":{
       "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm",
       "dataset_location": "",
@@ -618,7 +604,7 @@
       "main_script": "run_clm_no_trainer.py",
       "batch_size": 1
     },
-    "falcon_7b_woq_gptq_debug_int4":{
+    "falcon_7b_woq_gptq_int4":{
       "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm",
       "dataset_location": "",
       "input_model": "",

diff --git a/...les/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/README.md b/...les/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/README.md
@@ -61,10 +61,9 @@ python run_clm_no_trainer.py \
     --woq_scheme asym \
     --woq_group_size 128 \
     --gptq_pad_max_length 2048 \
-    --gptq_use_max_length \
-    --gptq_debug
+    --gptq_use_max_length
 ```
-**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md). Our GPTQ API support various CLMs including GPTJ, OPTs, Blooms, Llamas, Falcons, MPTs, ChatGLMs, etc. Simply replace the "--model" argument with other models to quantize different CLMs with GPTQ.
+**Notes**: Weight-only quantization based on fake quantization is supported in preview, including RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md). Our GPTQ API support various CLMs including GPTJ, OPTs, Blooms, Llamas, Falcons, MPTs, ChatGLMs, etc. Simply replace the "--model" argument with other models to quantize different CLMs with GPTQ.
 
 
 #### Accuracy with lm_eval
@@ -111,8 +110,7 @@ python run_clm_no_trainer.py \
     --woq_scheme asym \
     --woq_group_size 128 \
     --gptq_pad_max_length 2048 \
-    --gptq_use_max_length \
-    --gptq_debug
+    --gptq_use_max_length
 ```
 
 #### Accuracy with lm_eval
@@ -158,8 +156,7 @@ python run_clm_no_trainer.py \
     --woq_scheme asym \
     --woq_group_size 128 \
     --gptq_pad_max_length 2048 \
-    --gptq_use_max_length \
-    --gptq_debug
+    --gptq_use_max_length
 ```
 
 #### Accuracy with lm_eval
@@ -202,8 +199,7 @@ python run_clm_no_trainer.py \
     --woq_scheme asym \
     --woq_group_size 128 \
     --gptq_pad_max_length 2048 \
-    --gptq_use_max_length \
-    --gptq_debug
+    --gptq_use_max_length
 ```
 #### Accuracy with lm_eval
 ```bash
@@ -244,8 +240,7 @@ python run_clm_no_trainer.py \
     --woq_scheme asym \
     --woq_group_size 128 \
     --gptq_pad_max_length 2048 \
-    --gptq_use_max_length \
-    --gptq_debug
+    --gptq_use_max_length
 ```
 #### Accuracy with lm_eval
 ```bash

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_benchmark.sh
@@ -79,10 +79,10 @@ function run_benchmark {
         model_name_or_path="facebook/opt-125m"
         approach="weight_only"
         extra_cmd=$extra_cmd" --woq_algo GPTQ"
-    elif [ "${topology}" = "opt_125m_woq_gptq_debug_int4" ]; then
+    elif [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
         model_name_or_path="facebook/opt-125m"
         approach="weight_only"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_scheme asym --woq_group_size 128 --gptq_use_max_length --gptq_debug"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_scheme asym --woq_group_size 128 --gptq_use_max_length"
     elif [ "${topology}" = "opt_125m_woq_teq" ]; then
         model_name_or_path="facebook/opt-125m"
         approach="weight_only"
@@ -106,17 +106,17 @@ function run_benchmark {
         model_name_or_path="EleutherAI/gpt-j-6b"
         approach="weight_only"
         extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_enable_mse_search"
-    elif [ "${topology}" = "gpt_j_woq_gptq_debug_int4" ]; then
+    elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then
         model_name_or_path="EleutherAI/gpt-j-6b"
         approach="weight_only"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length --gptq_debug"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length"
     elif [ "${topology}" = "falcon_7b_sq" ]; then
         model_name_or_path="tiiuae/falcon-7b-instruct"
         extra_cmd=$extra_cmd" --sq --alpha 0.5"
-    elif [ "${topology}" = "falcon_7b_woq_gptq_debug_int4" ]; then
+    elif [ "${topology}" = "falcon_7b_woq_gptq_int4" ]; then
         model_name_or_path="tiiuae/falcon-7b-instruct"
         approach="weight_only"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length --gptq_debug"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length"
     fi
 
     python -u run_clm_no_trainer.py \

diff --git a/...s/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/...s/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py
@@ -77,7 +77,6 @@
 parser.add_argument('--gptq_pad_max_length', type=int, default=2048, help='Calibration dataset sequence max length, \
                                                                            this should align with your model config, \
                                                                            and your dataset builder args: args.pad_max_length')
-parser.add_argument('--gptq_debug', action='store_true', help='Whether to use debug model ')
 parser.add_argument('--gptq_static_groups', action='store_true', help='Use determined group to do quantization')
 # ==============code generation args===========
 parser.add_argument("--code_generation", action="store_true")
@@ -292,35 +291,6 @@ def calib_func(prepared_model):
             op_name_dict=op_name_dict,
             recipes=recipes,
         )
-
-        # for test on various models, keep the code of directly call gptq_quantize
-        if args.gptq_debug:
-
-            from neural_compressor.adaptor.torch_utils.weight_only import gptq_quantize
-
-            gptq_conf = {
-                ".*": {
-                    'wbits': args.woq_bits,  # 1-8 bits
-                    'group_size': args.woq_group_size,  # -1 (per-channel)
-                    'sym': (args.woq_scheme == "sym"),
-                    'act_order': args.gptq_actorder,
-                    'static_groups': args.gptq_static_groups,
-                }
-            }
-            q_model_gptq_debug, gptq_config = gptq_quantize(
-                user_model,
-                weight_config=gptq_conf,
-                dataloader=calib_dataloader,
-                nsamples=args.gptq_nsamples,
-                use_max_length=args.gptq_use_max_length,
-                pad_max_length=args.gptq_pad_max_length,
-            )
-
-            # save the fake quantized model
-            os.makedirs(args.output_dir, exist_ok=True)
-            torch.save(q_model_gptq_debug, os.path.join(args.output_dir, "gptq_best_model.pt"))
-            exit(0)
-
     else:
         if re.search("gpt", user_model.config.model_type):
             op_type_dict = {
@@ -371,12 +341,9 @@ def eval_func(model):
     if args.ipex:
         user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
     else:
-        if args.gptq_debug:
-            user_model = torch.load(os.path.join(args.output_dir, "gptq_best_model.pt"))
-        else:
-            user_model, _ = get_user_model()
-            kwargs = {'weight_only': True} if args.approach == 'weight_only' else {}
-            user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)), user_model, **kwargs)
+        user_model, _ = get_user_model()
+        kwargs = {'weight_only': True} if args.approach == 'weight_only' else {}
+        user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)), user_model, **kwargs)
 else:
     user_model, _ = get_user_model()
 

diff --git a/...pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.py b/...pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.py
@@ -315,7 +315,7 @@ def forward(self, *inp, **kwargs):
                 'percdamp': 0.01, 
                 'act_order':args.act_order,
                 'block_size': args.block_size, 
-                'nsampeles': args.nsamples,
+                'nsamples': args.nsamples,
                 'use_max_length': args.use_max_length,
                 'pad_max_length': args.pad_max_length
             },

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_quant.sh
@@ -50,10 +50,10 @@ function run_tuning {
         model_name_or_path="facebook/opt-125m"
         approach="weight_only"
         extra_cmd=$extra_cmd" --woq_algo GPTQ"
-    elif [ "${topology}" = "opt_125m_woq_gptq_debug_int4" ]; then
+    elif [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
         model_name_or_path="facebook/opt-125m"
         approach="weight_only"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_scheme asym --woq_group_size 128 --gptq_use_max_length --gptq_debug"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_scheme asym --woq_group_size 128 --gptq_use_max_length"
     elif [ "${topology}" = "opt_125m_woq_teq" ]; then
         model_name_or_path="facebook/opt-125m"
         approach="weight_only"
@@ -77,17 +77,17 @@ function run_tuning {
         model_name_or_path="EleutherAI/gpt-j-6b"
         approach="weight_only"
         extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_enable_mse_search"
-    elif [ "${topology}" = "gpt_j_woq_gptq_debug_int4" ]; then
+    elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then
         model_name_or_path="EleutherAI/gpt-j-6b"
         approach="weight_only"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length --gptq_debug"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length"
     elif [ "${topology}" = "falcon_7b_sq" ]; then
         model_name_or_path="tiiuae/falcon-7b-instruct"
         extra_cmd=$extra_cmd" --sq --alpha 0.5"
-    elif [ "${topology}" = "falcon_7b_woq_gptq_debug_int4" ]; then
+    elif [ "${topology}" = "falcon_7b_woq_gptq_int4" ]; then
         model_name_or_path="tiiuae/falcon-7b-instruct"
         approach="weight_only"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length --gptq_debug"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length"
     fi
 
     python -u run_clm_no_trainer.py \

diff --git a/neural_compressor/adaptor/torch_utils/gptq.py b/neural_compressor/adaptor/torch_utils/gptq.py
@@ -133,7 +133,8 @@ def find_layers(module, layers=[nn.Conv2d, nn.Conv1d, nn.Linear, transformers.Co
         return {name: module}
     else:
         # use string type to find name:
-        if type(module).__name__ in ["Linear"]:
+        # if type(module).__name__ in ["Linear"]:
+        if isinstance(module, (nn.Conv2d, nn.Conv1d, nn.Linear, transformers.Conv1D)):
             return {name: module}
         else:
             pass