Skip to content

Commit

Permalink
Remove gptq_debug options in examples (#1569)
Browse files Browse the repository at this point in the history
Signed-off-by: YIYANGCAI <[email protected]>
Signed-off-by: chensuyue <[email protected]>
  • Loading branch information
YIYANGCAI authored Jan 28, 2024
1 parent b08725a commit f9bc76b
Show file tree
Hide file tree
Showing 7 changed files with 25 additions and 76 deletions.
16 changes: 1 addition & 15 deletions examples/.config/model_params_pytorch.json
Original file line number Diff line number Diff line change
Expand Up @@ -492,13 +492,6 @@
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"opt_125m_woq_gptq_debug_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"opt_125m_woq_teq":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm",
"dataset_location": "",
Expand Down Expand Up @@ -583,13 +576,6 @@
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"gpt_j_woq_gptq_debug_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"gpt_j_woq_gptq_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm",
"dataset_location": "",
Expand Down Expand Up @@ -618,7 +604,7 @@
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"falcon_7b_woq_gptq_debug_int4":{
"falcon_7b_woq_gptq_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm",
"dataset_location": "",
"input_model": "",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,9 @@ python run_clm_no_trainer.py \
--woq_scheme asym \
--woq_group_size 128 \
--gptq_pad_max_length 2048 \
--gptq_use_max_length \
--gptq_debug
--gptq_use_max_length
```
**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md). Our GPTQ API support various CLMs including GPTJ, OPTs, Blooms, Llamas, Falcons, MPTs, ChatGLMs, etc. Simply replace the "--model" argument with other models to quantize different CLMs with GPTQ.
**Notes**: Weight-only quantization based on fake quantization is supported in preview, including RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md). Our GPTQ API support various CLMs including GPTJ, OPTs, Blooms, Llamas, Falcons, MPTs, ChatGLMs, etc. Simply replace the "--model" argument with other models to quantize different CLMs with GPTQ.


#### Accuracy with lm_eval
Expand Down Expand Up @@ -111,8 +110,7 @@ python run_clm_no_trainer.py \
--woq_scheme asym \
--woq_group_size 128 \
--gptq_pad_max_length 2048 \
--gptq_use_max_length \
--gptq_debug
--gptq_use_max_length
```

#### Accuracy with lm_eval
Expand Down Expand Up @@ -158,8 +156,7 @@ python run_clm_no_trainer.py \
--woq_scheme asym \
--woq_group_size 128 \
--gptq_pad_max_length 2048 \
--gptq_use_max_length \
--gptq_debug
--gptq_use_max_length
```

#### Accuracy with lm_eval
Expand Down Expand Up @@ -202,8 +199,7 @@ python run_clm_no_trainer.py \
--woq_scheme asym \
--woq_group_size 128 \
--gptq_pad_max_length 2048 \
--gptq_use_max_length \
--gptq_debug
--gptq_use_max_length
```
#### Accuracy with lm_eval
```bash
Expand Down Expand Up @@ -244,8 +240,7 @@ python run_clm_no_trainer.py \
--woq_scheme asym \
--woq_group_size 128 \
--gptq_pad_max_length 2048 \
--gptq_use_max_length \
--gptq_debug
--gptq_use_max_length
```
#### Accuracy with lm_eval
```bash
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,10 @@ function run_benchmark {
model_name_or_path="facebook/opt-125m"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo GPTQ"
elif [ "${topology}" = "opt_125m_woq_gptq_debug_int4" ]; then
elif [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
model_name_or_path="facebook/opt-125m"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_scheme asym --woq_group_size 128 --gptq_use_max_length --gptq_debug"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_scheme asym --woq_group_size 128 --gptq_use_max_length"
elif [ "${topology}" = "opt_125m_woq_teq" ]; then
model_name_or_path="facebook/opt-125m"
approach="weight_only"
Expand All @@ -106,17 +106,17 @@ function run_benchmark {
model_name_or_path="EleutherAI/gpt-j-6b"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_enable_mse_search"
elif [ "${topology}" = "gpt_j_woq_gptq_debug_int4" ]; then
elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length --gptq_debug"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length"
elif [ "${topology}" = "falcon_7b_sq" ]; then
model_name_or_path="tiiuae/falcon-7b-instruct"
extra_cmd=$extra_cmd" --sq --alpha 0.5"
elif [ "${topology}" = "falcon_7b_woq_gptq_debug_int4" ]; then
elif [ "${topology}" = "falcon_7b_woq_gptq_int4" ]; then
model_name_or_path="tiiuae/falcon-7b-instruct"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length --gptq_debug"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length"
fi

python -u run_clm_no_trainer.py \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@
parser.add_argument('--gptq_pad_max_length', type=int, default=2048, help='Calibration dataset sequence max length, \
this should align with your model config, \
and your dataset builder args: args.pad_max_length')
parser.add_argument('--gptq_debug', action='store_true', help='Whether to use debug model ')
parser.add_argument('--gptq_static_groups', action='store_true', help='Use determined group to do quantization')
# ==============code generation args===========
parser.add_argument("--code_generation", action="store_true")
Expand Down Expand Up @@ -292,35 +291,6 @@ def calib_func(prepared_model):
op_name_dict=op_name_dict,
recipes=recipes,
)

# for test on various models, keep the code of directly call gptq_quantize
if args.gptq_debug:

from neural_compressor.adaptor.torch_utils.weight_only import gptq_quantize

gptq_conf = {
".*": {
'wbits': args.woq_bits, # 1-8 bits
'group_size': args.woq_group_size, # -1 (per-channel)
'sym': (args.woq_scheme == "sym"),
'act_order': args.gptq_actorder,
'static_groups': args.gptq_static_groups,
}
}
q_model_gptq_debug, gptq_config = gptq_quantize(
user_model,
weight_config=gptq_conf,
dataloader=calib_dataloader,
nsamples=args.gptq_nsamples,
use_max_length=args.gptq_use_max_length,
pad_max_length=args.gptq_pad_max_length,
)

# save the fake quantized model
os.makedirs(args.output_dir, exist_ok=True)
torch.save(q_model_gptq_debug, os.path.join(args.output_dir, "gptq_best_model.pt"))
exit(0)

else:
if re.search("gpt", user_model.config.model_type):
op_type_dict = {
Expand Down Expand Up @@ -371,12 +341,9 @@ def eval_func(model):
if args.ipex:
user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
else:
if args.gptq_debug:
user_model = torch.load(os.path.join(args.output_dir, "gptq_best_model.pt"))
else:
user_model, _ = get_user_model()
kwargs = {'weight_only': True} if args.approach == 'weight_only' else {}
user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)), user_model, **kwargs)
user_model, _ = get_user_model()
kwargs = {'weight_only': True} if args.approach == 'weight_only' else {}
user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)), user_model, **kwargs)
else:
user_model, _ = get_user_model()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def forward(self, *inp, **kwargs):
'percdamp': 0.01,
'act_order':args.act_order,
'block_size': args.block_size,
'nsampeles': args.nsamples,
'nsamples': args.nsamples,
'use_max_length': args.use_max_length,
'pad_max_length': args.pad_max_length
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,10 @@ function run_tuning {
model_name_or_path="facebook/opt-125m"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo GPTQ"
elif [ "${topology}" = "opt_125m_woq_gptq_debug_int4" ]; then
elif [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
model_name_or_path="facebook/opt-125m"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_scheme asym --woq_group_size 128 --gptq_use_max_length --gptq_debug"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_scheme asym --woq_group_size 128 --gptq_use_max_length"
elif [ "${topology}" = "opt_125m_woq_teq" ]; then
model_name_or_path="facebook/opt-125m"
approach="weight_only"
Expand All @@ -77,17 +77,17 @@ function run_tuning {
model_name_or_path="EleutherAI/gpt-j-6b"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_enable_mse_search"
elif [ "${topology}" = "gpt_j_woq_gptq_debug_int4" ]; then
elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length --gptq_debug"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length"
elif [ "${topology}" = "falcon_7b_sq" ]; then
model_name_or_path="tiiuae/falcon-7b-instruct"
extra_cmd=$extra_cmd" --sq --alpha 0.5"
elif [ "${topology}" = "falcon_7b_woq_gptq_debug_int4" ]; then
elif [ "${topology}" = "falcon_7b_woq_gptq_int4" ]; then
model_name_or_path="tiiuae/falcon-7b-instruct"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length --gptq_debug"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --gptq_use_max_length"
fi

python -u run_clm_no_trainer.py \
Expand Down
3 changes: 2 additions & 1 deletion neural_compressor/adaptor/torch_utils/gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@ def find_layers(module, layers=[nn.Conv2d, nn.Conv1d, nn.Linear, transformers.Co
return {name: module}
else:
# use string type to find name:
if type(module).__name__ in ["Linear"]:
# if type(module).__name__ in ["Linear"]:
if isinstance(module, (nn.Conv2d, nn.Conv1d, nn.Linear, transformers.Conv1D)):
return {name: module}
else:
pass
Expand Down

0 comments on commit f9bc76b

Please sign in to comment.