Skip to content

Commit

Permalink
Modify WOQ examples structure (#1866)
Browse files Browse the repository at this point in the history
Signed-off-by: Kaihui-intel <[email protected]>
Signed-off-by: chensuyue <[email protected]>
  • Loading branch information
Kaihui-intel authored Jun 14, 2024
1 parent 498af74 commit 48c5e3a
Show file tree
Hide file tree
Showing 11 changed files with 233 additions and 305 deletions.
6 changes: 3 additions & 3 deletions .azure-pipelines/scripts/models/run_pytorch_models_trigger.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,15 @@ elif [ "${model}" == "resnet18_fx" ]; then
tuning_cmd="bash run_quant.sh --topology=resnet18 --dataset_location=${dataset_location} --input_model=${input_model}"
benchmark_cmd="bash run_benchmark.sh --topology=resnet18 --dataset_location=${dataset_location} --mode=performance --batch_size=${batch_size} --iters=500"
elif [ "${model}" == "opt_125m_woq_gptq_int4" ]; then
model_src_dir="nlp/huggingface_models/language-modeling/quantization/llm"
model_src_dir="nlp/huggingface_models/language-modeling/quantization/weight_only"
inc_new_api=3x_pt
tuning_cmd="bash run_quant.sh --topology=opt_125m_woq_gptq_int4"
elif [ "${model}" == "opt_125m_woq_gptq_int4_dq_bnb" ]; then
model_src_dir="nlp/huggingface_models/language-modeling/quantization/llm"
model_src_dir="nlp/huggingface_models/language-modeling/quantization/weight_only"
inc_new_api=3x_pt
tuning_cmd="bash run_quant.sh --topology=opt_125m_woq_gptq_int4_dq_bnb"
elif [ "${model}" == "opt_125m_woq_gptq_int4_dq_ggml" ]; then
model_src_dir="nlp/huggingface_models/language-modeling/quantization/llm"
model_src_dir="nlp/huggingface_models/language-modeling/quantization/weight_only"
inc_new_api=3x_pt
tuning_cmd="bash run_quant.sh --topology=opt_125m_woq_gptq_int4_dq_ggml"
fi
Expand Down
2 changes: 1 addition & 1 deletion docs/3x/PT_WeightOnlyQuant.md
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ loaded_model = load(

## Examples

Users can also refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm) on how to quantize a model with WeightOnlyQuant.
Users can also refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only) on how to quantize a model with WeightOnlyQuant.

## Reference

Expand Down
218 changes: 151 additions & 67 deletions examples/.config/model_params_pytorch_3x.json
Original file line number Diff line number Diff line change
@@ -1,67 +1,151 @@
{
"pytorch": {
"gpt_j_ipex":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"gpt_j_ipex_sq":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"llama2_7b_ipex":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"llama2_7b_ipex_sq":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"opt_125m_ipex":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"opt_125m_ipex_sq":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"dlrm_ipex": {
"model_src_dir": "recommendation/dlrm/static_quant/ipex",
"dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input",
"input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt",
"main_script": "dlrm_s_pytorch.py",
"batch_size": 16384
},
"resnet18_pt2e_static":{
"model_src_dir": "cv/static_quant",
"dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
"input_model": "",
"main_script": "main.py",
"batch_size": 1
},
"opt_125m_pt2e_static":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
}
}
}
{
"pytorch": {
"opt_125m_woq_gptq_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"opt_125m_woq_gptq_int4_dq_bnb":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"opt_125m_woq_gptq_int4_dq_ggml":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"llama2_7b_gptq_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"llama2_7b_gptq_int4_dq_bnb":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"llama2_7b_gptq_int4_dq_ggml":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"gpt_j_woq_rtn_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"gpt_j_woq_rtn_int4_dq_bnb":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"gpt_j_woq_rtn_int4_dq_ggml":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"gpt_j_woq_gptq_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"gpt_j_woq_gptq_int4_dq_bnb":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"gpt_j_woq_gptq_int4_dq_ggml":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"gpt_j_ipex":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"gpt_j_ipex_sq":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"llama2_7b_ipex":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"llama2_7b_ipex_sq":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"opt_125m_ipex":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"opt_125m_ipex_sq":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"dlrm_ipex": {
"model_src_dir": "recommendation/dlrm/static_quant/ipex",
"dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input",
"input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt",
"main_script": "dlrm_s_pytorch.py",
"batch_size": 16384
},
"resnet18_pt2e_static":{
"model_src_dir": "cv/static_quant",
"dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
"input_model": "",
"main_script": "main.py",
"batch_size": 1
},
"opt_125m_pt2e_static":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,27 +21,14 @@ Here is how to run the scripts:
### GPT-J-6b

#### Quantization
```bash
# "--sq" is used to enable smooth quant
python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--quantize \
--sq \
--alpha 1.0 \
--ipex \
--output_dir "saved_results"
```
**Notes**: Smooth quantization here is based on torch.jit. Without past key value in example_inputs, the quantized model cannot be used for text-generation.

```bash
# "--approach weight_only" is used to enable weight only quantization.
# "--woq_algo GPTQ" is used to enable GPTQ algorithms
# "--double_quant_type BNB_NF4" is used to enable double quant algorithms
python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--dataset NeelNanda/pile-10k \
--quantize \
--approach weight_only \
--woq_algo GPTQ \
--woq_bits 4 \
--woq_scheme asym \
Expand All @@ -57,7 +44,6 @@ python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--dataset NeelNanda/pile-10k \
--quantize \
--approach weight_only \
--woq_algo RTN \
--woq_bits 4 \
--woq_scheme asym \
Expand All @@ -74,23 +60,12 @@ python run_clm_no_trainer.py \
#### Quantization

```bash
# "--sq" is used to enable smooth quant
python run_clm_no_trainer.py \
--model facebook/opt-125m \
--quantize \
--sq \
--alpha 0.5 \
--ipex \
--output_dir "saved_results"

# "--approach weight_only" is used to enable weight only quantization.
# "--woq_algo GPTQ" is used to enable GPTQ algorithms
# "--double_quant_type BNB_NF4" is used to enable double quant algorithms
python run_clm_no_trainer.py \
--model facebook/opt-125m \
--dataset NeelNanda/pile-10k \
--quantize \
--approach weight_only \
--woq_algo GPTQ \
--woq_bits 4 \
--woq_scheme asym \
Expand All @@ -106,7 +81,6 @@ python run_clm_no_trainer.py \
--model facebook/opt-125m \
--dataset NeelNanda/pile-10k \
--quantize \
--approach weight_only \
--woq_algo RTN \
--woq_bits 4 \
--woq_scheme asym \
Expand All @@ -121,23 +95,12 @@ python run_clm_no_trainer.py \
#### Quantization

```bash
# "--sq" is used to enable smooth quant
python run_clm_no_trainer.py \
--model meta-llama/Llama-2-7b-hf \
--quantize \
--sq \
--alpha 0.8 \
--ipex \
--output_dir "saved_results"

# "--approach weight_only" is used to enable weight only quantization.
# "--double_quant_type BNB_NF4" is used to enable double quant algorithms
# "--woq_algo GPTQ" is used to enable GPTQ algorithms
python run_clm_no_trainer.py \
--model meta-llama/Llama-2-7b-hf \
--dataset NeelNanda/pile-10k \
--quantize \
--approach weight_only \
--woq_algo GPTQ \
--woq_bits 4 \
--woq_scheme asym \
Expand All @@ -153,7 +116,6 @@ python run_clm_no_trainer.py \
--model meta-llama/Llama-2-7b-hf \
--dataset NeelNanda/pile-10k \
--quantize \
--approach weight_only \
--woq_algo RTN \
--woq_bits 4 \
--woq_scheme asym \
Expand Down
Loading

0 comments on commit 48c5e3a

Please sign in to comment.