Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modify WOQ examples structure #1866

Merged
merged 12 commits into from
Jun 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .azure-pipelines/scripts/models/run_pytorch_models_trigger.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,15 @@ elif [ "${model}" == "resnet18_fx" ]; then
tuning_cmd="bash run_quant.sh --topology=resnet18 --dataset_location=${dataset_location} --input_model=${input_model}"
benchmark_cmd="bash run_benchmark.sh --topology=resnet18 --dataset_location=${dataset_location} --mode=performance --batch_size=${batch_size} --iters=500"
elif [ "${model}" == "opt_125m_woq_gptq_int4" ]; then
model_src_dir="nlp/huggingface_models/language-modeling/quantization/llm"
model_src_dir="nlp/huggingface_models/language-modeling/quantization/weight_only"
inc_new_api=3x_pt
tuning_cmd="bash run_quant.sh --topology=opt_125m_woq_gptq_int4"
elif [ "${model}" == "opt_125m_woq_gptq_int4_dq_bnb" ]; then
model_src_dir="nlp/huggingface_models/language-modeling/quantization/llm"
model_src_dir="nlp/huggingface_models/language-modeling/quantization/weight_only"
inc_new_api=3x_pt
tuning_cmd="bash run_quant.sh --topology=opt_125m_woq_gptq_int4_dq_bnb"
elif [ "${model}" == "opt_125m_woq_gptq_int4_dq_ggml" ]; then
model_src_dir="nlp/huggingface_models/language-modeling/quantization/llm"
model_src_dir="nlp/huggingface_models/language-modeling/quantization/weight_only"
inc_new_api=3x_pt
tuning_cmd="bash run_quant.sh --topology=opt_125m_woq_gptq_int4_dq_ggml"
fi
Expand Down
2 changes: 1 addition & 1 deletion docs/3x/PT_WeightOnlyQuant.md
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ loaded_model = load(

## Examples

Users can also refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm) on how to quantize a model with WeightOnlyQuant.
Users can also refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only) on how to quantize a model with WeightOnlyQuant.

## Reference

Expand Down
218 changes: 151 additions & 67 deletions examples/.config/model_params_pytorch_3x.json
Original file line number Diff line number Diff line change
@@ -1,67 +1,151 @@
{
"pytorch": {
"gpt_j_ipex":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"gpt_j_ipex_sq":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"llama2_7b_ipex":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"llama2_7b_ipex_sq":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"opt_125m_ipex":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"opt_125m_ipex_sq":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"dlrm_ipex": {
"model_src_dir": "recommendation/dlrm/static_quant/ipex",
"dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input",
"input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt",
"main_script": "dlrm_s_pytorch.py",
"batch_size": 16384
},
"resnet18_pt2e_static":{
"model_src_dir": "cv/static_quant",
"dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
"input_model": "",
"main_script": "main.py",
"batch_size": 1
},
"opt_125m_pt2e_static":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
}
}
}
{
"pytorch": {
"opt_125m_woq_gptq_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"opt_125m_woq_gptq_int4_dq_bnb":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"opt_125m_woq_gptq_int4_dq_ggml":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"llama2_7b_gptq_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"llama2_7b_gptq_int4_dq_bnb":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"llama2_7b_gptq_int4_dq_ggml":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"gpt_j_woq_rtn_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"gpt_j_woq_rtn_int4_dq_bnb":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"gpt_j_woq_rtn_int4_dq_ggml":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"gpt_j_woq_gptq_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"gpt_j_woq_gptq_int4_dq_bnb":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"gpt_j_woq_gptq_int4_dq_ggml":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"gpt_j_ipex":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"gpt_j_ipex_sq":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"llama2_7b_ipex":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"llama2_7b_ipex_sq":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"opt_125m_ipex":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"opt_125m_ipex_sq":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"dlrm_ipex": {
"model_src_dir": "recommendation/dlrm/static_quant/ipex",
"dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input",
"input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt",
"main_script": "dlrm_s_pytorch.py",
"batch_size": 16384
},
"resnet18_pt2e_static":{
"model_src_dir": "cv/static_quant",
"dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
"input_model": "",
"main_script": "main.py",
"batch_size": 1
},
"opt_125m_pt2e_static":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,27 +21,14 @@ Here is how to run the scripts:
### GPT-J-6b

#### Quantization
```bash
# "--sq" is used to enable smooth quant
python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--quantize \
--sq \
--alpha 1.0 \
--ipex \
--output_dir "saved_results"
```
**Notes**: Smooth quantization here is based on torch.jit. Without past key value in example_inputs, the quantized model cannot be used for text-generation.

```bash
# "--approach weight_only" is used to enable weight only quantization.
# "--woq_algo GPTQ" is used to enable GPTQ algorithms
# "--double_quant_type BNB_NF4" is used to enable double quant algorithms
python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--dataset NeelNanda/pile-10k \
--quantize \
--approach weight_only \
--woq_algo GPTQ \
--woq_bits 4 \
--woq_scheme asym \
Expand All @@ -57,7 +44,6 @@ python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--dataset NeelNanda/pile-10k \
--quantize \
--approach weight_only \
--woq_algo RTN \
--woq_bits 4 \
--woq_scheme asym \
Expand All @@ -74,23 +60,12 @@ python run_clm_no_trainer.py \
#### Quantization

```bash
# "--sq" is used to enable smooth quant
python run_clm_no_trainer.py \
--model facebook/opt-125m \
--quantize \
--sq \
--alpha 0.5 \
--ipex \
--output_dir "saved_results"

# "--approach weight_only" is used to enable weight only quantization.
# "--woq_algo GPTQ" is used to enable GPTQ algorithms
# "--double_quant_type BNB_NF4" is used to enable double quant algorithms
python run_clm_no_trainer.py \
--model facebook/opt-125m \
--dataset NeelNanda/pile-10k \
--quantize \
--approach weight_only \
--woq_algo GPTQ \
--woq_bits 4 \
--woq_scheme asym \
Expand All @@ -106,7 +81,6 @@ python run_clm_no_trainer.py \
--model facebook/opt-125m \
--dataset NeelNanda/pile-10k \
--quantize \
--approach weight_only \
--woq_algo RTN \
--woq_bits 4 \
--woq_scheme asym \
Expand All @@ -121,23 +95,12 @@ python run_clm_no_trainer.py \
#### Quantization

```bash
# "--sq" is used to enable smooth quant
python run_clm_no_trainer.py \
--model meta-llama/Llama-2-7b-hf \
--quantize \
--sq \
--alpha 0.8 \
--ipex \
--output_dir "saved_results"

# "--approach weight_only" is used to enable weight only quantization.
# "--double_quant_type BNB_NF4" is used to enable double quant algorithms
# "--woq_algo GPTQ" is used to enable GPTQ algorithms
python run_clm_no_trainer.py \
--model meta-llama/Llama-2-7b-hf \
--dataset NeelNanda/pile-10k \
--quantize \
--approach weight_only \
--woq_algo GPTQ \
--woq_bits 4 \
--woq_scheme asym \
Expand All @@ -153,7 +116,6 @@ python run_clm_no_trainer.py \
--model meta-llama/Llama-2-7b-hf \
--dataset NeelNanda/pile-10k \
--quantize \
--approach weight_only \
--woq_algo RTN \
--woq_bits 4 \
--woq_scheme asym \
Expand Down
Loading
Loading