diff --git a/.azure-pipelines/scripts/models/run_pytorch_models_trigger.sh b/.azure-pipelines/scripts/models/run_pytorch_models_trigger.sh index 32bd2eb0109..896e377b755 100644 --- a/.azure-pipelines/scripts/models/run_pytorch_models_trigger.sh +++ b/.azure-pipelines/scripts/models/run_pytorch_models_trigger.sh @@ -53,15 +53,15 @@ elif [ "${model}" == "resnet18_fx" ]; then tuning_cmd="bash run_quant.sh --topology=resnet18 --dataset_location=${dataset_location} --input_model=${input_model}" benchmark_cmd="bash run_benchmark.sh --topology=resnet18 --dataset_location=${dataset_location} --mode=performance --batch_size=${batch_size} --iters=500" elif [ "${model}" == "opt_125m_woq_gptq_int4" ]; then - model_src_dir="nlp/huggingface_models/language-modeling/quantization/llm" + model_src_dir="nlp/huggingface_models/language-modeling/quantization/weight_only" inc_new_api=3x_pt tuning_cmd="bash run_quant.sh --topology=opt_125m_woq_gptq_int4" elif [ "${model}" == "opt_125m_woq_gptq_int4_dq_bnb" ]; then - model_src_dir="nlp/huggingface_models/language-modeling/quantization/llm" + model_src_dir="nlp/huggingface_models/language-modeling/quantization/weight_only" inc_new_api=3x_pt tuning_cmd="bash run_quant.sh --topology=opt_125m_woq_gptq_int4_dq_bnb" elif [ "${model}" == "opt_125m_woq_gptq_int4_dq_ggml" ]; then - model_src_dir="nlp/huggingface_models/language-modeling/quantization/llm" + model_src_dir="nlp/huggingface_models/language-modeling/quantization/weight_only" inc_new_api=3x_pt tuning_cmd="bash run_quant.sh --topology=opt_125m_woq_gptq_int4_dq_ggml" fi diff --git a/docs/3x/PT_WeightOnlyQuant.md b/docs/3x/PT_WeightOnlyQuant.md index 37cc934592a..b115b38fce3 100644 --- a/docs/3x/PT_WeightOnlyQuant.md +++ b/docs/3x/PT_WeightOnlyQuant.md @@ -258,7 +258,7 @@ loaded_model = load( ## Examples -Users can also refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm) on how to quantize a model with WeightOnlyQuant. +Users can also refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only) on how to quantize a model with WeightOnlyQuant. ## Reference diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json index dfedb7486d3..bbbab60bdbc 100644 --- a/examples/.config/model_params_pytorch_3x.json +++ b/examples/.config/model_params_pytorch_3x.json @@ -1,67 +1,151 @@ -{ - "pytorch": { - "gpt_j_ipex":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, - "gpt_j_ipex_sq":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, - "llama2_7b_ipex":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, - "llama2_7b_ipex_sq":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, - "opt_125m_ipex":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 8 - }, - "opt_125m_ipex_sq":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 8 - }, - "dlrm_ipex": { - "model_src_dir": "recommendation/dlrm/static_quant/ipex", - "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input", - "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt", - "main_script": "dlrm_s_pytorch.py", - "batch_size": 16384 - }, - "resnet18_pt2e_static":{ - "model_src_dir": "cv/static_quant", - "dataset_location": "/tf_dataset/pytorch/ImageNet/raw", - "input_model": "", - "main_script": "main.py", - "batch_size": 1 - }, - "opt_125m_pt2e_static":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - } - } -} +{ + "pytorch": { + "opt_125m_woq_gptq_int4":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "opt_125m_woq_gptq_int4_dq_bnb":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "opt_125m_woq_gptq_int4_dq_ggml":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "llama2_7b_gptq_int4":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "llama2_7b_gptq_int4_dq_bnb":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "llama2_7b_gptq_int4_dq_ggml":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "gpt_j_woq_rtn_int4":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "gpt_j_woq_rtn_int4_dq_bnb":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "gpt_j_woq_rtn_int4_dq_ggml":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "gpt_j_woq_gptq_int4":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "gpt_j_woq_gptq_int4_dq_bnb":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "gpt_j_woq_gptq_int4_dq_ggml":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "gpt_j_ipex":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "gpt_j_ipex_sq":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "llama2_7b_ipex":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "llama2_7b_ipex_sq":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "opt_125m_ipex":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "opt_125m_ipex_sq":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "dlrm_ipex": { + "model_src_dir": "recommendation/dlrm/static_quant/ipex", + "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input", + "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt", + "main_script": "dlrm_s_pytorch.py", + "batch_size": 16384 + }, + "resnet18_pt2e_static":{ + "model_src_dir": "cv/static_quant", + "dataset_location": "/tf_dataset/pytorch/ImageNet/raw", + "input_model": "", + "main_script": "main.py", + "batch_size": 1 + }, + "opt_125m_pt2e_static":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + } + } +} diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md similarity index 79% rename from examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/README.md rename to examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md index 1659ae41e75..889d7b42682 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/README.md +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md @@ -21,27 +21,14 @@ Here is how to run the scripts: ### GPT-J-6b #### Quantization -```bash -# "--sq" is used to enable smooth quant -python run_clm_no_trainer.py \ - --model EleutherAI/gpt-j-6B \ - --quantize \ - --sq \ - --alpha 1.0 \ - --ipex \ - --output_dir "saved_results" -``` -**Notes**: Smooth quantization here is based on torch.jit. Without past key value in example_inputs, the quantized model cannot be used for text-generation. ```bash -# "--approach weight_only" is used to enable weight only quantization. # "--woq_algo GPTQ" is used to enable GPTQ algorithms # "--double_quant_type BNB_NF4" is used to enable double quant algorithms python run_clm_no_trainer.py \ --model EleutherAI/gpt-j-6B \ --dataset NeelNanda/pile-10k \ --quantize \ - --approach weight_only \ --woq_algo GPTQ \ --woq_bits 4 \ --woq_scheme asym \ @@ -57,7 +44,6 @@ python run_clm_no_trainer.py \ --model EleutherAI/gpt-j-6B \ --dataset NeelNanda/pile-10k \ --quantize \ - --approach weight_only \ --woq_algo RTN \ --woq_bits 4 \ --woq_scheme asym \ @@ -74,23 +60,12 @@ python run_clm_no_trainer.py \ #### Quantization ```bash -# "--sq" is used to enable smooth quant -python run_clm_no_trainer.py \ - --model facebook/opt-125m \ - --quantize \ - --sq \ - --alpha 0.5 \ - --ipex \ - --output_dir "saved_results" - -# "--approach weight_only" is used to enable weight only quantization. # "--woq_algo GPTQ" is used to enable GPTQ algorithms # "--double_quant_type BNB_NF4" is used to enable double quant algorithms python run_clm_no_trainer.py \ --model facebook/opt-125m \ --dataset NeelNanda/pile-10k \ --quantize \ - --approach weight_only \ --woq_algo GPTQ \ --woq_bits 4 \ --woq_scheme asym \ @@ -106,7 +81,6 @@ python run_clm_no_trainer.py \ --model facebook/opt-125m \ --dataset NeelNanda/pile-10k \ --quantize \ - --approach weight_only \ --woq_algo RTN \ --woq_bits 4 \ --woq_scheme asym \ @@ -121,23 +95,12 @@ python run_clm_no_trainer.py \ #### Quantization ```bash -# "--sq" is used to enable smooth quant -python run_clm_no_trainer.py \ - --model meta-llama/Llama-2-7b-hf \ - --quantize \ - --sq \ - --alpha 0.8 \ - --ipex \ - --output_dir "saved_results" - -# "--approach weight_only" is used to enable weight only quantization. # "--double_quant_type BNB_NF4" is used to enable double quant algorithms # "--woq_algo GPTQ" is used to enable GPTQ algorithms python run_clm_no_trainer.py \ --model meta-llama/Llama-2-7b-hf \ --dataset NeelNanda/pile-10k \ --quantize \ - --approach weight_only \ --woq_algo GPTQ \ --woq_bits 4 \ --woq_scheme asym \ @@ -153,7 +116,6 @@ python run_clm_no_trainer.py \ --model meta-llama/Llama-2-7b-hf \ --dataset NeelNanda/pile-10k \ --quantize \ - --approach weight_only \ --woq_algo RTN \ --woq_bits 4 \ --woq_scheme asym \ diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements.txt similarity index 100% rename from examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt rename to examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements.txt diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh similarity index 78% rename from examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_benchmark.sh rename to examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh index 8002b61ad10..9e1d766128e 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_benchmark.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh @@ -12,7 +12,6 @@ function main { function init_params { iters=100 batch_size=16 - approach=static tuned_checkpoint=saved_results task=lambada_openai echo ${max_eval_samples} @@ -73,83 +72,52 @@ function run_benchmark { if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then model_name_or_path="facebook/opt-125m" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_bnb" ]; then model_name_or_path="facebook/opt-125m" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_ggml" ]; then model_name_or_path="facebook/opt-125m" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length --gptq_percdamp 0.1 --gptq_actorder" extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" - elif [ "${topology}" = "opt_125m_ipex" ]; then - model_name_or_path="facebook/opt-125m" - extra_cmd=$extra_cmd" --ipex" - elif [ "${topology}" = "opt_125m_ipex_sq" ]; then - model_name_or_path="facebook/opt-125m" - extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5" elif [ "${topology}" = "llama2_7b_gptq_int4" ]; then model_name_or_path="meta-llama/Llama-2-7b-hf" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" elif [ "${topology}" = "llama2_7b_gptq_int4_dq_bnb" ]; then model_name_or_path="meta-llama/Llama-2-7b-hf" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" elif [ "${topology}" = "llama2_7b_gptq_int4_dq_ggml" ]; then model_name_or_path="meta-llama/Llama-2-7b-hf" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" - elif [ "${topology}" = "llama2_7b_ipex" ]; then - model_name_or_path="meta-llama/Llama-2-7b-hf" - extra_cmd=$extra_cmd" --ipex" - elif [ "${topology}" = "llama2_7b_ipex_sq" ]; then - model_name_or_path="meta-llama/Llama-2-7b-hf" - extra_cmd=$extra_cmd" --ipex --sq --alpha 0.8" elif [ "${topology}" = "gpt_j_woq_rtn_int4" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search" elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then - model_name_or_path="EleutherAI/gpt-j-6b" - approach="weight_only" + model_name_or_path="EleutherAI/gpt-j-6b"\ extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search" extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then - model_name_or_path="EleutherAI/gpt-j-6b" - approach="weight_only" + model_name_or_path="EleutherAI/gpt-j-6b"\ extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search" extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_bnb" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_ggml" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" - elif [ "${topology}" = "gpt_j_ipex" ]; then - model_name_or_path="EleutherAI/gpt-j-6b" - extra_cmd=$extra_cmd" --ipex" - elif [ "${topology}" = "gpt_j_ipex_sq" ]; then - model_name_or_path="EleutherAI/gpt-j-6b" - extra_cmd=$extra_cmd" --ipex --sq --alpha 1.0" fi python -u run_clm_no_trainer.py \ --model ${model_name_or_path} \ - --approach ${approach} \ --output_dir ${tuned_checkpoint} \ --task ${task} \ --batch_size ${batch_size} \ diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py similarity index 64% rename from examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py rename to examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py index c586f8d765e..8655c47a8da 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py @@ -35,10 +35,7 @@ '--seed', type=int, default=42, help='Seed for sampling the calibration data.' ) -parser.add_argument("--approach", type=str, default='static', - help="Select from ['dynamic', 'static', 'weight-only']") parser.add_argument("--int8", action="store_true") -parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.") parser.add_argument("--accuracy", action="store_true") parser.add_argument("--performance", action="store_true") parser.add_argument("--iters", default=100, type=int, @@ -54,9 +51,6 @@ parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext", type=str, help="tasks for accuracy validation") parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") -# ============SmoothQuant configs============== -parser.add_argument("--sq", action="store_true") -parser.add_argument("--alpha", default="auto", help="Smooth quant parameter.") # ============WeightOnly configs=============== parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'], help="Weight-only parameter.") @@ -109,8 +103,6 @@ # ======================================= args = parser.parse_args() -if args.ipex: - import intel_extension_for_pytorch as ipex calib_size = 1 @@ -193,7 +185,7 @@ def evaluate(self, model): def get_user_model(): torchscript = False - if args.sq or args.ipex or args.woq_algo in ['AWQ', 'TEQ']: + if args.woq_algo in ['AWQ', 'TEQ']: torchscript = True user_model = AutoModelForCausalLM.from_pretrained( args.model, @@ -202,8 +194,7 @@ def get_user_model(): revision=args.revision, ) tokenizer = AutoTokenizer.from_pretrained(args.model) - if args.approach == 'weight_only': - user_model = user_model.float() + user_model = user_model.float() # Set model's seq_len when GPTQ calibration is enabled. if args.woq_algo == 'GPTQ': @@ -234,120 +225,52 @@ def get_user_model(): ) # 3.x api - if args.approach == 'weight_only': - from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, prepare, convert, quantize - from neural_compressor.torch.utils import get_double_quant_config_dict - weight_sym = True if args.woq_scheme == "sym" else False + from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, prepare, convert, quantize + from neural_compressor.torch.utils import get_double_quant_config_dict + weight_sym = True if args.woq_scheme == "sym" else False + if args.double_quant_type is not None: + double_quant_config_dict = get_double_quant_config_dict(args.double_quant_type) + + if args.woq_algo == "RTN": if args.double_quant_type is not None: - double_quant_config_dict = get_double_quant_config_dict(args.double_quant_type) - - if args.woq_algo == "RTN": - if args.double_quant_type is not None: - double_quant_config_dict.update( - { - # TODO: add group_dim into double quant config? - "use_full_range": args.woq_use_full_range, - "use_mse_search": args.woq_use_mse_search, - } - ) - quant_config = RTNConfig.from_dict(double_quant_config_dict) - else: - quant_config = RTNConfig( - dtype=args.woq_dtype, - bits=args.woq_bits, - use_sym=weight_sym, - group_size=args.woq_group_size, - group_dim=args.woq_group_dim, - use_full_range=args.woq_use_full_range, - use_mse_search=args.woq_use_mse_search, - use_double_quant=False, - double_quant_bits=args.double_quant_bits, - double_quant_dtype=args.double_quant_dtype, - double_quant_use_sym=args.double_quant_use_sym, - double_quant_group_size=args.double_quant_group_size, - ) - quant_config.set_local("lm_head", RTNConfig(dtype="fp32")) - user_model = prepare(model=user_model, quant_config=quant_config) - user_model = convert(model=user_model) - elif args.woq_algo == "GPTQ": - from utils import DataloaderPreprocessor - dataloaderPreprocessor = DataloaderPreprocessor( - dataloader_original=calib_dataloader, - use_max_length=args.gptq_use_max_length, - max_seq_length=args.gptq_max_seq_length, + double_quant_config_dict.update( + { + # TODO: add group_dim into double quant config? + "use_full_range": args.woq_use_full_range, + "use_mse_search": args.woq_use_mse_search, + } ) - dataloader_for_calibration = dataloaderPreprocessor.get_prepared_dataloader() - from neural_compressor.torch.algorithms.weight_only.utility import move_input_to_device - from tqdm import tqdm - def run_fn_for_gptq(model, dataloader_for_calibration, *args): - for batch in tqdm(dataloader_for_calibration): - batch = move_input_to_device(batch, device=None) - try: - if isinstance(batch, tuple) or isinstance(batch, list): - model(batch[0]) - elif isinstance(batch, dict): - model(**batch) - else: - model(batch) - except ValueError: - pass - return - if args.double_quant_type is not None: - double_quant_config_dict.update( - { - "use_mse_search": args.woq_use_mse_search, - "percdamp": args.gptq_percdamp, - "act_order": args.gptq_actorder, - "block_size": args.gptq_block_size, - "static_groups": args.gptq_static_groups, - } - ) - quant_config = GPTQConfig.from_dict(double_quant_config_dict) - else: - quant_config = GPTQConfig( - dtype=args.woq_dtype, - bits=args.woq_bits, - use_sym=weight_sym, - group_size=args.woq_group_size, - use_mse_search=args.woq_use_mse_search, - percdamp=args.gptq_percdamp, - act_order=args.gptq_actorder, - block_size=args.gptq_block_size, - static_groups=args.gptq_static_groups, - use_double_quant=False, - double_quant_bits=args.double_quant_bits, - double_quant_dtype=args.double_quant_dtype, - double_quant_use_sym=args.double_quant_use_sym, - double_quant_group_size=args.double_quant_group_size, - ) - quant_config.set_local("lm_head", GPTQConfig(dtype="fp32")) - user_model = prepare(model=user_model, quant_config=quant_config) - run_fn_for_gptq(user_model, dataloader_for_calibration) - user_model = convert(user_model) - else: - if args.sq: - from neural_compressor.torch.quantization import SmoothQuantConfig - - # alpha can be a float number of a list of float number. - args.alpha = args.alpha if args.alpha == "auto" else eval(args.alpha) - if re.search("falcon", user_model.config.model_type): - quant_config = SmoothQuantConfig(alpha=args.alpha, folding=False) - else: - quant_config = SmoothQuantConfig(alpha=args.alpha, folding=True) - - if re.search("gpt", user_model.config.model_type): - quant_config.set_local(torch.add, SmoothQuantConfig(w_dtype="fp32", act_dtype="fp32")) + quant_config = RTNConfig.from_dict(double_quant_config_dict) else: - from neural_compressor.torch.quantization import get_default_static_config, StaticQuantConfig - - quant_config = get_default_static_config() - if re.search("gpt", user_model.config.model_type): - quant_config.set_local(torch.add, StaticQuantConfig(w_dtype="fp32", act_dtype="fp32")) - - from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device + quant_config = RTNConfig( + dtype=args.woq_dtype, + bits=args.woq_bits, + use_sym=weight_sym, + group_size=args.woq_group_size, + group_dim=args.woq_group_dim, + use_full_range=args.woq_use_full_range, + use_mse_search=args.woq_use_mse_search, + use_double_quant=False, + double_quant_bits=args.double_quant_bits, + double_quant_dtype=args.double_quant_dtype, + double_quant_use_sym=args.double_quant_use_sym, + double_quant_group_size=args.double_quant_group_size, + ) + quant_config.set_local("lm_head", RTNConfig(dtype="fp32")) + user_model = prepare(model=user_model, quant_config=quant_config) + user_model = convert(model=user_model) + elif args.woq_algo == "GPTQ": + from utils import DataloaderPreprocessor + dataloaderPreprocessor = DataloaderPreprocessor( + dataloader_original=calib_dataloader, + use_max_length=args.gptq_use_max_length, + max_seq_length=args.gptq_max_seq_length, + ) + dataloader_for_calibration = dataloaderPreprocessor.get_prepared_dataloader() + from neural_compressor.torch.algorithms.weight_only.utility import move_input_to_device from tqdm import tqdm - def run_fn(model): - for batch in tqdm(calib_dataloader): + def run_fn_for_gptq(model, dataloader_for_calibration, *args): + for batch in tqdm(dataloader_for_calibration): batch = move_input_to_device(batch, device=None) try: if isinstance(batch, tuple) or isinstance(batch, list): @@ -359,13 +282,37 @@ def run_fn(model): except ValueError: pass return - - from utils import get_example_inputs - example_inputs = get_example_inputs(user_model, calib_dataloader) - - from neural_compressor.torch.quantization import prepare, convert - user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs) - run_fn(user_model) + if args.double_quant_type is not None: + double_quant_config_dict.update( + { + "use_mse_search": args.woq_use_mse_search, + "percdamp": args.gptq_percdamp, + "act_order": args.gptq_actorder, + "block_size": args.gptq_block_size, + "static_groups": args.gptq_static_groups, + } + ) + quant_config = GPTQConfig.from_dict(double_quant_config_dict) + else: + quant_config = GPTQConfig( + dtype=args.woq_dtype, + bits=args.woq_bits, + use_sym=weight_sym, + group_size=args.woq_group_size, + use_mse_search=args.woq_use_mse_search, + percdamp=args.gptq_percdamp, + act_order=args.gptq_actorder, + block_size=args.gptq_block_size, + static_groups=args.gptq_static_groups, + use_double_quant=False, + double_quant_bits=args.double_quant_bits, + double_quant_dtype=args.double_quant_dtype, + double_quant_use_sym=args.double_quant_use_sym, + double_quant_group_size=args.double_quant_group_size, + ) + quant_config.set_local("lm_head", GPTQConfig(dtype="fp32")) + user_model = prepare(model=user_model, quant_config=quant_config) + run_fn_for_gptq(user_model, dataloader_for_calibration) user_model = convert(user_model) user_model.save(args.output_dir) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh similarity index 77% rename from examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_quant.sh rename to examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh index 3f95f44946e..079a1d28406 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_quant.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh @@ -38,91 +38,59 @@ function init_params { function run_tuning { extra_cmd='' batch_size=8 - approach='static' DATASET_NAME="NeelNanda/pile-10k" tuned_checkpoint="saved_results" if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then model_name_or_path="facebook/opt-125m" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_bnb" ]; then model_name_or_path="facebook/opt-125m" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_ggml" ]; then model_name_or_path="facebook/opt-125m" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length --gptq_percdamp 0.1 --gptq_actorder" extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" - elif [ "${topology}" = "opt_125m_ipex" ]; then - model_name_or_path="facebook/opt-125m" - extra_cmd=$extra_cmd" --ipex" - elif [ "${topology}" = "opt_125m_ipex_sq" ]; then - model_name_or_path="facebook/opt-125m" - extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5" elif [ "${topology}" = "llama2_7b_gptq_int4" ]; then model_name_or_path="meta-llama/Llama-2-7b-hf" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" elif [ "${topology}" = "llama2_7b_gptq_int4_dq_bnb" ]; then model_name_or_path="meta-llama/Llama-2-7b-hf" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" elif [ "${topology}" = "llama2_7b_gptq_int4_dq_ggml" ]; then model_name_or_path="meta-llama/Llama-2-7b-hf" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" - elif [ "${topology}" = "llama2_7b_ipex" ]; then - model_name_or_path="meta-llama/Llama-2-7b-hf" - extra_cmd=$extra_cmd" --ipex" - elif [ "${topology}" = "llama2_7b_ipex_sq" ]; then - model_name_or_path="meta-llama/Llama-2-7b-hf" - extra_cmd=$extra_cmd" --ipex --sq --alpha 0.8" elif [ "${topology}" = "gpt_j_woq_rtn_int4" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search" elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search" extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search" extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_bnb" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_ggml" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - approach="weight_only" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" - elif [ "${topology}" = "gpt_j_ipex" ]; then - model_name_or_path="EleutherAI/gpt-j-6b" - extra_cmd=$extra_cmd" --ipex" - elif [ "${topology}" = "gpt_j_ipex_sq" ]; then - model_name_or_path="EleutherAI/gpt-j-6b" - extra_cmd=$extra_cmd" --ipex --sq --alpha 1.0" fi python -u run_clm_no_trainer.py \ --model ${model_name_or_path} \ --dataset ${DATASET_NAME} \ --quantize \ - --approach ${approach} \ --output_dir ${tuned_checkpoint} \ --tasks "lambada_openai" \ --batch_size ${batch_size} \ diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/utils.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/utils.py similarity index 100% rename from examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/utils.py rename to examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/utils.py diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py index 702a22b49ad..18cf6e46e55 100644 --- a/neural_compressor/torch/algorithms/weight_only/modules.py +++ b/neural_compressor/torch/algorithms/weight_only/modules.py @@ -303,7 +303,6 @@ def unpack_tensor_with_torch(self, packed_tensor): def pack_tensor_with_numpy(self, raw_tensor): raw_array = raw_tensor.cpu().numpy() target_len = np.ceil(raw_array.shape[1] / self.n_pack).astype(int) - torch.int32 target_dtype = torch.tensor(0, dtype=self.compression_dtype).numpy().dtype packed_array = np.zeros((raw_array.shape[0], target_len), dtype=target_dtype) mask = np.uint8(2**self.bits - 1) diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py index 7494dac86f9..231502c32c6 100644 --- a/neural_compressor/torch/algorithms/weight_only/save_load.py +++ b/neural_compressor/torch/algorithms/weight_only/save_load.py @@ -41,7 +41,7 @@ def save(model, output_dir="./saved_results"): del model.save torch.save(model.state_dict(), qmodel_weight_file_path) - logger.info("Save quantized model weight to {}.".format(qmodel_weight_file_path)) + logger.info("Save quantized model to {}.".format(qmodel_weight_file_path)) logger.info("Save configuration of quantized model to {}.".format(qconfig_file_path))