intel · chensuyue · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024
diff --git a/.azure-pipelines/scripts/models/run_pytorch_models_trigger.sh b/.azure-pipelines/scripts/models/run_pytorch_models_trigger.sh
@@ -53,15 +53,15 @@ elif [ "${model}" == "resnet18_fx" ]; then
     tuning_cmd="bash run_quant.sh --topology=resnet18 --dataset_location=${dataset_location} --input_model=${input_model}"
     benchmark_cmd="bash run_benchmark.sh --topology=resnet18 --dataset_location=${dataset_location} --mode=performance --batch_size=${batch_size} --iters=500"
 elif [ "${model}" == "opt_125m_woq_gptq_int4" ]; then
-    model_src_dir="nlp/huggingface_models/language-modeling/quantization/llm"
+    model_src_dir="nlp/huggingface_models/language-modeling/quantization/weight_only"
     inc_new_api=3x_pt
     tuning_cmd="bash run_quant.sh --topology=opt_125m_woq_gptq_int4"
 elif [ "${model}" == "opt_125m_woq_gptq_int4_dq_bnb" ]; then
-    model_src_dir="nlp/huggingface_models/language-modeling/quantization/llm"
+    model_src_dir="nlp/huggingface_models/language-modeling/quantization/weight_only"
     inc_new_api=3x_pt
     tuning_cmd="bash run_quant.sh --topology=opt_125m_woq_gptq_int4_dq_bnb"
 elif [ "${model}" == "opt_125m_woq_gptq_int4_dq_ggml" ]; then
-    model_src_dir="nlp/huggingface_models/language-modeling/quantization/llm"
+    model_src_dir="nlp/huggingface_models/language-modeling/quantization/weight_only"
     inc_new_api=3x_pt
     tuning_cmd="bash run_quant.sh --topology=opt_125m_woq_gptq_int4_dq_ggml"
 fi

diff --git a/docs/3x/PT_WeightOnlyQuant.md b/docs/3x/PT_WeightOnlyQuant.md
@@ -258,7 +258,7 @@ loaded_model = load(
 
 ## Examples
 
-Users can also refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm) on how to quantize a  model with WeightOnlyQuant.
+Users can also refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only) on how to quantize a  model with WeightOnlyQuant.
 
 ## Reference
 

diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
@@ -1,67 +1,151 @@
-{
-    "pytorch": {
-      "gpt_j_ipex":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "gpt_j_ipex_sq":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "llama2_7b_ipex":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "llama2_7b_ipex_sq":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "opt_125m_ipex":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "opt_125m_ipex_sq":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "dlrm_ipex": {
-        "model_src_dir": "recommendation/dlrm/static_quant/ipex",
-        "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input",
-        "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt",
-        "main_script": "dlrm_s_pytorch.py",
-        "batch_size": 16384
-      },
-      "resnet18_pt2e_static":{
-        "model_src_dir": "cv/static_quant",
-        "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
-        "input_model": "",
-        "main_script": "main.py",
-        "batch_size": 1
-      },
-      "opt_125m_pt2e_static":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      }
-    }
-}
+{
+    "pytorch": {
+      "opt_125m_woq_gptq_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "opt_125m_woq_gptq_int4_dq_bnb":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "opt_125m_woq_gptq_int4_dq_ggml":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "llama2_7b_gptq_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "llama2_7b_gptq_int4_dq_bnb":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "llama2_7b_gptq_int4_dq_ggml":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "gpt_j_woq_rtn_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "gpt_j_woq_rtn_int4_dq_bnb":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "gpt_j_woq_rtn_int4_dq_ggml":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "gpt_j_woq_gptq_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "gpt_j_woq_gptq_int4_dq_bnb":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "gpt_j_woq_gptq_int4_dq_ggml":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "gpt_j_ipex":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "gpt_j_ipex_sq":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "llama2_7b_ipex":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "llama2_7b_ipex_sq":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "opt_125m_ipex":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "opt_125m_ipex_sq":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "dlrm_ipex": {
+        "model_src_dir": "recommendation/dlrm/static_quant/ipex",
+        "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input",
+        "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt",
+        "main_script": "dlrm_s_pytorch.py",
+        "batch_size": 16384
+      },
+      "resnet18_pt2e_static":{
+        "model_src_dir": "cv/static_quant",
+        "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
+        "input_model": "",
+        "main_script": "main.py",
+        "batch_size": 1
+      },
+      "opt_125m_pt2e_static":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      }
+    }
+}
diff --git a/...guage-modeling/quantization/llm/README.md → ...deling/quantization/weight_only/README.md b/...guage-modeling/quantization/llm/README.md → ...deling/quantization/weight_only/README.md
@@ -21,27 +21,14 @@ Here is how to run the scripts:
 ### GPT-J-6b
 
 #### Quantization
-```bash
-# "--sq" is used to enable smooth quant
-python run_clm_no_trainer.py \
-    --model EleutherAI/gpt-j-6B \
-    --quantize \
-    --sq \
-    --alpha 1.0 \
-    --ipex \
-    --output_dir "saved_results"
-```
-**Notes**: Smooth quantization here is based on torch.jit. Without past key value in example_inputs, the quantized model cannot be used for text-generation.
 
 ```bash
-# "--approach weight_only" is used to enable weight only quantization.
 # "--woq_algo GPTQ" is used to enable GPTQ algorithms
 # "--double_quant_type BNB_NF4" is used to enable double quant algorithms
 python run_clm_no_trainer.py \
     --model EleutherAI/gpt-j-6B \
     --dataset NeelNanda/pile-10k \
     --quantize \
-    --approach weight_only \
     --woq_algo GPTQ \
     --woq_bits 4 \
     --woq_scheme asym \
@@ -57,7 +44,6 @@ python run_clm_no_trainer.py \
     --model EleutherAI/gpt-j-6B \
     --dataset NeelNanda/pile-10k \
     --quantize \
-    --approach weight_only \
     --woq_algo RTN \
     --woq_bits 4 \
     --woq_scheme asym \
@@ -74,23 +60,12 @@ python run_clm_no_trainer.py \
 #### Quantization
 
 ```bash
-# "--sq" is used to enable smooth quant
-python run_clm_no_trainer.py \
-    --model facebook/opt-125m \
-    --quantize \
-    --sq \
-    --alpha 0.5 \
-    --ipex \
-    --output_dir "saved_results"
-
-# "--approach weight_only" is used to enable weight only quantization.
 # "--woq_algo GPTQ" is used to enable GPTQ algorithms
 # "--double_quant_type BNB_NF4" is used to enable double quant algorithms
 python run_clm_no_trainer.py \
     --model facebook/opt-125m \
     --dataset NeelNanda/pile-10k \
     --quantize \
-    --approach weight_only \
     --woq_algo GPTQ \
     --woq_bits 4 \
     --woq_scheme asym \
@@ -106,7 +81,6 @@ python run_clm_no_trainer.py \
     --model facebook/opt-125m \
     --dataset NeelNanda/pile-10k \
     --quantize \
-    --approach weight_only \
     --woq_algo RTN \
     --woq_bits 4 \
     --woq_scheme asym \
@@ -121,23 +95,12 @@ python run_clm_no_trainer.py \
 #### Quantization
 
 ```bash
-# "--sq" is used to enable smooth quant
-python run_clm_no_trainer.py \
-    --model meta-llama/Llama-2-7b-hf \
-    --quantize \
-    --sq \
-    --alpha 0.8 \
-    --ipex \
-    --output_dir "saved_results"
-
-# "--approach weight_only" is used to enable weight only quantization.
 # "--double_quant_type BNB_NF4" is used to enable double quant algorithms
 # "--woq_algo GPTQ" is used to enable GPTQ algorithms
 python run_clm_no_trainer.py \
     --model meta-llama/Llama-2-7b-hf \
     --dataset NeelNanda/pile-10k \
     --quantize \
-    --approach weight_only \
     --woq_algo GPTQ \
     --woq_bits 4 \
     --woq_scheme asym \
@@ -153,7 +116,6 @@ python run_clm_no_trainer.py \
     --model meta-llama/Llama-2-7b-hf \
     --dataset NeelNanda/pile-10k \
     --quantize \
-    --approach weight_only \
     --woq_algo RTN \
     --woq_bits 4 \
     --woq_scheme asym \

diff --git a/...odeling/quantization/llm/requirements.txt → ...quantization/weight_only/requirements.txt b/...odeling/quantization/llm/requirements.txt → ...quantization/weight_only/requirements.txt