From 354791d92cfcce0d576b45dc067d5d0d30961b99 Mon Sep 17 00:00:00 2001
From: Yiyang Cai <49231152+YIYANGCAI@users.noreply.github.com>
Date: Fri, 23 Feb 2024 14:44:18 +0800
Subject: [PATCH] Add the export model process in mlperf codes (#1602)

Signed-off-by: YIYANGCAI <yiyang.cai@intel.com>
---
 .../quantization/llm/run_gptj_mlperf_int4.py         |  8 +++++---
 .../quantization/llm/run_gptj_mlperf_int4.sh         | 12 ++++++------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.py
index e2172f54a56..8217ce13c68 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.py
@@ -260,6 +260,7 @@ def forward(self, *inp, **kwargs):
     parser.add_argument('--use_max_length', action='store_true', 
         help='Only select data whose length equals or more than model.seqlen, please refer to GPTQ original implementation'
     )
+    parser.add_argument('--benchmark', action='store_true', help='Whether to do benchmark on CNN datasets.')
 
     # load the gptj model
     args = parser.parse_args()
@@ -324,12 +325,13 @@ def forward(self, *inp, **kwargs):
 
     q_model = quantization.fit(model, conf, calib_dataloader=dataloader,)
 
-    q_model.save("./gptj-gptq-gs128-calib128-calibration-fp16/")
+    # q_model.save("./gptj-gptq-gs128-calib128-calibration-fp16/")
     # q_model.float()
     # q_model.save("./gptj-gptq-gs128-calib128-calibration-fp32/")
+    compressed_model = q_model.export_compressed_model()
+    torch.save(compressed_model.state_dict(), "gptj_w3g128_compressed_model.pt")
     # benchmarking first 100 examples
-    # if args.benchmark:
-    if True:
+    if args.benchmark:
         # use half to accerlerate inference
         model.half()
         model = model.to(DEV)
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.sh
index 6ea2c6bdaa4..1f63e0661e7 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.sh
@@ -2,15 +2,15 @@ CALIBRATION_DATA=/your/data/calibration-data/cnn_dailymail_calibration.json
 VALIDATION_DATA=/your/data/validation-data/cnn_dailymail_validation.json
 MODEL_DIR=/your/gptj/
 
-python -u examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/run_gptj_mlperf_int4.py \
+python -u examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.py \
     --model_name_or_path ${MODEL_DIR} \
-    --wbits 4 \
+    --wbits 3 \
     --sym \
-    --group_size -1 \
-    --nsamples 128 \
+    --group_size 128 \
+    --nsamples 256 \
     --calib-data-path ${CALIBRATION_DATA} \
     --val-data-path ${VALIDATION_DATA} \
-    --calib-iters 128 \
+    --calib-iters 256 \
     --use_max_length \
     --pad_max_length 2048 \
-    --use_gpu
\ No newline at end of file
+    --use_gpu