From 354791d92cfcce0d576b45dc067d5d0d30961b99 Mon Sep 17 00:00:00 2001 From: Yiyang Cai <49231152+YIYANGCAI@users.noreply.github.com> Date: Fri, 23 Feb 2024 14:44:18 +0800 Subject: [PATCH] Add the export model process in mlperf codes (#1602) Signed-off-by: YIYANGCAI --- .../quantization/llm/run_gptj_mlperf_int4.py | 8 +++++--- .../quantization/llm/run_gptj_mlperf_int4.sh | 12 ++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.py index e2172f54a56..8217ce13c68 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.py @@ -260,6 +260,7 @@ def forward(self, *inp, **kwargs): parser.add_argument('--use_max_length', action='store_true', help='Only select data whose length equals or more than model.seqlen, please refer to GPTQ original implementation' ) + parser.add_argument('--benchmark', action='store_true', help='Whether to do benchmark on CNN datasets.') # load the gptj model args = parser.parse_args() @@ -324,12 +325,13 @@ def forward(self, *inp, **kwargs): q_model = quantization.fit(model, conf, calib_dataloader=dataloader,) - q_model.save("./gptj-gptq-gs128-calib128-calibration-fp16/") + # q_model.save("./gptj-gptq-gs128-calib128-calibration-fp16/") # q_model.float() # q_model.save("./gptj-gptq-gs128-calib128-calibration-fp32/") + compressed_model = q_model.export_compressed_model() + torch.save(compressed_model.state_dict(), "gptj_w3g128_compressed_model.pt") # benchmarking first 100 examples - # if args.benchmark: - if True: + if args.benchmark: # use half to accerlerate inference model.half() model = model.to(DEV) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.sh index 6ea2c6bdaa4..1f63e0661e7 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.sh @@ -2,15 +2,15 @@ CALIBRATION_DATA=/your/data/calibration-data/cnn_dailymail_calibration.json VALIDATION_DATA=/your/data/validation-data/cnn_dailymail_validation.json MODEL_DIR=/your/gptj/ -python -u examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/run_gptj_mlperf_int4.py \ +python -u examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_gptj_mlperf_int4.py \ --model_name_or_path ${MODEL_DIR} \ - --wbits 4 \ + --wbits 3 \ --sym \ - --group_size -1 \ - --nsamples 128 \ + --group_size 128 \ + --nsamples 256 \ --calib-data-path ${CALIBRATION_DATA} \ --val-data-path ${VALIDATION_DATA} \ - --calib-iters 128 \ + --calib-iters 256 \ --use_max_length \ --pad_max_length 2048 \ - --use_gpu \ No newline at end of file + --use_gpu