-
Notifications
You must be signed in to change notification settings - Fork 258
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support save/load API for WOQ (#1786)
Signed-off-by: Kaihui-intel <[email protected]> Signed-off-by: chensuyue <[email protected]>
- Loading branch information
1 parent
ca9f8eb
commit bacc164
Showing
19 changed files
with
462 additions
and
232 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
159 changes: 159 additions & 0 deletions
159
....x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_benchmark.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
#!/bin/bash | ||
set -x | ||
|
||
function main { | ||
|
||
init_params "$@" | ||
run_benchmark | ||
|
||
} | ||
|
||
# init params | ||
function init_params { | ||
iters=100 | ||
batch_size=16 | ||
approach=static | ||
tuned_checkpoint=saved_results | ||
task=lambada_openai | ||
echo ${max_eval_samples} | ||
for var in "$@" | ||
do | ||
case $var in | ||
--topology=*) | ||
topology=$(echo $var |cut -f2 -d=) | ||
;; | ||
--dataset_location=*) | ||
dataset_location=$(echo $var |cut -f2 -d=) | ||
;; | ||
--input_model=*) | ||
input_model=$(echo $var |cut -f2 -d=) | ||
;; | ||
--mode=*) | ||
mode=$(echo $var |cut -f2 -d=) | ||
;; | ||
--batch_size=*) | ||
batch_size=$(echo $var |cut -f2 -d=) | ||
;; | ||
--iters=*) | ||
iters=$(echo ${var} |cut -f2 -d=) | ||
;; | ||
--int8=*) | ||
int8=$(echo ${var} |cut -f2 -d=) | ||
;; | ||
--config=*) | ||
tuned_checkpoint=$(echo $var |cut -f2 -d=) | ||
;; | ||
*) | ||
echo "Error: No such parameter: ${var}" | ||
exit 1 | ||
;; | ||
esac | ||
done | ||
|
||
} | ||
|
||
|
||
# run_benchmark | ||
function run_benchmark { | ||
extra_cmd='' | ||
|
||
if [[ ${mode} == "accuracy" ]]; then | ||
mode_cmd=" --accuracy " | ||
elif [[ ${mode} == "performance" ]]; then | ||
mode_cmd=" --performance --iters "${iters} | ||
else | ||
echo "Error: No such mode: ${mode}" | ||
exit 1 | ||
fi | ||
|
||
if [[ ${int8} == "true" ]]; then | ||
extra_cmd=$extra_cmd" --int8" | ||
fi | ||
echo $extra_cmd | ||
|
||
if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then | ||
model_name_or_path="facebook/opt-125m" | ||
approach="weight_only" | ||
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" | ||
elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_bnb" ]; then | ||
model_name_or_path="facebook/opt-125m" | ||
approach="weight_only" | ||
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" | ||
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" | ||
elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_ggml" ]; then | ||
model_name_or_path="facebook/opt-125m" | ||
approach="weight_only" | ||
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length --gptq_percdamp 0.1 --gptq_actorder" | ||
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" | ||
elif [ "${topology}" = "opt_125m_ipex" ]; then | ||
model_name_or_path="facebook/opt-125m" | ||
extra_cmd=$extra_cmd" --ipex" | ||
elif [ "${topology}" = "opt_125m_ipex_sq" ]; then | ||
model_name_or_path="facebook/opt-125m" | ||
extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5" | ||
elif [ "${topology}" = "llama2_7b_gptq_int4" ]; then | ||
model_name_or_path="meta-llama/Llama-2-7b-hf" | ||
approach="weight_only" | ||
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" | ||
elif [ "${topology}" = "llama2_7b_gptq_int4_dq_bnb" ]; then | ||
model_name_or_path="meta-llama/Llama-2-7b-hf" | ||
approach="weight_only" | ||
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" | ||
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" | ||
elif [ "${topology}" = "llama2_7b_gptq_int4_dq_ggml" ]; then | ||
model_name_or_path="meta-llama/Llama-2-7b-hf" | ||
approach="weight_only" | ||
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" | ||
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" | ||
elif [ "${topology}" = "llama2_7b_ipex" ]; then | ||
model_name_or_path="meta-llama/Llama-2-7b-hf" | ||
extra_cmd=$extra_cmd" --ipex" | ||
elif [ "${topology}" = "llama2_7b_ipex_sq" ]; then | ||
model_name_or_path="meta-llama/Llama-2-7b-hf" | ||
extra_cmd=$extra_cmd" --ipex --sq --alpha 0.8" | ||
elif [ "${topology}" = "gpt_j_woq_rtn_int4" ]; then | ||
model_name_or_path="EleutherAI/gpt-j-6b" | ||
approach="weight_only" | ||
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search" | ||
elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then | ||
model_name_or_path="EleutherAI/gpt-j-6b" | ||
approach="weight_only" | ||
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search" | ||
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" | ||
elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then | ||
model_name_or_path="EleutherAI/gpt-j-6b" | ||
approach="weight_only" | ||
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search" | ||
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" | ||
elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then | ||
model_name_or_path="EleutherAI/gpt-j-6b" | ||
approach="weight_only" | ||
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" | ||
elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_bnb" ]; then | ||
model_name_or_path="EleutherAI/gpt-j-6b" | ||
approach="weight_only" | ||
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" | ||
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" | ||
elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_ggml" ]; then | ||
model_name_or_path="EleutherAI/gpt-j-6b" | ||
approach="weight_only" | ||
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" | ||
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" | ||
elif [ "${topology}" = "gpt_j_ipex" ]; then | ||
model_name_or_path="EleutherAI/gpt-j-6b" | ||
extra_cmd=$extra_cmd" --ipex" | ||
elif [ "${topology}" = "gpt_j_ipex_sq" ]; then | ||
model_name_or_path="EleutherAI/gpt-j-6b" | ||
extra_cmd=$extra_cmd" --ipex --sq --alpha 1.0" | ||
fi | ||
|
||
python -u run_clm_no_trainer.py \ | ||
--model ${model_name_or_path} \ | ||
--approach ${approach} \ | ||
--output_dir ${tuned_checkpoint} \ | ||
--task ${task} \ | ||
--batch_size ${batch_size} \ | ||
${extra_cmd} ${mode_cmd} | ||
} | ||
|
||
main "$@" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
51 changes: 51 additions & 0 deletions
51
neural_compressor/torch/algorithms/weight_only/save_load.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
# Copyright (c) 2024 Intel Corporation | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# pylint:disable=import-error | ||
|
||
import json | ||
import os | ||
|
||
import torch | ||
|
||
from neural_compressor.common.utils import load_config_mapping, save_config_mapping | ||
from neural_compressor.torch.utils import QCONFIG_NAME, WEIGHT_NAME, logger | ||
|
||
|
||
def save(model, output_dir="./saved_results"): | ||
if not os.path.exists(output_dir): | ||
os.mkdir(output_dir) | ||
qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME) | ||
qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), QCONFIG_NAME) | ||
# saving process | ||
save_config_mapping(model.qconfig, qconfig_file_path) | ||
|
||
if hasattr(model, "gptq_config") and model.gptq_config: | ||
gptq_config_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), "gptq_config.json") | ||
with open(gptq_config_path, "w") as f: | ||
json.dump(model.gptq_config, f, indent=4) | ||
|
||
# MethodType 'save' not in state_dict | ||
del model.save | ||
torch.save(model, qmodel_file_path) | ||
|
||
logger.info("Save quantized model to {}.".format(qmodel_file_path)) | ||
logger.info("Save configuration of quantized model to {}.".format(qconfig_file_path)) | ||
|
||
|
||
def load(output_dir="./saved_results"): | ||
qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME) | ||
model = torch.load(qmodel_file_path) | ||
logger.info("Quantized model loading successful.") | ||
return model |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.