Skip to content

Commit

Permalink
Support save/load API for WOQ (#1786)
Browse files Browse the repository at this point in the history
Signed-off-by: Kaihui-intel <[email protected]>
Signed-off-by: chensuyue <[email protected]>
  • Loading branch information
Kaihui-intel authored May 17, 2024
1 parent ca9f8eb commit bacc164
Show file tree
Hide file tree
Showing 19 changed files with 462 additions and 232 deletions.
5 changes: 5 additions & 0 deletions .azure-pipelines/scripts/models/run_model_trigger_common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@ elif [ "${mode}" == "tuning" ]; then
[[ ${output_model} ]] && tuning_cmd="${tuning_cmd} --output_model=${output_model}"

cd ${WORK_SOURCE_DIR}/${model_src_dir}
# for int4 models add "--accuracy" to run tuning after quantize
if [[ "${model}" == *"int4"* ]]; then
sed -i "s|--quantize|--quantize --accuracy --int8|g" run_quant.sh
fi

$BOLD_YELLOW && echo "workspace ${WORK_SOURCE_DIR}/${model_src_dir}" && $RESET
$BOLD_YELLOW && echo "tuning_cmd is === ${tuning_cmd}" && $RESET
$BOLD_YELLOW && echo "======== run tuning ========" && $RESET
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
#!/bin/bash
set -x

function main {

init_params "$@"
run_benchmark

}

# init params
function init_params {
iters=100
batch_size=16
approach=static
tuned_checkpoint=saved_results
task=lambada_openai
echo ${max_eval_samples}
for var in "$@"
do
case $var in
--topology=*)
topology=$(echo $var |cut -f2 -d=)
;;
--dataset_location=*)
dataset_location=$(echo $var |cut -f2 -d=)
;;
--input_model=*)
input_model=$(echo $var |cut -f2 -d=)
;;
--mode=*)
mode=$(echo $var |cut -f2 -d=)
;;
--batch_size=*)
batch_size=$(echo $var |cut -f2 -d=)
;;
--iters=*)
iters=$(echo ${var} |cut -f2 -d=)
;;
--int8=*)
int8=$(echo ${var} |cut -f2 -d=)
;;
--config=*)
tuned_checkpoint=$(echo $var |cut -f2 -d=)
;;
*)
echo "Error: No such parameter: ${var}"
exit 1
;;
esac
done

}


# run_benchmark
function run_benchmark {
extra_cmd=''

if [[ ${mode} == "accuracy" ]]; then
mode_cmd=" --accuracy "
elif [[ ${mode} == "performance" ]]; then
mode_cmd=" --performance --iters "${iters}
else
echo "Error: No such mode: ${mode}"
exit 1
fi

if [[ ${int8} == "true" ]]; then
extra_cmd=$extra_cmd" --int8"
fi
echo $extra_cmd

if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
model_name_or_path="facebook/opt-125m"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_bnb" ]; then
model_name_or_path="facebook/opt-125m"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_ggml" ]; then
model_name_or_path="facebook/opt-125m"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length --gptq_percdamp 0.1 --gptq_actorder"
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
elif [ "${topology}" = "opt_125m_ipex" ]; then
model_name_or_path="facebook/opt-125m"
extra_cmd=$extra_cmd" --ipex"
elif [ "${topology}" = "opt_125m_ipex_sq" ]; then
model_name_or_path="facebook/opt-125m"
extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5"
elif [ "${topology}" = "llama2_7b_gptq_int4" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
elif [ "${topology}" = "llama2_7b_gptq_int4_dq_bnb" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
elif [ "${topology}" = "llama2_7b_gptq_int4_dq_ggml" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
elif [ "${topology}" = "llama2_7b_ipex" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
extra_cmd=$extra_cmd" --ipex"
elif [ "${topology}" = "llama2_7b_ipex_sq" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
extra_cmd=$extra_cmd" --ipex --sq --alpha 0.8"
elif [ "${topology}" = "gpt_j_woq_rtn_int4" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_bnb" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_ggml" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
approach="weight_only"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
elif [ "${topology}" = "gpt_j_ipex" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
extra_cmd=$extra_cmd" --ipex"
elif [ "${topology}" = "gpt_j_ipex_sq" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
extra_cmd=$extra_cmd" --ipex --sq --alpha 1.0"
fi

python -u run_clm_no_trainer.py \
--model ${model_name_or_path} \
--approach ${approach} \
--output_dir ${tuned_checkpoint} \
--task ${task} \
--batch_size ${batch_size} \
${extra_cmd} ${mode_cmd}
}

main "$@"
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import datasets
from torch.nn.functional import pad
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer

parser = argparse.ArgumentParser()
parser.add_argument(
Expand Down Expand Up @@ -66,7 +67,6 @@
parser.add_argument("--woq_scheme", default="sym")
parser.add_argument("--woq_use_mse_search", action="store_true")
parser.add_argument("--woq_use_full_range", action="store_true")
parser.add_argument("--woq_export_compressed_model", action="store_true")
# =============GPTQ configs====================
parser.add_argument("--gptq_actorder", action="store_true",
help="Whether to apply the activation order GPTQ heuristic.")
Expand Down Expand Up @@ -192,7 +192,6 @@ def evaluate(self, model):


def get_user_model():
from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer
torchscript = False
if args.sq or args.ipex or args.woq_algo in ['AWQ', 'TEQ']:
torchscript = True
Expand Down Expand Up @@ -248,7 +247,6 @@ def get_user_model():
# TODO: add group_dim into double quant config?
"use_full_range": args.woq_use_full_range,
"use_mse_search": args.woq_use_mse_search,
"export_compressed_model": args.woq_export_compressed_model,
}
)
quant_config = RTNConfig.from_dict(double_quant_config_dict)
Expand All @@ -261,7 +259,6 @@ def get_user_model():
group_dim=args.woq_group_dim,
use_full_range=args.woq_use_full_range,
use_mse_search=args.woq_use_mse_search,
export_compressed_model=args.woq_export_compressed_model,
use_double_quant=False,
double_quant_bits=args.double_quant_bits,
double_quant_dtype=args.double_quant_dtype,
Expand Down Expand Up @@ -298,7 +295,6 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
double_quant_config_dict.update(
{
"use_mse_search": args.woq_use_mse_search,
"export_compressed_model": args.woq_export_compressed_model,
"percdamp": args.gptq_percdamp,
"act_order": args.gptq_actorder,
"block_size": args.gptq_block_size,
Expand All @@ -313,7 +309,6 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
use_sym=weight_sym,
group_size=args.woq_group_size,
use_mse_search=args.woq_use_mse_search,
export_compressed_model=args.woq_export_compressed_model,
percdamp=args.gptq_percdamp,
act_order=args.gptq_actorder,
block_size=args.gptq_block_size,
Expand Down Expand Up @@ -380,24 +375,19 @@ def run_fn(model):
user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
run_fn(user_model)
user_model = convert(user_model)
user_model.save(args.output_dir)
user_model.save(args.output_dir)


# TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
# if args.int8 or args.int8_bf16_mixed:
# print("load int8 model")

# # TODO: from neural_compressor.torch.quantization import load
# from neural_compressor.torch.algorithms.static_quant import load

# if args.ipex:
# user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
# else:
# # TODO: WOQ save&load
# print("Int8 model loading does not support WeightOnlyQuant now.")
# pass
# else:
user_model, _ = get_user_model()

if args.int8 or args.int8_bf16_mixed:
print("load int8 model")

from neural_compressor.torch.quantization import load
tokenizer = AutoTokenizer.from_pretrained(args.model)
user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
else:
user_model, tokenizer = get_user_model()


if args.accuracy:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,6 @@ function run_tuning {
--model ${model_name_or_path} \
--dataset ${DATASET_NAME} \
--quantize \
--accuracy \
--approach ${approach} \
--output_dir ${tuned_checkpoint} \
--tasks "lambada_openai" \
Expand Down
2 changes: 1 addition & 1 deletion neural_compressor/torch/algorithms/weight_only/gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -1146,7 +1146,7 @@ def prepare(
max_seq_length=2048,
use_max_length=True,
device=None,
export_compressed_model=False,
export_compressed_model=True,
use_layer_wise=False,
model_path=None,
*args,
Expand Down
2 changes: 1 addition & 1 deletion neural_compressor/torch/algorithms/weight_only/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
invperm = torch.argsort(self.g_idx)
self.g_idx = invperm // self.group_size
self.g_idx = self.g_idx.type(torch.int32).to(self.device)
assert scale.shape == self.scales.shape, "Scale shape is mismatched."
assert scale.shape == self.scales.shape, f"{scale.shape} != {self.scales.shape} Scale shape is mismatched."
self.scales = scale.type(self.float_type).to(self.device)
if not self.use_optimum_format and self.compression_dim == 0:
int_weight = int_weight.t_().contiguous()
Expand Down
3 changes: 1 addition & 2 deletions neural_compressor/torch/algorithms/weight_only/rtn.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def convert(
group_size=32,
group_dim=1,
quantile=1.0,
export_compressed_model=False,
export_compressed_model=True,
use_full_range=False,
use_mse_search=False,
*args,
Expand Down Expand Up @@ -128,7 +128,6 @@ def convert(
use_full_range = weight_config[name]["use_full_range"]
use_mse_search = weight_config[name]["use_mse_search"]
use_layer_wise = weight_config[name]["use_layer_wise"]
export_compressed_model = weight_config[name]["export_compressed_model"]
if export_compressed_model:
use_optimum_format = kwargs.get("use_optimum_format", True)
# double quant config
Expand Down
51 changes: 51 additions & 0 deletions neural_compressor/torch/algorithms/weight_only/save_load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright (c) 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# pylint:disable=import-error

import json
import os

import torch

from neural_compressor.common.utils import load_config_mapping, save_config_mapping
from neural_compressor.torch.utils import QCONFIG_NAME, WEIGHT_NAME, logger


def save(model, output_dir="./saved_results"):
if not os.path.exists(output_dir):
os.mkdir(output_dir)
qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), QCONFIG_NAME)
# saving process
save_config_mapping(model.qconfig, qconfig_file_path)

if hasattr(model, "gptq_config") and model.gptq_config:
gptq_config_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), "gptq_config.json")
with open(gptq_config_path, "w") as f:
json.dump(model.gptq_config, f, indent=4)

# MethodType 'save' not in state_dict
del model.save
torch.save(model, qmodel_file_path)

logger.info("Save quantized model to {}.".format(qmodel_file_path))
logger.info("Save configuration of quantized model to {}.".format(qconfig_file_path))


def load(output_dir="./saved_results"):
qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
model = torch.load(qmodel_file_path)
logger.info("Quantized model loading successful.")
return model
3 changes: 2 additions & 1 deletion neural_compressor/torch/algorithms/weight_only/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,8 @@ def quant_tensor(
scale_bits = kwargs.get("double_quant_bits", 8)
scale_scheme = kwargs.get("double_quant_scheme", "asym")
scale_group_size = kwargs.get("double_quant_group_size", 256)
scale_return_int = kwargs.get("double_quant_return_int", return_int)
# TODO: kwargs.get("double_quant_return_int", return_int)
scale_return_int = kwargs.get("double_quant_return_int", False)
orig_scale_shape = scale.shape
scale = scale.reshape(1, -1)
# pre-process: scale_mean
Expand Down
Loading

0 comments on commit bacc164

Please sign in to comment.