Support save/load API for WOQ (#1786)

Signed-off-by: Kaihui-intel <[email protected]> Signed-off-by: chensuyue <[email protected]>
intel · May 17, 2024 · bacc164 · bacc164
1 parent ca9f8eb
commit bacc164
Show file tree

Hide file tree

Showing 19 changed files with 462 additions and 232 deletions.
diff --git a/.azure-pipelines/scripts/models/run_model_trigger_common.sh b/.azure-pipelines/scripts/models/run_model_trigger_common.sh
@@ -88,6 +88,11 @@ elif [ "${mode}" == "tuning" ]; then
     [[ ${output_model} ]] && tuning_cmd="${tuning_cmd} --output_model=${output_model}"
 
     cd ${WORK_SOURCE_DIR}/${model_src_dir}
+    # for int4 models add "--accuracy" to run tuning after quantize
+    if [[ "${model}" == *"int4"* ]]; then
+        sed -i "s|--quantize|--quantize --accuracy --int8|g" run_quant.sh
+    fi
+
     $BOLD_YELLOW && echo "workspace ${WORK_SOURCE_DIR}/${model_src_dir}" && $RESET
     $BOLD_YELLOW && echo "tuning_cmd is === ${tuning_cmd}" && $RESET
     $BOLD_YELLOW && echo "======== run tuning ========" && $RESET

diff --git a/....x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_benchmark.sh b/....x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_benchmark.sh
@@ -0,0 +1,159 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  iters=100
+  batch_size=16
+  approach=static
+  tuned_checkpoint=saved_results
+  task=lambada_openai
+  echo ${max_eval_samples}
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --config=*)
+          tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+
+# run_benchmark
+function run_benchmark {
+    extra_cmd=''
+
+    if [[ ${mode} == "accuracy" ]]; then
+        mode_cmd=" --accuracy "
+    elif [[ ${mode} == "performance" ]]; then
+        mode_cmd=" --performance --iters "${iters}
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+
+    if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --int8"
+    fi
+    echo $extra_cmd
+
+        if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
+        model_name_or_path="facebook/opt-125m"
+        approach="weight_only"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
+    elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_bnb" ]; then
+        model_name_or_path="facebook/opt-125m"
+        approach="weight_only"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
+        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
+    elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_ggml" ]; then
+        model_name_or_path="facebook/opt-125m"
+        approach="weight_only"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length --gptq_percdamp 0.1 --gptq_actorder"
+        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+    elif [ "${topology}" = "opt_125m_ipex" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --ipex"
+    elif [ "${topology}" = "opt_125m_ipex_sq" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5"
+    elif [ "${topology}" = "llama2_7b_gptq_int4" ]; then
+        model_name_or_path="meta-llama/Llama-2-7b-hf"
+        approach="weight_only"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
+    elif [ "${topology}" = "llama2_7b_gptq_int4_dq_bnb" ]; then
+        model_name_or_path="meta-llama/Llama-2-7b-hf"
+        approach="weight_only"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
+        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
+    elif [ "${topology}" = "llama2_7b_gptq_int4_dq_ggml" ]; then
+        model_name_or_path="meta-llama/Llama-2-7b-hf"
+        approach="weight_only"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
+        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+    elif [ "${topology}" = "llama2_7b_ipex" ]; then
+        model_name_or_path="meta-llama/Llama-2-7b-hf"
+        extra_cmd=$extra_cmd" --ipex"
+    elif [ "${topology}" = "llama2_7b_ipex_sq" ]; then
+        model_name_or_path="meta-llama/Llama-2-7b-hf"
+        extra_cmd=$extra_cmd" --ipex --sq --alpha 0.8"
+    elif [ "${topology}" = "gpt_j_woq_rtn_int4" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        approach="weight_only"
+        extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
+    elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        approach="weight_only"
+        extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
+        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
+    elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        approach="weight_only"
+        extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
+        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+    elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        approach="weight_only"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
+    elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_bnb" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        approach="weight_only"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
+        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
+    elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_ggml" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        approach="weight_only"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
+        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+    elif [ "${topology}" = "gpt_j_ipex" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        extra_cmd=$extra_cmd" --ipex"
+    elif [ "${topology}" = "gpt_j_ipex_sq" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        extra_cmd=$extra_cmd" --ipex --sq --alpha 1.0"
+    fi
+
+    python -u run_clm_no_trainer.py \
+        --model ${model_name_or_path} \
+        --approach ${approach} \
+        --output_dir ${tuned_checkpoint} \
+        --task ${task} \
+        --batch_size ${batch_size} \
+        ${extra_cmd} ${mode_cmd}
+}
+
+main "$@"
diff --git a/...i/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/...i/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py
@@ -11,6 +11,7 @@
 import datasets
 from torch.nn.functional import pad
 from torch.utils.data import DataLoader
+from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer
 
 parser = argparse.ArgumentParser()
 parser.add_argument(
@@ -66,7 +67,6 @@
 parser.add_argument("--woq_scheme", default="sym")
 parser.add_argument("--woq_use_mse_search", action="store_true")
 parser.add_argument("--woq_use_full_range", action="store_true")
-parser.add_argument("--woq_export_compressed_model", action="store_true")
 # =============GPTQ configs====================
 parser.add_argument("--gptq_actorder", action="store_true",
                     help="Whether to apply the activation order GPTQ heuristic.")
@@ -192,7 +192,6 @@ def evaluate(self, model):
 
 
 def get_user_model():
-    from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer
     torchscript = False
     if args.sq or args.ipex or args.woq_algo in ['AWQ', 'TEQ']:
         torchscript = True
@@ -248,7 +247,6 @@ def get_user_model():
                         # TODO: add group_dim into double quant config?
                         "use_full_range": args.woq_use_full_range,
                         "use_mse_search": args.woq_use_mse_search,
-                        "export_compressed_model": args.woq_export_compressed_model,
                     }
                 )
                 quant_config = RTNConfig.from_dict(double_quant_config_dict)
@@ -261,7 +259,6 @@ def get_user_model():
                     group_dim=args.woq_group_dim,
                     use_full_range=args.woq_use_full_range,
                     use_mse_search=args.woq_use_mse_search,
-                    export_compressed_model=args.woq_export_compressed_model,
                     use_double_quant=False,
                     double_quant_bits=args.double_quant_bits,
                     double_quant_dtype=args.double_quant_dtype,
@@ -298,7 +295,6 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
                 double_quant_config_dict.update(
                     {
                         "use_mse_search": args.woq_use_mse_search,
-                        "export_compressed_model": args.woq_export_compressed_model,
                         "percdamp": args.gptq_percdamp,
                         "act_order": args.gptq_actorder,
                         "block_size": args.gptq_block_size,
@@ -313,7 +309,6 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
                     use_sym=weight_sym,
                     group_size=args.woq_group_size,
                     use_mse_search=args.woq_use_mse_search,
-                    export_compressed_model=args.woq_export_compressed_model,
                     percdamp=args.gptq_percdamp,
                     act_order=args.gptq_actorder,
                     block_size=args.gptq_block_size,
@@ -380,24 +375,19 @@ def run_fn(model):
             user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
             run_fn(user_model)
             user_model = convert(user_model)
-        user_model.save(args.output_dir)
+    user_model.save(args.output_dir)
 
 
 # TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
-# if args.int8 or args.int8_bf16_mixed:
-#     print("load int8 model")
-
-#     # TODO: from neural_compressor.torch.quantization import load
-#     from neural_compressor.torch.algorithms.static_quant import load
-
-#     if args.ipex:
-#         user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
-#     else:
-#         # TODO: WOQ save&load
-#         print("Int8 model loading does not support WeightOnlyQuant now.")
-#         pass
-# else:
-        user_model, _ = get_user_model()
+
+if args.int8 or args.int8_bf16_mixed:
+    print("load int8 model")
+
+    from neural_compressor.torch.quantization import load
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
+else:
+    user_model, tokenizer = get_user_model()
 
 
 if args.accuracy:

diff --git a/...es/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_quant.sh b/...es/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_quant.sh
@@ -122,7 +122,6 @@ function run_tuning {
         --model ${model_name_or_path} \
         --dataset ${DATASET_NAME} \
         --quantize \
-        --accuracy \
         --approach ${approach} \
         --output_dir ${tuned_checkpoint} \
         --tasks "lambada_openai" \

diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py
@@ -1146,7 +1146,7 @@ def prepare(
         max_seq_length=2048,
         use_max_length=True,
         device=None,
-        export_compressed_model=False,
+        export_compressed_model=True,
         use_layer_wise=False,
         model_path=None,
         *args,

diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -194,7 +194,7 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
                 invperm = torch.argsort(self.g_idx)
                 self.g_idx = invperm // self.group_size
                 self.g_idx = self.g_idx.type(torch.int32).to(self.device)
-        assert scale.shape == self.scales.shape, "Scale shape is mismatched."
+        assert scale.shape == self.scales.shape, f"{scale.shape} != {self.scales.shape} Scale shape is mismatched."
         self.scales = scale.type(self.float_type).to(self.device)
         if not self.use_optimum_format and self.compression_dim == 0:
             int_weight = int_weight.t_().contiguous()

diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -59,7 +59,7 @@ def convert(
         group_size=32,
         group_dim=1,
         quantile=1.0,
-        export_compressed_model=False,
+        export_compressed_model=True,
         use_full_range=False,
         use_mse_search=False,
         *args,
@@ -128,7 +128,6 @@ def convert(
                 use_full_range = weight_config[name]["use_full_range"]
                 use_mse_search = weight_config[name]["use_mse_search"]
                 use_layer_wise = weight_config[name]["use_layer_wise"]
-                export_compressed_model = weight_config[name]["export_compressed_model"]
                 if export_compressed_model:
                     use_optimum_format = kwargs.get("use_optimum_format", True)
                 # double quant config

diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint:disable=import-error
+
+import json
+import os
+
+import torch
+
+from neural_compressor.common.utils import load_config_mapping, save_config_mapping
+from neural_compressor.torch.utils import QCONFIG_NAME, WEIGHT_NAME, logger
+
+
+def save(model, output_dir="./saved_results"):
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+    qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
+    qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), QCONFIG_NAME)
+    # saving process
+    save_config_mapping(model.qconfig, qconfig_file_path)
+
+    if hasattr(model, "gptq_config") and model.gptq_config:
+        gptq_config_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), "gptq_config.json")
+        with open(gptq_config_path, "w") as f:
+            json.dump(model.gptq_config, f, indent=4)
+
+    # MethodType 'save' not in state_dict
+    del model.save
+    torch.save(model, qmodel_file_path)
+
+    logger.info("Save quantized model to {}.".format(qmodel_file_path))
+    logger.info("Save configuration of quantized model to {}.".format(qconfig_file_path))
+
+
+def load(output_dir="./saved_results"):
+    qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
+    model = torch.load(qmodel_file_path)
+    logger.info("Quantized model loading successful.")
+    return model
diff --git a/neural_compressor/torch/algorithms/weight_only/utility.py b/neural_compressor/torch/algorithms/weight_only/utility.py
@@ -356,7 +356,8 @@ def quant_tensor(
         scale_bits = kwargs.get("double_quant_bits", 8)
         scale_scheme = kwargs.get("double_quant_scheme", "asym")
         scale_group_size = kwargs.get("double_quant_group_size", 256)
-        scale_return_int = kwargs.get("double_quant_return_int", return_int)
+        # TODO: kwargs.get("double_quant_return_int", return_int)
+        scale_return_int = kwargs.get("double_quant_return_int", False)
         orig_scale_shape = scale.shape
         scale = scale.reshape(1, -1)
         # pre-process: scale_mean