diff --git a/docs/3x/PT_SmoothQuant.md b/docs/3x/PT_SmoothQuant.md index 9e4ae3eb62f..e3a7262dcde 100644 --- a/docs/3x/PT_SmoothQuant.md +++ b/docs/3x/PT_SmoothQuant.md @@ -46,7 +46,7 @@ run_fn(prepared_model) q_model = convert(prepared_model) ``` -To get more information, please refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm). +To get more information, please refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant). ## Validated Models diff --git a/docs/3x/PT_StaticQuant.md b/docs/3x/PT_StaticQuant.md index ec967a780d4..7d56f817296 100644 --- a/docs/3x/PT_StaticQuant.md +++ b/docs/3x/PT_StaticQuant.md @@ -68,7 +68,7 @@ q_model = convert(prepared_model) #### Model Examples -Users could refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm) on how to quantize a new model. +Users could refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex) on how to quantize a new model. ### Static Quantization with PT2E Backend diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json new file mode 100644 index 00000000000..8520a9545b0 --- /dev/null +++ b/examples/.config/model_params_pytorch_3x.json @@ -0,0 +1,46 @@ +{ + "pytorch": { + "gpt_j_ipex":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "gpt_j_ipex_sq":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "llama2_7b_ipex":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "llama2_7b_ipex_sq":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "opt_125m_ipex":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "opt_125m_ipex_sq":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + } + } +} \ No newline at end of file diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/README.md new file mode 100644 index 00000000000..8900ea9fd9b --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/README.md @@ -0,0 +1,64 @@ +Step-by-Step +============ +This document describes the step-by-step instructions to run large language models (LLMs) using Smooth Quantization on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch and Intel® Extension for PyTorch. + +The script `run_clm_no_trainer.py` supports `GPTJ`, `OPT`, `LLaMA2`, `BLOOM` and `Falcon` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models. + +# Prerequisite +## 1. Create Environment +``` +# Installation +pip install -r requirements.txt +``` + +# Run + +Here is how to run the scripts: + +**Causal Language Modeling (CLM)** + +`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows. +### GPT-J-6b + +#### Quantization +```bash +# "--sq" is used to enable smooth quant +python run_clm_no_trainer.py \ + --model EleutherAI/gpt-j-6B \ + --quantize \ + --sq \ + --alpha 1.0 \ + --ipex \ + --output_dir "saved_results" +``` +**Notes**: Smooth quantization here is based on torch.jit. Without past key value in example_inputs, the quantized model cannot be used for text-generation. + +### OPT-125m + +#### Quantization + +```bash +# "--sq" is used to enable smooth quant +python run_clm_no_trainer.py \ + --model facebook/opt-125m \ + --quantize \ + --sq \ + --alpha 0.5 \ + --ipex \ + --output_dir "saved_results" +``` + +### LLAMA2-7b/13b/70b +>Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy. +#### Quantization + +```bash +# "--sq" is used to enable smooth quant +python run_clm_no_trainer.py \ + --model meta-llama/Llama-2-7b-hf \ + --quantize \ + --sq \ + --alpha 0.8 \ + --ipex \ + --output_dir "saved_results" +``` \ No newline at end of file diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt new file mode 100644 index 00000000000..f0b56e558d3 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt @@ -0,0 +1,13 @@ +accelerate +protobuf +sentencepiece != 0.1.92 +datasets >= 1.1.3 +torch >= 1.10 +transformers +pytest +wandb +einops +neural-compressor +intel-extension-for-transformers +lm_eval==0.4.2 +peft diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh new file mode 100644 index 00000000000..61c50611090 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh @@ -0,0 +1,96 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + iters=100 + batch_size=16 + approach=static + tuned_checkpoint=saved_results + task=lambada_openai + echo ${max_eval_samples} + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo $var |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo $var |cut -f2 -d=) + ;; + --iters=*) + iters=$(echo ${var} |cut -f2 -d=) + ;; + --int8=*) + int8=$(echo ${var} |cut -f2 -d=) + ;; + --config=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + + +# run_benchmark +function run_benchmark { + extra_cmd='' + + if [[ ${mode} == "accuracy" ]]; then + mode_cmd=" --accuracy " + extra_cmd=$extra_cmd" --load" + elif [[ ${mode} == "performance" ]]; then + mode_cmd=" --performance --iters "${iters} + extra_cmd=$extra_cmd" --load" + else + echo "Error: No such mode: ${mode}" + exit 1 + fi + + if [[ ${int8} == "true" ]]; then + extra_cmd=$extra_cmd" --int8" + fi + echo $extra_cmd + + if [ "${topology}" = "opt_125m_ipex_sq" ]; then + model_name_or_path="facebook/opt-125m" + extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5" + elif [ "${topology}" = "llama2_7b_ipex_sq" ]; then + model_name_or_path="meta-llama/Llama-2-7b-hf" + extra_cmd=$extra_cmd" --ipex --sq --alpha 0.8" + elif [ "${topology}" = "gpt_j_ipex_sq" ]; then + model_name_or_path="EleutherAI/gpt-j-6b" + extra_cmd=$extra_cmd" --ipex --sq --alpha 1.0" + fi + + python -u run_clm_no_trainer.py \ + --model ${model_name_or_path} \ + --approach ${approach} \ + --output_dir ${tuned_checkpoint} \ + --task ${task} \ + --batch_size ${batch_size} \ + ${extra_cmd} ${mode_cmd} +} + +main "$@" diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py new file mode 100644 index 00000000000..ef0590e2982 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py @@ -0,0 +1,264 @@ +import argparse +import os +import sys + +sys.path.append('./') +import time +import re +import torch +from datasets import load_dataset +from torch.nn.functional import pad +from torch.utils.data import DataLoader +from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument( + "--model", nargs="?", default="EleutherAI/gpt-j-6b" +) +parser.add_argument( + "--trust_remote_code", default=True, + help="Transformers parameter: use the external repo") +parser.add_argument( + "--revision", default=None, + help="Transformers parameter: set the model hub commit number") +parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k") +parser.add_argument("--output_dir", nargs="?", default="./saved_results") +parser.add_argument("--quantize", action="store_true") +parser.add_argument( + "--int8_bf16_mixed", + action="store_true", + help="By default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)", +) +parser.add_argument( + '--seed', + type=int, default=42, help='Seed for sampling the calibration data.' +) +parser.add_argument("--approach", type=str, default='static', + help="Select from ['dynamic', 'static', 'weight-only']") +parser.add_argument("--int8", action="store_true") +parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.") +parser.add_argument("--load", action="store_true", help="Load quantized model.") +parser.add_argument("--accuracy", action="store_true") +parser.add_argument("--performance", action="store_true") +parser.add_argument("--iters", default=100, type=int, + help="For accuracy measurement only.") +parser.add_argument("--batch_size", default=1, type=int, + help="For accuracy measurement only.") +parser.add_argument("--save_accuracy_path", default=None, + help="Save accuracy results path.") +parser.add_argument("--pad_max_length", default=512, type=int, + help="Pad input ids to max length.") +parser.add_argument("--calib_iters", default=512, type=int, + help="calibration iters.") +parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext", + type=str, help="tasks for accuracy validation") +parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") +# ============SmoothQuant configs============== +parser.add_argument("--sq", action="store_true") +parser.add_argument("--alpha", default="auto", help="Smooth quant parameter.") + +args = parser.parse_args() +if args.ipex: + import intel_extension_for_pytorch as ipex +calib_size = 1 + + +class Evaluator: + def __init__(self, dataset, tokenizer, batch_size=8, pad_val=1, pad_max=196, is_calib=False): + self.dataset = dataset + self.tokenizer = tokenizer + self.batch_size = batch_size + self.pad_val = pad_val + self.pad_max = pad_max + self.is_calib = is_calib + + # tokenize the dataset + self.dataset = self.dataset.map(self.tokenize_function, batched=True) + self.dataset.set_format(type="torch", columns=["input_ids"]) + + @torch.no_grad() + def tokenize_function(self, examples): + return self.tokenizer(examples["text"]) + + @torch.no_grad() + def collate_batch(self, batch): + + input_ids_padded = [] + last_ind = [] + + for text in batch: + input_ids = text["input_ids"] + pad_len = self.pad_max - input_ids.shape[0] + last_ind.append(input_ids.shape[0] - 1) + if self.is_calib: + input_ids = input_ids[:self.pad_max] if len(input_ids) > self.pad_max else input_ids + else: + input_ids = pad(input_ids, (0, pad_len), value=self.pad_val) + input_ids_padded.append(input_ids) + + return (torch.vstack(input_ids_padded), torch.tensor(last_ind)) + + @torch.no_grad() + def evaluate(self, model): + model.eval() + # The task is to predict the last word of the input. + total, hit = 0, 0 + latency = 0 + test_dataloader = DataLoader( + self.dataset, + batch_size=self.batch_size, + shuffle=False, + collate_fn=self.collate_batch, + ) + for i, (input_ids, last_ind) in enumerate(test_dataloader): + label = input_ids[torch.arange(len(last_ind)), last_ind] + input_ids[torch.arange(len(last_ind)), last_ind] = self.pad_val + pad_len = self.pad_max - last_ind - 1 + + start = time.time() + outputs = model(input_ids) + latency += time.time() - start + + last_token_logits = outputs[0][torch.arange(len(last_ind)), -2 - pad_len, :] + pred = last_token_logits.argmax(dim=-1) + total += label.size(0) + hit += (pred == label).sum().item() + if (i + 1) % 50 == 0: + print(hit / total) + print("Processed minibatch:", i) + + acc = hit / total + print("Accuracy: ", acc) + print("Latency: ", latency) + return acc + + +def get_user_model(): + user_model = AutoModelForCausalLM.from_pretrained( + args.model, + torchscript=True, # torchscript will force `return_dict=False` to avoid jit errors + trust_remote_code=args.trust_remote_code, + revision=args.revision, + ) + tokenizer = AutoTokenizer.from_pretrained(args.model) + + if args.peft_model_id is not None: + from peft import PeftModel + user_model = PeftModel.from_pretrained(user_model, args.peft_model_id) + + # to channels last + user_model = user_model.to(memory_format=torch.channels_last) + user_model.eval() + return user_model, tokenizer + + +if args.quantize: + # dataset + user_model, tokenizer = get_user_model() + calib_dataset = load_dataset(args.dataset, split="train") + # calib_dataset = datasets.load_from_disk('/your/local/dataset/pile-10k/') # use this if trouble with connecting to HF + calib_dataset = calib_dataset.shuffle(seed=args.seed) + calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True) + calib_dataloader = DataLoader( + calib_evaluator.dataset, + batch_size=calib_size, + shuffle=False, + collate_fn=calib_evaluator.collate_batch, + ) + + from neural_compressor.torch.quantization import SmoothQuantConfig + args.alpha = eval(args.alpha) + excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] + quant_config = SmoothQuantConfig(alpha=args.alpha, folding=False, excluded_precisions=excluded_precisions) + + if re.search("gpt", user_model.config.model_type): + quant_config.set_local(torch.add, SmoothQuantConfig(w_dtype="fp32", act_dtype="fp32")) + + from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device + from tqdm import tqdm + def run_fn(model): + calib_iter = 0 + for batch in tqdm(calib_dataloader, total=args.calib_iters): + batch = move_input_to_device(batch, device=None) + if isinstance(batch, tuple) or isinstance(batch, list): + model(batch[0]) + elif isinstance(batch, dict): + model(**batch) + else: + model(batch) + + calib_iter += 1 + if calib_iter >= args.calib_iters: + break + return + + from utils import get_example_inputs + example_inputs = get_example_inputs(user_model, calib_dataloader) + + from neural_compressor.torch.quantization import prepare, convert + user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs) + run_fn(user_model) + user_model = convert(user_model) + user_model.save(args.output_dir) + + +if args.load: + # TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result + if args.int8 or args.int8_bf16_mixed: + print("load int8 model") + from neural_compressor.torch.quantization import load + tokenizer = AutoTokenizer.from_pretrained(args.model) + config = AutoConfig.from_pretrained(args.model) + user_model = load(os.path.abspath(os.path.expanduser(args.output_dir))) + setattr(user_model, "config", config) + else: + user_model, tokenizer = get_user_model() + + +if args.accuracy: + user_model.eval() + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + eval_args = LMEvalParser( + model="hf", + user_model=user_model, + tokenizer=tokenizer, + batch_size=args.batch_size, + tasks=args.tasks, + device="cpu", + ) + results = evaluate(eval_args) + for task_name in args.tasks.split(","): + if task_name == "wikitext": + acc = results["results"][task_name]["word_perplexity,none"] + else: + acc = results["results"][task_name]["acc,none"] + print("Accuracy: %.5f" % acc) + print('Batch size = %d' % args.batch_size) + +if args.performance: + user_model.eval() + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + import time + + samples = args.iters * args.batch_size + eval_args = LMEvalParser( + model="hf", + user_model=user_model, + tokenizer=tokenizer, + batch_size=args.batch_size, + tasks=args.tasks, + limit=samples, + device="cpu", + ) + start = time.time() + results = evaluate(eval_args) + end = time.time() + for task_name in args.tasks.split(","): + if task_name == "wikitext": + acc = results["results"][task_name]["word_perplexity,none"] + else: + acc = results["results"][task_name]["acc,none"] + print("Accuracy: %.5f" % acc) + print('Throughput: %.3f samples/sec' % (samples / (end - start))) + print('Latency: %.3f ms' % ((end - start) * 1000 / samples)) + print('Batch size = %d' % args.batch_size) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_quant.sh new file mode 100644 index 00000000000..774bb73b6f1 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_quant.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + +# run_tuning +function run_tuning { + extra_cmd='' + batch_size=8 + approach='static' + DATASET_NAME="NeelNanda/pile-10k" + tuned_checkpoint="saved_results" + + if [ "${topology}" = "opt_125m_ipex_sq" ]; then + model_name_or_path="facebook/opt-125m" + extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5" + elif [ "${topology}" = "llama2_7b_ipex_sq" ]; then + model_name_or_path="meta-llama/Llama-2-7b-hf" + extra_cmd=$extra_cmd" --ipex --sq --alpha 0.8" + elif [ "${topology}" = "gpt_j_ipex_sq" ]; then + model_name_or_path="EleutherAI/gpt-j-6b" + extra_cmd=$extra_cmd" --ipex --sq --alpha 1.0" + fi + + python -u run_clm_no_trainer.py \ + --model ${model_name_or_path} \ + --dataset ${DATASET_NAME} \ + --quantize \ + --approach ${approach} \ + --output_dir ${tuned_checkpoint} \ + --tasks "lambada_openai" \ + --batch_size ${batch_size} \ + ${extra_cmd} +} + +main "$@" diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/utils.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/utils.py new file mode 100644 index 00000000000..76117f8b0b5 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/utils.py @@ -0,0 +1,47 @@ +import torch +from collections import UserDict +from packaging.version import Version +from neural_compressor.torch.utils import get_torch_version + +def get_example_inputs(model, dataloader): + version = get_torch_version() + from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device + + # Suggest set dataloader like calib_dataloader + if dataloader is None: + return None + device = next(model.parameters()).device + try: + for idx, (input, label) in enumerate(dataloader): + input = move_input_to_device(input, device) + if isinstance(input, (dict, UserDict)): # pragma: no cover + assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0" + if "label" in input.keys(): + input.pop("label") + if version.release <= Version("2.0.1").release: + return tuple(input.values()) + else: + return dict(input) + if isinstance(input, (list, tuple)): + return tuple(input) + if isinstance(input, torch.Tensor): + return input + break + except Exception as e: # pragma: no cover + for idx, input in enumerate(dataloader): + input = move_input_to_device(input, device) + if isinstance(input, (dict, UserDict)): # pragma: no cover + assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0" + if "label" in input.keys(): + input.pop("label") + if version.release <= Version("2.0.1").release: + return tuple(input.values()) + else: + return dict(input) + if isinstance(input, list) or isinstance(input, tuple): + return tuple(input) + if isinstance(input, torch.Tensor): + return input + break + if idx == 0: + assert False, "Please checkout the example_inputs format." diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/README.md new file mode 100644 index 00000000000..8ecdc6c5110 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/README.md @@ -0,0 +1,57 @@ +Step-by-Step +============ +This document describes the step-by-step instructions to run large language models (LLMs) using Static Quantization on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch and Intel® Extension for PyTorch. + +The script `run_clm_no_trainer.py` supports `GPTJ`, `OPT`, `LLaMA2`, `BLOOM` and `Falcon` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models. + +# Prerequisite +## 1. Create Environment +``` +# Installation +pip install -r requirements.txt +``` + +# Run + +Here is how to run the scripts: + +**Causal Language Modeling (CLM)** + +`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows. +### GPT-J-6b + +#### Quantization +```bash +python run_clm_no_trainer.py \ + --model EleutherAI/gpt-j-6B \ + --quantize \ + --alpha 1.0 \ + --ipex \ + --output_dir "saved_results" +``` + +### OPT-125m + +#### Quantization + +```bash +python run_clm_no_trainer.py \ + --model facebook/opt-125m \ + --quantize \ + --alpha 0.5 \ + --ipex \ + --output_dir "saved_results" +``` + +### LLAMA2-7b/13b/70b +>Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy. +#### Quantization + +```bash +python run_clm_no_trainer.py \ + --model meta-llama/Llama-2-7b-hf \ + --quantize \ + --alpha 0.8 \ + --ipex \ + --output_dir "saved_results" +``` \ No newline at end of file diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt new file mode 100644 index 00000000000..f0b56e558d3 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt @@ -0,0 +1,13 @@ +accelerate +protobuf +sentencepiece != 0.1.92 +datasets >= 1.1.3 +torch >= 1.10 +transformers +pytest +wandb +einops +neural-compressor +intel-extension-for-transformers +lm_eval==0.4.2 +peft diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_benchmark.sh new file mode 100644 index 00000000000..b62a6381b20 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_benchmark.sh @@ -0,0 +1,96 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + iters=100 + batch_size=16 + approach=static + tuned_checkpoint=saved_results + task=lambada_openai + echo ${max_eval_samples} + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo $var |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo $var |cut -f2 -d=) + ;; + --iters=*) + iters=$(echo ${var} |cut -f2 -d=) + ;; + --int8=*) + int8=$(echo ${var} |cut -f2 -d=) + ;; + --config=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + + +# run_benchmark +function run_benchmark { + extra_cmd='' + + if [[ ${mode} == "accuracy" ]]; then + mode_cmd=" --accuracy " + extra_cmd=$extra_cmd" --load" + elif [[ ${mode} == "performance" ]]; then + mode_cmd=" --performance --iters "${iters} + extra_cmd=$extra_cmd" --load" + else + echo "Error: No such mode: ${mode}" + exit 1 + fi + + if [[ ${int8} == "true" ]]; then + extra_cmd=$extra_cmd" --int8" + fi + echo $extra_cmd + + if [ "${topology}" = "opt_125m_ipex" ]; then + model_name_or_path="facebook/opt-125m" + extra_cmd=$extra_cmd" --ipex" + elif [ "${topology}" = "llama2_7b_ipex" ]; then + model_name_or_path="meta-llama/Llama-2-7b-hf" + extra_cmd=$extra_cmd" --ipex" + elif [ "${topology}" = "gpt_j_ipex" ]; then + model_name_or_path="EleutherAI/gpt-j-6b" + extra_cmd=$extra_cmd" --ipex" + fi + + python -u run_clm_no_trainer.py \ + --model ${model_name_or_path} \ + --approach ${approach} \ + --output_dir ${tuned_checkpoint} \ + --task ${task} \ + --batch_size ${batch_size} \ + ${extra_cmd} ${mode_cmd} +} + +main "$@" diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py new file mode 100644 index 00000000000..0ccb2093537 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py @@ -0,0 +1,259 @@ +import argparse +import os +import sys + +sys.path.append('./') +import time +import re +import torch +from datasets import load_dataset +from torch.nn.functional import pad +from torch.utils.data import DataLoader +from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument( + "--model", nargs="?", default="EleutherAI/gpt-j-6b" +) +parser.add_argument( + "--trust_remote_code", default=True, + help="Transformers parameter: use the external repo") +parser.add_argument( + "--revision", default=None, + help="Transformers parameter: set the model hub commit number") +parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k") +parser.add_argument("--output_dir", nargs="?", default="./saved_results") +parser.add_argument("--quantize", action="store_true") +parser.add_argument( + "--int8_bf16_mixed", + action="store_true", + help="By default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)", +) +parser.add_argument( + '--seed', + type=int, default=42, help='Seed for sampling the calibration data.' +) +parser.add_argument("--approach", type=str, default='static', + help="Select from ['dynamic', 'static', 'weight-only']") +parser.add_argument("--int8", action="store_true") +parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.") +parser.add_argument("--load", action="store_true", help="Load quantized model.") +parser.add_argument("--accuracy", action="store_true") +parser.add_argument("--performance", action="store_true") +parser.add_argument("--iters", default=100, type=int, + help="For accuracy measurement only.") +parser.add_argument("--batch_size", default=1, type=int, + help="For accuracy measurement only.") +parser.add_argument("--save_accuracy_path", default=None, + help="Save accuracy results path.") +parser.add_argument("--pad_max_length", default=512, type=int, + help="Pad input ids to max length.") +parser.add_argument("--calib_iters", default=512, type=int, + help="calibration iters.") +parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext", + type=str, help="tasks for accuracy validation") +parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") + +args = parser.parse_args() +if args.ipex: + import intel_extension_for_pytorch as ipex +calib_size = 1 + + +class Evaluator: + def __init__(self, dataset, tokenizer, batch_size=8, pad_val=1, pad_max=196, is_calib=False): + self.dataset = dataset + self.tokenizer = tokenizer + self.batch_size = batch_size + self.pad_val = pad_val + self.pad_max = pad_max + self.is_calib = is_calib + + # tokenize the dataset + self.dataset = self.dataset.map(self.tokenize_function, batched=True) + self.dataset.set_format(type="torch", columns=["input_ids"]) + + @torch.no_grad() + def tokenize_function(self, examples): + return self.tokenizer(examples["text"]) + + @torch.no_grad() + def collate_batch(self, batch): + + input_ids_padded = [] + last_ind = [] + + for text in batch: + input_ids = text["input_ids"] + pad_len = self.pad_max - input_ids.shape[0] + last_ind.append(input_ids.shape[0] - 1) + if self.is_calib: + input_ids = input_ids[:self.pad_max] if len(input_ids) > self.pad_max else input_ids + else: + input_ids = pad(input_ids, (0, pad_len), value=self.pad_val) + input_ids_padded.append(input_ids) + + return (torch.vstack(input_ids_padded), torch.tensor(last_ind)) + + @torch.no_grad() + def evaluate(self, model): + model.eval() + # The task is to predict the last word of the input. + total, hit = 0, 0 + latency = 0 + test_dataloader = DataLoader( + self.dataset, + batch_size=self.batch_size, + shuffle=False, + collate_fn=self.collate_batch, + ) + for i, (input_ids, last_ind) in enumerate(test_dataloader): + label = input_ids[torch.arange(len(last_ind)), last_ind] + input_ids[torch.arange(len(last_ind)), last_ind] = self.pad_val + pad_len = self.pad_max - last_ind - 1 + + start = time.time() + outputs = model(input_ids) + latency += time.time() - start + + last_token_logits = outputs[0][torch.arange(len(last_ind)), -2 - pad_len, :] + pred = last_token_logits.argmax(dim=-1) + total += label.size(0) + hit += (pred == label).sum().item() + if (i + 1) % 50 == 0: + print(hit / total) + print("Processed minibatch:", i) + + acc = hit / total + print("Accuracy: ", acc) + print("Latency: ", latency) + return acc + + +def get_user_model(): + user_model = AutoModelForCausalLM.from_pretrained( + args.model, + torchscript=True, # torchscript will force `return_dict=False` to avoid jit errors + trust_remote_code=args.trust_remote_code, + revision=args.revision, + ) + tokenizer = AutoTokenizer.from_pretrained(args.model) + + if args.peft_model_id is not None: + from peft import PeftModel + user_model = PeftModel.from_pretrained(user_model, args.peft_model_id) + + # to channels last + user_model = user_model.to(memory_format=torch.channels_last) + user_model.eval() + return user_model, tokenizer + + +if args.quantize: + # dataset + user_model, tokenizer = get_user_model() + calib_dataset = load_dataset(args.dataset, split="train") + # calib_dataset = datasets.load_from_disk('/your/local/dataset/pile-10k/') # use this if trouble with connecting to HF + calib_dataset = calib_dataset.shuffle(seed=args.seed) + calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True) + calib_dataloader = DataLoader( + calib_evaluator.dataset, + batch_size=calib_size, + shuffle=False, + collate_fn=calib_evaluator.collate_batch, + ) + + + from neural_compressor.torch.quantization import get_default_static_config, StaticQuantConfig + quant_config = get_default_static_config() + quant_config.excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] + if re.search("gpt", user_model.config.model_type): + quant_config.set_local("add", StaticQuantConfig(w_dtype="fp32", act_dtype="fp32")) + + from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device + from tqdm import tqdm + def run_fn(model): + calib_iter = 0 + for batch in tqdm(calib_dataloader, total=args.calib_iters): + batch = move_input_to_device(batch, device=None) + if isinstance(batch, tuple) or isinstance(batch, list): + model(batch[0]) + elif isinstance(batch, dict): + model(**batch) + else: + model(batch) + + calib_iter += 1 + if calib_iter >= args.calib_iters: + break + return + + from utils import get_example_inputs + example_inputs = get_example_inputs(user_model, calib_dataloader) + + from neural_compressor.torch.quantization import prepare, convert + user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs) + run_fn(user_model) + user_model = convert(user_model) + user_model.save(args.output_dir) + +if args.load: + # TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result + if args.int8 or args.int8_bf16_mixed: + print("load int8 model") + from neural_compressor.torch.quantization import load + tokenizer = AutoTokenizer.from_pretrained(args.model) + config = AutoConfig.from_pretrained(args.model) + user_model = load(os.path.abspath(os.path.expanduser(args.output_dir))) + setattr(user_model, "config", config) + else: + user_model, tokenizer = get_user_model() + + +if args.accuracy: + user_model.eval() + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + eval_args = LMEvalParser( + model="hf", + user_model=user_model, + tokenizer=tokenizer, + batch_size=args.batch_size, + tasks=args.tasks, + device="cpu", + ) + results = evaluate(eval_args) + for task_name in args.tasks.split(","): + if task_name == "wikitext": + acc = results["results"][task_name]["word_perplexity,none"] + else: + acc = results["results"][task_name]["acc,none"] + print("Accuracy: %.5f" % acc) + print('Batch size = %d' % args.batch_size) + +if args.performance: + user_model.eval() + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + import time + + samples = args.iters * args.batch_size + eval_args = LMEvalParser( + model="hf", + user_model=user_model, + tokenizer=tokenizer, + batch_size=args.batch_size, + tasks=args.tasks, + limit=samples, + device="cpu", + ) + start = time.time() + results = evaluate(eval_args) + end = time.time() + for task_name in args.tasks.split(","): + if task_name == "wikitext": + acc = results["results"][task_name]["word_perplexity,none"] + else: + acc = results["results"][task_name]["acc,none"] + print("Accuracy: %.5f" % acc) + print('Throughput: %.3f samples/sec' % (samples / (end - start))) + print('Latency: %.3f ms' % ((end - start) * 1000 / samples)) + print('Batch size = %d' % args.batch_size) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_quant.sh new file mode 100644 index 00000000000..a93d8220d64 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_quant.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + +# run_tuning +function run_tuning { + extra_cmd='' + batch_size=8 + approach='static' + DATASET_NAME="NeelNanda/pile-10k" + tuned_checkpoint="saved_results" + + if [ "${topology}" = "opt_125m_ipex" ]; then + model_name_or_path="facebook/opt-125m" + extra_cmd=$extra_cmd" --ipex" + elif [ "${topology}" = "llama2_7b_ipex" ]; then + model_name_or_path="meta-llama/Llama-2-7b-hf" + extra_cmd=$extra_cmd" --ipex" + elif [ "${topology}" = "gpt_j_ipex" ]; then + model_name_or_path="EleutherAI/gpt-j-6b" + extra_cmd=$extra_cmd" --ipex" + fi + + python -u run_clm_no_trainer.py \ + --model ${model_name_or_path} \ + --dataset ${DATASET_NAME} \ + --quantize \ + --approach ${approach} \ + --output_dir ${tuned_checkpoint} \ + --tasks "lambada_openai" \ + --batch_size ${batch_size} \ + ${extra_cmd} +} + +main "$@" diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/utils.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/utils.py new file mode 100644 index 00000000000..76117f8b0b5 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/utils.py @@ -0,0 +1,47 @@ +import torch +from collections import UserDict +from packaging.version import Version +from neural_compressor.torch.utils import get_torch_version + +def get_example_inputs(model, dataloader): + version = get_torch_version() + from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device + + # Suggest set dataloader like calib_dataloader + if dataloader is None: + return None + device = next(model.parameters()).device + try: + for idx, (input, label) in enumerate(dataloader): + input = move_input_to_device(input, device) + if isinstance(input, (dict, UserDict)): # pragma: no cover + assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0" + if "label" in input.keys(): + input.pop("label") + if version.release <= Version("2.0.1").release: + return tuple(input.values()) + else: + return dict(input) + if isinstance(input, (list, tuple)): + return tuple(input) + if isinstance(input, torch.Tensor): + return input + break + except Exception as e: # pragma: no cover + for idx, input in enumerate(dataloader): + input = move_input_to_device(input, device) + if isinstance(input, (dict, UserDict)): # pragma: no cover + assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0" + if "label" in input.keys(): + input.pop("label") + if version.release <= Version("2.0.1").release: + return tuple(input.values()) + else: + return dict(input) + if isinstance(input, list) or isinstance(input, tuple): + return tuple(input) + if isinstance(input, torch.Tensor): + return input + break + if idx == 0: + assert False, "Please checkout the example_inputs format." diff --git a/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py b/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py index cd2686d0b0e..fdfb51640ac 100644 --- a/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py +++ b/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py @@ -82,13 +82,14 @@ def prepare(self, model, example_inputs, inplace=True, *args, **kwargs): model.output_tensor_id_op_name, ) - # Update json file in ipex_config_path - cfg_to_qconfig(self.quant_config, cfgs, op_infos_from_cfgs, output_tensor_id_op_name) - model.eval() - # check smoothquant alpha and act_algo value recipe_cfgs = self.quant_config.get("recipe_cfgs", None) alpha = recipe_cfgs["smooth_quant_args"]["alpha"] + + # Update json file in ipex_config_path + cfg_to_qconfig(self.quant_config, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, alpha, smooth_quant=True) + model.eval() + for op, _ in self.quant_config["op"].items(): act_algo = self.quant_config["op"][op]["activation"]["algorithm"] @@ -120,7 +121,6 @@ def prepare(self, model, example_inputs, inplace=True, *args, **kwargs): else: model = ipex.quantization.prepare(model, static_qconfig, example_inputs=example_inputs, inplace=inplace) - cfg_to_qconfig(self.quant_config, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, smooth_quant=True) model.load_qconf_summary(qconf_summary=ipex_config_path) return model diff --git a/neural_compressor/torch/algorithms/smooth_quant/utility.py b/neural_compressor/torch/algorithms/smooth_quant/utility.py index e2ce9c97f5b..b8a5b9669ff 100644 --- a/neural_compressor/torch/algorithms/smooth_quant/utility.py +++ b/neural_compressor/torch/algorithms/smooth_quant/utility.py @@ -164,7 +164,7 @@ def get_quantizable_ops_recursively(model, example_inputs, alpha, act_algo, inpl def check_cfg_and_qconfig( - tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_op_name, smooth_quant=False + tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_op_name, alpha=0.5, smooth_quant=True ): # pragma: no cover """Check configs and quantization configs. @@ -205,7 +205,7 @@ def check_cfg_and_qconfig( else: smooth_quant_enable = False activation_observer = generate_activation_observer( - inc_scheme, inc_algorithm, smooth_quant, smooth_quant_enable + inc_scheme, inc_algorithm, smooth_quant, smooth_quant_enable, alpha ) if not smooth_quant: if inc_scheme == "sym": @@ -241,11 +241,11 @@ def check_cfg_and_qconfig( def cfg_to_qconfig( - tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, smooth_quant=False + tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, alpha=0.5, smooth_quant=True ): # pragma: no cover assert cfgs is not None, "No configure for IPEX int8 model..." op_infos = copy.deepcopy(op_infos_from_cfgs) - cfgs = check_cfg_and_qconfig(tune_cfg["op"], cfgs, op_infos, output_tensor_id_op_name, smooth_quant) + cfgs = check_cfg_and_qconfig(tune_cfg["op"], cfgs, op_infos, output_tensor_id_op_name, alpha, smooth_quant) with open(ipex_config_path, "w") as write_f: json.dump(cfgs, write_f, indent=4) return None diff --git a/neural_compressor/torch/algorithms/static_quant/static_quant.py b/neural_compressor/torch/algorithms/static_quant/static_quant.py index e2eac7f236d..efd1880666c 100644 --- a/neural_compressor/torch/algorithms/static_quant/static_quant.py +++ b/neural_compressor/torch/algorithms/static_quant/static_quant.py @@ -85,7 +85,15 @@ def prepare(self, model, example_inputs, inplace=True, *args, **kwargs): from torch.ao.quantization import MinMaxObserver, PerChannelMinMaxObserver, QConfig if ipex_ver.release >= Version("2.1").release: - static_qconfig = ipex.quantization.default_static_qconfig_mapping + # HistogramObserver will cause a performance issue. + # static_qconfig = ipex.quantization.default_static_qconfig_mapping + qconfig = QConfig( + activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8), + weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric), + ) + from torch.ao.quantization import QConfigMapping + + static_qconfig = QConfigMapping().set_global(qconfig) else: static_qconfig = QConfig( activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8), diff --git a/neural_compressor/torch/algorithms/static_quant/utility.py b/neural_compressor/torch/algorithms/static_quant/utility.py index 81133557b3e..64fec8de785 100644 --- a/neural_compressor/torch/algorithms/static_quant/utility.py +++ b/neural_compressor/torch/algorithms/static_quant/utility.py @@ -164,7 +164,9 @@ def check_cfg_and_qconfig(user_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_ return cfgs, ori_user_cfg -def generate_activation_observer(scheme, algorithm, smooth_quant=False, smooth_quant_enable=False): # pragma: no cover +def generate_activation_observer( + scheme, algorithm, smooth_quant=False, smooth_quant_enable=False, alpha=0.5 +): # pragma: no cover """This is a helper method to generate an activation observer. Args: @@ -200,7 +202,7 @@ def generate_activation_observer(scheme, algorithm, smooth_quant=False, smooth_q "reduce_range": False, "quant_min": 0, "quant_max": 255, - "alpha": 0.5, + "alpha": 0.5 if alpha == "auto" else alpha, "act_observer": kl_activation_observer, "act_ic_observer": { "name": "PerChannelMinMaxObserver", @@ -220,7 +222,7 @@ def generate_activation_observer(scheme, algorithm, smooth_quant=False, smooth_q "reduce_range": False, "quant_min": 0, "quant_max": 255, - "alpha": 0.5, + "alpha": 0.5 if alpha == "auto" else alpha, "act_observer": minmax_activation_observer, "act_ic_observer": { "name": "PerChannelMinMaxObserver", diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index a8bab76b972..27a056d3284 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -980,7 +980,7 @@ def __init__( act_dtype: str = "uint8", act_sym: bool = False, act_granularity: str = "per_tensor", - act_algo: str = "kl", + act_algo: str = "minmax", excluded_precisions: list = [], white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST, ):