From 0c5f7bc2e01938b5221c881a0a7b5f1cf66cf7f6 Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Mon, 17 Jul 2023 15:23:01 +0800 Subject: [PATCH] Add a BERT example using Intel Neural Compressor SmoothQuant (#411) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Describe your changes Add a BERT example using Intel® Neural Compressor SmoothQuant. Add an introduction of Intel® Neural Compressor SmoothQuant to the README. ## Checklist before requesting a review - [ ] Add unit tests for this change. - [ ] Make sure all tests can pass. - [x] Update documents if necessary. - [x] Format your code by running `pre-commit run --all-files` - [ ] Is this a user-facing change? If yes, give a description of this change to be included in the release notes. ## (Optional) Issue link --------- Signed-off-by: yuwenzho --- examples/bert/README.md | 20 ++++ examples/bert/bert_inc_ptq_cpu.json | 1 + .../bert/bert_inc_smoothquant_ptq_cpu.json | 92 +++++++++++++++++++ 3 files changed, 113 insertions(+) create mode 100644 examples/bert/bert_inc_smoothquant_ptq_cpu.json diff --git a/examples/bert/README.md b/examples/bert/README.md index ca96fb41c..17d958d85 100644 --- a/examples/bert/README.md +++ b/examples/bert/README.md @@ -57,6 +57,26 @@ The workflow in [bert_inc_static_ptq_cpu.json](bert_inc_static_ptq_cpu.json) is #### Dynamic Quantization The workflow in [bert_inc_dynamic_ptq_cpu.json](bert_inc_dynamic_ptq_cpu.json) is similar to the above workflow, but specifically uses dynamic quantization instead of static/dynamic quantization. +#### Run with SmoothQuant + +Quantizing activations in large language models (LLMs) with huge parameter sizes can be challenging due to the presence of outliers. The SmoothQuant method, introduced in this [paper](https://arxiv.org/abs/2211.10438), addresses this issue by transferring the quantization difficulty from activations to weights through a mathematically equivalent transformation by using a fixed-value $\alpha$ for the entire model. However, the distributions of activation outliers vary not only across different models but also across different layers within a model. To resolve this, Intel® Neural Compressor proposes a method to obtain layer-wise optimal $\alpha$ values with the ability to tune automatically. Please refer to this [link](https://github.com/intel/neural-compressor/blob/master/docs/source/smooth_quant.md) for more algorithm details. + +User can use SmoothQuant by setting `smooth_quant` in `recipes` as shown below. Refer to [bert_inc_smoothquant_ptq_cpu.json](bert_inc_smoothquant_ptq_cpu.json) for an example of SmoothQuant. + +```json +"passes": { + "quantization": { + "type": "IncStaticQuantization", + "config": { + "recipes":{ + "smooth_quant": true, + "smooth_quant_args": {"alpha": 0.5} + } + } + } +} +``` + ### BERT optimization with QAT Customized Training Loop on CPU This workflow performs BERT optimization on CPU with QAT Customized Training Loop. It performs the optimization pipeline: - *PyTorch Model -> PyTorch Model after QAT -> Onnx Model -> Transformers Optimized Onnx Model -> ONNX Runtime performance tuning* diff --git a/examples/bert/bert_inc_ptq_cpu.json b/examples/bert/bert_inc_ptq_cpu.json index dde9407fb..180b2ad38 100644 --- a/examples/bert/bert_inc_ptq_cpu.json +++ b/examples/bert/bert_inc_ptq_cpu.json @@ -56,6 +56,7 @@ }, "quantization": { "type": "IncQuantization", + "disable_search": true, "config": { "approach": "SEARCHABLE_VALUES", "user_script": "user_script.py", diff --git a/examples/bert/bert_inc_smoothquant_ptq_cpu.json b/examples/bert/bert_inc_smoothquant_ptq_cpu.json new file mode 100644 index 000000000..0df1ff8b5 --- /dev/null +++ b/examples/bert/bert_inc_smoothquant_ptq_cpu.json @@ -0,0 +1,92 @@ +{ + "input_model":{ + "type": "PyTorchModel", + "config": { + "model_loader": "load_pytorch_origin_model", + "model_script": "user_script.py", + "io_config": { + "input_names": ["input_ids", "attention_mask", "token_type_ids"], + "input_shapes": [[1, 128], [1, 128], [1, 128]], + "input_types": ["int64", "int64", "int64"], + "output_names": ["output"] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics":[ + { + "name": "accuracy", + "type": "accuracy", + "sub_types": [ + {"name": "accuracy_score", "priority": 1, "goal": {"type": "percent-max-degradation", "value": 2}} + ], + "user_config":{ + "post_processing_func": "post_process", + "user_script": "user_script.py", + "dataloader_func": "create_dataloader", + "batch_size": 1 + } + }, + { + "name": "latency", + "type": "latency", + "sub_types": [ + {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}} + ], + "user_config":{ + "user_script": "user_script.py", + "dataloader_func": "create_dataloader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "config": { + "target_opset": 13 + } + }, + "transformers_optimization": { + "type": "OrtTransformersOptimization", + "config": {"model_type": "bert"} + }, + "quantization": { + "type": "IncStaticQuantization", + "disable_search": true, + "config": { + "quant_format": "QOperator", + "user_script": "user_script.py", + "dataloader_func": "inc_glue_calibration_reader", + "recipes":{ + "smooth_quant": true, + "smooth_quant_args": {"alpha": 0.7} + }, + "metric": { + "name": "accuracy", + "type": "custom", + "sub_types": [ + {"name": "accuracy_custom", "priority": 1, "higher_is_better": true, "goal": {"type": "percent-max-degradation", "value": 2}} + ], + "user_config":{ + "user_script": "user_script.py", + "evaluate_func": "eval_accuracy", + "batch_size": 1 + } + } + } + } + }, + "engine": { + "search_strategy": { + "execution_order": "joint", + "search_algorithm": "exhaustive" + }, + "evaluator": "common_evaluator", + "cache_dir": "cache", + "output_dir": "models/bert_inc_static_ptq_cpu" + } +}