From 0937e496330a131ea013883ec118713106e5f422 Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Tue, 23 May 2023 17:27:42 +0800 Subject: [PATCH 01/21] Add dnnl ep Signed-off-by: Mengni Wang --- .../mix_precision/README.md | 77 ++++ .../text_classification/mix_precision/eval.sh | 128 ++++++ .../mix_precision/export.py | 74 ++++ .../text_classification/mix_precision/main.py | 405 +++++++++++++++++ .../mix_precision/prepare_data.sh | 34 ++ .../mix_precision/requirements.txt | 8 + .../text_classification/mix_precision/run.sh | 118 +++++ neural_compressor/adaptor/onnxrt.py | 94 ++-- neural_compressor/adaptor/onnxrt_dnnl.yaml | 411 ++++++++++++++++++ neural_compressor/adaptor/ox_utils/util.py | 4 +- neural_compressor/config.py | 20 +- neural_compressor/strategy/strategy.py | 5 + 12 files changed, 1327 insertions(+), 51 deletions(-) create mode 100644 examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md create mode 100644 examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/eval.sh create mode 100644 examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/export.py create mode 100644 examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/main.py create mode 100644 examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/prepare_data.sh create mode 100644 examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/requirements.txt create mode 100644 examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/run.sh create mode 100644 neural_compressor/adaptor/onnxrt_dnnl.yaml diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md new file mode 100644 index 00000000000..f5d976123a3 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md @@ -0,0 +1,77 @@ +Step-by-Step +============ + +This example load a language translation model and confirm its accuracy and speed based on [GLUE data](https://gluebenchmark.com/). + +# Prerequisite + +## 1. Environment +```shell +git clone -b dnnl_ep --depth 1 https://github.com/intel/neural-compressor.git +cd neural-compressor +pip install -e ./ + +cd examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/ +pip install -r requirements.txt +``` +> Note: Validated ONNX Runtime [Version](/docs/source/installation_guide.md#validated-software-environment). + +## 2. Prepare Model + +Supported model identifier from [huggingface.co](https://huggingface.co/): + +| Model Identifier | +|:-----------------------------------------------:| +| Intel/bert-base-uncased-mrpc | +| Intel/roberta-base-mrpc | +| Intel/xlm-roberta-base-mrpc | +| Intel/camembert-base-mrpc | +| distilbert-base-uncased-finetuned-sst-2-english | +| Alireza1044/albert-base-v2-sst2 | +| Intel/MiniLM-L12-H384-uncased-mrpc | +| philschmid/MiniLM-L6-H384-uncased-sst2 | +| bert-base-cased-finetuned-mrpc | +| Intel/electra-small-discriminator-mrpc | +| M-FAC/bert-mini-finetuned-mrpc | +| Intel/xlnet-base-cased-mrpc | +| Intel/bart-large-mrpc | + +```bash +python export.py --model_name_or_path=Intel/bert-base-uncased-mrpc # or other supported model identifier +``` + +## 3. Prepare Dataset +Download the GLUE data with `prepare_data.sh` script. + +```shell +export GLUE_DIR=/path/to/glue_data +export TASK_NAME=MRPC # or SST + +bash prepare_data.sh --data_dir=$GLUE_DIR --task_name=$TASK_NAME +``` + +# Run + +If the hardware doesn't support bf16 instruction, please set flag as below to force bf16 conversion (this way will be deperecated): + +```shell +export FORCE_BF16=1 +``` + +## 1. Only mixed precision conversion + +```bash +bash run.sh --input_model=path/to/model \ # model path as *.onnx + --output_model=path/to/model_tune \ # model path as *.onnx +``` + +## 2. Mixed precision conversion + accuracy evaluation + +Please make sure DnnlExecutionProvider is in available providers list to execute evaluation. + +```bash +bash eval.sh --input_model=path/to/model \ # model path as *.onnx + --output_model=path/to/model_tune \ # model path as *.onnx + --dataset_location=path/to/glue/data \ + --batch_size=batch_size \ # optional +``` diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/eval.sh b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/eval.sh new file mode 100644 index 00000000000..9cc04b05e1e --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/eval.sh @@ -0,0 +1,128 @@ +#!/bin/bash +set -x + +function main { + init_params "$@" + run_tuning +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + output_model=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo $var |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function run_tuning { + + if [[ "${input_model}" =~ "bert-base-uncased" ]]; then + model_name_or_path="Intel/bert-base-uncased-mrpc" + TASK_NAME='mrpc' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "roberta-base" ]]; then + model_name_or_path="Intel/roberta-base-mrpc" + TASK_NAME='mrpc' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "xlm-roberta-base" ]]; then + model_name_or_path="Intel/xlm-roberta-base-mrpc" + TASK_NAME='mrpc' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "camembert-base" ]]; then + model_name_or_path="Intel/camembert-base-mrpc" + TASK_NAME='mrpc' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "distilbert-base" ]]; then + model_name_or_path="distilbert-base-uncased-finetuned-sst-2-english" + TASK_NAME='sst-2' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "albert-base" ]]; then + model_name_or_path="Alireza1044/albert-base-v2-sst2" + TASK_NAME='sst-2' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "MiniLM-L6" ]]; then + model_name_or_path="philschmid/MiniLM-L6-H384-uncased-sst2" + TASK_NAME='sst-2' + num_heads=12 + hidden_size=384 + fi + if [[ "${input_model}" =~ "MiniLM-L12" ]]; then + model_name_or_path="Intel/MiniLM-L12-H384-uncased-mrpc" + TASK_NAME='mrpc' + num_heads=12 + hidden_size=384 + fi + if [[ "${input_model}" =~ "bert-base-cased" ]]; then + model_name_or_path="bert-base-cased-finetuned-mrpc" + TASK_NAME='mrpc' + num_heads=12 + hidden_size=384 + fi + if [[ "${input_model}" =~ "xlnet-base-cased" ]]; then + model_name_or_path="Intel/xlnet-base-cased-mrpc" + TASK_NAME='mrpc' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "bert-mini" ]]; then + model_name_or_path="M-FAC/bert-mini-finetuned-mrpc" + TASK_NAME='mrpc' + num_heads=4 + hidden_size=256 + fi + if [[ "${input_model}" =~ "electra-small-discriminator" ]]; then + model_name_or_path="Intel/electra-small-discriminator-mrpc" + TASK_NAME='mrpc' + num_heads=4 + hidden_size=256 + fi + if [[ "${input_model}" =~ "bart" ]]; then + model_name_or_path="Intel/bart-large-mrpc" + TASK_NAME='mrpc' + num_heads=16 + hidden_size=4096 + fi + + python main.py \ + --model_name_or_path ${model_name_or_path} \ + --model_path ${input_model} \ + --output_model ${output_model} \ + --data_path ${dataset_location} \ + --batch_size ${batch_size-1} \ + --task ${TASK_NAME} \ + --num_heads ${num_heads} \ + --hidden_size ${hidden_size} \ + --do_eval +} + +main "$@" + + + diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/export.py b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/export.py new file mode 100644 index 00000000000..589fe3a345e --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/export.py @@ -0,0 +1,74 @@ +import argparse + +import torch +from transformers import AutoConfig, AutoModelForSequenceClassification + +def export_onnx_model(args, model): + with torch.no_grad(): + symbolic_names = {0: 'batch_size', 1: 'max_seq_len'} + if args.model_name_or_path in ['Intel/roberta-base-mrpc', + 'Intel/xlm-roberta-base-mrpc', + 'Intel/camembert-base-mrpc', + 'distilbert-base-uncased-finetuned-sst-2-english']: + inputs = {'input_ids': torch.ones(1, args.max_len, dtype=torch.int64), + 'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)} + torch.onnx.export(model, # model being run + (inputs['input_ids'], # model input (or a tuple for multiple inputs) + inputs['attention_mask']), + args.output_model, # where to save the model (can be a file or file-like object) + opset_version=14, # the ONNX version to export the model + do_constant_folding=True, # whether to execute constant folding + input_names=['input_ids', # the model's input names + 'attention_mask'], + output_names=['logits'], + dynamic_axes={'input_ids': symbolic_names, # variable length axes + 'attention_mask' : symbolic_names}) + else: + inputs = {'input_ids': torch.ones(1, args.max_len, dtype=torch.int64), + 'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64), + 'token_type_ids': torch.ones(1, args.max_len, dtype=torch.int64)} + torch.onnx.export(model, # model being run + (inputs['input_ids'], # model input (or a tuple for multiple inputs) + inputs['attention_mask'], + inputs['token_type_ids']), + args.output_model, # where to save the model (can be a file or file-like object) + opset_version=14, # the ONNX version to export the model + do_constant_folding=True, # whether to execute constant folding + input_names=['input_ids', # the model's input names + 'attention_mask', + 'token_type_ids'], + output_names=['logits'], + dynamic_axes={'input_ids': symbolic_names, # variable length axes + 'attention_mask' : symbolic_names, + 'token_type_ids' : symbolic_names}) + print("ONNX Model exported to {0}".format(args.output_model)) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Export huggingface onnx model', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + '--model_name_or_path', + type=str, + choices=['Intel/bert-base-uncased-mrpc', + 'Intel/roberta-base-mrpc', + 'Intel/xlm-roberta-base-mrpc', + 'Intel/camembert-base-mrpc', + 'distilbert-base-uncased-finetuned-sst-2-english', + 'Alireza1044/albert-base-v2-sst2', + 'philschmid/MiniLM-L6-H384-uncased-sst2', + 'Intel/MiniLM-L12-H384-uncased-mrpc'], + help='pretrained model name or path') + parser.add_argument( + '--max_len', + type=int, + default=128, + help='Maximum length of the sentence pairs') + args = parser.parse_args() + args.output_model = args.model_name_or_path.split('/')[-1] + '.onnx' + + model = AutoModelForSequenceClassification.from_pretrained( + args.model_name_or_path, + config=AutoConfig.from_pretrained(args.model_name_or_path)) + + export_onnx_model(args, model) \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/main.py b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/main.py new file mode 100644 index 00000000000..fa5bd52f578 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/main.py @@ -0,0 +1,405 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint:disable=redefined-outer-name,logging-format-interpolation + +import logging +import argparse +import onnx +import onnxruntime as ort +import transformers +import os +import torch +import numpy as np +from dataclasses import dataclass +from typing import List, Optional, Union +from neural_compressor.data.dataloaders.onnxrt_dataloader import DefaultDataLoader +from neural_compressor.data.datasets.dummy_dataset import DummyDataset + + +class ONNXRTBertDataset: + """Dataset used for model Bert. + Args: data_dir (str): The input data dir. + model_name_or_path (str): Path to pre-trained student model or shortcut name, + selected in the list: + max_seq_length (int, default=128): The maximum length after tokenization. + Sequences longer than this will be truncated, + sequences shorter will be padded. + do_lower_case (bool, default=True): Whether to lowercase the input when tokenizing. + task (str, default=mrpc): The name of the task to fine-tune. + Choices include mrpc, qqp, qnli, rte, + sts-b, cola, mnli, wnli. + model_type (str, default='bert'): model type, support 'distilbert', 'bert', + 'mobilebert', 'roberta'. + dynamic_length (bool, default=False): Whether to use fixed sequence length. + evaluate (bool, default=True): Whether do evaluation or training. + transform (transform object, default=None): transform to process input data. + filter (Filter objects, default=None): filter out examples according + to specific conditions. + """ + def __init__(self, model, data_dir, model_name_or_path, max_seq_length=128,\ + do_lower_case=True, task='mrpc', model_type='bert', dynamic_length=False,\ + evaluate=True, transform=None, filter=None): + self.inputs = [inp.name for inp in onnx.load(model).graph.input] + task = task.lower() + model_type = model_type.lower() + assert task in ['mrpc', 'qqp', 'qnli', 'rte', 'sts-b', 'cola', \ + 'mnli', 'wnli', 'sst-2'], 'Unsupported task type' + assert model_type in ['distilbert', 'bert', 'mobilebert', 'roberta'], 'Unsupported \ + model type' + self.dynamic_length = dynamic_length + self.model_type = model_type + self.max_seq_length = max_seq_length + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, + do_lower_case=do_lower_case) + self.dataset = load_and_cache_examples(data_dir, model_name_or_path, \ + max_seq_length, task, model_type, tokenizer, evaluate) + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, index): + batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in self.dataset[index]) + return batch[:len(self.inputs)], batch[-1] + +def load_and_cache_examples(data_dir, model_name_or_path, max_seq_length, task, \ + model_type, tokenizer, evaluate): + from torch.utils.data import TensorDataset + + processor = transformers.glue_processors[task]() + output_mode = transformers.glue_output_modes[task] + # Load data features from cache or dataset file + if not os.path.exists("./dataset_cached"): + os.makedirs("./dataset_cached") + cached_features_file = os.path.join("./dataset_cached", 'cached_{}_{}_{}_{}'.format( + 'dev' if evaluate else 'train', + list(filter(None, model_name_or_path.split('/'))).pop(), + str(max_seq_length), + str(task))) + if os.path.exists(cached_features_file): + logger.info("Load features from cached file {}.".format(cached_features_file)) + features = torch.load(cached_features_file) + else: + logger.info("Create features from dataset file at {}.".format(data_dir)) + label_list = processor.get_labels() + examples = processor.get_dev_examples(data_dir) if evaluate else \ + processor.get_train_examples(data_dir) + features = convert_examples_to_features(examples, + tokenizer, + task=task, + label_list=label_list, + max_length=max_seq_length, + output_mode=output_mode, + ) + logger.info("Save features into cached file {}.".format(cached_features_file)) + torch.save(features, cached_features_file) + # Convert to Tensors and build dataset + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) + all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long) + if output_mode == "classification": + all_labels = torch.tensor([f.label for f in features], dtype=torch.long) + elif output_mode == "regression": + all_labels = torch.tensor([f.label for f in features], dtype=torch.float) + dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, \ + all_seq_lengths, all_labels) + return dataset + +def convert_examples_to_features( + examples, + tokenizer, + max_length=128, + task=None, + label_list=None, + output_mode="classification", + pad_token=0, + pad_token_segment_id=0, + mask_padding_with_zero=True, +): + processor = transformers.glue_processors[task]() + if label_list is None: + label_list = processor.get_labels() + logger.info("Use label list {} for task {}.".format(label_list, task)) + label_map = {label: i for i, label in enumerate(label_list)} + features = [] + for (ex_index, example) in enumerate(examples): + inputs = tokenizer.encode_plus( + example.text_a, + example.text_b, + add_special_tokens=True, + max_length=max_length, + return_token_type_ids=True, + truncation=True, + ) + input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # Zero-pad up to the sequence length. + seq_length = len(input_ids) + padding_length = max_length - len(input_ids) + + input_ids = input_ids + ([pad_token] * padding_length) + attention_mask = attention_mask + \ + ([0 if mask_padding_with_zero else 1] * padding_length) + token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) + + assert len(input_ids) == max_length, \ + "Error with input_ids length {} vs {}".format( + len(input_ids), max_length) + assert len(attention_mask) == max_length, \ + "Error with attention_mask length {} vs {}".format( + len(attention_mask), max_length + ) + assert len(token_type_ids) == max_length, \ + "Error with token_type_ids length {} vs {}".format( + len(token_type_ids), max_length + ) + if output_mode == "classification": + label = label_map[example.label] + elif output_mode == "regression": + label = float(example.label) + else: + raise KeyError(output_mode) + + feats = InputFeatures( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + label=label, + seq_length=seq_length, + ) + features.append(feats) + return features + +@dataclass(frozen=True) +class InputFeatures: + """ + A single set of features of data. + Property names are the same names as the corresponding inputs to a model. + Args: + input_ids: Indices of input sequence tokens in the vocabulary. + attention_mask: Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, + ``0`` for MASKED (padded) tokens. + token_type_ids: (Optional) Segment token indices to indicate first and second + portions of the inputs. Only some models use them. + label: (Optional) Label corresponding to the input. Int for classification problems, + float for regression problems. + seq_length: (Optional) The length of input sequence before padding. + """ + + input_ids: List[int] + attention_mask: Optional[List[int]] = None + token_type_ids: Optional[List[int]] = None + label: Optional[Union[int, float]] = None + seq_length: Optional[List[int]] = None + +class ONNXRTGLUE: + """Computes GLUE score. + + Args: + task (str, default=mrpc): The name of the task. + Choices include mrpc, qqp, qnli, rte, + sts-b, cola, mnli, wnli. + + """ + def __init__(self, task='mrpc'): + assert task in ['mrpc', 'qqp', 'qnli', 'rte', 'sts-b', 'cola', \ + 'mnli', 'wnli', 'sst-2'], 'Unsupported task type' + self.pred_list = None + self.label_list = None + self.task = task + self.return_key = { + "cola": "mcc", + "mrpc": "f1", + "sts-b": "corr", + "qqp": "acc", + "mnli": "mnli/acc", + "qnli": "acc", + "rte": "acc", + "wnli": "acc", + "sst-2": "acc" + } + + def update(self, preds, labels): + """add preds and labels to storage""" + if isinstance(preds, list) and len(preds) == 1: + preds = preds[0] + if isinstance(labels, list) and len(labels) == 1: + labels = labels[0] + if self.pred_list is None: + self.pred_list = preds + self.label_list = labels + else: + self.pred_list = np.append(self.pred_list, preds, axis=0) + self.label_list = np.append(self.label_list, labels, axis=0) + + def reset(self): + """clear preds and labels storage""" + self.pred_list = None + self.label_list = None + + def result(self): + """calculate metric""" + output_mode = transformers.glue_output_modes[self.task] + + if output_mode == "classification": + processed_preds = np.argmax(self.pred_list, axis=1) + elif output_mode == "regression": + processed_preds = np.squeeze(self.pred_list) + result = transformers.glue_compute_metrics(\ + self.task, processed_preds, self.label_list) + return result[self.return_key[self.task]] + +logger = logging.getLogger(__name__) +logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt = '%m/%d/%Y %H:%M:%S', + level = logging.WARN) + +if __name__ == "__main__": + logger.info('Evaluating ONNXRuntime full precision accuracy and performance:') + parser = argparse.ArgumentParser( + description='BERT fine-tune examples for classification/regression tasks.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + '--model_path', + type=str, + help="Pre-trained resnet50 model on onnx file" + ) + parser.add_argument( + '--do_eval', + action='store_true', \ + default=False + ) + parser.add_argument( + '--output_model', + type=str, + default=None, + help="output model path" + ) + parser.add_argument( + '--data_path', + type=str, + default=None, + help="input data path" + ) + parser.add_argument( + '--batch_size', + default=8, + type=int, + ) + parser.add_argument( + '--model_name_or_path', + type=str, + choices=['Intel/bert-base-uncased-mrpc', + 'Intel/roberta-base-mrpc', + 'Intel/xlm-roberta-base-mrpc', + 'Intel/camembert-base-mrpc', + 'distilbert-base-uncased-finetuned-sst-2-english', + 'Alireza1044/albert-base-v2-sst2', + 'philschmid/MiniLM-L6-H384-uncased-sst2', + 'Intel/MiniLM-L12-H384-uncased-mrpc', + 'bert-base-cased-finetuned-mrpc', + 'Intel/electra-small-discriminator-mrpc', + 'M-FAC/bert-mini-finetuned-mrpc', + 'Intel/xlnet-base-cased-mrpc', + 'Intel/bart-large-mrpc'], + help="pretrained model name or path" + ) + parser.add_argument( + '--task', + type=str, + choices=['mrpc', 'qqp', 'qnli', 'rte', 'sts-b', 'cola', \ + 'mnli', 'wnli', 'sst-2'], + help="GLUE task name" + ) + parser.add_argument( + '--num_heads', + default=12, + type=int, + ) + parser.add_argument( + '--hidden_size', + default=768, + type=int, + ) + + args = parser.parse_args() + + if ort.__version__ <= '1.13.1': + from onnxruntime.transformers import optimizer + from onnxruntime.transformers.fusion_options import FusionOptions + model_type = 'bart' if args.model_name_or_path == 'Intel/bart-large-mrpc' else 'bert' + opt_options = FusionOptions(model_type) + opt_options.enable_embed_layer_norm = False + + model_optimizer = optimizer.optimize_model( + args.model_path, + model_type, + num_heads=args.num_heads, + hidden_size=args.hidden_size, + optimization_options=opt_options) + model = model_optimizer.model + else: + model = onnx.load(args.model_path) + + from neural_compressor import MixedPrecisionConfig + from neural_compressor.mix_precision import fit + config = MixedPrecisionConfig(backend='onnxrt_dnnl_ep', precision='bf16') + converted_model = fit(model, config) + if any([i.domain in ['', 'ai.onnx'] and i.version < 15 for i in converted_model.model.opset_import]): + from onnx import version_converter + try: + new = version_converter.convert_version(converted_model.model, 15) + onnx.save(new, args.output_model) + except: + logging.warning("Fail to upgrade opset_import to > 15, " + "please upgrate it manually to run with bf16 data type") + else: + converted_model.save(args.output_model) + + if args.do_eval: + dataset = ONNXRTBertDataset(args.model_path, + data_dir=args.data_path, + model_name_or_path=args.model_name_or_path, + task=args.task) + dataloader = DefaultDataLoader(dataset, args.batch_size) + metric = ONNXRTGLUE(args.task) + + def eval_func(model, *args): + metric.reset() + session = ort.InferenceSession(model.SerializeToString(), + providers=ort.get_available_providers()) + ort_inputs = {} + len_inputs = len(session.get_inputs()) + inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)] + for idx, (inputs, labels) in enumerate(dataloader): + if not isinstance(labels, list): + labels = [labels] + inputs = inputs[:len_inputs] + for i in range(len_inputs): + ort_inputs.update({inputs_names[i]: inputs[i]}) + predictions = session.run(None, ort_inputs) + metric.update(predictions[0], labels) + return metric.result() + + model = onnx.load(args.output_model) + acc_result = eval_func(model) + print("Batch size = %d" % args.batch_size) + print("Accuracy: %.5f" % acc_result) diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/prepare_data.sh b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/prepare_data.sh new file mode 100644 index 00000000000..8e434a5c521 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/prepare_data.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -x + +function main { + init_params "$@" + download_data + +} + +# init params +function init_params { + + for var in "$@" + do + case $var in + --data_dir=*) + data_dir=$(echo $var |cut -f2 -d=) + ;; + --task_name=*) + task_name=$(echo $var |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function download_data { + wget https://raw.githubusercontent.com/huggingface/transformers/f98ef14d161d7bcdc9808b5ec399981481411cc1/utils/download_glue_data.py + python download_glue_data.py --data_dir=${data_dir} --tasks=${task_name} +} + +main "$@" + diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/requirements.txt b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/requirements.txt new file mode 100644 index 00000000000..eb99b8d165d --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/requirements.txt @@ -0,0 +1,8 @@ +torch +transformers +onnx +onnxruntime >= 1.14.0 +coloredlogs +sympy +onnxruntime-extensions; python_version < '3.10' +numpy diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/run.sh b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/run.sh new file mode 100644 index 00000000000..033907a1e3c --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/run.sh @@ -0,0 +1,118 @@ +#!/bin/bash +set -x + +function main { + init_params "$@" + run_tuning +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + output_model=$(echo $var |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function run_tuning { + + if [[ "${input_model}" =~ "bert-base-uncased" ]]; then + model_name_or_path="Intel/bert-base-uncased-mrpc" + TASK_NAME='mrpc' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "roberta-base" ]]; then + model_name_or_path="Intel/roberta-base-mrpc" + TASK_NAME='mrpc' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "xlm-roberta-base" ]]; then + model_name_or_path="Intel/xlm-roberta-base-mrpc" + TASK_NAME='mrpc' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "camembert-base" ]]; then + model_name_or_path="Intel/camembert-base-mrpc" + TASK_NAME='mrpc' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "distilbert-base" ]]; then + model_name_or_path="distilbert-base-uncased-finetuned-sst-2-english" + TASK_NAME='sst-2' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "albert-base" ]]; then + model_name_or_path="Alireza1044/albert-base-v2-sst2" + TASK_NAME='sst-2' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "MiniLM-L6" ]]; then + model_name_or_path="philschmid/MiniLM-L6-H384-uncased-sst2" + TASK_NAME='sst-2' + num_heads=12 + hidden_size=384 + fi + if [[ "${input_model}" =~ "MiniLM-L12" ]]; then + model_name_or_path="Intel/MiniLM-L12-H384-uncased-mrpc" + TASK_NAME='mrpc' + num_heads=12 + hidden_size=384 + fi + if [[ "${input_model}" =~ "bert-base-cased" ]]; then + model_name_or_path="bert-base-cased-finetuned-mrpc" + TASK_NAME='mrpc' + num_heads=12 + hidden_size=384 + fi + if [[ "${input_model}" =~ "xlnet-base-cased" ]]; then + model_name_or_path="Intel/xlnet-base-cased-mrpc" + TASK_NAME='mrpc' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "bert-mini" ]]; then + model_name_or_path="M-FAC/bert-mini-finetuned-mrpc" + TASK_NAME='mrpc' + num_heads=4 + hidden_size=256 + fi + if [[ "${input_model}" =~ "electra-small-discriminator" ]]; then + model_name_or_path="Intel/electra-small-discriminator-mrpc" + TASK_NAME='mrpc' + num_heads=4 + hidden_size=256 + fi + if [[ "${input_model}" =~ "bart" ]]; then + model_name_or_path="Intel/bart-large-mrpc" + TASK_NAME='mrpc' + num_heads=16 + hidden_size=4096 + fi + + python main.py \ + --model_name_or_path ${model_name_or_path} \ + --model_path ${input_model} \ + --output_model ${output_model} \ + --num_heads ${num_heads} \ + --hidden_size ${hidden_size} +} + +main "$@" + + + diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py index 0f62b88a5b4..c0c6bcfc31d 100644 --- a/neural_compressor/adaptor/onnxrt.py +++ b/neural_compressor/adaptor/onnxrt.py @@ -65,6 +65,8 @@ def __init__(self, framework_specific_info): self.recipes = framework_specific_info.get("recipes", {}) self.backend = PROVIDERS[framework_specific_info["backend"]] self.performance_only = framework_specific_info.get("performance_only", False) + self.use_bf16 = framework_specific_info.get("use_bf16", False) + self.use_fp16 = framework_specific_info.get("use_fp16", False) if self.backend not in ort.get_all_providers(): logger.warning("{} backend is not supported in current environment, " @@ -93,6 +95,8 @@ def __init__(self, framework_specific_info): config_file = 'onnxrt_trt.yaml' elif self.backend == 'CUDAExecutionProvider': config_file = 'onnxrt_cuda.yaml' + elif self.backend == 'DnnlExecutionProvider': + config_file = 'onnxrt_dnnl.yaml' else: # pragma: no cover assert False, "{} provider is not supported in current environment, " \ "supported providers: {}".format(self.backend, @@ -590,9 +594,9 @@ def _detect_domain(self, model): # typically, NLP models have multiple inputs, # and the dimension of each input is usually 2 (batch_size, max_seq_len) if not model.is_large_model: - sess = ort.InferenceSession(model.model.SerializeToString(), providers=[self.backend]) + sess = ort.InferenceSession(model.model.SerializeToString(), providers=ort.get_available_providers()) elif model.model_path is not None: # pragma: no cover - sess = ort.InferenceSession(model.model_path, providers=[self.backend]) + sess = ort.InferenceSession(model.model_path, providers=ort.get_available_providers()) else: # pragma: no cover assert False, "Please use model path instead of onnx model object to quantize." input_shape_lens = [len(input.shape) for input in sess.get_inputs()] @@ -619,48 +623,51 @@ def _pre_optimize(self, model, level=1): from neural_compressor.adaptor.ox_utils.util import \ remove_init_from_model_input, split_shared_bias remove_init_from_model_input(model) - sess_options = ort.SessionOptions() - optimization_levels = { + try: + sess_options = ort.SessionOptions() + optimization_levels = { 'DISABLE_ALL': ort.GraphOptimizationLevel.ORT_DISABLE_ALL, 'ENABLE_BASIC': ort.GraphOptimizationLevel.ORT_ENABLE_BASIC, 'ENABLE_EXTENDED': ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED, 'ENABLE_ALL': ort.GraphOptimizationLevel.ORT_ENABLE_ALL} - if not isinstance(self.query_handler.get_graph_optimization(), list): - level = self.query_handler.get_graph_optimization() - elif options.onnxrt.graph_optimization.level is not None: - level = options.onnxrt.graph_optimization.level - elif self.recipes.get('graph_optimization_level', None) is not None: - level = self.recipes['graph_optimization_level'] - else: - if self.domain == "auto" and self._detect_domain(model): - self.domain = 'nlp' - level = 'ENABLE_EXTENDED' if self.domain == 'nlp' else 'ENABLE_BASIC' - logger.warning("Graph optimization level is automatically set to {}. " - "You can use 'recipe' argument in 'PostTrainingQuantConfig'" - "to overwrite it".format(level)) - sess_options.graph_optimization_level = optimization_levels[level] - sess_options.optimized_model_filepath = os.path.join(self.work_space, \ - "Optimized_model.onnx") - if sys.version_info < (3,10) and find_spec('onnxruntime_extensions'): # pragma: no cover - from onnxruntime_extensions import get_library_path - sess_options.register_custom_ops_library(get_library_path()) - backend = self.backend if self.backend != 'TensorrtExecutionProvider' else 'CUDAExecutionProvider' - if not model.is_large_model: - ort.InferenceSession(model.model.SerializeToString(), - sess_options, - providers=[backend]) - elif model.model_path is not None: # pragma: no cover - ort.InferenceSession(model.model_path, - sess_options, - providers=[backend]) - else: # pragma: no cover - logger.warning('Please use model path instead of onnx model object to quantize') - - tmp_model = onnx.load(sess_options.optimized_model_filepath, load_external_data=False) - if model.is_large_model: # pragma: no cover - from onnx.external_data_helper import load_external_data_for_model - load_external_data_for_model(tmp_model, os.path.split(model.model_path)[0]) - model.model_path = sess_options.optimized_model_filepath + if not isinstance(self.query_handler.get_graph_optimization(), list): + level = self.query_handler.get_graph_optimization() + elif options.onnxrt.graph_optimization.level is not None: + level = options.onnxrt.graph_optimization.level + elif self.recipes.get('graph_optimization_level', None) is not None: + level = self.recipes['graph_optimization_level'] + else: + if self.domain == "auto" and self._detect_domain(model): + self.domain = 'nlp' + level = 'ENABLE_EXTENDED' if self.domain == 'nlp' else 'ENABLE_BASIC' + logger.warning("Graph optimization level is automatically set to {}. " + "You can use 'recipe' argument in 'PostTrainingQuantConfig'" + "to overwrite it".format(level)) + sess_options.graph_optimization_level = optimization_levels[level] + sess_options.optimized_model_filepath = os.path.join(self.work_space, \ + "Optimized_model.onnx") + if sys.version_info < (3,10) and find_spec('onnxruntime_extensions'): # pragma: no cover + from onnxruntime_extensions import get_library_path + sess_options.register_custom_ops_library(get_library_path()) + if not model.is_large_model: + ort.InferenceSession(model.model.SerializeToString(), + sess_options, + providers=[self.backend]) + elif model.model_path is not None: # pragma: no cover + ort.InferenceSession(model.model_path, + sess_options, + providers=[self.backend]) + else: # pragma: no cover + logger.warning('Please use model path instead of onnx model object to quantize') + + tmp_model = onnx.load(sess_options.optimized_model_filepath, load_external_data=False) + + if model.is_large_model: # pragma: no cover + from onnx.external_data_helper import load_external_data_for_model + load_external_data_for_model(tmp_model, os.path.split(model.model_path)[0]) + model.model_path = sess_options.optimized_model_filepath + except: + tmp_model = model model.model = self._replace_gemm_with_matmul(tmp_model).model if \ options.onnxrt.graph_optimization.gemm2matmul and self.recipes.get('gemm_to_matmul', True) else \ tmp_model @@ -859,10 +866,13 @@ def query_fw_capability(self, model): precisions = query.get_precisions() for precision in precisions: - if precision in ['fp16', 'bf16'] and (self.device == 'cpu' or self.backend != 'CUDAExecutionProvider'): + if precision == 'fp16' and \ + (not self.use_fp16 or 'CUDAExecutionProvider' not in ort.get_available_providers()): continue - elif precision == 'bf16' and 'CUDAExecutionProvider' not in ort.get_available_providers(): + if precision == 'bf16' and \ + (not self.use_bf16 or (not CpuInfo().bf16 and os.getenv('FORCE_BF16') != '1')): continue + # get supported optype for target precision optypes = query.get_op_types_by_precision(precision) if \ query.get_op_types_by_precision(precision) != ['*'] else \ diff --git a/neural_compressor/adaptor/onnxrt_dnnl.yaml b/neural_compressor/adaptor/onnxrt_dnnl.yaml new file mode 100644 index 00000000000..704d63ead93 --- /dev/null +++ b/neural_compressor/adaptor/onnxrt_dnnl.yaml @@ -0,0 +1,411 @@ +## Copyright (c) 2021 Intel Corporation +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +# + +- + version: + name: '1.6.0' + int8: &ref_1_6 { + 'static': &ref_1_6_static { + 'Conv': { + 'weight': &int8_sym_perchanneltensor_minmax { + 'dtype': ['int8'], + 'scheme': ['sym'], + 'granularity': ['per_channel', 'per_tensor'], + 'algorithm': ['minmax'] + }, + 'activation': &uint8_asym_pertensor_minmax { + 'dtype': ['uint8'], + 'scheme': ['asym'], + 'granularity': ['per_tensor'], + 'algorithm': ['minmax'] + }, + 'mode': ['QDQ', 'QLinear'] + }, + 'FusedConv': { + 'weight': *int8_sym_perchanneltensor_minmax, #'QDQ': *int8_sym_pertensor_minmax + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'Gather': { + 'weight': &uint8_asym_perchanneltensor_minmax { + 'dtype': ['uint8'], + 'scheme': ['asym'], + 'granularity': ['per_channel', 'per_tensor'], + 'algorithm': ['minmax'] + }, + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'MatMul': { + 'weight': &int8_sym_pertensor_minmax { + 'dtype': ['int8'], + 'scheme': ['sym'], + 'granularity': ['per_tensor'], + 'algorithm': ['minmax'] + }, + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'Attention': &default_static_qlinear_qdq { + 'weight': *int8_sym_pertensor_minmax, + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'Mul': &default_static_qlinear { + 'weight': *int8_sym_pertensor_minmax, + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QLinear'] + }, + 'Relu': *default_static_qlinear_qdq, + 'Clip': *default_static_qlinear_qdq, + 'LeakyRelu': *default_static_qlinear_qdq, + 'Sigmoid': *default_static_qlinear_qdq, + 'MaxPool': *default_static_qlinear_qdq, + 'EmbedLayerNormalization': *default_static_qlinear_qdq, + 'GlobalAveragePool': *default_static_qlinear_qdq, + 'Add': *default_static_qlinear, + }, + 'dynamic': &ref_1_6_dynamic { + 'Conv': { + 'weight': *uint8_asym_perchanneltensor_minmax, + 'activation': *uint8_asym_pertensor_minmax + }, + 'FusedConv': { + 'weight': *uint8_asym_perchanneltensor_minmax, + 'activation': *uint8_asym_pertensor_minmax + }, + 'MatMul': &default_dynamic { + 'weight': *int8_sym_pertensor_minmax, + 'activation': *uint8_asym_pertensor_minmax + }, + 'Gather': *default_dynamic, + 'Attention': *default_dynamic, + 'EmbedLayerNormalization': *default_dynamic, + 'LSTM': *default_dynamic, + } + } + recipes: &default_optimization + graph_optimization: # from onnxruntime graph_optimization_level + level: ['DISABLE_ALL', 'ENABLE_BASIC', 'ENABLE_EXTENDED', 'ENABLE_ALL'] + +- + version: + name: '1.7.0' + int8: { + 'static': { + 'FusedConv': { + 'weight': *int8_sym_perchanneltensor_minmax, #'QDQ': *int8_sym_pertensor_minmax + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'Conv': { + 'weight': *int8_sym_perchanneltensor_minmax, + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'Gather': { + 'weight': *uint8_asym_perchanneltensor_minmax, + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'MatMul': *default_static_qlinear_qdq, + 'Attention': *default_static_qlinear_qdq, + 'Mul': *default_static_qlinear, + 'Relu': *default_static_qlinear_qdq, + 'Clip': *default_static_qlinear_qdq, + 'LeakyRelu': *default_static_qlinear_qdq, + 'Sigmoid': *default_static_qlinear_qdq, + 'MaxPool': *default_static_qlinear_qdq, + 'EmbedLayerNormalization': *default_static_qlinear_qdq, + 'GlobalAveragePool': *default_static_qlinear_qdq, + 'Pad': *default_static_qlinear_qdq, + 'Split': *default_static_qlinear_qdq, + 'Add': *default_static_qlinear, + }, + 'dynamic': *ref_1_6_dynamic + } + recipes: + <<: *default_optimization + +- + version: + name: '1.8.0' + int8: { + 'static': { + 'FusedConv': { + 'weight': *int8_sym_perchanneltensor_minmax, # QDQ: *int8_sym_pertensor_minmax + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'Conv': { + 'weight': *int8_sym_perchanneltensor_minmax, + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'Gather': { + 'weight': *uint8_asym_perchanneltensor_minmax, + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'MatMul': { + 'weight': *int8_sym_perchanneltensor_minmax, # QDQ: *int8_sym_pertensor_minmax + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'Attention': *default_static_qlinear_qdq, + 'Mul': *default_static_qlinear, + 'Relu': *default_static_qlinear_qdq, + 'Clip': *default_static_qlinear_qdq, + 'LeakyRelu': *default_static_qlinear_qdq, + 'Sigmoid': *default_static_qlinear_qdq, + 'MaxPool': *default_static_qlinear_qdq, + 'EmbedLayerNormalization': *default_static_qlinear_qdq, + 'GlobalAveragePool': *default_static_qlinear_qdq, + 'Pad': *default_static_qlinear_qdq, + 'Split': *default_static_qlinear_qdq, + 'Add': *default_static_qlinear, + 'Squeeze': *default_static_qlinear_qdq, + 'Reshape': *default_static_qlinear_qdq, + 'Concat': *default_static_qlinear_qdq, + 'AveragePool': *default_static_qlinear_qdq, + 'Unsqueeze': *default_static_qlinear_qdq, + 'Transpose': *default_static_qlinear_qdq, + 'Resize': *default_static_qlinear_qdq, + }, + 'dynamic': { + 'Conv': { + 'weight': *uint8_asym_perchanneltensor_minmax, + 'activation': *uint8_asym_pertensor_minmax + }, + 'FusedConv': { + 'weight': *uint8_asym_perchanneltensor_minmax, + 'activation': *uint8_asym_pertensor_minmax + }, + 'MatMul': { + 'weight': *int8_sym_perchanneltensor_minmax, + 'activation': *uint8_asym_pertensor_minmax + }, + 'Gather': *default_dynamic, + 'Attention': *default_dynamic, + 'EmbedLayerNormalization': *default_dynamic, + 'LSTM': *default_dynamic, + } + } + recipes: + <<: *default_optimization + +- + version: + name: '1.9.0' + int8: { + 'static': { + 'FusedConv': { + 'weight': *int8_sym_perchanneltensor_minmax, # QDQ: *int8_sym_pertensor_minmax + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'Conv': { + 'weight': *int8_sym_perchanneltensor_minmax, + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'Gather': { + 'weight': *uint8_asym_perchanneltensor_minmax, + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'MatMul': { + 'weight': *int8_sym_perchanneltensor_minmax, # QDQ: *int8_sym_pertensor_minmax + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'EmbedLayerNormalization': { + 'weight': *uint8_asym_pertensor_minmax, # QDQ: *int8_sym_pertensor_minmax + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'Attention': *default_static_qlinear_qdq, + 'Mul': *default_static_qlinear, + 'Relu': *default_static_qlinear_qdq, + 'Clip': *default_static_qlinear_qdq, + 'LeakyRelu': *default_static_qlinear_qdq, + 'Sigmoid': *default_static_qlinear_qdq, + 'MaxPool': *default_static_qlinear_qdq, + 'GlobalAveragePool': *default_static_qlinear_qdq, + 'Pad': *default_static_qlinear_qdq, + 'Split': *default_static_qlinear_qdq, + 'Add': *default_static_qlinear, + 'Squeeze': *default_static_qlinear_qdq, + 'Reshape': *default_static_qlinear_qdq, + 'Concat': *default_static_qlinear_qdq, + 'AveragePool': *default_static_qlinear_qdq, + 'Unsqueeze': *default_static_qlinear_qdq, + 'Transpose': *default_static_qlinear_qdq, + 'Resize': *default_static_qlinear_qdq, + }, + 'dynamic': &ref_1_9_dynamic { + 'Conv': { + 'weight': *uint8_asym_pertensor_minmax, + 'activation': *uint8_asym_pertensor_minmax + }, + 'FusedConv': { + 'weight': *uint8_asym_pertensor_minmax, + 'activation': *uint8_asym_pertensor_minmax + }, + 'MatMul': { + 'weight': *int8_sym_perchanneltensor_minmax, + 'activation': *uint8_asym_pertensor_minmax + }, + 'EmbedLayerNormalization': { + 'weight': *uint8_asym_pertensor_minmax, + 'activation': *uint8_asym_pertensor_minmax + }, + 'Gather': *default_dynamic, + 'Attention': *default_dynamic, + 'LSTM': *default_dynamic, + } + } + recipes: + <<: *default_optimization + +- + version: + name: '1.10.0' + int8: { + 'static': { + 'FusedConv': { + 'weight': *int8_sym_perchanneltensor_minmax, # QDQ: *int8_sym_pertensor_minmax + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'Conv': { + 'weight': *int8_sym_perchanneltensor_minmax, + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'Gather': { + 'weight': *uint8_asym_perchanneltensor_minmax, + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'MatMul': { + 'weight': *int8_sym_perchanneltensor_minmax, + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'EmbedLayerNormalization': { + 'weight': *uint8_asym_pertensor_minmax, # QDQ: *int8_sym_pertensor_minmax + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'Attention': *default_static_qlinear_qdq, + 'Mul': *default_static_qlinear, + 'Relu': *default_static_qlinear_qdq, + 'Clip': *default_static_qlinear_qdq, + 'LeakyRelu': *default_static_qlinear_qdq, + 'Sigmoid': *default_static_qlinear_qdq, + 'MaxPool': *default_static_qlinear_qdq, + 'GlobalAveragePool': *default_static_qlinear_qdq, + 'Pad': *default_static_qlinear_qdq, + 'Split': *default_static_qlinear_qdq, + 'Add': *default_static_qlinear, + 'Squeeze': *default_static_qlinear_qdq, + 'Reshape': *default_static_qlinear_qdq, + 'Concat': *default_static_qlinear_qdq, + 'AveragePool': *default_static_qlinear_qdq, + 'Unsqueeze': *default_static_qlinear_qdq, + 'Transpose': *default_static_qlinear_qdq, + 'Resize': *default_static_qlinear_qdq, + }, + 'dynamic': *ref_1_9_dynamic + } + recipes: + <<: *default_optimization + +- + version: + name: '1.11.0' + int8: &ref_1_11 { + 'static': { + 'FusedConv': { + 'weight': *int8_sym_perchanneltensor_minmax, # QDQ: *int8_sym_pertensor_minmax + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'Conv': { + 'weight': *int8_sym_perchanneltensor_minmax, + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'Gather': { + 'weight': *uint8_asym_perchanneltensor_minmax, + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'MatMul': { + 'weight': *int8_sym_perchanneltensor_minmax, + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'Gemm': { + 'weight': *int8_sym_perchanneltensor_minmax, + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'EmbedLayerNormalization': { + 'weight': *uint8_asym_pertensor_minmax, # QDQ: *int8_sym_pertensor_minmax + 'activation': *uint8_asym_pertensor_minmax, + 'mode': ['QDQ', 'QLinear'] + }, + 'Attention': *default_static_qlinear_qdq, + 'Mul': *default_static_qlinear, + 'Relu': *default_static_qlinear_qdq, + 'Clip': *default_static_qlinear_qdq, + 'LeakyRelu': *default_static_qlinear_qdq, + 'Sigmoid': *default_static_qlinear_qdq, + 'MaxPool': *default_static_qlinear_qdq, + 'GlobalAveragePool': *default_static_qlinear_qdq, + 'Pad': *default_static_qlinear_qdq, + 'Split': *default_static_qlinear_qdq, + 'Add': *default_static_qlinear, + 'Squeeze': *default_static_qlinear_qdq, + 'Reshape': *default_static_qlinear_qdq, + 'Concat': *default_static_qlinear_qdq, + 'AveragePool': *default_static_qlinear_qdq, + 'Unsqueeze': *default_static_qlinear_qdq, + 'Transpose': *default_static_qlinear_qdq, + 'ArgMax': *default_static_qlinear, + 'Resize': *default_static_qlinear_qdq, + }, + 'dynamic': *ref_1_9_dynamic + } + recipes: + <<: *default_optimization + +- + version: + name: '1.14.0' + int8: *ref_1_11 + bf16: &common_bf16 ['MatMul', 'Gemm'] + recipes: + <<: *default_optimization + +- + version: + name: 'default' + int8: *ref_1_6 + recipes: + <<: *default_optimization diff --git a/neural_compressor/adaptor/ox_utils/util.py b/neural_compressor/adaptor/ox_utils/util.py index de369ca33a0..0040db9f21e 100644 --- a/neural_compressor/adaptor/ox_utils/util.py +++ b/neural_compressor/adaptor/ox_utils/util.py @@ -70,13 +70,15 @@ PROVIDERS = { 'default': 'CPUExecutionProvider', 'onnxrt_trt_ep': 'TensorrtExecutionProvider', + 'onnxrt_dnnl_ep': 'DnnlExecutionProvider', 'onnxrt_cuda_ep': 'CUDAExecutionProvider', } ONNXRT_BACKENDS = { 'CPUExecutionProvider': 'default', 'TensorrtExecutionProvider': 'onnxrt_trt_ep', - 'CUDAExecutionProvider': 'onnxrt_cuda_ep' + 'CUDAExecutionProvider': 'onnxrt_cuda_ep', + 'DnnlExecutionProvider': 'onnxrt_dnnl_ep' } def dtype_to_name(dtype_mapping, dtype): diff --git a/neural_compressor/config.py b/neural_compressor/config.py index 4528926ea71..a4a661cf0a6 100644 --- a/neural_compressor/config.py +++ b/neural_compressor/config.py @@ -259,7 +259,8 @@ class BenchmarkConfig: inputs (list, optional): A list of strings containing the inputs of model. Default is an empty list. outputs (list, optional): A list of strings containing the outputs of model. Default is an empty list. backend (str, optional): Backend name for model execution. Supported values include: 'default', 'itex', - 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep'. Default value is 'default'. + 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep', 'onnxrt_dnnl_ep'. + Default value is 'default'. warmup (int, optional): The number of iterations to perform warmup before running performance tests. Default value is 5. iteration (int, optional): The number of iterations to run performance tests. Default is -1. @@ -327,7 +328,7 @@ def backend(self): def backend(self, backend): """Set backend.""" if _check_value('backend', backend, str, [ - 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep']): + 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep', 'onnxrt_dnnl_ep']): self._backend = backend @property @@ -672,7 +673,8 @@ class _BaseQuantizationConfig: Args: inputs: Inputs of model, only required in tensorflow. outputs: Outputs of model, only required in tensorflow. - backend: Backend for model execution. Support 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep' + backend: Backend for model execution. + Support 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep', 'onnxrt_dnnl_ep' domain: Model domain. Support 'auto', 'cv', 'object_detection', 'nlp' and 'recommendation_system'. Adaptor will use specific quantization settings for different domains automatically, and explicitly specified quantization settings will override the automatic setting. @@ -1038,7 +1040,7 @@ def backend(self): @backend.setter def backend(self, backend): if _check_value('backend', backend, str, [ - 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep']): + 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep', 'onnxrt_dnnl_ep']): self._backend = backend @property @@ -1083,7 +1085,8 @@ class PostTrainingQuantConfig(_BaseQuantizationConfig): Args: device: Support 'cpu' and 'gpu'. - backend: Backend for model execution. Support 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep' + backend: Backend for model execution. + Support 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep', 'onnxrt_dnnl_ep' domain: Model domain. Support 'auto', 'cv', 'object_detection', 'nlp' and 'recommendation_system'. Adaptor will use specific quantization settings for different domains automatically, and explicitly specified quantization settings will override the automatic setting. @@ -1242,7 +1245,8 @@ class QuantizationAwareTrainingConfig(_BaseQuantizationConfig): Args: device: Support 'cpu' and 'gpu'. - backend: Backend for model execution. Support 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep' + backend: Backend for model execution. + Support 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep', 'onnxrt_dnnl_ep' inputs: Inputs of model, only required in tensorflow. outputs: Outputs of model, only required in tensorflow. op_type_dict: Tuning constraints on optype-wise for advance user to reduce tuning space. @@ -1684,8 +1688,8 @@ class MixedPrecisionConfig(object): device (str, optional): Device for execution. Support 'cpu' and 'gpu', default is 'cpu'. backend (str, optional): Backend for model execution. - Support 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep', - default is 'default', 'ipex' doesn't support tune. + Support 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep', 'onnxrt_dnnl_ep' + default is 'default'. precisions ([str, list], optional): Target precision for mix precision conversion. Support 'bf16' and 'fp16', default is 'bf16'. model_name (str, optional): The name of the model. Default value is empty. diff --git a/neural_compressor/strategy/strategy.py b/neural_compressor/strategy/strategy.py index 3624b342286..e5ac78fadd1 100644 --- a/neural_compressor/strategy/strategy.py +++ b/neural_compressor/strategy/strategy.py @@ -1250,6 +1250,11 @@ def _set_framework_info(self, q_dataloader, q_func=None): framework_specific_info['backend'] == 'onnxrt_trt_ep': framework_specific_info.update({'format': 'QDQ'}) framework = 'onnxrt_qdq' + if framework_specific_info['backend'] == 'onnxrt_cuda_ep' and self.cfg.device =='gpu': + framework_specific_info['use_fp16'] = True + framework_specific_info['use_bf16'] = True + if framework_specific_info['backend'] == 'onnxrt_dnnl_ep' and self.cfg.device == 'cpu': + framework_specific_info['use_bf16'] = True if framework == 'pytorch_ipex' or framework == 'pytorch' or framework == 'pytorch_fx': if self.config.backend == 'ipex': framework = 'pytorch_ipex' From 60831abff3052f051ac626b4bdad86e42b892d5e Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Sat, 27 May 2023 21:31:52 +0800 Subject: [PATCH 02/21] add op and fix bug Signed-off-by: Mengni Wang --- .../scripts/codeScan/pyspelling/inc_dict.txt | 1 + .../mix_precision/README.md | 2 +- neural_compressor/adaptor/onnxrt_dnnl.yaml | 8 +- neural_compressor/strategy/strategy.py | 4 +- .../onnxrt_adaptor/test_onnxrt_operators.py | 171 ++++++++++++++++++ 5 files changed, 182 insertions(+), 4 deletions(-) diff --git a/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt b/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt index 1ba7cd7a55e..af758fcce72 100644 --- a/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt +++ b/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt @@ -495,6 +495,7 @@ dnf dnn dnnl DNNL +DnnlExecutionProvider Dockerfile doclist docstrings diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md index f5d976123a3..69e02a5059a 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md +++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md @@ -52,7 +52,7 @@ bash prepare_data.sh --data_dir=$GLUE_DIR --task_name=$TASK_NAME # Run -If the hardware doesn't support bf16 instruction, please set flag as below to force bf16 conversion (this way will be deperecated): +If the hardware doesn't support bf16 instruction, please set flag as below to force bf16 conversion (this way will be deprecated): ```shell export FORCE_BF16=1 diff --git a/neural_compressor/adaptor/onnxrt_dnnl.yaml b/neural_compressor/adaptor/onnxrt_dnnl.yaml index 704d63ead93..2d2d718130c 100644 --- a/neural_compressor/adaptor/onnxrt_dnnl.yaml +++ b/neural_compressor/adaptor/onnxrt_dnnl.yaml @@ -399,7 +399,13 @@ version: name: '1.14.0' int8: *ref_1_11 - bf16: &common_bf16 ['MatMul', 'Gemm'] + bf16: &common_bf16 ['MatMul', 'Gemm', 'BatchNormalization', 'Softmax', 'Sum', + 'Abs', 'BiasGelu', 'Exp', 'FastGelu', 'Gelu', 'Log', 'Relu', 'Round', 'Sigmoid', + 'Sqrt', 'Tanh', 'Add', 'Sub', 'Mul', 'Div', 'Pow', 'ReduceMean', 'Equal', + 'FusedMatMul', 'Greater', 'GreaterOrEqual', 'LeakyRelu', 'Less', 'LessOrEqual', + 'Reshape', 'Squeeze', 'Transpose', 'Unsqueeze', 'ReduceL1', 'ReduceL2', 'ReduceLogSum', + 'ReduceLogSumExp', 'ReduceMax', 'ReduceProd', 'ReduceSum', 'ReduceSumSquare', + 'LayerNormalization', 'Concat'] recipes: <<: *default_optimization diff --git a/neural_compressor/strategy/strategy.py b/neural_compressor/strategy/strategy.py index e5ac78fadd1..512c3a5d20b 100644 --- a/neural_compressor/strategy/strategy.py +++ b/neural_compressor/strategy/strategy.py @@ -1250,10 +1250,10 @@ def _set_framework_info(self, q_dataloader, q_func=None): framework_specific_info['backend'] == 'onnxrt_trt_ep': framework_specific_info.update({'format': 'QDQ'}) framework = 'onnxrt_qdq' - if framework_specific_info['backend'] == 'onnxrt_cuda_ep' and self.cfg.device =='gpu': + if framework_specific_info['backend'] == 'onnxrt_cuda_ep' and self.config.device =='gpu': framework_specific_info['use_fp16'] = True framework_specific_info['use_bf16'] = True - if framework_specific_info['backend'] == 'onnxrt_dnnl_ep' and self.cfg.device == 'cpu': + if framework_specific_info['backend'] == 'onnxrt_dnnl_ep' and self.config.device == 'cpu': framework_specific_info['use_bf16'] = True if framework == 'pytorch_ipex' or framework == 'pytorch' or framework == 'pytorch_fx': if self.config.backend == 'ipex': diff --git a/test/adaptor/onnxrt_adaptor/test_onnxrt_operators.py b/test/adaptor/onnxrt_adaptor/test_onnxrt_operators.py index fc29326a3ca..4b99baf1e7f 100644 --- a/test/adaptor/onnxrt_adaptor/test_onnxrt_operators.py +++ b/test/adaptor/onnxrt_adaptor/test_onnxrt_operators.py @@ -10,6 +10,7 @@ from neural_compressor.adaptor.ox_utils.util import QuantizedInitializer, QuantizedValue, QuantizationMode import onnxruntime as ort from neural_compressor.config import ONNXQlinear2QDQConfig +from neural_compressor.utils.utility import CpuInfo def build_model(): initializers = [] @@ -1174,6 +1175,176 @@ def test_fp16(self): session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['CUDAExecutionProvider']) outputs = session.run(None, input_data) + def get_bf16_mixed_precision_model(self, model): + from neural_compressor import MixedPrecisionConfig + from neural_compressor.mix_precision import fit + config = MixedPrecisionConfig(backend='onnxrt_dnnl_ep', precision='bf16') + converted_model = fit(model, config) + return converted_model + + @unittest.skipIf(not CpuInfo().bf16 or 'DnnlExecutionProvider' not in ort.get_all_providers(), + "skip since DnnlExecutionProvider is not supported") + def test_bf16(self): + optypes = ['Sum', 'Sub', 'Div', 'Pow', 'Add'] + for optype in optypes: + inps = [['input1', TensorProto.FLOAT, (1,2)]] + outs = [['output', TensorProto.FLOAT, (1,2)]] + weights = [['input2', TensorProto.FLOAT, (1,2), np.random.random((2))]] + node_infos = [['test', ['input1', 'input2'], ['output'], optype]] + model = self.build_model(inps, outs, weights, node_infos) + input_data = self.build_test_data(['input1'], [(1,2)], ['float32']) + convert_model = self.get_bf16_mixed_precision_model(model) + self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()])) + self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast'])) + session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider']) + outputs = session.run(None, input_data) + + optypes = ['Equal', 'Greater', 'GreaterOrEqual', 'Less', 'LessOrEqual'] + for optype in optypes: + inps = [['input1', TensorProto.FLOAT, (1,2)]] + outs = [['output', TensorProto.BOOL, (1,2)]] + weights = [['input2', TensorProto.FLOAT, (1,2), np.random.random((2))]] + node_infos = [['test', ['input1', 'input2'], ['output'], optype]] + model = self.build_model(inps, outs, weights, node_infos) + input_data = self.build_test_data(['input1'], [(1,2)], ['float32']) + convert_model = self.get_bf16_mixed_precision_model(model) + self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()])) + self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast'])) + session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider']) + outputs = session.run(None, input_data) + + optypes = ['Abs', 'Exp', 'Log', 'Round', 'Sqrt', 'Softmax', 'Exp', 'Tanh', 'Sigmoid', 'LeakyRelu', 'Round'] + for optype in optypes: + inps = [['input1', TensorProto.FLOAT, (1,2)]] + outs = [['output', TensorProto.FLOAT, (1,2)]] + node_infos = [['test', ['input1'], ['output'], optype]] + model = self.build_model(inps, outs, [], node_infos) + input_data = self.build_test_data(['input1'], [(1,2)], ['float32']) + convert_model = self.get_bf16_mixed_precision_model(model) + self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()])) + self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast'])) + session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider']) + outputs = session.run(None, input_data) + + optypes = ['ReduceMean', 'ReduceL1', 'ReduceL2', 'ReduceLogSum', 'ReduceLogSumExp', 'ReduceMax', 'ReduceProd', \ + 'ReduceSum', 'ReduceSumSquare'] + for optype in optypes: + inps = [['input1', TensorProto.FLOAT, (1,2)]] + outs = [['output', TensorProto.FLOAT, (1,1)]] + node_infos = [['test', ['input1'], ['output'], optype]] + model = self.build_model(inps, outs, [], node_infos) + input_data = self.build_test_data(['input1'], [(1,2)], ['float32']) + convert_model = self.get_bf16_mixed_precision_model(model) + self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()])) + self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast'])) + session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider']) + outputs = session.run(None, input_data) + + optypes = ['Gelu'] + for optype in optypes: + inps = [['input1', TensorProto.FLOAT, (1,2)]] + outs = [['output', TensorProto.FLOAT, (1,2)]] + node_infos = [['test', ['input1'], ['output'], optype, 'com.microsoft']] + model = self.build_model(inps, outs, [], node_infos) + input_data = self.build_test_data(['input1'], [(1,2)], ['float32']) + convert_model = self.get_bf16_mixed_precision_model(model) + self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()])) + self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast'])) + session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider']) + outputs = session.run(None, input_data) + + optypes = ['BiasGelu', 'FastGelu'] + for optype in optypes: + inps = [['input1', TensorProto.FLOAT, [2]]] + outs = [['output', TensorProto.FLOAT, [2]]] + weights = [['input2', TensorProto.FLOAT, [2], np.random.random((2))]] + node_infos = [['test', ['input1', 'input2'], ['output'], optype, 'com.microsoft']] + model = self.build_model(inps, outs, weights, node_infos) + input_data = self.build_test_data(['input1'], [(2)], ['float32']) + convert_model = self.get_bf16_mixed_precision_model(model) + self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()])) + self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast'])) + session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider']) + outputs = session.run(None, input_data) + + + optypes = ['MatMul'] + for optype in optypes: + inps = [['input1', TensorProto.FLOAT, (1,2)]] + outs = [['output', TensorProto.FLOAT, (1,1)]] + weights = [['input2', TensorProto.FLOAT, (2,1), np.random.random((2))]] + node_infos = [['test', ['input1', 'input2'], ['output'], optype]] + model = self.build_model(inps, outs, weights, node_infos) + input_data = self.build_test_data(['input1'], [(1,2)], ['float32']) + convert_model = self.get_bf16_mixed_precision_model(model) + self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()])) + self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast'])) + session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider']) + outputs = session.run(None, input_data) + + optypes = ['FusedMatMul'] + for optype in optypes: + inps = [['input1', TensorProto.FLOAT, (1,2)]] + outs = [['output', TensorProto.FLOAT, (1,1)]] + weights = [['input2', TensorProto.FLOAT, (2,1), np.random.random((2))]] + node_infos = [['test', ['input1', 'input2'], ['output'], optype, 'com.microsoft']] + model = self.build_model(inps, outs, weights, node_infos) + ort.InferenceSession(model.SerializeToString()) + input_data = self.build_test_data(['input1'], [(1,2)], ['float32']) + convert_model = self.get_bf16_mixed_precision_model(model) + self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()])) + self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast'])) + session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider']) + outputs = session.run(None, input_data) + + optypes = ['Gemm'] + for optype in optypes: + inps = [['input1', TensorProto.FLOAT, (1,2)]] + outs = [['output', TensorProto.FLOAT, (1,2)]] + weights = [['input2', TensorProto.FLOAT, (2,1), np.random.random((2))], + ['input3', TensorProto.FLOAT, [], np.random.random((1))]] + node_infos = [['test', ['input1', 'input2', 'input3'], ['output'], optype]] + model = self.build_model(inps, outs, weights, node_infos) + input_data = self.build_test_data(['input1'], [(1,2)], ['float32']) + convert_model = self.get_bf16_mixed_precision_model(model) + self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()])) + self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast'])) + session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider']) + outputs = session.run(None, input_data) + + optypes = ['LayerNormalization'] + for optype in optypes: + inps = [['input1', TensorProto.FLOAT, (1,2)]] + outs = [['output1', TensorProto.FLOAT, (1,2)], ['output2', TensorProto.FLOAT, (1,2)], ['output3', TensorProto.FLOAT, (1,2)]] + weights = [['input2', TensorProto.FLOAT, (2,1), np.random.random((2))], + ['input3', TensorProto.FLOAT, (2,1), np.random.random((2))]] + node_infos = [['test', ['input1', 'input2', 'input3'], ['output1', 'output2', 'output3'], optype]] + model = self.build_model(inps, outs, weights, node_infos) + input_data = self.build_test_data(['input1'], [(1,2)], ['float32']) + convert_model = self.get_bf16_mixed_precision_model(model) + self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()])) + self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast'])) + session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider']) + outputs = session.run(None, input_data) + + optypes = ['BatchNormalization'] + for optype in optypes: + inps = [['input1', TensorProto.FLOAT, [1, 2]]] + outs = [['output1', TensorProto.FLOAT, [1, 2]]] + weights = [['input2', TensorProto.FLOAT, [2], np.random.random((2))], + ['input3', TensorProto.FLOAT, [2], np.random.random((2))], + ['input4', TensorProto.FLOAT, [2], np.random.random((2))], + ['input5', TensorProto.FLOAT, [2], np.random.random((2))],] + node_infos = [['test', ['input1', 'input2', 'input3', 'input4', 'input5'], ['output1'], optype]] + model = self.build_model(inps, outs, weights, node_infos) + ort.InferenceSession(model.SerializeToString()) + input_data = self.build_test_data(['input1'], [(1,2)], ['float32']) + convert_model = self.get_bf16_mixed_precision_model(model) + self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()])) + self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast'])) + session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider']) + outputs = session.run(None, input_data) + if __name__ == "__main__": unittest.main() From b5f2cc884d1aefde933a48535130f8bbf870ff14 Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Sat, 27 May 2023 23:29:40 +0800 Subject: [PATCH 03/21] fix pre optimize Signed-off-by: Mengni Wang --- neural_compressor/adaptor/onnxrt.py | 87 ++++++++++++++--------------- 1 file changed, 42 insertions(+), 45 deletions(-) diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py index c0c6bcfc31d..9b3fcd5e15c 100644 --- a/neural_compressor/adaptor/onnxrt.py +++ b/neural_compressor/adaptor/onnxrt.py @@ -623,51 +623,48 @@ def _pre_optimize(self, model, level=1): from neural_compressor.adaptor.ox_utils.util import \ remove_init_from_model_input, split_shared_bias remove_init_from_model_input(model) - try: - sess_options = ort.SessionOptions() - optimization_levels = { - 'DISABLE_ALL': ort.GraphOptimizationLevel.ORT_DISABLE_ALL, - 'ENABLE_BASIC': ort.GraphOptimizationLevel.ORT_ENABLE_BASIC, - 'ENABLE_EXTENDED': ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED, - 'ENABLE_ALL': ort.GraphOptimizationLevel.ORT_ENABLE_ALL} - if not isinstance(self.query_handler.get_graph_optimization(), list): - level = self.query_handler.get_graph_optimization() - elif options.onnxrt.graph_optimization.level is not None: - level = options.onnxrt.graph_optimization.level - elif self.recipes.get('graph_optimization_level', None) is not None: - level = self.recipes['graph_optimization_level'] - else: - if self.domain == "auto" and self._detect_domain(model): - self.domain = 'nlp' - level = 'ENABLE_EXTENDED' if self.domain == 'nlp' else 'ENABLE_BASIC' - logger.warning("Graph optimization level is automatically set to {}. " - "You can use 'recipe' argument in 'PostTrainingQuantConfig'" - "to overwrite it".format(level)) - sess_options.graph_optimization_level = optimization_levels[level] - sess_options.optimized_model_filepath = os.path.join(self.work_space, \ - "Optimized_model.onnx") - if sys.version_info < (3,10) and find_spec('onnxruntime_extensions'): # pragma: no cover - from onnxruntime_extensions import get_library_path - sess_options.register_custom_ops_library(get_library_path()) - if not model.is_large_model: - ort.InferenceSession(model.model.SerializeToString(), - sess_options, - providers=[self.backend]) - elif model.model_path is not None: # pragma: no cover - ort.InferenceSession(model.model_path, - sess_options, - providers=[self.backend]) - else: # pragma: no cover - logger.warning('Please use model path instead of onnx model object to quantize') - - tmp_model = onnx.load(sess_options.optimized_model_filepath, load_external_data=False) - - if model.is_large_model: # pragma: no cover - from onnx.external_data_helper import load_external_data_for_model - load_external_data_for_model(tmp_model, os.path.split(model.model_path)[0]) - model.model_path = sess_options.optimized_model_filepath - except: - tmp_model = model + sess_options = ort.SessionOptions() + optimization_levels = { + 'DISABLE_ALL': ort.GraphOptimizationLevel.ORT_DISABLE_ALL, + 'ENABLE_BASIC': ort.GraphOptimizationLevel.ORT_ENABLE_BASIC, + 'ENABLE_EXTENDED': ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED, + 'ENABLE_ALL': ort.GraphOptimizationLevel.ORT_ENABLE_ALL} + if not isinstance(self.query_handler.get_graph_optimization(), list): + level = self.query_handler.get_graph_optimization() + elif options.onnxrt.graph_optimization.level is not None: + level = options.onnxrt.graph_optimization.level + elif self.recipes.get('graph_optimization_level', None) is not None: + level = self.recipes['graph_optimization_level'] + else: + if self.domain == "auto" and self._detect_domain(model): + self.domain = 'nlp' + level = 'ENABLE_EXTENDED' if self.domain == 'nlp' else 'ENABLE_BASIC' + logger.warning("Graph optimization level is automatically set to {}. " + "You can use 'recipe' argument in 'PostTrainingQuantConfig'" + "to overwrite it".format(level)) + sess_options.graph_optimization_level = optimization_levels[level] + sess_options.optimized_model_filepath = os.path.join(self.work_space, \ + "Optimized_model.onnx") + if sys.version_info < (3,10) and find_spec('onnxruntime_extensions'): # pragma: no cover + from onnxruntime_extensions import get_library_path + sess_options.register_custom_ops_library(get_library_path()) + if not model.is_large_model: + ort.InferenceSession(model.model.SerializeToString(), + sess_options, + providers=['CPUExecutionProvider']) + elif model.model_path is not None: # pragma: no cover + ort.InferenceSession(model.model_path, + sess_options, + providers=['CPUExecutionProvider']) + else: # pragma: no cover + logger.warning('Please use model path instead of onnx model object to quantize') + + tmp_model = onnx.load(sess_options.optimized_model_filepath, load_external_data=False) + + if model.is_large_model: # pragma: no cover + from onnx.external_data_helper import load_external_data_for_model + load_external_data_for_model(tmp_model, os.path.split(model.model_path)[0]) + model.model_path = sess_options.optimized_model_filepath model.model = self._replace_gemm_with_matmul(tmp_model).model if \ options.onnxrt.graph_optimization.gemm2matmul and self.recipes.get('gemm_to_matmul', True) else \ tmp_model From b21aa0d734477c3d5f354a4249cf4ff84efccba4 Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Sun, 28 May 2023 21:06:12 +0800 Subject: [PATCH 04/21] fix ut Signed-off-by: Mengni Wang --- neural_compressor/adaptor/onnxrt.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py index 9b3fcd5e15c..3bb698d893c 100644 --- a/neural_compressor/adaptor/onnxrt.py +++ b/neural_compressor/adaptor/onnxrt.py @@ -65,7 +65,8 @@ def __init__(self, framework_specific_info): self.recipes = framework_specific_info.get("recipes", {}) self.backend = PROVIDERS[framework_specific_info["backend"]] self.performance_only = framework_specific_info.get("performance_only", False) - self.use_bf16 = framework_specific_info.get("use_bf16", False) + self.use_bf16 = framework_specific_info.get("use_bf16", False) and \ + self.backend in ort.get_available_providers() self.use_fp16 = framework_specific_info.get("use_fp16", False) if self.backend not in ort.get_all_providers(): @@ -863,8 +864,7 @@ def query_fw_capability(self, model): precisions = query.get_precisions() for precision in precisions: - if precision == 'fp16' and \ - (not self.use_fp16 or 'CUDAExecutionProvider' not in ort.get_available_providers()): + if precision == 'fp16' and not self.use_fp16: continue if precision == 'bf16' and \ (not self.use_bf16 or (not CpuInfo().bf16 and os.getenv('FORCE_BF16') != '1')): From b2d4b4008b3d7a5aa4bba6fe838fe81af0952b78 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Sun, 4 Jun 2023 12:27:05 +0800 Subject: [PATCH 05/21] Update config.py --- neural_compressor/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/config.py b/neural_compressor/config.py index a4a661cf0a6..58ec22ebae9 100644 --- a/neural_compressor/config.py +++ b/neural_compressor/config.py @@ -1834,7 +1834,7 @@ def backend(self): def backend(self, backend): """Set backend.""" if _check_value('backend', backend, str, [ - 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep']): + 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep', 'onnxrt_dnnl_ep']): self._backend = backend @property From 0fc1ab9874833fffae5949bd0accc5db7168c86a Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Tue, 6 Jun 2023 10:26:54 +0800 Subject: [PATCH 06/21] Update mix_precision.py --- neural_compressor/mix_precision.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/neural_compressor/mix_precision.py b/neural_compressor/mix_precision.py index 057344591ab..2444466f691 100644 --- a/neural_compressor/mix_precision.py +++ b/neural_compressor/mix_precision.py @@ -98,11 +98,14 @@ def fit(model, precisions = list(set(conf.precisions) - set(conf.excluded_precisions)) if ('bf16' in precisions or 'fp16' in precisions) and conf.framework == "onnxruntime": # pragma: no cover - if conf.device == "cpu": - logger.warning("Mix precision exits due to device isn't gpu for onnx models.") + if 'fp16' in precisions and not (conf.device == "gpu" and conf.backend == "onnxrt_cuda_ep"): + logger.warning("Mix precision exits due to fp16 for onnx models" \ + "needs 'gpu' device and 'onnxrt_cuda_ep' backend.") sys.exit(0) - elif conf.backend != "onnxrt_cuda_ep": - logger.warning("Mix precision exits due to backend isn't onnxrt_cuda_ep for onnx models.") + elif 'bf16' in precisions and (not (conf.backend == "onnxrt_cuda_ep" and conf.device == "gpu") and \ + not (conf.backend == "onnxrt_dnnl_ep" and conf.device == "cpu")): + logger.warning("Mix precision exits due to bf16 for onnx models needs " \ + "'gpu' device and 'onnxrt_cuda_ep' backend, or 'cpu' device and 'onnxrt_dnnl_ep' backend.") sys.exit(0) elif 'bf16' in precisions and not CpuInfo().bf16 and conf.framework != "onnxruntime": # pragma: no cover if os.getenv('FORCE_BF16') == '1': From 82853cc578727cec86857e25b47fa600087bb3dd Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Tue, 6 Jun 2023 10:30:02 +0800 Subject: [PATCH 07/21] Update quantization.md --- docs/source/quantization.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/source/quantization.md b/docs/source/quantization.md index e52e5ee702f..30ef8931ebd 100644 --- a/docs/source/quantization.md +++ b/docs/source/quantization.md @@ -470,6 +470,12 @@ Intel(R) Neural Compressor support multi-framework: PyTorch, Tensorflow, ONNX Ru "onnxrt_cuda_ep" gpu + + DnnlExecutionProvider + OneDNN + "onnxrt_dnnl_ep" + cpu + Tensorflow Tensorflow From 08fe94b68762c5044842f69a75df6ce69c933ca9 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Tue, 6 Jun 2023 10:31:04 +0800 Subject: [PATCH 08/21] Update quantization.md --- docs/source/quantization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/quantization.md b/docs/source/quantization.md index 30ef8931ebd..a9c770e729b 100644 --- a/docs/source/quantization.md +++ b/docs/source/quantization.md @@ -452,7 +452,7 @@ Intel(R) Neural Compressor support multi-framework: PyTorch, Tensorflow, ONNX Ru cpu - ONNX Runtime + ONNX Runtime CPUExecutionProvider MLAS "default" From bd7aa586594b7ca46780c8adef10d9424298f205 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Tue, 6 Jun 2023 12:31:05 +0800 Subject: [PATCH 09/21] Update mixed_precision.md --- docs/source/mixed_precision.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/source/mixed_precision.md b/docs/source/mixed_precision.md index 560bcc41e33..579ac765125 100644 --- a/docs/source/mixed_precision.md +++ b/docs/source/mixed_precision.md @@ -17,6 +17,7 @@ The recently launched 3rd Gen Intel® Xeon® Scalable processor (codenamed Coope

## Mixed Precision Support Matrix + @@ -48,7 +49,7 @@ The recently launched 3rd Gen Intel® Xeon® Scalable processor (codenamed Coope - + @@ -72,6 +73,14 @@ The recently launched 3rd Gen Intel® Xeon® Scalable processor (codenamed Coope + + + + + + + + @@ -162,4 +171,4 @@ converted_model.save('./path/to/save/') - Quick started with [helloworld example](/examples/helloworld/tf_example3) - PyTorch [ResNet18](/examples/pytorch/image_recognition/torchvision_models/mixed_precision/resnet18) - IPEX [DistilBERT base](/examples/pytorch/nlp/huggingface_models/question-answering/mixed_precision/ipex) -- Tensorflow [ResNet50](/examples/tensorflow/image_recognition/tensorflow_models/resnet50_v1/mixed_precision) \ No newline at end of file +- Tensorflow [ResNet50](/examples/tensorflow/image_recognition/tensorflow_models/resnet50_v1/mixed_precision) From fec595c5817139cf25dc0985fb536811338c1baf Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Tue, 6 Jun 2023 13:14:44 +0800 Subject: [PATCH 10/21] Update inc_dict.txt --- .azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt b/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt index af758fcce72..548cdaebfc0 100644 --- a/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt +++ b/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt @@ -564,6 +564,7 @@ enum env environ ep +eps eq erf Erf From 381f1bd61c374d85e19c5ab1b2d403b2e0047dae Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Tue, 4 Jul 2023 10:23:42 +0800 Subject: [PATCH 11/21] fix bug Signed-off-by: Mengni Wang --- neural_compressor/adaptor/onnxrt.py | 10 ++++++++++ neural_compressor/adaptor/ox_utils/calibration.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py index 3bb698d893c..9930b768873 100644 --- a/neural_compressor/adaptor/onnxrt.py +++ b/neural_compressor/adaptor/onnxrt.py @@ -212,6 +212,16 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None): return model if model.model.opset_import[0].version < 11: # pragma: no cover logger.warning("Quantize input needs model opset 11 or newer.") + if self.backend == 'DnnlExecutionProvider' and \ + any([i.domain in ['', 'ai.onnx'] and i.version < 15 for i in model.model.opset_import]): + from onnx import version_converter + try: + model.model = version_converter.convert_version(model.model, 15) + except: + logging.warning("Fail to upgrade model opset_import to >= 15, "\ + "please upgrate it manually to run with bf16 data type") + exit(0) + from neural_compressor.adaptor.ox_utils.util import QuantizationMode if self.format == "qlinearops": format = QuantizationMode.QLinearOps diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py index bb6060096cd..37befd20044 100644 --- a/neural_compressor/adaptor/ox_utils/calibration.py +++ b/neural_compressor/adaptor/ox_utils/calibration.py @@ -467,7 +467,7 @@ def calculate_quantization_params(self, q_config, quantization_thresholds): if tensor_name in output_name_to_nodes: parent = output_name_to_nodes[tensor_name] if parent and parent.name in q_config and \ - q_config[parent.name] not in ['fp32', 'fp16']: + q_config[parent.name] not in ['fp32', 'fp16', 'bf16']: scheme = q_config[parent.name]['activation']['scheme'] qType = q_config[parent.name]['activation']['dtype'] elif self.backend in ['TensorrtExecutionProvider']: From 1d6d4df940175dc10ca985f708f4e308844cc8dc Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Wed, 5 Jul 2023 19:45:01 +0800 Subject: [PATCH 12/21] Update quantizer.py --- neural_compressor/adaptor/ox_utils/quantizer.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/neural_compressor/adaptor/ox_utils/quantizer.py b/neural_compressor/adaptor/ox_utils/quantizer.py index 04558648fe3..35421ee03bf 100644 --- a/neural_compressor/adaptor/ox_utils/quantizer.py +++ b/neural_compressor/adaptor/ox_utils/quantizer.py @@ -246,8 +246,11 @@ def merge_dedicated_qdq_pair(self): def should_cast(self, node): """Check if node should be casted.""" if node.name in self.config and self.config[node.name] != 'fp32': # pragma: no cover - return True - else: + parent = self.model.get_parent(node, 0) + if parent is not None and (parent.op_type != 'Cast' or parent.attribute[0].i in [1, 10, 16]): + return True + elif parent is None and node.input[0] in self.model.input(): + return True return False def insert_qdq(self): From 20fc55d4b98a4fe7e212a9bf88b386d23532dd25 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Wed, 5 Jul 2023 19:45:51 +0800 Subject: [PATCH 13/21] Update onnxrt.py --- neural_compressor/adaptor/onnxrt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py index 9930b768873..1e6cf9d06a0 100644 --- a/neural_compressor/adaptor/onnxrt.py +++ b/neural_compressor/adaptor/onnxrt.py @@ -216,7 +216,7 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None): any([i.domain in ['', 'ai.onnx'] and i.version < 15 for i in model.model.opset_import]): from onnx import version_converter try: - model.model = version_converter.convert_version(model.model, 15) + model.model = self._rename_node(version_converter.convert_version(model.model, 15)) except: logging.warning("Fail to upgrade model opset_import to >= 15, "\ "please upgrate it manually to run with bf16 data type") From e1c0dbeddf4c38527b70c4658f6dbbf4a52f7d5b Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Thu, 6 Jul 2023 10:10:04 +0800 Subject: [PATCH 14/21] Update model.py --- neural_compressor/model/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neural_compressor/model/model.py b/neural_compressor/model/model.py index e73efbb4feb..66632ece602 100644 --- a/neural_compressor/model/model.py +++ b/neural_compressor/model/model.py @@ -79,9 +79,9 @@ def _is_onnxruntime(model): from onnxruntime_extensions import get_library_path so.register_custom_ops_library(get_library_path()) if isinstance(model, str): - ort.InferenceSession(model, so, providers=['CPUExecutionProvider']) + ort.InferenceSession(model, so, providers=ort.get_available_providers()) else: - ort.InferenceSession(model.SerializeToString(), so, providers=['CPUExecutionProvider']) + ort.InferenceSession(model.SerializeToString(), so, providers=ort.get_available_providers()) except Exception as e: # pragma: no cover if 'Message onnx.ModelProto exceeds maximum protobuf size of 2GB' in str(e): logger.warning('Please use model path instead of onnx model object to quantize') From f38b29751a468dd17cc2c1dcf868d70580bf7500 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Thu, 6 Jul 2023 10:55:44 +0800 Subject: [PATCH 15/21] Update onnxrt_dnnl.yaml --- neural_compressor/adaptor/onnxrt_dnnl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/adaptor/onnxrt_dnnl.yaml b/neural_compressor/adaptor/onnxrt_dnnl.yaml index 2d2d718130c..847331b346d 100644 --- a/neural_compressor/adaptor/onnxrt_dnnl.yaml +++ b/neural_compressor/adaptor/onnxrt_dnnl.yaml @@ -400,7 +400,7 @@ name: '1.14.0' int8: *ref_1_11 bf16: &common_bf16 ['MatMul', 'Gemm', 'BatchNormalization', 'Softmax', 'Sum', - 'Abs', 'BiasGelu', 'Exp', 'FastGelu', 'Gelu', 'Log', 'Relu', 'Round', 'Sigmoid', + 'Abs', 'Exp', 'FastGelu', 'Gelu', 'Log', 'Relu', 'Round', 'Sigmoid', 'Sqrt', 'Tanh', 'Add', 'Sub', 'Mul', 'Div', 'Pow', 'ReduceMean', 'Equal', 'FusedMatMul', 'Greater', 'GreaterOrEqual', 'LeakyRelu', 'Less', 'LessOrEqual', 'Reshape', 'Squeeze', 'Transpose', 'Unsqueeze', 'ReduceL1', 'ReduceL2', 'ReduceLogSum', From 65c41de0efcf46fc570dbe0e138cf512f916c735 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Thu, 6 Jul 2023 17:02:51 +0800 Subject: [PATCH 16/21] Update onnxrt_dnnl.yaml --- neural_compressor/adaptor/onnxrt_dnnl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/adaptor/onnxrt_dnnl.yaml b/neural_compressor/adaptor/onnxrt_dnnl.yaml index 847331b346d..2d2d718130c 100644 --- a/neural_compressor/adaptor/onnxrt_dnnl.yaml +++ b/neural_compressor/adaptor/onnxrt_dnnl.yaml @@ -400,7 +400,7 @@ name: '1.14.0' int8: *ref_1_11 bf16: &common_bf16 ['MatMul', 'Gemm', 'BatchNormalization', 'Softmax', 'Sum', - 'Abs', 'Exp', 'FastGelu', 'Gelu', 'Log', 'Relu', 'Round', 'Sigmoid', + 'Abs', 'BiasGelu', 'Exp', 'FastGelu', 'Gelu', 'Log', 'Relu', 'Round', 'Sigmoid', 'Sqrt', 'Tanh', 'Add', 'Sub', 'Mul', 'Div', 'Pow', 'ReduceMean', 'Equal', 'FusedMatMul', 'Greater', 'GreaterOrEqual', 'LeakyRelu', 'Less', 'LessOrEqual', 'Reshape', 'Squeeze', 'Transpose', 'Unsqueeze', 'ReduceL1', 'ReduceL2', 'ReduceLogSum', From d6c34a58bbe3f13fe6098707ff169e85999e7324 Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Fri, 14 Jul 2023 15:05:10 +0800 Subject: [PATCH 17/21] fix keyerror bug in _optypewise_filter_for_qdq Signed-off-by: yuwenzho --- neural_compressor/adaptor/onnxrt.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py index 1e6cf9d06a0..ae0173979d4 100644 --- a/neural_compressor/adaptor/onnxrt.py +++ b/neural_compressor/adaptor/onnxrt.py @@ -1081,6 +1081,8 @@ def _optypewise_filter_for_qdq(self, optype_wise): '1.11.0': ['Conv', 'Gather', 'MatMul', 'Gemm'], '1.12.0': ['Conv', 'Gather', 'MatMul', 'Gemm']} specific_cfg_version = self.query_handler.get_specific_cfg_version() + if Version(specific_cfg_version) > ONNXRT112_VERSION: + specific_cfg_version = '1.12.0' for optype, caps in optype_wise.items(): if optype not in supported_perchannel_optypes[specific_cfg_version]: for cap in caps: From 389ff129eeb4145c22fae3444e845afa16bd6247 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Mon, 17 Jul 2023 14:05:52 +0800 Subject: [PATCH 18/21] Update mixed_precision.md --- docs/source/mixed_precision.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/mixed_precision.md b/docs/source/mixed_precision.md index 579ac765125..70ea2d98a89 100644 --- a/docs/source/mixed_precision.md +++ b/docs/source/mixed_precision.md @@ -171,4 +171,5 @@ converted_model.save('./path/to/save/') - Quick started with [helloworld example](/examples/helloworld/tf_example3) - PyTorch [ResNet18](/examples/pytorch/image_recognition/torchvision_models/mixed_precision/resnet18) - IPEX [DistilBERT base](/examples/pytorch/nlp/huggingface_models/question-answering/mixed_precision/ipex) -- Tensorflow [ResNet50](/examples/tensorflow/image_recognition/tensorflow_models/resnet50_v1/mixed_precision) +- Tensorflow [ResNet50](/examples/tensorflow/image_recognition/tensorflow_models/resnet50_v1/mixed_precision) +- ONNX Runtime [Bert base](/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision) From 52dc9b7b4b68a339bbaacb1eaf31f798680cc0a8 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Mon, 17 Jul 2023 15:38:05 +0800 Subject: [PATCH 19/21] Update README.md --- .../text_classification/mix_precision/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md index 69e02a5059a..25857c31adc 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md +++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md @@ -37,7 +37,7 @@ Supported model identifier from [huggingface.co](https://huggingface.co/): | Intel/bart-large-mrpc | ```bash -python export.py --model_name_or_path=Intel/bert-base-uncased-mrpc # or other supported model identifier +optimum-cli export onnx --model Intel/bert-base-uncased-mrpc --task text-classification ``` ## 3. Prepare Dataset From f496568abdb5f071d776220c612d51876dd2d41b Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Mon, 17 Jul 2023 15:38:31 +0800 Subject: [PATCH 20/21] Update requirements.txt --- .../text_classification/mix_precision/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/requirements.txt b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/requirements.txt index eb99b8d165d..89803cfb0d2 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/requirements.txt +++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/requirements.txt @@ -6,3 +6,4 @@ coloredlogs sympy onnxruntime-extensions; python_version < '3.10' numpy +optimum[exporters] From e782acc0db57cd0fd18a0736457404bda828425e Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Mon, 17 Jul 2023 15:38:46 +0800 Subject: [PATCH 21/21] Delete export.py --- .../mix_precision/export.py | 74 ------------------- 1 file changed, 74 deletions(-) delete mode 100644 examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/export.py diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/export.py b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/export.py deleted file mode 100644 index 589fe3a345e..00000000000 --- a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/export.py +++ /dev/null @@ -1,74 +0,0 @@ -import argparse - -import torch -from transformers import AutoConfig, AutoModelForSequenceClassification - -def export_onnx_model(args, model): - with torch.no_grad(): - symbolic_names = {0: 'batch_size', 1: 'max_seq_len'} - if args.model_name_or_path in ['Intel/roberta-base-mrpc', - 'Intel/xlm-roberta-base-mrpc', - 'Intel/camembert-base-mrpc', - 'distilbert-base-uncased-finetuned-sst-2-english']: - inputs = {'input_ids': torch.ones(1, args.max_len, dtype=torch.int64), - 'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)} - torch.onnx.export(model, # model being run - (inputs['input_ids'], # model input (or a tuple for multiple inputs) - inputs['attention_mask']), - args.output_model, # where to save the model (can be a file or file-like object) - opset_version=14, # the ONNX version to export the model - do_constant_folding=True, # whether to execute constant folding - input_names=['input_ids', # the model's input names - 'attention_mask'], - output_names=['logits'], - dynamic_axes={'input_ids': symbolic_names, # variable length axes - 'attention_mask' : symbolic_names}) - else: - inputs = {'input_ids': torch.ones(1, args.max_len, dtype=torch.int64), - 'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64), - 'token_type_ids': torch.ones(1, args.max_len, dtype=torch.int64)} - torch.onnx.export(model, # model being run - (inputs['input_ids'], # model input (or a tuple for multiple inputs) - inputs['attention_mask'], - inputs['token_type_ids']), - args.output_model, # where to save the model (can be a file or file-like object) - opset_version=14, # the ONNX version to export the model - do_constant_folding=True, # whether to execute constant folding - input_names=['input_ids', # the model's input names - 'attention_mask', - 'token_type_ids'], - output_names=['logits'], - dynamic_axes={'input_ids': symbolic_names, # variable length axes - 'attention_mask' : symbolic_names, - 'token_type_ids' : symbolic_names}) - print("ONNX Model exported to {0}".format(args.output_model)) - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description='Export huggingface onnx model', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument( - '--model_name_or_path', - type=str, - choices=['Intel/bert-base-uncased-mrpc', - 'Intel/roberta-base-mrpc', - 'Intel/xlm-roberta-base-mrpc', - 'Intel/camembert-base-mrpc', - 'distilbert-base-uncased-finetuned-sst-2-english', - 'Alireza1044/albert-base-v2-sst2', - 'philschmid/MiniLM-L6-H384-uncased-sst2', - 'Intel/MiniLM-L12-H384-uncased-mrpc'], - help='pretrained model name or path') - parser.add_argument( - '--max_len', - type=int, - default=128, - help='Maximum length of the sentence pairs') - args = parser.parse_args() - args.output_model = args.model_name_or_path.split('/')[-1] + '.onnx' - - model = AutoModelForSequenceClassification.from_pretrained( - args.model_name_or_path, - config=AutoConfig.from_pretrained(args.model_name_or_path)) - - export_onnx_model(args, model) \ No newline at end of file
:x:
ONNX RuntimeONNX Runtime CPUExecutionProvider MLAS "default"
DnnlExecutionProviderOneDNN"onnxrt_dnnl_ep"cpu:x:
Tensorflow Tensorflow