From 0937e496330a131ea013883ec118713106e5f422 Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Tue, 23 May 2023 17:27:42 +0800
Subject: [PATCH 01/21] Add dnnl ep

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 .../mix_precision/README.md                   |  77 ++++
 .../text_classification/mix_precision/eval.sh | 128 ++++++
 .../mix_precision/export.py                   |  74 ++++
 .../text_classification/mix_precision/main.py | 405 +++++++++++++++++
 .../mix_precision/prepare_data.sh             |  34 ++
 .../mix_precision/requirements.txt            |   8 +
 .../text_classification/mix_precision/run.sh  | 118 +++++
 neural_compressor/adaptor/onnxrt.py           |  94 ++--
 neural_compressor/adaptor/onnxrt_dnnl.yaml    | 411 ++++++++++++++++++
 neural_compressor/adaptor/ox_utils/util.py    |   4 +-
 neural_compressor/config.py                   |  20 +-
 neural_compressor/strategy/strategy.py        |   5 +
 12 files changed, 1327 insertions(+), 51 deletions(-)
 create mode 100644 examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md
 create mode 100644 examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/eval.sh
 create mode 100644 examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/export.py
 create mode 100644 examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/main.py
 create mode 100644 examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/prepare_data.sh
 create mode 100644 examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/requirements.txt
 create mode 100644 examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/run.sh
 create mode 100644 neural_compressor/adaptor/onnxrt_dnnl.yaml

diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md
new file mode 100644
index 00000000000..f5d976123a3
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md
@@ -0,0 +1,77 @@
+Step-by-Step
+============
+
+This example load a language translation model and confirm its accuracy and speed based on [GLUE data](https://gluebenchmark.com/).
+
+# Prerequisite
+
+## 1. Environment
+```shell
+git clone -b dnnl_ep --depth 1 https://github.com/intel/neural-compressor.git
+cd neural-compressor
+pip install -e ./
+
+cd examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/
+pip install -r requirements.txt
+```
+> Note: Validated ONNX Runtime [Version](/docs/source/installation_guide.md#validated-software-environment).
+
+## 2. Prepare Model
+
+Supported model identifier from [huggingface.co](https://huggingface.co/):
+
+|                 Model Identifier                |
+|:-----------------------------------------------:|
+|           Intel/bert-base-uncased-mrpc          |
+|             Intel/roberta-base-mrpc             |
+|           Intel/xlm-roberta-base-mrpc           |
+|            Intel/camembert-base-mrpc            |
+| distilbert-base-uncased-finetuned-sst-2-english |
+|         Alireza1044/albert-base-v2-sst2         |
+|        Intel/MiniLM-L12-H384-uncased-mrpc       |
+|      philschmid/MiniLM-L6-H384-uncased-sst2     |
+|     bert-base-cased-finetuned-mrpc              |
+|        Intel/electra-small-discriminator-mrpc   |
+|         M-FAC/bert-mini-finetuned-mrpc          |
+|           Intel/xlnet-base-cased-mrpc           |
+|            Intel/bart-large-mrpc                |
+
+```bash
+python export.py --model_name_or_path=Intel/bert-base-uncased-mrpc # or other supported model identifier
+```
+
+## 3. Prepare Dataset
+Download the GLUE data with `prepare_data.sh` script.
+
+```shell
+export GLUE_DIR=/path/to/glue_data
+export TASK_NAME=MRPC # or SST
+
+bash prepare_data.sh --data_dir=$GLUE_DIR --task_name=$TASK_NAME
+```
+
+# Run
+
+If the hardware doesn't support bf16 instruction, please set flag as below to force bf16 conversion (this way will be deperecated):
+
+```shell
+export FORCE_BF16=1
+```
+
+## 1. Only mixed precision conversion
+
+```bash
+bash run.sh --input_model=path/to/model \ # model path as *.onnx
+            --output_model=path/to/model_tune \ # model path as *.onnx
+```
+
+## 2. Mixed precision conversion + accuracy evaluation
+
+Please make sure DnnlExecutionProvider is in available providers list to execute evaluation.
+
+```bash
+bash eval.sh --input_model=path/to/model \ # model path as *.onnx
+            --output_model=path/to/model_tune \ # model path as *.onnx
+            --dataset_location=path/to/glue/data \ 
+            --batch_size=batch_size \  # optional
+```
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/eval.sh b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/eval.sh
new file mode 100644
index 00000000000..9cc04b05e1e
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/eval.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+set -x
+
+function main {
+  init_params "$@"
+  run_tuning
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --output_model=*)
+          output_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+
+    if [[ "${input_model}" =~ "bert-base-uncased" ]]; then
+        model_name_or_path="Intel/bert-base-uncased-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "roberta-base" ]]; then
+        model_name_or_path="Intel/roberta-base-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "xlm-roberta-base" ]]; then
+        model_name_or_path="Intel/xlm-roberta-base-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "camembert-base" ]]; then
+        model_name_or_path="Intel/camembert-base-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "distilbert-base" ]]; then
+        model_name_or_path="distilbert-base-uncased-finetuned-sst-2-english"
+        TASK_NAME='sst-2'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "albert-base" ]]; then
+        model_name_or_path="Alireza1044/albert-base-v2-sst2"
+        TASK_NAME='sst-2'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "MiniLM-L6" ]]; then
+        model_name_or_path="philschmid/MiniLM-L6-H384-uncased-sst2"
+        TASK_NAME='sst-2'
+        num_heads=12
+        hidden_size=384
+    fi
+    if [[ "${input_model}" =~ "MiniLM-L12" ]]; then
+        model_name_or_path="Intel/MiniLM-L12-H384-uncased-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=384
+    fi
+    if [[ "${input_model}" =~ "bert-base-cased" ]]; then
+        model_name_or_path="bert-base-cased-finetuned-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=384
+    fi
+    if [[ "${input_model}" =~ "xlnet-base-cased" ]]; then
+        model_name_or_path="Intel/xlnet-base-cased-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "bert-mini" ]]; then
+        model_name_or_path="M-FAC/bert-mini-finetuned-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=4
+        hidden_size=256
+    fi
+    if [[ "${input_model}" =~ "electra-small-discriminator" ]]; then
+        model_name_or_path="Intel/electra-small-discriminator-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=4
+        hidden_size=256
+    fi
+    if [[ "${input_model}" =~ "bart" ]]; then
+        model_name_or_path="Intel/bart-large-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=16
+        hidden_size=4096
+    fi
+
+    python main.py \
+            --model_name_or_path ${model_name_or_path} \
+            --model_path ${input_model} \
+            --output_model ${output_model} \
+            --data_path ${dataset_location} \
+            --batch_size ${batch_size-1} \
+            --task ${TASK_NAME} \
+            --num_heads ${num_heads} \
+            --hidden_size ${hidden_size} \
+            --do_eval
+}
+
+main "$@"
+
+
+
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/export.py b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/export.py
new file mode 100644
index 00000000000..589fe3a345e
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/export.py
@@ -0,0 +1,74 @@
+import argparse
+
+import torch
+from transformers import AutoConfig, AutoModelForSequenceClassification
+
+def export_onnx_model(args, model):
+    with torch.no_grad():
+        symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
+        if args.model_name_or_path in ['Intel/roberta-base-mrpc', 
+                                        'Intel/xlm-roberta-base-mrpc', 
+                                        'Intel/camembert-base-mrpc', 
+                                        'distilbert-base-uncased-finetuned-sst-2-english']:
+            inputs = {'input_ids':      torch.ones(1, args.max_len, dtype=torch.int64),
+                    'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)}
+            torch.onnx.export(model,                            # model being run
+                            (inputs['input_ids'],               # model input (or a tuple for multiple inputs) 
+                            inputs['attention_mask']),          
+                            args.output_model,                  # where to save the model (can be a file or file-like object)
+                            opset_version=14,                   # the ONNX version to export the model
+                            do_constant_folding=True,           # whether to execute constant folding
+                            input_names=['input_ids',           # the model's input names
+                                        'attention_mask'],
+                            output_names=['logits'],
+                            dynamic_axes={'input_ids': symbolic_names,        # variable length axes
+                                        'attention_mask' : symbolic_names})
+        else:
+            inputs = {'input_ids':      torch.ones(1, args.max_len, dtype=torch.int64),
+                      'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64),
+                    'token_type_ids': torch.ones(1, args.max_len, dtype=torch.int64)}
+            torch.onnx.export(model,                            # model being run
+                            (inputs['input_ids'],               # model input (or a tuple for multiple inputs) 
+                            inputs['attention_mask'],
+                            inputs['token_type_ids']),          
+                            args.output_model,                  # where to save the model (can be a file or file-like object)
+                            opset_version=14,                   # the ONNX version to export the model
+                            do_constant_folding=True,           # whether to execute constant folding
+                            input_names=['input_ids',           # the model's input names
+                                        'attention_mask',
+                                        'token_type_ids'],
+                            output_names=['logits'],
+                            dynamic_axes={'input_ids': symbolic_names,        # variable length axes
+                                        'attention_mask' : symbolic_names,
+                                        'token_type_ids' : symbolic_names})
+        print("ONNX Model exported to {0}".format(args.output_model))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+    description='Export huggingface onnx model',
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        '--model_name_or_path',
+        type=str,
+        choices=['Intel/bert-base-uncased-mrpc',
+                'Intel/roberta-base-mrpc',
+                'Intel/xlm-roberta-base-mrpc',
+                'Intel/camembert-base-mrpc',
+                'distilbert-base-uncased-finetuned-sst-2-english',
+                'Alireza1044/albert-base-v2-sst2',
+                'philschmid/MiniLM-L6-H384-uncased-sst2',
+                'Intel/MiniLM-L12-H384-uncased-mrpc'],
+        help='pretrained model name or path')
+    parser.add_argument(
+        '--max_len',
+        type=int,
+        default=128,
+        help='Maximum length of the sentence pairs')
+    args = parser.parse_args()
+    args.output_model = args.model_name_or_path.split('/')[-1] + '.onnx'
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        args.model_name_or_path,
+        config=AutoConfig.from_pretrained(args.model_name_or_path))
+
+    export_onnx_model(args, model)
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/main.py b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/main.py
new file mode 100644
index 00000000000..fa5bd52f578
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/main.py
@@ -0,0 +1,405 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint:disable=redefined-outer-name,logging-format-interpolation
+
+import logging
+import argparse
+import onnx
+import onnxruntime as ort
+import transformers
+import os
+import torch
+import numpy as np
+from dataclasses import dataclass
+from typing import List, Optional, Union
+from neural_compressor.data.dataloaders.onnxrt_dataloader import DefaultDataLoader
+from neural_compressor.data.datasets.dummy_dataset import DummyDataset
+
+
+class ONNXRTBertDataset:
+    """Dataset used for model Bert.
+    Args: data_dir (str): The input data dir.
+          model_name_or_path (str): Path to pre-trained student model or shortcut name,
+                                    selected in the list:
+          max_seq_length (int, default=128): The maximum length after tokenization.
+                                Sequences longer than this will be truncated,
+                                sequences shorter will be padded.
+          do_lower_case (bool, default=True): Whether to lowercase the input when tokenizing.
+          task (str, default=mrpc): The name of the task to fine-tune.
+                                    Choices include mrpc, qqp, qnli, rte,
+                                    sts-b, cola, mnli, wnli.
+          model_type (str, default='bert'): model type, support 'distilbert', 'bert',
+                                            'mobilebert', 'roberta'.
+          dynamic_length (bool, default=False): Whether to use fixed sequence length.
+          evaluate (bool, default=True): Whether do evaluation or training.
+          transform (transform object, default=None):  transform to process input data.
+          filter (Filter objects, default=None): filter out examples according
+                                                 to specific conditions.
+    """
+    def __init__(self, model, data_dir, model_name_or_path, max_seq_length=128,\
+                do_lower_case=True, task='mrpc', model_type='bert', dynamic_length=False,\
+                evaluate=True, transform=None, filter=None):
+        self.inputs = [inp.name for inp in onnx.load(model).graph.input]
+        task = task.lower()
+        model_type = model_type.lower()
+        assert task in ['mrpc', 'qqp', 'qnli', 'rte', 'sts-b', 'cola', \
+            'mnli', 'wnli', 'sst-2'], 'Unsupported task type'
+        assert model_type in ['distilbert', 'bert', 'mobilebert', 'roberta'], 'Unsupported \
+            model type'
+        self.dynamic_length = dynamic_length
+        self.model_type = model_type
+        self.max_seq_length = max_seq_length
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path,
+            do_lower_case=do_lower_case)
+        self.dataset = load_and_cache_examples(data_dir, model_name_or_path, \
+            max_seq_length, task, model_type, tokenizer, evaluate)
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, index):
+        batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in self.dataset[index])
+        return batch[:len(self.inputs)], batch[-1]
+
+def load_and_cache_examples(data_dir, model_name_or_path, max_seq_length, task, \
+    model_type, tokenizer, evaluate):
+    from torch.utils.data import TensorDataset
+
+    processor = transformers.glue_processors[task]()
+    output_mode = transformers.glue_output_modes[task]
+    # Load data features from cache or dataset file
+    if not os.path.exists("./dataset_cached"):
+        os.makedirs("./dataset_cached")
+    cached_features_file = os.path.join("./dataset_cached", 'cached_{}_{}_{}_{}'.format(
+        'dev' if evaluate else 'train',
+        list(filter(None, model_name_or_path.split('/'))).pop(),
+        str(max_seq_length),
+        str(task)))
+    if os.path.exists(cached_features_file):
+        logger.info("Load features from cached file {}.".format(cached_features_file))
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Create features from dataset file at {}.".format(data_dir))
+        label_list = processor.get_labels()
+        examples = processor.get_dev_examples(data_dir) if evaluate else \
+            processor.get_train_examples(data_dir)
+        features = convert_examples_to_features(examples,
+                                                tokenizer,
+                                                task=task,
+                                                label_list=label_list,
+                                                max_length=max_seq_length,
+                                                output_mode=output_mode,
+        )
+        logger.info("Save features into cached file {}.".format(cached_features_file))
+        torch.save(features, cached_features_file)
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+    all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long)
+    if output_mode == "classification":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
+    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, \
+        all_seq_lengths, all_labels)
+    return dataset
+
+def convert_examples_to_features(
+    examples,
+    tokenizer,
+    max_length=128,
+    task=None,
+    label_list=None,
+    output_mode="classification",
+    pad_token=0,
+    pad_token_segment_id=0,
+    mask_padding_with_zero=True,
+):
+    processor = transformers.glue_processors[task]()
+    if label_list is None:
+        label_list = processor.get_labels()
+        logger.info("Use label list {} for task {}.".format(label_list, task))
+    label_map = {label: i for i, label in enumerate(label_list)}
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        inputs = tokenizer.encode_plus(
+            example.text_a,
+            example.text_b,
+            add_special_tokens=True,
+            max_length=max_length,
+            return_token_type_ids=True,
+            truncation=True,
+        )
+        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        seq_length = len(input_ids)
+        padding_length = max_length - len(input_ids)
+
+        input_ids = input_ids + ([pad_token] * padding_length)
+        attention_mask = attention_mask + \
+            ([0 if mask_padding_with_zero else 1] * padding_length)
+        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
+
+        assert len(input_ids) == max_length, \
+            "Error with input_ids length {} vs {}".format(
+            len(input_ids), max_length)
+        assert len(attention_mask) == max_length, \
+            "Error with attention_mask length {} vs {}".format(
+            len(attention_mask), max_length
+        )
+        assert len(token_type_ids) == max_length, \
+            "Error with token_type_ids length {} vs {}".format(
+            len(token_type_ids), max_length
+        )
+        if output_mode == "classification":
+            label = label_map[example.label]
+        elif output_mode == "regression":
+            label = float(example.label)
+        else:
+            raise KeyError(output_mode)
+
+        feats = InputFeatures(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            label=label,
+            seq_length=seq_length,
+        )
+        features.append(feats)
+    return features
+
+@dataclass(frozen=True)
+class InputFeatures:
+    """
+    A single set of features of data.
+    Property names are the same names as the corresponding inputs to a model.
+    Args:
+        input_ids: Indices of input sequence tokens in the vocabulary.
+        attention_mask: Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED,
+            ``0`` for MASKED (padded) tokens.
+        token_type_ids: (Optional) Segment token indices to indicate first and second
+            portions of the inputs. Only some models use them.
+        label: (Optional) Label corresponding to the input. Int for classification problems,
+            float for regression problems.
+        seq_length: (Optional) The length of input sequence before padding.
+    """
+
+    input_ids: List[int]
+    attention_mask: Optional[List[int]] = None
+    token_type_ids: Optional[List[int]] = None
+    label: Optional[Union[int, float]] = None
+    seq_length: Optional[List[int]] = None
+
+class ONNXRTGLUE:
+    """Computes GLUE score.
+
+    Args:
+        task (str, default=mrpc): The name of the task.
+                                  Choices include mrpc, qqp, qnli, rte,
+                                  sts-b, cola, mnli, wnli.
+
+    """
+    def __init__(self, task='mrpc'):
+        assert task in ['mrpc', 'qqp', 'qnli', 'rte', 'sts-b', 'cola', \
+            'mnli', 'wnli', 'sst-2'], 'Unsupported task type'
+        self.pred_list = None
+        self.label_list = None
+        self.task = task
+        self.return_key = {
+            "cola": "mcc",
+            "mrpc": "f1",
+            "sts-b": "corr",
+            "qqp": "acc",
+            "mnli": "mnli/acc",
+            "qnli": "acc",
+            "rte": "acc",
+            "wnli": "acc",
+            "sst-2": "acc"
+        }
+
+    def update(self, preds, labels):
+        """add preds and labels to storage"""
+        if isinstance(preds, list) and len(preds) == 1:
+            preds = preds[0]
+        if isinstance(labels, list) and len(labels) == 1:
+            labels = labels[0]
+        if self.pred_list is None:
+            self.pred_list = preds
+            self.label_list = labels
+        else:
+            self.pred_list = np.append(self.pred_list, preds, axis=0)
+            self.label_list = np.append(self.label_list, labels, axis=0)
+
+    def reset(self):
+        """clear preds and labels storage"""
+        self.pred_list = None
+        self.label_list = None
+
+    def result(self):
+        """calculate metric"""
+        output_mode = transformers.glue_output_modes[self.task]
+
+        if output_mode == "classification":
+            processed_preds = np.argmax(self.pred_list, axis=1)
+        elif output_mode == "regression":
+            processed_preds = np.squeeze(self.pred_list)
+        result = transformers.glue_compute_metrics(\
+            self.task, processed_preds, self.label_list)
+        return result[self.return_key[self.task]]
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.WARN)
+
+if __name__ == "__main__":
+    logger.info('Evaluating ONNXRuntime full precision accuracy and performance:')
+    parser = argparse.ArgumentParser(
+    description='BERT fine-tune examples for classification/regression tasks.',
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        '--model_path',
+        type=str,
+        help="Pre-trained resnet50 model on onnx file"
+    )
+    parser.add_argument(
+        '--do_eval',
+        action='store_true', \
+        default=False
+    )
+    parser.add_argument(
+        '--output_model',
+        type=str,
+        default=None,
+        help="output model path"
+    )
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default=None,
+        help="input data path"
+    )
+    parser.add_argument(
+        '--batch_size',
+        default=8,
+        type=int,
+    )
+    parser.add_argument(
+        '--model_name_or_path',
+        type=str,
+        choices=['Intel/bert-base-uncased-mrpc',
+                'Intel/roberta-base-mrpc',
+                'Intel/xlm-roberta-base-mrpc',
+                'Intel/camembert-base-mrpc',
+                'distilbert-base-uncased-finetuned-sst-2-english',
+                'Alireza1044/albert-base-v2-sst2',
+                'philschmid/MiniLM-L6-H384-uncased-sst2',
+                'Intel/MiniLM-L12-H384-uncased-mrpc',
+                'bert-base-cased-finetuned-mrpc',
+                'Intel/electra-small-discriminator-mrpc',
+                'M-FAC/bert-mini-finetuned-mrpc',
+                'Intel/xlnet-base-cased-mrpc',
+                'Intel/bart-large-mrpc'],
+        help="pretrained model name or path"
+    )
+    parser.add_argument(
+        '--task',
+        type=str,
+        choices=['mrpc', 'qqp', 'qnli', 'rte', 'sts-b', 'cola', \
+                'mnli', 'wnli', 'sst-2'],
+        help="GLUE task name"
+    )
+    parser.add_argument(
+        '--num_heads',
+        default=12,
+        type=int,
+    )
+    parser.add_argument(
+        '--hidden_size',
+        default=768,
+        type=int,
+    )
+ 
+    args = parser.parse_args()
+
+    if ort.__version__ <= '1.13.1':
+        from onnxruntime.transformers import optimizer
+        from onnxruntime.transformers.fusion_options import FusionOptions
+        model_type = 'bart' if args.model_name_or_path == 'Intel/bart-large-mrpc' else 'bert'
+        opt_options = FusionOptions(model_type)
+        opt_options.enable_embed_layer_norm = False
+
+        model_optimizer = optimizer.optimize_model(
+            args.model_path,
+            model_type,
+            num_heads=args.num_heads,
+            hidden_size=args.hidden_size,
+            optimization_options=opt_options)
+        model = model_optimizer.model
+    else:
+        model = onnx.load(args.model_path)
+
+    from neural_compressor import MixedPrecisionConfig
+    from neural_compressor.mix_precision import fit
+    config = MixedPrecisionConfig(backend='onnxrt_dnnl_ep', precision='bf16')
+    converted_model = fit(model, config)
+    if any([i.domain in ['', 'ai.onnx'] and i.version < 15 for i in converted_model.model.opset_import]):
+        from onnx import version_converter
+        try:
+            new = version_converter.convert_version(converted_model.model, 15)
+            onnx.save(new, args.output_model)
+        except:
+            logging.warning("Fail to upgrade opset_import to > 15, "
+                "please upgrate it manually to run with bf16 data type")
+    else:
+        converted_model.save(args.output_model)
+
+    if args.do_eval:
+        dataset = ONNXRTBertDataset(args.model_path,
+                                    data_dir=args.data_path,
+                                    model_name_or_path=args.model_name_or_path,
+                                    task=args.task)
+        dataloader = DefaultDataLoader(dataset, args.batch_size)
+        metric = ONNXRTGLUE(args.task)
+
+        def eval_func(model, *args):
+            metric.reset()
+            session = ort.InferenceSession(model.SerializeToString(), 
+                                           providers=ort.get_available_providers())
+            ort_inputs = {}
+            len_inputs = len(session.get_inputs())
+            inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)]
+            for idx, (inputs, labels) in enumerate(dataloader):
+                if not isinstance(labels, list):
+                    labels = [labels]
+                inputs = inputs[:len_inputs]
+                for i in range(len_inputs):
+                    ort_inputs.update({inputs_names[i]: inputs[i]})
+                predictions = session.run(None, ort_inputs)
+                metric.update(predictions[0], labels)
+            return metric.result()
+
+        model = onnx.load(args.output_model)
+        acc_result = eval_func(model)
+        print("Batch size = %d" % args.batch_size)
+        print("Accuracy: %.5f" % acc_result)
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/prepare_data.sh b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/prepare_data.sh
new file mode 100644
index 00000000000..8e434a5c521
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/prepare_data.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+set -x
+
+function main {
+  init_params "$@"
+  download_data
+
+}
+
+# init params
+function init_params {
+
+  for var in "$@"
+  do
+    case $var in
+      --data_dir=*)
+          data_dir=$(echo $var |cut -f2 -d=)
+      ;;
+      --task_name=*)
+          task_name=$(echo $var |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function download_data {
+    wget https://raw.githubusercontent.com/huggingface/transformers/f98ef14d161d7bcdc9808b5ec399981481411cc1/utils/download_glue_data.py
+    python download_glue_data.py --data_dir=${data_dir} --tasks=${task_name}
+}
+
+main "$@"
+
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/requirements.txt b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/requirements.txt
new file mode 100644
index 00000000000..eb99b8d165d
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/requirements.txt
@@ -0,0 +1,8 @@
+torch
+transformers
+onnx
+onnxruntime >= 1.14.0
+coloredlogs
+sympy
+onnxruntime-extensions; python_version < '3.10'
+numpy
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/run.sh b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/run.sh
new file mode 100644
index 00000000000..033907a1e3c
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/run.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+set -x
+
+function main {
+  init_params "$@"
+  run_tuning
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --output_model=*)
+          output_model=$(echo $var |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+
+    if [[ "${input_model}" =~ "bert-base-uncased" ]]; then
+        model_name_or_path="Intel/bert-base-uncased-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "roberta-base" ]]; then
+        model_name_or_path="Intel/roberta-base-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "xlm-roberta-base" ]]; then
+        model_name_or_path="Intel/xlm-roberta-base-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "camembert-base" ]]; then
+        model_name_or_path="Intel/camembert-base-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "distilbert-base" ]]; then
+        model_name_or_path="distilbert-base-uncased-finetuned-sst-2-english"
+        TASK_NAME='sst-2'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "albert-base" ]]; then
+        model_name_or_path="Alireza1044/albert-base-v2-sst2"
+        TASK_NAME='sst-2'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "MiniLM-L6" ]]; then
+        model_name_or_path="philschmid/MiniLM-L6-H384-uncased-sst2"
+        TASK_NAME='sst-2'
+        num_heads=12
+        hidden_size=384
+    fi
+    if [[ "${input_model}" =~ "MiniLM-L12" ]]; then
+        model_name_or_path="Intel/MiniLM-L12-H384-uncased-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=384
+    fi
+    if [[ "${input_model}" =~ "bert-base-cased" ]]; then
+        model_name_or_path="bert-base-cased-finetuned-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=384
+    fi
+    if [[ "${input_model}" =~ "xlnet-base-cased" ]]; then
+        model_name_or_path="Intel/xlnet-base-cased-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "bert-mini" ]]; then
+        model_name_or_path="M-FAC/bert-mini-finetuned-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=4
+        hidden_size=256
+    fi
+    if [[ "${input_model}" =~ "electra-small-discriminator" ]]; then
+        model_name_or_path="Intel/electra-small-discriminator-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=4
+        hidden_size=256
+    fi
+    if [[ "${input_model}" =~ "bart" ]]; then
+        model_name_or_path="Intel/bart-large-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=16
+        hidden_size=4096
+    fi
+
+    python main.py \
+            --model_name_or_path ${model_name_or_path} \
+            --model_path ${input_model} \
+            --output_model ${output_model} \
+            --num_heads ${num_heads} \
+            --hidden_size ${hidden_size}
+}
+
+main "$@"
+
+
+
diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py
index 0f62b88a5b4..c0c6bcfc31d 100644
--- a/neural_compressor/adaptor/onnxrt.py
+++ b/neural_compressor/adaptor/onnxrt.py
@@ -65,6 +65,8 @@ def __init__(self, framework_specific_info):
         self.recipes = framework_specific_info.get("recipes", {})
         self.backend = PROVIDERS[framework_specific_info["backend"]]
         self.performance_only = framework_specific_info.get("performance_only", False)
+        self.use_bf16 = framework_specific_info.get("use_bf16", False)
+        self.use_fp16 = framework_specific_info.get("use_fp16", False)
 
         if self.backend not in ort.get_all_providers():
             logger.warning("{} backend is not supported in current environment, "
@@ -93,6 +95,8 @@ def __init__(self, framework_specific_info):
             config_file = 'onnxrt_trt.yaml'
         elif self.backend == 'CUDAExecutionProvider':
             config_file = 'onnxrt_cuda.yaml'
+        elif self.backend == 'DnnlExecutionProvider':
+            config_file = 'onnxrt_dnnl.yaml'
         else: # pragma: no cover
             assert False, "{} provider is not supported in current environment, " \
                 "supported providers: {}".format(self.backend,
@@ -590,9 +594,9 @@ def _detect_domain(self, model):
         # typically, NLP models have multiple inputs, 
         # and the dimension of each input is usually 2 (batch_size, max_seq_len)
         if not model.is_large_model:
-            sess = ort.InferenceSession(model.model.SerializeToString(), providers=[self.backend])
+            sess = ort.InferenceSession(model.model.SerializeToString(), providers=ort.get_available_providers())
         elif model.model_path is not None: # pragma: no cover
-            sess = ort.InferenceSession(model.model_path, providers=[self.backend])
+            sess = ort.InferenceSession(model.model_path, providers=ort.get_available_providers())
         else: # pragma: no cover
             assert False, "Please use model path instead of onnx model object to quantize."
         input_shape_lens = [len(input.shape) for input in  sess.get_inputs()]
@@ -619,48 +623,51 @@ def _pre_optimize(self, model, level=1):
         from neural_compressor.adaptor.ox_utils.util import \
             remove_init_from_model_input, split_shared_bias
         remove_init_from_model_input(model)
-        sess_options = ort.SessionOptions()
-        optimization_levels = {
+        try:
+            sess_options = ort.SessionOptions()
+            optimization_levels = {
                 'DISABLE_ALL': ort.GraphOptimizationLevel.ORT_DISABLE_ALL,
                 'ENABLE_BASIC': ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
                 'ENABLE_EXTENDED': ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED,
                 'ENABLE_ALL': ort.GraphOptimizationLevel.ORT_ENABLE_ALL}
-        if not isinstance(self.query_handler.get_graph_optimization(), list):
-            level = self.query_handler.get_graph_optimization()
-        elif options.onnxrt.graph_optimization.level is not None:
-            level = options.onnxrt.graph_optimization.level
-        elif self.recipes.get('graph_optimization_level', None) is not None:
-            level = self.recipes['graph_optimization_level']
-        else:
-            if self.domain == "auto" and self._detect_domain(model):
-                self.domain = 'nlp' 
-            level = 'ENABLE_EXTENDED' if self.domain == 'nlp' else 'ENABLE_BASIC'
-            logger.warning("Graph optimization level is automatically set to {}. "
-                "You can use 'recipe' argument in 'PostTrainingQuantConfig'" 
-                "to overwrite it".format(level))
-        sess_options.graph_optimization_level = optimization_levels[level]
-        sess_options.optimized_model_filepath = os.path.join(self.work_space, \
-            "Optimized_model.onnx")
-        if sys.version_info < (3,10) and find_spec('onnxruntime_extensions'): # pragma: no cover
-            from onnxruntime_extensions import get_library_path
-            sess_options.register_custom_ops_library(get_library_path())
-        backend = self.backend if self.backend != 'TensorrtExecutionProvider' else 'CUDAExecutionProvider'
-        if not model.is_large_model:
-            ort.InferenceSession(model.model.SerializeToString(),
-                                 sess_options,
-                                 providers=[backend])
-        elif model.model_path is not None: # pragma: no cover
-            ort.InferenceSession(model.model_path,
-                                 sess_options,
-                                 providers=[backend])
-        else: # pragma: no cover 
-            logger.warning('Please use model path instead of onnx model object to quantize')
-
-        tmp_model = onnx.load(sess_options.optimized_model_filepath, load_external_data=False)
-        if model.is_large_model: # pragma: no cover
-            from onnx.external_data_helper import load_external_data_for_model
-            load_external_data_for_model(tmp_model, os.path.split(model.model_path)[0])
-        model.model_path = sess_options.optimized_model_filepath
+            if not isinstance(self.query_handler.get_graph_optimization(), list):
+                level = self.query_handler.get_graph_optimization()
+            elif options.onnxrt.graph_optimization.level is not None:
+                level = options.onnxrt.graph_optimization.level
+            elif self.recipes.get('graph_optimization_level', None) is not None:
+                level = self.recipes['graph_optimization_level']
+            else:
+                if self.domain == "auto" and self._detect_domain(model):
+                    self.domain = 'nlp' 
+                level = 'ENABLE_EXTENDED' if self.domain == 'nlp' else 'ENABLE_BASIC'
+                logger.warning("Graph optimization level is automatically set to {}. "
+                    "You can use 'recipe' argument in 'PostTrainingQuantConfig'" 
+                    "to overwrite it".format(level))
+            sess_options.graph_optimization_level = optimization_levels[level]
+            sess_options.optimized_model_filepath = os.path.join(self.work_space, \
+                "Optimized_model.onnx")
+            if sys.version_info < (3,10) and find_spec('onnxruntime_extensions'): # pragma: no cover
+                from onnxruntime_extensions import get_library_path
+                sess_options.register_custom_ops_library(get_library_path())
+            if not model.is_large_model:
+                ort.InferenceSession(model.model.SerializeToString(),
+                                    sess_options,
+                                    providers=[self.backend])
+            elif model.model_path is not None: # pragma: no cover
+                ort.InferenceSession(model.model_path,
+                                    sess_options,
+                                    providers=[self.backend])
+            else: # pragma: no cover 
+                logger.warning('Please use model path instead of onnx model object to quantize')
+
+            tmp_model = onnx.load(sess_options.optimized_model_filepath, load_external_data=False)
+
+            if model.is_large_model: # pragma: no cover
+                from onnx.external_data_helper import load_external_data_for_model
+                load_external_data_for_model(tmp_model, os.path.split(model.model_path)[0])
+            model.model_path = sess_options.optimized_model_filepath
+        except:
+            tmp_model = model
         model.model = self._replace_gemm_with_matmul(tmp_model).model if \
             options.onnxrt.graph_optimization.gemm2matmul and self.recipes.get('gemm_to_matmul', True) else \
             tmp_model
@@ -859,10 +866,13 @@ def query_fw_capability(self, model):
             precisions = query.get_precisions()
 
             for precision in precisions:
-                if precision in ['fp16', 'bf16'] and (self.device == 'cpu' or self.backend != 'CUDAExecutionProvider'):
+                if precision == 'fp16' and \
+                    (not self.use_fp16 or 'CUDAExecutionProvider' not in ort.get_available_providers()):
                     continue
-                elif precision == 'bf16' and 'CUDAExecutionProvider' not in ort.get_available_providers():
+                if precision == 'bf16' and \
+                    (not self.use_bf16 or (not CpuInfo().bf16 and os.getenv('FORCE_BF16') != '1')):
                     continue
+ 
                 # get supported optype for target precision
                 optypes = query.get_op_types_by_precision(precision) if \
                     query.get_op_types_by_precision(precision) != ['*'] else \
diff --git a/neural_compressor/adaptor/onnxrt_dnnl.yaml b/neural_compressor/adaptor/onnxrt_dnnl.yaml
new file mode 100644
index 00000000000..704d63ead93
--- /dev/null
+++ b/neural_compressor/adaptor/onnxrt_dnnl.yaml
@@ -0,0 +1,411 @@
+## Copyright (c) 2021 Intel Corporation
+##
+## Licensed under the Apache License, Version 2.0 (the "License");
+## you may not use this file except in compliance with the License.
+## You may obtain a copy of the License at
+##
+##    http://www.apache.org/licenses/LICENSE-2.0
+##
+## Unless required by applicable law or agreed to in writing, software
+## distributed under the License is distributed on an "AS IS" BASIS,
+## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+## See the License for the specific language governing permissions and
+## limitations under the License.
+##
+#
+
+-
+  version:
+    name: '1.6.0'
+  int8: &ref_1_6 {
+    'static': &ref_1_6_static {
+      'Conv': {
+        'weight':   &int8_sym_perchanneltensor_minmax {
+                    'dtype': ['int8'],
+                    'scheme': ['sym'],
+                    'granularity': ['per_channel', 'per_tensor'],
+                    'algorithm': ['minmax']
+                    },
+        'activation': &uint8_asym_pertensor_minmax {
+                    'dtype': ['uint8'],
+                    'scheme': ['asym'],
+                    'granularity': ['per_tensor'],
+                    'algorithm': ['minmax']
+                    },
+        'mode': ['QDQ', 'QLinear']
+      },
+      'FusedConv': {
+        'weight':   *int8_sym_perchanneltensor_minmax, #'QDQ': *int8_sym_pertensor_minmax
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'Gather': {
+        'weight':   &uint8_asym_perchanneltensor_minmax {
+                    'dtype': ['uint8'],
+                    'scheme': ['asym'],
+                    'granularity': ['per_channel', 'per_tensor'],
+                    'algorithm': ['minmax']
+                    }, 
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'MatMul': {
+        'weight':   &int8_sym_pertensor_minmax {
+                    'dtype': ['int8'],
+                    'scheme': ['sym'],
+                    'granularity': ['per_tensor'],
+                    'algorithm': ['minmax']
+                    }, 
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'Attention': &default_static_qlinear_qdq {
+        'weight':   *int8_sym_pertensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'Mul': &default_static_qlinear {
+        'weight':   *int8_sym_pertensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QLinear']
+      },
+      'Relu': *default_static_qlinear_qdq, 
+      'Clip': *default_static_qlinear_qdq,
+      'LeakyRelu': *default_static_qlinear_qdq,
+      'Sigmoid': *default_static_qlinear_qdq,
+      'MaxPool': *default_static_qlinear_qdq,
+      'EmbedLayerNormalization': *default_static_qlinear_qdq,
+      'GlobalAveragePool': *default_static_qlinear_qdq,
+      'Add': *default_static_qlinear,
+    },
+    'dynamic': &ref_1_6_dynamic {
+      'Conv': {
+        'weight':   *uint8_asym_perchanneltensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax
+      },
+      'FusedConv': {
+        'weight':   *uint8_asym_perchanneltensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax
+      },
+      'MatMul': &default_dynamic {
+        'weight': *int8_sym_pertensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax
+      },
+      'Gather': *default_dynamic,
+      'Attention': *default_dynamic,
+      'EmbedLayerNormalization': *default_dynamic,
+      'LSTM': *default_dynamic,
+    }
+  }
+  recipes: &default_optimization
+    graph_optimization:   # from onnxruntime graph_optimization_level
+      level: ['DISABLE_ALL', 'ENABLE_BASIC', 'ENABLE_EXTENDED', 'ENABLE_ALL']
+
+-
+  version:
+    name: '1.7.0'
+  int8: {
+    'static': {
+      'FusedConv': {
+        'weight': *int8_sym_perchanneltensor_minmax, #'QDQ': *int8_sym_pertensor_minmax
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'Conv': {
+        'weight':   *int8_sym_perchanneltensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'Gather': {
+        'weight':   *uint8_asym_perchanneltensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'MatMul': *default_static_qlinear_qdq,
+      'Attention': *default_static_qlinear_qdq,
+      'Mul': *default_static_qlinear,
+      'Relu': *default_static_qlinear_qdq,
+      'Clip': *default_static_qlinear_qdq,
+      'LeakyRelu': *default_static_qlinear_qdq,
+      'Sigmoid': *default_static_qlinear_qdq,
+      'MaxPool': *default_static_qlinear_qdq,
+      'EmbedLayerNormalization': *default_static_qlinear_qdq,
+      'GlobalAveragePool': *default_static_qlinear_qdq,
+      'Pad': *default_static_qlinear_qdq,
+      'Split': *default_static_qlinear_qdq,
+      'Add': *default_static_qlinear,
+    },
+    'dynamic': *ref_1_6_dynamic 
+  }
+  recipes:
+    <<: *default_optimization
+
+-
+  version:
+    name: '1.8.0'
+  int8: {
+    'static': {
+      'FusedConv': {
+        'weight': *int8_sym_perchanneltensor_minmax, # QDQ: *int8_sym_pertensor_minmax
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'Conv': {
+        'weight':   *int8_sym_perchanneltensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'Gather': {
+        'weight':   *uint8_asym_perchanneltensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'MatMul': {
+        'weight':  *int8_sym_perchanneltensor_minmax, # QDQ: *int8_sym_pertensor_minmax
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'Attention': *default_static_qlinear_qdq,
+      'Mul': *default_static_qlinear,
+      'Relu': *default_static_qlinear_qdq,
+      'Clip': *default_static_qlinear_qdq,
+      'LeakyRelu': *default_static_qlinear_qdq,
+      'Sigmoid': *default_static_qlinear_qdq,
+      'MaxPool': *default_static_qlinear_qdq,
+      'EmbedLayerNormalization': *default_static_qlinear_qdq,
+      'GlobalAveragePool': *default_static_qlinear_qdq,
+      'Pad': *default_static_qlinear_qdq,
+      'Split': *default_static_qlinear_qdq,
+      'Add': *default_static_qlinear,
+      'Squeeze': *default_static_qlinear_qdq,
+      'Reshape': *default_static_qlinear_qdq,
+      'Concat': *default_static_qlinear_qdq,
+      'AveragePool': *default_static_qlinear_qdq,
+      'Unsqueeze': *default_static_qlinear_qdq,
+      'Transpose': *default_static_qlinear_qdq,
+      'Resize': *default_static_qlinear_qdq,
+    },
+    'dynamic': {
+      'Conv': {
+        'weight': *uint8_asym_perchanneltensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax
+      },
+      'FusedConv': {
+        'weight': *uint8_asym_perchanneltensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax
+      },
+      'MatMul': {
+        'weight': *int8_sym_perchanneltensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax
+      },
+      'Gather': *default_dynamic,
+      'Attention': *default_dynamic,
+      'EmbedLayerNormalization': *default_dynamic,
+      'LSTM': *default_dynamic,
+    }
+  }
+  recipes:
+    <<: *default_optimization
+
+-
+  version:
+    name: '1.9.0'
+  int8: {
+    'static': {
+      'FusedConv': {
+        'weight': *int8_sym_perchanneltensor_minmax, # QDQ: *int8_sym_pertensor_minmax
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'Conv': {
+        'weight':   *int8_sym_perchanneltensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'Gather': {
+        'weight':   *uint8_asym_perchanneltensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'MatMul': {
+        'weight':   *int8_sym_perchanneltensor_minmax, # QDQ: *int8_sym_pertensor_minmax
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'EmbedLayerNormalization': {
+        'weight': *uint8_asym_pertensor_minmax, # QDQ: *int8_sym_pertensor_minmax
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'Attention': *default_static_qlinear_qdq,
+      'Mul': *default_static_qlinear,
+      'Relu': *default_static_qlinear_qdq,
+      'Clip': *default_static_qlinear_qdq,
+      'LeakyRelu': *default_static_qlinear_qdq,
+      'Sigmoid': *default_static_qlinear_qdq,
+      'MaxPool': *default_static_qlinear_qdq,
+      'GlobalAveragePool': *default_static_qlinear_qdq,
+      'Pad': *default_static_qlinear_qdq,
+      'Split': *default_static_qlinear_qdq,
+      'Add': *default_static_qlinear,
+      'Squeeze': *default_static_qlinear_qdq,
+      'Reshape': *default_static_qlinear_qdq,
+      'Concat': *default_static_qlinear_qdq,
+      'AveragePool': *default_static_qlinear_qdq,
+      'Unsqueeze': *default_static_qlinear_qdq,
+      'Transpose': *default_static_qlinear_qdq,
+      'Resize': *default_static_qlinear_qdq,
+    },
+    'dynamic': &ref_1_9_dynamic {
+      'Conv': {
+        'weight': *uint8_asym_pertensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax
+      },
+      'FusedConv': {
+        'weight': *uint8_asym_pertensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax
+      },
+      'MatMul': {
+        'weight': *int8_sym_perchanneltensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax
+      },
+      'EmbedLayerNormalization': {
+        'weight': *uint8_asym_pertensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax
+      },
+      'Gather': *default_dynamic,
+      'Attention': *default_dynamic,
+      'LSTM': *default_dynamic,
+    }
+  }
+  recipes:
+    <<: *default_optimization
+
+-
+  version:
+    name: '1.10.0'
+  int8: {
+    'static': {
+      'FusedConv': {
+        'weight': *int8_sym_perchanneltensor_minmax, # QDQ: *int8_sym_pertensor_minmax
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'Conv': {
+        'weight':   *int8_sym_perchanneltensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'Gather': {
+        'weight':   *uint8_asym_perchanneltensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'MatMul': {
+        'weight':   *int8_sym_perchanneltensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'EmbedLayerNormalization': {
+        'weight': *uint8_asym_pertensor_minmax, # QDQ: *int8_sym_pertensor_minmax
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'Attention': *default_static_qlinear_qdq,
+      'Mul': *default_static_qlinear,
+      'Relu': *default_static_qlinear_qdq,
+      'Clip': *default_static_qlinear_qdq,
+      'LeakyRelu': *default_static_qlinear_qdq,
+      'Sigmoid': *default_static_qlinear_qdq,
+      'MaxPool': *default_static_qlinear_qdq,
+      'GlobalAveragePool': *default_static_qlinear_qdq,
+      'Pad': *default_static_qlinear_qdq,
+      'Split': *default_static_qlinear_qdq,
+      'Add': *default_static_qlinear,
+      'Squeeze': *default_static_qlinear_qdq,
+      'Reshape': *default_static_qlinear_qdq,
+      'Concat': *default_static_qlinear_qdq,
+      'AveragePool': *default_static_qlinear_qdq,
+      'Unsqueeze': *default_static_qlinear_qdq,
+      'Transpose': *default_static_qlinear_qdq,
+      'Resize': *default_static_qlinear_qdq,
+    },
+    'dynamic': *ref_1_9_dynamic
+  }
+  recipes:
+    <<: *default_optimization
+
+-
+  version:
+    name: '1.11.0'
+  int8: &ref_1_11 {
+    'static': {
+      'FusedConv': {
+        'weight': *int8_sym_perchanneltensor_minmax, # QDQ: *int8_sym_pertensor_minmax
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'Conv': {
+        'weight':   *int8_sym_perchanneltensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'Gather': {
+        'weight':   *uint8_asym_perchanneltensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'MatMul': {
+        'weight':   *int8_sym_perchanneltensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'Gemm': {
+        'weight':   *int8_sym_perchanneltensor_minmax,
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'EmbedLayerNormalization': {
+        'weight': *uint8_asym_pertensor_minmax, # QDQ: *int8_sym_pertensor_minmax
+        'activation': *uint8_asym_pertensor_minmax,
+        'mode': ['QDQ', 'QLinear']
+      },
+      'Attention': *default_static_qlinear_qdq,
+      'Mul': *default_static_qlinear,
+      'Relu': *default_static_qlinear_qdq,
+      'Clip': *default_static_qlinear_qdq,
+      'LeakyRelu': *default_static_qlinear_qdq,
+      'Sigmoid': *default_static_qlinear_qdq,
+      'MaxPool': *default_static_qlinear_qdq,
+      'GlobalAveragePool': *default_static_qlinear_qdq,
+      'Pad': *default_static_qlinear_qdq,
+      'Split': *default_static_qlinear_qdq,
+      'Add': *default_static_qlinear,
+      'Squeeze': *default_static_qlinear_qdq,
+      'Reshape': *default_static_qlinear_qdq,
+      'Concat': *default_static_qlinear_qdq,
+      'AveragePool': *default_static_qlinear_qdq,
+      'Unsqueeze': *default_static_qlinear_qdq,
+      'Transpose': *default_static_qlinear_qdq,
+      'ArgMax': *default_static_qlinear,
+      'Resize': *default_static_qlinear_qdq,
+    },
+    'dynamic': *ref_1_9_dynamic
+  }
+  recipes:
+    <<: *default_optimization
+
+-
+  version:
+    name: '1.14.0'
+  int8: *ref_1_11
+  bf16: &common_bf16 ['MatMul', 'Gemm']
+  recipes:
+    <<: *default_optimization
+
+-
+  version:
+    name: 'default'
+  int8: *ref_1_6
+  recipes:
+    <<: *default_optimization
diff --git a/neural_compressor/adaptor/ox_utils/util.py b/neural_compressor/adaptor/ox_utils/util.py
index de369ca33a0..0040db9f21e 100644
--- a/neural_compressor/adaptor/ox_utils/util.py
+++ b/neural_compressor/adaptor/ox_utils/util.py
@@ -70,13 +70,15 @@
 PROVIDERS = {
     'default': 'CPUExecutionProvider',
     'onnxrt_trt_ep': 'TensorrtExecutionProvider',
+    'onnxrt_dnnl_ep': 'DnnlExecutionProvider',
     'onnxrt_cuda_ep': 'CUDAExecutionProvider',
 }
 
 ONNXRT_BACKENDS = {
     'CPUExecutionProvider': 'default',
     'TensorrtExecutionProvider': 'onnxrt_trt_ep',
-    'CUDAExecutionProvider': 'onnxrt_cuda_ep'
+    'CUDAExecutionProvider': 'onnxrt_cuda_ep',
+    'DnnlExecutionProvider': 'onnxrt_dnnl_ep'
 }
 
 def dtype_to_name(dtype_mapping, dtype):
diff --git a/neural_compressor/config.py b/neural_compressor/config.py
index 4528926ea71..a4a661cf0a6 100644
--- a/neural_compressor/config.py
+++ b/neural_compressor/config.py
@@ -259,7 +259,8 @@ class BenchmarkConfig:
         inputs (list, optional): A list of strings containing the inputs of model. Default is an empty list.
         outputs (list, optional): A list of strings containing the outputs of model. Default is an empty list.
         backend (str, optional): Backend name for model execution. Supported values include: 'default', 'itex',
-                                'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep'. Default value is 'default'.
+                                'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep', 'onnxrt_dnnl_ep'.
+                                Default value is 'default'.
         warmup (int, optional): The number of iterations to perform warmup before running performance tests.
                                 Default value is 5.
         iteration (int, optional): The number of iterations to run performance tests. Default is -1.
@@ -327,7 +328,7 @@ def backend(self):
     def backend(self, backend):
         """Set backend."""
         if _check_value('backend', backend, str, [
-                'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep']):
+                'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep', 'onnxrt_dnnl_ep']):
             self._backend = backend
 
     @property
@@ -672,7 +673,8 @@ class _BaseQuantizationConfig:
     Args:
         inputs: Inputs of model, only required in tensorflow.
         outputs: Outputs of model, only required in tensorflow.
-        backend: Backend for model execution. Support 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep'
+        backend: Backend for model execution.
+                 Support 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep', 'onnxrt_dnnl_ep'
         domain: Model domain. Support 'auto', 'cv', 'object_detection', 'nlp' and 'recommendation_system'.
                 Adaptor will use specific quantization settings for different domains automatically, and
                 explicitly specified quantization settings will override the automatic setting.
@@ -1038,7 +1040,7 @@ def backend(self):
     @backend.setter
     def backend(self, backend):
         if _check_value('backend', backend, str, [
-                'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep']):
+                'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep', 'onnxrt_dnnl_ep']):
             self._backend = backend
 
     @property
@@ -1083,7 +1085,8 @@ class PostTrainingQuantConfig(_BaseQuantizationConfig):
 
     Args:
         device: Support 'cpu' and 'gpu'.
-        backend: Backend for model execution. Support 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep'
+        backend: Backend for model execution.
+                 Support 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep', 'onnxrt_dnnl_ep'
         domain: Model domain. Support 'auto', 'cv', 'object_detection', 'nlp' and 'recommendation_system'.
                 Adaptor will use specific quantization settings for different domains automatically, and
                 explicitly specified quantization settings will override the automatic setting.
@@ -1242,7 +1245,8 @@ class QuantizationAwareTrainingConfig(_BaseQuantizationConfig):
 
     Args:
         device: Support 'cpu' and 'gpu'.
-        backend: Backend for model execution. Support 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep'
+        backend: Backend for model execution.
+                 Support 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep', 'onnxrt_dnnl_ep'
         inputs: Inputs of model, only required in tensorflow.
         outputs: Outputs of model, only required in tensorflow.
         op_type_dict: Tuning constraints on optype-wise  for advance user to reduce tuning space.
@@ -1684,8 +1688,8 @@ class MixedPrecisionConfig(object):
         device (str, optional): Device for execution.
                                 Support 'cpu' and 'gpu', default is 'cpu'.
         backend (str, optional): Backend for model execution.
-                                 Support 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep',
-                                 default is 'default', 'ipex' doesn't support tune.
+                                 Support 'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep', 'onnxrt_dnnl_ep'
+                                 default is 'default'.
         precisions ([str, list], optional): Target precision for mix precision conversion.
                                    Support 'bf16' and 'fp16', default is 'bf16'.
         model_name (str, optional): The name of the model. Default value is empty.
diff --git a/neural_compressor/strategy/strategy.py b/neural_compressor/strategy/strategy.py
index 3624b342286..e5ac78fadd1 100644
--- a/neural_compressor/strategy/strategy.py
+++ b/neural_compressor/strategy/strategy.py
@@ -1250,6 +1250,11 @@ def _set_framework_info(self, q_dataloader, q_func=None):
                 framework_specific_info['backend'] == 'onnxrt_trt_ep':
                 framework_specific_info.update({'format': 'QDQ'})
                 framework = 'onnxrt_qdq'
+            if framework_specific_info['backend'] == 'onnxrt_cuda_ep' and self.cfg.device =='gpu':
+                framework_specific_info['use_fp16'] = True
+                framework_specific_info['use_bf16'] = True
+            if framework_specific_info['backend'] == 'onnxrt_dnnl_ep' and self.cfg.device == 'cpu':
+                framework_specific_info['use_bf16'] = True
         if framework == 'pytorch_ipex' or framework == 'pytorch' or framework == 'pytorch_fx':
             if self.config.backend == 'ipex':
                 framework = 'pytorch_ipex'

From 60831abff3052f051ac626b4bdad86e42b892d5e Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Sat, 27 May 2023 21:31:52 +0800
Subject: [PATCH 02/21] add op and fix bug

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 .../scripts/codeScan/pyspelling/inc_dict.txt  |   1 +
 .../mix_precision/README.md                   |   2 +-
 neural_compressor/adaptor/onnxrt_dnnl.yaml    |   8 +-
 neural_compressor/strategy/strategy.py        |   4 +-
 .../onnxrt_adaptor/test_onnxrt_operators.py   | 171 ++++++++++++++++++
 5 files changed, 182 insertions(+), 4 deletions(-)

diff --git a/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt b/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt
index 1ba7cd7a55e..af758fcce72 100644
--- a/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt
+++ b/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt
@@ -495,6 +495,7 @@ dnf
 dnn
 dnnl
 DNNL
+DnnlExecutionProvider
 Dockerfile
 doclist
 docstrings
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md
index f5d976123a3..69e02a5059a 100644
--- a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md
@@ -52,7 +52,7 @@ bash prepare_data.sh --data_dir=$GLUE_DIR --task_name=$TASK_NAME
 
 # Run
 
-If the hardware doesn't support bf16 instruction, please set flag as below to force bf16 conversion (this way will be deperecated):
+If the hardware doesn't support bf16 instruction, please set flag as below to force bf16 conversion (this way will be deprecated):
 
 ```shell
 export FORCE_BF16=1
diff --git a/neural_compressor/adaptor/onnxrt_dnnl.yaml b/neural_compressor/adaptor/onnxrt_dnnl.yaml
index 704d63ead93..2d2d718130c 100644
--- a/neural_compressor/adaptor/onnxrt_dnnl.yaml
+++ b/neural_compressor/adaptor/onnxrt_dnnl.yaml
@@ -399,7 +399,13 @@
   version:
     name: '1.14.0'
   int8: *ref_1_11
-  bf16: &common_bf16 ['MatMul', 'Gemm']
+  bf16: &common_bf16 ['MatMul', 'Gemm', 'BatchNormalization', 'Softmax', 'Sum',
+    'Abs', 'BiasGelu', 'Exp', 'FastGelu', 'Gelu', 'Log', 'Relu', 'Round', 'Sigmoid',
+    'Sqrt', 'Tanh', 'Add', 'Sub', 'Mul', 'Div', 'Pow', 'ReduceMean', 'Equal',
+    'FusedMatMul', 'Greater', 'GreaterOrEqual', 'LeakyRelu', 'Less', 'LessOrEqual',
+    'Reshape', 'Squeeze', 'Transpose', 'Unsqueeze', 'ReduceL1', 'ReduceL2', 'ReduceLogSum',
+    'ReduceLogSumExp', 'ReduceMax', 'ReduceProd', 'ReduceSum', 'ReduceSumSquare',
+    'LayerNormalization', 'Concat']
   recipes:
     <<: *default_optimization
 
diff --git a/neural_compressor/strategy/strategy.py b/neural_compressor/strategy/strategy.py
index e5ac78fadd1..512c3a5d20b 100644
--- a/neural_compressor/strategy/strategy.py
+++ b/neural_compressor/strategy/strategy.py
@@ -1250,10 +1250,10 @@ def _set_framework_info(self, q_dataloader, q_func=None):
                 framework_specific_info['backend'] == 'onnxrt_trt_ep':
                 framework_specific_info.update({'format': 'QDQ'})
                 framework = 'onnxrt_qdq'
-            if framework_specific_info['backend'] == 'onnxrt_cuda_ep' and self.cfg.device =='gpu':
+            if framework_specific_info['backend'] == 'onnxrt_cuda_ep' and self.config.device =='gpu':
                 framework_specific_info['use_fp16'] = True
                 framework_specific_info['use_bf16'] = True
-            if framework_specific_info['backend'] == 'onnxrt_dnnl_ep' and self.cfg.device == 'cpu':
+            if framework_specific_info['backend'] == 'onnxrt_dnnl_ep' and self.config.device == 'cpu':
                 framework_specific_info['use_bf16'] = True
         if framework == 'pytorch_ipex' or framework == 'pytorch' or framework == 'pytorch_fx':
             if self.config.backend == 'ipex':
diff --git a/test/adaptor/onnxrt_adaptor/test_onnxrt_operators.py b/test/adaptor/onnxrt_adaptor/test_onnxrt_operators.py
index fc29326a3ca..4b99baf1e7f 100644
--- a/test/adaptor/onnxrt_adaptor/test_onnxrt_operators.py
+++ b/test/adaptor/onnxrt_adaptor/test_onnxrt_operators.py
@@ -10,6 +10,7 @@
 from neural_compressor.adaptor.ox_utils.util import QuantizedInitializer, QuantizedValue, QuantizationMode
 import onnxruntime as ort
 from neural_compressor.config import ONNXQlinear2QDQConfig
+from neural_compressor.utils.utility import CpuInfo
 
 def build_model():
     initializers = []
@@ -1174,6 +1175,176 @@ def test_fp16(self):
             session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['CUDAExecutionProvider'])
             outputs = session.run(None, input_data)
 
+    def get_bf16_mixed_precision_model(self, model):
+        from neural_compressor import MixedPrecisionConfig
+        from neural_compressor.mix_precision import fit
+        config = MixedPrecisionConfig(backend='onnxrt_dnnl_ep', precision='bf16')
+        converted_model = fit(model, config)
+        return converted_model
+
+    @unittest.skipIf(not CpuInfo().bf16 or 'DnnlExecutionProvider' not in ort.get_all_providers(),
+        "skip since DnnlExecutionProvider is not supported")
+    def test_bf16(self):
+        optypes = ['Sum', 'Sub', 'Div', 'Pow', 'Add']
+        for optype in optypes:
+            inps = [['input1', TensorProto.FLOAT, (1,2)]]
+            outs = [['output', TensorProto.FLOAT, (1,2)]]
+            weights = [['input2', TensorProto.FLOAT, (1,2), np.random.random((2))]]
+            node_infos = [['test', ['input1', 'input2'], ['output'], optype]]
+            model = self.build_model(inps, outs, weights, node_infos)
+            input_data = self.build_test_data(['input1'], [(1,2)], ['float32'])
+            convert_model = self.get_bf16_mixed_precision_model(model)
+            self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()]))
+            self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast']))
+            session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider'])
+            outputs = session.run(None, input_data)
+
+        optypes = ['Equal', 'Greater', 'GreaterOrEqual', 'Less', 'LessOrEqual']
+        for optype in optypes:
+            inps = [['input1', TensorProto.FLOAT, (1,2)]]
+            outs = [['output', TensorProto.BOOL, (1,2)]]
+            weights = [['input2', TensorProto.FLOAT, (1,2), np.random.random((2))]]
+            node_infos = [['test', ['input1', 'input2'], ['output'], optype]]
+            model = self.build_model(inps, outs, weights, node_infos)
+            input_data = self.build_test_data(['input1'], [(1,2)], ['float32'])
+            convert_model = self.get_bf16_mixed_precision_model(model)
+            self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()]))
+            self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast']))
+            session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider'])
+            outputs = session.run(None, input_data)
+
+        optypes = ['Abs', 'Exp', 'Log', 'Round', 'Sqrt', 'Softmax', 'Exp', 'Tanh', 'Sigmoid', 'LeakyRelu', 'Round']
+        for optype in optypes:
+            inps = [['input1', TensorProto.FLOAT, (1,2)]]
+            outs = [['output', TensorProto.FLOAT, (1,2)]]
+            node_infos = [['test', ['input1'], ['output'], optype]]
+            model = self.build_model(inps, outs, [], node_infos)
+            input_data = self.build_test_data(['input1'], [(1,2)], ['float32'])
+            convert_model = self.get_bf16_mixed_precision_model(model)
+            self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()]))
+            self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast']))
+            session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider'])
+            outputs = session.run(None, input_data)
+
+        optypes = ['ReduceMean', 'ReduceL1', 'ReduceL2', 'ReduceLogSum', 'ReduceLogSumExp', 'ReduceMax', 'ReduceProd', \
+                   'ReduceSum', 'ReduceSumSquare']
+        for optype in optypes:
+            inps = [['input1', TensorProto.FLOAT, (1,2)]]
+            outs = [['output', TensorProto.FLOAT, (1,1)]]
+            node_infos = [['test', ['input1'], ['output'], optype]]
+            model = self.build_model(inps, outs, [], node_infos)
+            input_data = self.build_test_data(['input1'], [(1,2)], ['float32'])
+            convert_model = self.get_bf16_mixed_precision_model(model)
+            self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()]))
+            self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast']))
+            session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider'])
+            outputs = session.run(None, input_data)
+
+        optypes = ['Gelu']
+        for optype in optypes:
+            inps = [['input1', TensorProto.FLOAT, (1,2)]]
+            outs = [['output', TensorProto.FLOAT, (1,2)]]
+            node_infos = [['test', ['input1'], ['output'], optype, 'com.microsoft']]
+            model = self.build_model(inps, outs, [], node_infos)
+            input_data = self.build_test_data(['input1'], [(1,2)], ['float32'])
+            convert_model = self.get_bf16_mixed_precision_model(model)
+            self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()]))
+            self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast']))
+            session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider'])
+            outputs = session.run(None, input_data)
+
+        optypes = ['BiasGelu', 'FastGelu']
+        for optype in optypes:
+            inps = [['input1', TensorProto.FLOAT, [2]]]
+            outs = [['output', TensorProto.FLOAT, [2]]]
+            weights = [['input2', TensorProto.FLOAT, [2], np.random.random((2))]]
+            node_infos = [['test', ['input1', 'input2'], ['output'], optype, 'com.microsoft']]
+            model = self.build_model(inps, outs, weights, node_infos)
+            input_data = self.build_test_data(['input1'], [(2)], ['float32'])
+            convert_model = self.get_bf16_mixed_precision_model(model)
+            self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()]))
+            self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast']))
+            session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider'])
+            outputs = session.run(None, input_data)
+
+
+        optypes = ['MatMul']
+        for optype in optypes:
+            inps = [['input1', TensorProto.FLOAT, (1,2)]]
+            outs = [['output', TensorProto.FLOAT, (1,1)]]
+            weights = [['input2', TensorProto.FLOAT, (2,1), np.random.random((2))]]
+            node_infos = [['test', ['input1', 'input2'], ['output'], optype]]
+            model = self.build_model(inps, outs, weights, node_infos)
+            input_data = self.build_test_data(['input1'], [(1,2)], ['float32'])
+            convert_model = self.get_bf16_mixed_precision_model(model)
+            self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()]))
+            self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast']))
+            session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider'])
+            outputs = session.run(None, input_data)
+
+        optypes = ['FusedMatMul']
+        for optype in optypes:
+            inps = [['input1', TensorProto.FLOAT, (1,2)]]
+            outs = [['output', TensorProto.FLOAT, (1,1)]]
+            weights = [['input2', TensorProto.FLOAT, (2,1), np.random.random((2))]]
+            node_infos = [['test', ['input1', 'input2'], ['output'], optype, 'com.microsoft']]
+            model = self.build_model(inps, outs, weights, node_infos)
+            ort.InferenceSession(model.SerializeToString())
+            input_data = self.build_test_data(['input1'], [(1,2)], ['float32'])
+            convert_model = self.get_bf16_mixed_precision_model(model)
+            self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()]))
+            self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast']))
+            session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider'])
+            outputs = session.run(None, input_data)
+
+        optypes = ['Gemm']
+        for optype in optypes:
+            inps = [['input1', TensorProto.FLOAT, (1,2)]]
+            outs = [['output', TensorProto.FLOAT, (1,2)]]
+            weights = [['input2', TensorProto.FLOAT, (2,1), np.random.random((2))],
+                        ['input3', TensorProto.FLOAT, [], np.random.random((1))]]
+            node_infos = [['test', ['input1', 'input2', 'input3'], ['output'], optype]]
+            model = self.build_model(inps, outs, weights, node_infos)
+            input_data = self.build_test_data(['input1'], [(1,2)], ['float32'])
+            convert_model = self.get_bf16_mixed_precision_model(model)
+            self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()]))
+            self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast']))
+            session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider'])
+            outputs = session.run(None, input_data)
+
+        optypes = ['LayerNormalization']
+        for optype in optypes:
+            inps = [['input1', TensorProto.FLOAT, (1,2)]]
+            outs = [['output1', TensorProto.FLOAT, (1,2)], ['output2', TensorProto.FLOAT, (1,2)], ['output3', TensorProto.FLOAT, (1,2)]]
+            weights = [['input2', TensorProto.FLOAT, (2,1), np.random.random((2))],
+                        ['input3', TensorProto.FLOAT, (2,1), np.random.random((2))]]
+            node_infos = [['test', ['input1', 'input2', 'input3'], ['output1', 'output2', 'output3'], optype]]
+            model = self.build_model(inps, outs, weights, node_infos)
+            input_data = self.build_test_data(['input1'], [(1,2)], ['float32'])
+            convert_model = self.get_bf16_mixed_precision_model(model)
+            self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()]))
+            self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast']))
+            session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider'])
+            outputs = session.run(None, input_data)
+
+        optypes = ['BatchNormalization']
+        for optype in optypes:
+            inps = [['input1', TensorProto.FLOAT, [1, 2]]]
+            outs = [['output1', TensorProto.FLOAT, [1, 2]]]
+            weights = [['input2', TensorProto.FLOAT, [2], np.random.random((2))],
+                        ['input3', TensorProto.FLOAT, [2], np.random.random((2))],
+                        ['input4', TensorProto.FLOAT, [2], np.random.random((2))],
+                        ['input5', TensorProto.FLOAT, [2], np.random.random((2))],]
+            node_infos = [['test', ['input1', 'input2', 'input3', 'input4', 'input5'], ['output1'], optype]]
+            model = self.build_model(inps, outs, weights, node_infos)
+            ort.InferenceSession(model.SerializeToString())
+            input_data = self.build_test_data(['input1'], [(1,2)], ['float32'])
+            convert_model = self.get_bf16_mixed_precision_model(model)
+            self.assertTrue('Cast' in set([i.op_type for i in convert_model.nodes()]))
+            self.assertTrue(16 in set([i.attribute[0].i for i in convert_model.nodes() if i.op_type == 'Cast']))
+            session = ort.InferenceSession(convert_model.model.SerializeToString(), providers=['DnnlExecutionProvider'])
+            outputs = session.run(None, input_data)
+
 
 if __name__ == "__main__":
     unittest.main()

From b5f2cc884d1aefde933a48535130f8bbf870ff14 Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Sat, 27 May 2023 23:29:40 +0800
Subject: [PATCH 03/21] fix pre optimize

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 neural_compressor/adaptor/onnxrt.py | 87 ++++++++++++++---------------
 1 file changed, 42 insertions(+), 45 deletions(-)

diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py
index c0c6bcfc31d..9b3fcd5e15c 100644
--- a/neural_compressor/adaptor/onnxrt.py
+++ b/neural_compressor/adaptor/onnxrt.py
@@ -623,51 +623,48 @@ def _pre_optimize(self, model, level=1):
         from neural_compressor.adaptor.ox_utils.util import \
             remove_init_from_model_input, split_shared_bias
         remove_init_from_model_input(model)
-        try:
-            sess_options = ort.SessionOptions()
-            optimization_levels = {
-                'DISABLE_ALL': ort.GraphOptimizationLevel.ORT_DISABLE_ALL,
-                'ENABLE_BASIC': ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
-                'ENABLE_EXTENDED': ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED,
-                'ENABLE_ALL': ort.GraphOptimizationLevel.ORT_ENABLE_ALL}
-            if not isinstance(self.query_handler.get_graph_optimization(), list):
-                level = self.query_handler.get_graph_optimization()
-            elif options.onnxrt.graph_optimization.level is not None:
-                level = options.onnxrt.graph_optimization.level
-            elif self.recipes.get('graph_optimization_level', None) is not None:
-                level = self.recipes['graph_optimization_level']
-            else:
-                if self.domain == "auto" and self._detect_domain(model):
-                    self.domain = 'nlp' 
-                level = 'ENABLE_EXTENDED' if self.domain == 'nlp' else 'ENABLE_BASIC'
-                logger.warning("Graph optimization level is automatically set to {}. "
-                    "You can use 'recipe' argument in 'PostTrainingQuantConfig'" 
-                    "to overwrite it".format(level))
-            sess_options.graph_optimization_level = optimization_levels[level]
-            sess_options.optimized_model_filepath = os.path.join(self.work_space, \
-                "Optimized_model.onnx")
-            if sys.version_info < (3,10) and find_spec('onnxruntime_extensions'): # pragma: no cover
-                from onnxruntime_extensions import get_library_path
-                sess_options.register_custom_ops_library(get_library_path())
-            if not model.is_large_model:
-                ort.InferenceSession(model.model.SerializeToString(),
-                                    sess_options,
-                                    providers=[self.backend])
-            elif model.model_path is not None: # pragma: no cover
-                ort.InferenceSession(model.model_path,
-                                    sess_options,
-                                    providers=[self.backend])
-            else: # pragma: no cover 
-                logger.warning('Please use model path instead of onnx model object to quantize')
-
-            tmp_model = onnx.load(sess_options.optimized_model_filepath, load_external_data=False)
-
-            if model.is_large_model: # pragma: no cover
-                from onnx.external_data_helper import load_external_data_for_model
-                load_external_data_for_model(tmp_model, os.path.split(model.model_path)[0])
-            model.model_path = sess_options.optimized_model_filepath
-        except:
-            tmp_model = model
+        sess_options = ort.SessionOptions()
+        optimization_levels = {
+            'DISABLE_ALL': ort.GraphOptimizationLevel.ORT_DISABLE_ALL,
+            'ENABLE_BASIC': ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
+            'ENABLE_EXTENDED': ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED,
+            'ENABLE_ALL': ort.GraphOptimizationLevel.ORT_ENABLE_ALL}
+        if not isinstance(self.query_handler.get_graph_optimization(), list):
+            level = self.query_handler.get_graph_optimization()
+        elif options.onnxrt.graph_optimization.level is not None:
+            level = options.onnxrt.graph_optimization.level
+        elif self.recipes.get('graph_optimization_level', None) is not None:
+            level = self.recipes['graph_optimization_level']
+        else:
+            if self.domain == "auto" and self._detect_domain(model):
+                self.domain = 'nlp' 
+            level = 'ENABLE_EXTENDED' if self.domain == 'nlp' else 'ENABLE_BASIC'
+            logger.warning("Graph optimization level is automatically set to {}. "
+                "You can use 'recipe' argument in 'PostTrainingQuantConfig'" 
+                "to overwrite it".format(level))
+        sess_options.graph_optimization_level = optimization_levels[level]
+        sess_options.optimized_model_filepath = os.path.join(self.work_space, \
+            "Optimized_model.onnx")
+        if sys.version_info < (3,10) and find_spec('onnxruntime_extensions'): # pragma: no cover
+            from onnxruntime_extensions import get_library_path
+            sess_options.register_custom_ops_library(get_library_path())
+        if not model.is_large_model:
+            ort.InferenceSession(model.model.SerializeToString(),
+                                sess_options,
+                                providers=['CPUExecutionProvider'])
+        elif model.model_path is not None: # pragma: no cover
+            ort.InferenceSession(model.model_path,
+                                sess_options,
+                                providers=['CPUExecutionProvider'])
+        else: # pragma: no cover 
+            logger.warning('Please use model path instead of onnx model object to quantize')
+
+        tmp_model = onnx.load(sess_options.optimized_model_filepath, load_external_data=False)
+
+        if model.is_large_model: # pragma: no cover
+            from onnx.external_data_helper import load_external_data_for_model
+            load_external_data_for_model(tmp_model, os.path.split(model.model_path)[0])
+        model.model_path = sess_options.optimized_model_filepath
         model.model = self._replace_gemm_with_matmul(tmp_model).model if \
             options.onnxrt.graph_optimization.gemm2matmul and self.recipes.get('gemm_to_matmul', True) else \
             tmp_model

From b21aa0d734477c3d5f354a4249cf4ff84efccba4 Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Sun, 28 May 2023 21:06:12 +0800
Subject: [PATCH 04/21] fix ut

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 neural_compressor/adaptor/onnxrt.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py
index 9b3fcd5e15c..3bb698d893c 100644
--- a/neural_compressor/adaptor/onnxrt.py
+++ b/neural_compressor/adaptor/onnxrt.py
@@ -65,7 +65,8 @@ def __init__(self, framework_specific_info):
         self.recipes = framework_specific_info.get("recipes", {})
         self.backend = PROVIDERS[framework_specific_info["backend"]]
         self.performance_only = framework_specific_info.get("performance_only", False)
-        self.use_bf16 = framework_specific_info.get("use_bf16", False)
+        self.use_bf16 = framework_specific_info.get("use_bf16", False) and \
+            self.backend in ort.get_available_providers()
         self.use_fp16 = framework_specific_info.get("use_fp16", False)
 
         if self.backend not in ort.get_all_providers():
@@ -863,8 +864,7 @@ def query_fw_capability(self, model):
             precisions = query.get_precisions()
 
             for precision in precisions:
-                if precision == 'fp16' and \
-                    (not self.use_fp16 or 'CUDAExecutionProvider' not in ort.get_available_providers()):
+                if precision == 'fp16' and not self.use_fp16:
                     continue
                 if precision == 'bf16' and \
                     (not self.use_bf16 or (not CpuInfo().bf16 and os.getenv('FORCE_BF16') != '1')):

From b2d4b4008b3d7a5aa4bba6fe838fe81af0952b78 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Sun, 4 Jun 2023 12:27:05 +0800
Subject: [PATCH 05/21] Update config.py

---
 neural_compressor/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/config.py b/neural_compressor/config.py
index a4a661cf0a6..58ec22ebae9 100644
--- a/neural_compressor/config.py
+++ b/neural_compressor/config.py
@@ -1834,7 +1834,7 @@ def backend(self):
     def backend(self, backend):
         """Set backend."""
         if _check_value('backend', backend, str, [
-                'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep']):
+                'default', 'itex', 'ipex', 'onnxrt_trt_ep', 'onnxrt_cuda_ep', 'onnxrt_dnnl_ep']):
             self._backend = backend
 
     @property

From 0fc1ab9874833fffae5949bd0accc5db7168c86a Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Tue, 6 Jun 2023 10:26:54 +0800
Subject: [PATCH 06/21] Update mix_precision.py

---
 neural_compressor/mix_precision.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/mix_precision.py b/neural_compressor/mix_precision.py
index 057344591ab..2444466f691 100644
--- a/neural_compressor/mix_precision.py
+++ b/neural_compressor/mix_precision.py
@@ -98,11 +98,14 @@ def fit(model,
 
     precisions = list(set(conf.precisions) - set(conf.excluded_precisions))
     if ('bf16' in precisions or 'fp16' in precisions) and conf.framework == "onnxruntime":  # pragma: no cover
-        if conf.device == "cpu":
-            logger.warning("Mix precision exits due to device isn't gpu for onnx models.")
+        if 'fp16' in precisions and not (conf.device == "gpu" and conf.backend == "onnxrt_cuda_ep"):
+            logger.warning("Mix precision exits due to fp16 for onnx models" \
+                           "needs 'gpu' device and 'onnxrt_cuda_ep' backend.")
             sys.exit(0)
-        elif conf.backend != "onnxrt_cuda_ep":
-            logger.warning("Mix precision exits due to backend isn't onnxrt_cuda_ep for onnx models.")
+        elif 'bf16' in precisions and (not (conf.backend == "onnxrt_cuda_ep" and conf.device == "gpu") and \
+                                       not (conf.backend == "onnxrt_dnnl_ep" and conf.device == "cpu")): 
+            logger.warning("Mix precision exits due to bf16 for onnx models needs " \
+                "'gpu' device and 'onnxrt_cuda_ep' backend, or 'cpu' device and 'onnxrt_dnnl_ep' backend.")
             sys.exit(0)
     elif 'bf16' in precisions and not CpuInfo().bf16 and conf.framework != "onnxruntime":  # pragma: no cover
         if os.getenv('FORCE_BF16') == '1':

From 82853cc578727cec86857e25b47fa600087bb3dd Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Tue, 6 Jun 2023 10:30:02 +0800
Subject: [PATCH 07/21] Update quantization.md

---
 docs/source/quantization.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/quantization.md b/docs/source/quantization.md
index e52e5ee702f..30ef8931ebd 100644
--- a/docs/source/quantization.md
+++ b/docs/source/quantization.md
@@ -470,6 +470,12 @@ Intel(R) Neural Compressor support multi-framework: PyTorch, Tensorflow, ONNX Ru
             <td align="left">"onnxrt_cuda_ep"</td>
             <td align="left">gpu</td>
         </tr>
+        <tr>
+            <td align="left">DnnlExecutionProvider</td>
+            <td align="left">OneDNN</td>
+            <td align="left">"onnxrt_dnnl_ep"</td>
+            <td align="left">cpu</td>
+        </tr>
         <tr>
             <td rowspan="2" align="left">Tensorflow</td>
             <td align="left">Tensorflow</td>

From 08fe94b68762c5044842f69a75df6ce69c933ca9 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Tue, 6 Jun 2023 10:31:04 +0800
Subject: [PATCH 08/21] Update quantization.md

---
 docs/source/quantization.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/quantization.md b/docs/source/quantization.md
index 30ef8931ebd..a9c770e729b 100644
--- a/docs/source/quantization.md
+++ b/docs/source/quantization.md
@@ -452,7 +452,7 @@ Intel(R) Neural Compressor support multi-framework: PyTorch, Tensorflow, ONNX Ru
             <td align="left">cpu</td>
         </tr>
         <tr>
-            <td rowspan="3" align="left">ONNX Runtime</td>
+            <td rowspan="4" align="left">ONNX Runtime</td>
             <td align="left">CPUExecutionProvider</td>
             <td align="left">MLAS</td>
             <td align="left">"default"</td>

From bd7aa586594b7ca46780c8adef10d9424298f205 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Tue, 6 Jun 2023 12:31:05 +0800
Subject: [PATCH 09/21] Update mixed_precision.md

---
 docs/source/mixed_precision.md | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/docs/source/mixed_precision.md b/docs/source/mixed_precision.md
index 560bcc41e33..579ac765125 100644
--- a/docs/source/mixed_precision.md
+++ b/docs/source/mixed_precision.md
@@ -17,6 +17,7 @@ The recently launched 3rd Gen Intel® Xeon® Scalable processor (codenamed Coope
 </p>
 
 ## Mixed Precision Support Matrix
+
 <table class="center">
     <thead>
         <tr>
@@ -48,7 +49,7 @@ The recently launched 3rd Gen Intel® Xeon® Scalable processor (codenamed Coope
             <td align="left">:x:</td>
         </tr>
         <tr>
-            <td rowspan="3" align="left">ONNX Runtime</td>
+            <td rowspan="4" align="left">ONNX Runtime</td>
             <td align="left">CPUExecutionProvider</td>
             <td align="left">MLAS</td>
             <td align="left">"default"</td>
@@ -72,6 +73,14 @@ The recently launched 3rd Gen Intel® Xeon® Scalable processor (codenamed Coope
             <td align="left">&#10004;</td>
             <td align="left">&#10004;</td>
         </tr>
+        <tr>
+            <td align="left">DnnlExecutionProvider</td>
+            <td align="left">OneDNN</td>
+            <td align="left">"onnxrt_dnnl_ep"</td>
+            <td align="left">cpu</td>
+            <td align="left">&#10004;</td>
+            <td align="left">:x:</td>
+        </tr>
         <tr>
             <td rowspan="2" align="left">Tensorflow</td>
             <td align="left">Tensorflow</td>
@@ -162,4 +171,4 @@ converted_model.save('./path/to/save/')
 - Quick started with [helloworld example](/examples/helloworld/tf_example3)
 - PyTorch [ResNet18](/examples/pytorch/image_recognition/torchvision_models/mixed_precision/resnet18)
 - IPEX [DistilBERT base](/examples/pytorch/nlp/huggingface_models/question-answering/mixed_precision/ipex)
-- Tensorflow [ResNet50](/examples/tensorflow/image_recognition/tensorflow_models/resnet50_v1/mixed_precision) 
\ No newline at end of file
+- Tensorflow [ResNet50](/examples/tensorflow/image_recognition/tensorflow_models/resnet50_v1/mixed_precision) 

From fec595c5817139cf25dc0985fb536811338c1baf Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Tue, 6 Jun 2023 13:14:44 +0800
Subject: [PATCH 10/21] Update inc_dict.txt

---
 .azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt b/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt
index af758fcce72..548cdaebfc0 100644
--- a/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt
+++ b/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt
@@ -564,6 +564,7 @@ enum
 env
 environ
 ep
+eps
 eq
 erf
 Erf

From 381f1bd61c374d85e19c5ab1b2d403b2e0047dae Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Tue, 4 Jul 2023 10:23:42 +0800
Subject: [PATCH 11/21] fix bug

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 neural_compressor/adaptor/onnxrt.py               | 10 ++++++++++
 neural_compressor/adaptor/ox_utils/calibration.py |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py
index 3bb698d893c..9930b768873 100644
--- a/neural_compressor/adaptor/onnxrt.py
+++ b/neural_compressor/adaptor/onnxrt.py
@@ -212,6 +212,16 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
             return model
         if model.model.opset_import[0].version < 11: # pragma: no cover
             logger.warning("Quantize input needs model opset 11 or newer.")
+        if self.backend == 'DnnlExecutionProvider' and \
+            any([i.domain in ['', 'ai.onnx'] and i.version < 15 for i in model.model.opset_import]):
+            from onnx import version_converter
+            try:
+                model.model = version_converter.convert_version(model.model, 15)
+            except:
+                logging.warning("Fail to upgrade model opset_import to >= 15, "\
+                                "please upgrate it manually to run with bf16 data type")
+                exit(0)
+            
         from neural_compressor.adaptor.ox_utils.util import QuantizationMode
         if self.format == "qlinearops":
             format = QuantizationMode.QLinearOps
diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py
index bb6060096cd..37befd20044 100644
--- a/neural_compressor/adaptor/ox_utils/calibration.py
+++ b/neural_compressor/adaptor/ox_utils/calibration.py
@@ -467,7 +467,7 @@ def calculate_quantization_params(self, q_config, quantization_thresholds):
             if tensor_name in output_name_to_nodes:
                 parent = output_name_to_nodes[tensor_name]
             if parent and parent.name in q_config and \
-                q_config[parent.name] not in ['fp32', 'fp16']:
+                q_config[parent.name] not in ['fp32', 'fp16', 'bf16']:
                 scheme = q_config[parent.name]['activation']['scheme']
                 qType = q_config[parent.name]['activation']['dtype']
             elif self.backend in ['TensorrtExecutionProvider']:

From 1d6d4df940175dc10ca985f708f4e308844cc8dc Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Wed, 5 Jul 2023 19:45:01 +0800
Subject: [PATCH 12/21] Update quantizer.py

---
 neural_compressor/adaptor/ox_utils/quantizer.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/adaptor/ox_utils/quantizer.py b/neural_compressor/adaptor/ox_utils/quantizer.py
index 04558648fe3..35421ee03bf 100644
--- a/neural_compressor/adaptor/ox_utils/quantizer.py
+++ b/neural_compressor/adaptor/ox_utils/quantizer.py
@@ -246,8 +246,11 @@ def merge_dedicated_qdq_pair(self):
     def should_cast(self, node):
         """Check if node should be casted."""
         if node.name in self.config and self.config[node.name] != 'fp32': # pragma: no cover
-            return True
-        else:
+            parent = self.model.get_parent(node, 0)
+            if parent is not None and (parent.op_type != 'Cast' or parent.attribute[0].i in [1, 10, 16]):
+                return True
+            elif parent is None and node.input[0] in self.model.input():
+                return True
             return False
 
     def insert_qdq(self):

From 20fc55d4b98a4fe7e212a9bf88b386d23532dd25 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Wed, 5 Jul 2023 19:45:51 +0800
Subject: [PATCH 13/21] Update onnxrt.py

---
 neural_compressor/adaptor/onnxrt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py
index 9930b768873..1e6cf9d06a0 100644
--- a/neural_compressor/adaptor/onnxrt.py
+++ b/neural_compressor/adaptor/onnxrt.py
@@ -216,7 +216,7 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
             any([i.domain in ['', 'ai.onnx'] and i.version < 15 for i in model.model.opset_import]):
             from onnx import version_converter
             try:
-                model.model = version_converter.convert_version(model.model, 15)
+                model.model = self._rename_node(version_converter.convert_version(model.model, 15))
             except:
                 logging.warning("Fail to upgrade model opset_import to >= 15, "\
                                 "please upgrate it manually to run with bf16 data type")

From e1c0dbeddf4c38527b70c4658f6dbbf4a52f7d5b Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Thu, 6 Jul 2023 10:10:04 +0800
Subject: [PATCH 14/21] Update model.py

---
 neural_compressor/model/model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/model/model.py b/neural_compressor/model/model.py
index e73efbb4feb..66632ece602 100644
--- a/neural_compressor/model/model.py
+++ b/neural_compressor/model/model.py
@@ -79,9 +79,9 @@ def _is_onnxruntime(model):
                 from onnxruntime_extensions import get_library_path
                 so.register_custom_ops_library(get_library_path())
             if isinstance(model, str):
-                ort.InferenceSession(model, so, providers=['CPUExecutionProvider'])
+                ort.InferenceSession(model, so, providers=ort.get_available_providers())
             else:
-                ort.InferenceSession(model.SerializeToString(), so, providers=['CPUExecutionProvider'])
+                ort.InferenceSession(model.SerializeToString(), so, providers=ort.get_available_providers())
         except Exception as e:  # pragma: no cover
             if 'Message onnx.ModelProto exceeds maximum protobuf size of 2GB' in str(e):
                 logger.warning('Please use model path instead of onnx model object to quantize')

From f38b29751a468dd17cc2c1dcf868d70580bf7500 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Thu, 6 Jul 2023 10:55:44 +0800
Subject: [PATCH 15/21] Update onnxrt_dnnl.yaml

---
 neural_compressor/adaptor/onnxrt_dnnl.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/adaptor/onnxrt_dnnl.yaml b/neural_compressor/adaptor/onnxrt_dnnl.yaml
index 2d2d718130c..847331b346d 100644
--- a/neural_compressor/adaptor/onnxrt_dnnl.yaml
+++ b/neural_compressor/adaptor/onnxrt_dnnl.yaml
@@ -400,7 +400,7 @@
     name: '1.14.0'
   int8: *ref_1_11
   bf16: &common_bf16 ['MatMul', 'Gemm', 'BatchNormalization', 'Softmax', 'Sum',
-    'Abs', 'BiasGelu', 'Exp', 'FastGelu', 'Gelu', 'Log', 'Relu', 'Round', 'Sigmoid',
+    'Abs', 'Exp', 'FastGelu', 'Gelu', 'Log', 'Relu', 'Round', 'Sigmoid',
     'Sqrt', 'Tanh', 'Add', 'Sub', 'Mul', 'Div', 'Pow', 'ReduceMean', 'Equal',
     'FusedMatMul', 'Greater', 'GreaterOrEqual', 'LeakyRelu', 'Less', 'LessOrEqual',
     'Reshape', 'Squeeze', 'Transpose', 'Unsqueeze', 'ReduceL1', 'ReduceL2', 'ReduceLogSum',

From 65c41de0efcf46fc570dbe0e138cf512f916c735 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Thu, 6 Jul 2023 17:02:51 +0800
Subject: [PATCH 16/21] Update onnxrt_dnnl.yaml

---
 neural_compressor/adaptor/onnxrt_dnnl.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/adaptor/onnxrt_dnnl.yaml b/neural_compressor/adaptor/onnxrt_dnnl.yaml
index 847331b346d..2d2d718130c 100644
--- a/neural_compressor/adaptor/onnxrt_dnnl.yaml
+++ b/neural_compressor/adaptor/onnxrt_dnnl.yaml
@@ -400,7 +400,7 @@
     name: '1.14.0'
   int8: *ref_1_11
   bf16: &common_bf16 ['MatMul', 'Gemm', 'BatchNormalization', 'Softmax', 'Sum',
-    'Abs', 'Exp', 'FastGelu', 'Gelu', 'Log', 'Relu', 'Round', 'Sigmoid',
+    'Abs', 'BiasGelu', 'Exp', 'FastGelu', 'Gelu', 'Log', 'Relu', 'Round', 'Sigmoid',
     'Sqrt', 'Tanh', 'Add', 'Sub', 'Mul', 'Div', 'Pow', 'ReduceMean', 'Equal',
     'FusedMatMul', 'Greater', 'GreaterOrEqual', 'LeakyRelu', 'Less', 'LessOrEqual',
     'Reshape', 'Squeeze', 'Transpose', 'Unsqueeze', 'ReduceL1', 'ReduceL2', 'ReduceLogSum',

From d6c34a58bbe3f13fe6098707ff169e85999e7324 Mon Sep 17 00:00:00 2001
From: yuwenzho <yuwen.zhou@intel.com>
Date: Fri, 14 Jul 2023 15:05:10 +0800
Subject: [PATCH 17/21] fix keyerror bug in _optypewise_filter_for_qdq

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
---
 neural_compressor/adaptor/onnxrt.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py
index 1e6cf9d06a0..ae0173979d4 100644
--- a/neural_compressor/adaptor/onnxrt.py
+++ b/neural_compressor/adaptor/onnxrt.py
@@ -1081,6 +1081,8 @@ def _optypewise_filter_for_qdq(self, optype_wise):
             '1.11.0': ['Conv', 'Gather', 'MatMul', 'Gemm'],
             '1.12.0': ['Conv', 'Gather', 'MatMul', 'Gemm']}
         specific_cfg_version = self.query_handler.get_specific_cfg_version()
+        if Version(specific_cfg_version) > ONNXRT112_VERSION:
+            specific_cfg_version = '1.12.0'
         for optype, caps in optype_wise.items():
             if optype not in supported_perchannel_optypes[specific_cfg_version]:
                 for cap in caps:

From 389ff129eeb4145c22fae3444e845afa16bd6247 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Mon, 17 Jul 2023 14:05:52 +0800
Subject: [PATCH 18/21] Update mixed_precision.md

---
 docs/source/mixed_precision.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/mixed_precision.md b/docs/source/mixed_precision.md
index 579ac765125..70ea2d98a89 100644
--- a/docs/source/mixed_precision.md
+++ b/docs/source/mixed_precision.md
@@ -171,4 +171,5 @@ converted_model.save('./path/to/save/')
 - Quick started with [helloworld example](/examples/helloworld/tf_example3)
 - PyTorch [ResNet18](/examples/pytorch/image_recognition/torchvision_models/mixed_precision/resnet18)
 - IPEX [DistilBERT base](/examples/pytorch/nlp/huggingface_models/question-answering/mixed_precision/ipex)
-- Tensorflow [ResNet50](/examples/tensorflow/image_recognition/tensorflow_models/resnet50_v1/mixed_precision) 
+- Tensorflow [ResNet50](/examples/tensorflow/image_recognition/tensorflow_models/resnet50_v1/mixed_precision)
+- ONNX Runtime [Bert base](/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision)

From 52dc9b7b4b68a339bbaacb1eaf31f798680cc0a8 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Mon, 17 Jul 2023 15:38:05 +0800
Subject: [PATCH 19/21] Update README.md

---
 .../text_classification/mix_precision/README.md                 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md
index 69e02a5059a..25857c31adc 100644
--- a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/README.md
@@ -37,7 +37,7 @@ Supported model identifier from [huggingface.co](https://huggingface.co/):
 |            Intel/bart-large-mrpc                |
 
 ```bash
-python export.py --model_name_or_path=Intel/bert-base-uncased-mrpc # or other supported model identifier
+optimum-cli export onnx --model Intel/bert-base-uncased-mrpc --task text-classification <path to export onnx model>
 ```
 
 ## 3. Prepare Dataset

From f496568abdb5f071d776220c612d51876dd2d41b Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Mon, 17 Jul 2023 15:38:31 +0800
Subject: [PATCH 20/21] Update requirements.txt

---
 .../text_classification/mix_precision/requirements.txt           | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/requirements.txt b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/requirements.txt
index eb99b8d165d..89803cfb0d2 100644
--- a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/requirements.txt
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/requirements.txt
@@ -6,3 +6,4 @@ coloredlogs
 sympy
 onnxruntime-extensions; python_version < '3.10'
 numpy
+optimum[exporters]

From e782acc0db57cd0fd18a0736457404bda828425e Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Mon, 17 Jul 2023 15:38:46 +0800
Subject: [PATCH 21/21] Delete export.py

---
 .../mix_precision/export.py                   | 74 -------------------
 1 file changed, 74 deletions(-)
 delete mode 100644 examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/export.py

diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/export.py b/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/export.py
deleted file mode 100644
index 589fe3a345e..00000000000
--- a/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision/export.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import argparse
-
-import torch
-from transformers import AutoConfig, AutoModelForSequenceClassification
-
-def export_onnx_model(args, model):
-    with torch.no_grad():
-        symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
-        if args.model_name_or_path in ['Intel/roberta-base-mrpc', 
-                                        'Intel/xlm-roberta-base-mrpc', 
-                                        'Intel/camembert-base-mrpc', 
-                                        'distilbert-base-uncased-finetuned-sst-2-english']:
-            inputs = {'input_ids':      torch.ones(1, args.max_len, dtype=torch.int64),
-                    'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)}
-            torch.onnx.export(model,                            # model being run
-                            (inputs['input_ids'],               # model input (or a tuple for multiple inputs) 
-                            inputs['attention_mask']),          
-                            args.output_model,                  # where to save the model (can be a file or file-like object)
-                            opset_version=14,                   # the ONNX version to export the model
-                            do_constant_folding=True,           # whether to execute constant folding
-                            input_names=['input_ids',           # the model's input names
-                                        'attention_mask'],
-                            output_names=['logits'],
-                            dynamic_axes={'input_ids': symbolic_names,        # variable length axes
-                                        'attention_mask' : symbolic_names})
-        else:
-            inputs = {'input_ids':      torch.ones(1, args.max_len, dtype=torch.int64),
-                      'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64),
-                    'token_type_ids': torch.ones(1, args.max_len, dtype=torch.int64)}
-            torch.onnx.export(model,                            # model being run
-                            (inputs['input_ids'],               # model input (or a tuple for multiple inputs) 
-                            inputs['attention_mask'],
-                            inputs['token_type_ids']),          
-                            args.output_model,                  # where to save the model (can be a file or file-like object)
-                            opset_version=14,                   # the ONNX version to export the model
-                            do_constant_folding=True,           # whether to execute constant folding
-                            input_names=['input_ids',           # the model's input names
-                                        'attention_mask',
-                                        'token_type_ids'],
-                            output_names=['logits'],
-                            dynamic_axes={'input_ids': symbolic_names,        # variable length axes
-                                        'attention_mask' : symbolic_names,
-                                        'token_type_ids' : symbolic_names})
-        print("ONNX Model exported to {0}".format(args.output_model))
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-    description='Export huggingface onnx model',
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument(
-        '--model_name_or_path',
-        type=str,
-        choices=['Intel/bert-base-uncased-mrpc',
-                'Intel/roberta-base-mrpc',
-                'Intel/xlm-roberta-base-mrpc',
-                'Intel/camembert-base-mrpc',
-                'distilbert-base-uncased-finetuned-sst-2-english',
-                'Alireza1044/albert-base-v2-sst2',
-                'philschmid/MiniLM-L6-H384-uncased-sst2',
-                'Intel/MiniLM-L12-H384-uncased-mrpc'],
-        help='pretrained model name or path')
-    parser.add_argument(
-        '--max_len',
-        type=int,
-        default=128,
-        help='Maximum length of the sentence pairs')
-    args = parser.parse_args()
-    args.output_model = args.model_name_or_path.split('/')[-1] + '.onnx'
-
-    model = AutoModelForSequenceClassification.from_pretrained(
-        args.model_name_or_path,
-        config=AutoConfig.from_pretrained(args.model_name_or_path))
-
-    export_onnx_model(args, model)
\ No newline at end of file