From 6e1b1da712d20d9291e5932974bc3167b00dd214 Mon Sep 17 00:00:00 2001
From: Zixuan Cheng <110808245+violetch24@users.noreply.github.com>
Date: Fri, 26 Jul 2024 15:58:00 +0800
Subject: [PATCH] add ipex xpu example to 3x API (#1948)

Signed-off-by: violetch24 <zixuan@aia-sdp-spr-117706.jf.intel.com>
---
 .../quantization/static_quant/ipex/README.md  | 100 +++
 .../quantization/static_quant/ipex/main.py    | 551 +++++++++++++
 .../static_quant/ipex/requirements.txt        |   3 +
 .../static_quant/ipex/run_benchmark.sh        |  89 +++
 .../static_quant/ipex/run_quant.sh            |  58 ++
 .../quantization/static_quant/ipex/utils.py   |  47 ++
 .../quantization/static_quant/ipex/README.md  |  57 ++
 .../static_quant/ipex/requirements.txt        |   5 +
 .../static_quant/ipex/run_benchmark.sh        | 104 +++
 .../quantization/static_quant/ipex/run_qa.py  | 738 ++++++++++++++++++
 .../static_quant/ipex/run_quant.sh            |  64 ++
 .../static_quant/ipex/trainer_qa.py           | 105 +++
 .../static_quant/ipex/utils_qa.py             | 481 ++++++++++++
 13 files changed, 2402 insertions(+)
 create mode 100644 examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/README.md
 create mode 100644 examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/main.py
 create mode 100644 examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/requirements.txt
 create mode 100644 examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/run_benchmark.sh
 create mode 100644 examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/run_quant.sh
 create mode 100644 examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/utils.py
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/README.md
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/requirements.txt
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_benchmark.sh
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_qa.py
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_quant.sh
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/trainer_qa.py
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/utils_qa.py

diff --git a/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/README.md b/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/README.md
new file mode 100644
index 00000000000..a17eb188d0a
--- /dev/null
+++ b/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/README.md
@@ -0,0 +1,100 @@
+Step-by-Step
+============
+
+This document describes the step-by-step instructions for reproducing PyTorch tuning results with Intel® Neural Compressor.
+
+# Prerequisite
+
+## 1. Environment
+
+We verified examples with IPEX backend on Python 3.10, recommended.
+
+```shell
+pip install -r requirements.txt
+```
+
+## 2. Install Intel-Pytorch-Extension
+
+Please refer to [intel/intel-extension-for-pytorch(github.com)](https://github.com/intel/intel-extension-for-pytorch).
+
+### Install IPEX CPU
+
+   > Note: GCC9 compiler is recommended
+
+   ```shell
+   python -m pip install intel_extension_for_pytorch -f https://software.intel.com/ipex-whl-stable
+   ```
+
+### Install IPEX XPU
+Please build an IPEX docker container according to the [official guide](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu&version=v2.1.30%2bxpu&os=linux%2fwsl2&package=docker).
+
+You can run a simple sanity test to double confirm if the correct version is installed, and if the software stack can get correct hardware information onboard your system. The command should return PyTorch and IPEX versions installed, as well as GPU card(s) information detected.
+```bash
+source {DPCPPROOT}/env/vars.sh
+source {MKLROOT}/env/vars.sh
+source {CCLROOT}/env/vars.sh
+source {MPIROOT}/env/vars.sh
+python -c "import torch; import intel_extension_for_pytorch as ipex; print(torch.__version__); print(ipex.__version__); [print(f'[{i}]: {torch.xpu.get_device_properties(i)}') for i in range(torch.xpu.device_count())];"
+```
+Please also refer to this [tutorial](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu&version=v2.1.30%2bxpu&os=linux%2fwsl2&package=conda) to check system requirements and install dependencies.
+
+## 3. Prepare Dataset
+
+Download [ImageNet](http://www.image-net.org/) Raw image to dir: /path/to/imagenet. The dir include below folder:
+
+```bash
+ls /path/to/imagenet
+train  val
+```
+
+# Run with CPU
+
+> Note: All torchvision model names can be passed as long as they are included in `torchvision.models`, below are some examples.
+
+### 1. ResNet18 With Intel PyTorch Extension
+
+```shell
+python main.py -t -a resnet18 --ipex --pretrained /path/to/imagenet
+```
+or
+```shell
+bash run_quant.sh --input_model=resnet18 --dataset_location=/path/to/imagenet
+bash run_benchmark.sh --input_model=resnet18 --dataset_location=/path/to/imagenet --mode=performance/accuracy --int8=true/false
+```
+
+### 2. ResNet50 With Intel PyTorch Extension
+
+```shell
+python main.py -t -a resnet50 --ipex --pretrained /path/to/imagenet
+```
+or
+```shell
+bash run_quant.sh --input_model=resnet50 --dataset_location=/path/to/imagenet
+bash run_benchmark.sh --input_model=resnet50 --dataset_location=/path/to/imagenet --mode=performance/accuracy --int8=true/false
+```
+
+### 3. ResNext101_32x16d With Intel PyTorch Extension
+
+```shell
+python main.py -t -a resnext101_32x16d_wsl --hub --ipex --pretrained /path/to/imagenet
+```
+or
+```shell
+bash run_quant.sh --input_model=resnext101_32x16d_wsl --dataset_location=/path/to/imagenet
+bash run_benchmark.sh --input_model=resnext101_32x16d_wsl --dataset_location=/path/to/imagenet --mode=performance/accuracy --int8=true/false
+```
+
+# Run with XPU
+
+> Note: All torchvision model names can be passed as long as they are included in `torchvision.models`, below are some examples.
+
+### 1. ResNet18 With Intel PyTorch Extension
+
+```shell
+python main.py -t -a resnet18 --ipex --pretrained /path/to/imagenet --xpu
+```
+or
+```shell
+bash run_quant.sh --input_model=resnet18 --dataset_location=/path/to/imagenet
+bash run_benchmark.sh --input_model=resnet18 --dataset_location=/path/to/imagenet --mode=performance/accuracy --int8=true/false --xpu=true/false
+```
diff --git a/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/main.py b/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/main.py
new file mode 100644
index 00000000000..a308aacad35
--- /dev/null
+++ b/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/main.py
@@ -0,0 +1,551 @@
+import argparse
+import os
+import random
+import shutil
+import time
+import warnings
+import sys
+
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+use_gpu = False
+if use_gpu:
+    import torch.backends.cudnn as cudnn
+#import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.optim
+import torch.multiprocessing as mp
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+import torchvision.models.quantization as quantize_models
+import torchvision.models as models
+from neural_compressor.adaptor.pytorch import get_torch_version
+from packaging.version import Version
+import intel_extension_for_pytorch as ipex
+
+
+model_names = models.list_models(module=models)
+
+torch.hub._validate_not_a_forked_repo=lambda a,b,c: True
+hub_model_names = torch.hub.list('facebookresearch/WSL-Images')
+model_names += hub_model_names
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser.add_argument('data', metavar='DIR',
+                    help='path to dataset')
+parser.add_argument('--hub', action='store_true', default=False,
+                    help='use model with torch hub')
+parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
+                    choices=model_names,
+                    help='model architecture: ' +
+                        ' | '.join(model_names) +
+                        ' (default: resnet18)')
+parser.add_argument('-j', '--workers', default=0, type=int, metavar='N',
+                    help='number of data loading workers (default: 4)')
+parser.add_argument('--epochs', default=90, type=int, metavar='N',
+                    help='number of total epochs to run')
+parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+                    metavar='N',
+                    help='mini-batch size (default: 256), this is the total '
+                         'batch size of all GPUs on the current node when '
+                         'using Data Parallel or Distributed Data Parallel')
+parser.add_argument('--steps', default=-1, type=int,
+                    help='steps for validation')
+parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
+                    metavar='LR', help='initial learning rate', dest='lr')
+parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                    help='momentum')
+parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                    metavar='W', help='weight decay (default: 1e-4)',
+                    dest='weight_decay')
+parser.add_argument('-p', '--print-freq', default=10, type=int,
+                    metavar='N', help='print frequency (default: 10)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('-t', '--tune', dest='tune', action='store_true',
+                    help='tune best int8 model on calibration dataset')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+parser.add_argument('--world-size', default=-1, type=int,
+                    help='number of nodes for distributed training')
+parser.add_argument('--rank', default=-1, type=int,
+                    help='node rank for distributed training')
+parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
+                    help='url used to set up distributed training')
+parser.add_argument('--dist-backend', default='nccl', type=str,
+                    help='distributed backend')
+parser.add_argument('--seed', default=None, type=int,
+                    help='seed for initializing training. ')
+parser.add_argument('--gpu', default=None, type=int,
+                    help='GPU id to use.')
+parser.add_argument('--ppn', default=1, type=int,
+                    help='number of processes on each node of distributed training')
+parser.add_argument('--multiprocessing-distributed', action='store_true',
+                    help='Use multi-processing distributed training to launch '
+                         'N processes per node, which has N GPUs. This is the '
+                         'fastest way to use PyTorch for either single node or '
+                         'multi node data parallel training')
+parser.add_argument('-i', "--iter", default=0, type=int,
+                    help='For accuracy measurement only.')
+parser.add_argument('-w', "--warmup_iter", default=5, type=int,
+                    help='For benchmark measurement only.')
+parser.add_argument('--performance', dest='performance', action='store_true',
+                    help='run benchmark')
+parser.add_argument('-r', "--accuracy", dest='accuracy', action='store_true',
+                    help='For accuracy measurement only.')
+parser.add_argument("--tuned_checkpoint", default='./saved_results', type=str, metavar='PATH',
+                    help='path to checkpoint tuned by Neural Compressor (default: ./)')
+parser.add_argument('--int8', dest='int8', action='store_true',
+                    help='run benchmark')
+parser.add_argument('--ipex', dest='ipex', action='store_true',
+                    help='tuning or benchmark with Intel PyTorch Extension')
+parser.add_argument("--calib_iters", default=512, type=int,
+                    help="calibration iters.")
+parser.add_argument('--xpu', action='store_true',
+                    help='whether use xpu')
+
+best_acc1 = 0
+
+
+def main():
+    args = parser.parse_args()
+    print(args)
+
+    if args.seed is not None:
+        random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        cudnn.deterministic = True
+        warnings.warn('You have chosen to seed training. '
+                      'This will turn on the CUDNN deterministic setting, '
+                      'which can slow down your training considerably! '
+                      'You may see unexpected behavior when restarting '
+                      'from checkpoints.')
+
+    if args.gpu is not None:
+        warnings.warn('You have chosen a specific GPU. This will completely '
+                      'disable data parallelism.')
+
+    if args.dist_url == "env://" and args.world_size == -1:
+        args.world_size = int(os.environ["WORLD_SIZE"])
+
+    args.distributed = args.world_size > 1 or args.ppn > 1 or args.multiprocessing_distributed
+
+    if use_gpu:
+        ngpus_per_node = torch.cuda.device_count()
+    else:
+        ngpus_per_node = args.ppn
+
+    #ngpus_per_node = torch.cuda.device_count()
+    if args.multiprocessing_distributed:
+        # Since we have ngpus_per_node processes per node, the total world_size
+        # needs to be adjusted accordingly
+        args.world_size = ngpus_per_node * args.world_size
+        # Use torch.multiprocessing.spawn to launch distributed processes: the
+        # main_worker process function
+        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+    else:
+        # Simply call main_worker function
+        main_worker(args.gpu, ngpus_per_node, args)
+
+
+def main_worker(gpu, ngpus_per_node, args):
+    global best_acc1
+    pytorch_version = get_torch_version()
+    #args.gpu = gpu
+    #affinity = subprocess.check_output("lscpu | grep 'NUMA node[0-9]' | awk '{ print $4 }' | awk -F',' '{ print $1 }'", shell=True)
+    #os.environ['OMP_NUM_THREADS'] = '28'
+    #os.environ['KMP_AFFINITY'] = 'proclist=[{}],granularity=thread,explicit'.format(affinity.splitlines()[gpu].decode('utf-8'))
+    #print (os.environ['KMP_AFFINITY'])
+
+    #if args.gpu is not None:
+    #    print("Use GPU: {} for training".format(args.gpu))
+    print("Use CPU: {} for training".format(gpu))
+
+    if args.distributed:
+        if args.dist_url == "env://" and args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            # For multiprocessing distributed training, rank needs to be the
+            # global rank among all the processes
+            args.rank = args.rank * ngpus_per_node + gpu
+        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                world_size=args.world_size, rank=args.rank)
+    
+    if args.hub:
+        torch.set_flush_denormal(True)
+        model = torch.hub.load('facebookresearch/WSL-Images', args.arch)
+    else:
+        # create model
+        if args.pretrained:
+            print("=> using pre-trained model '{}'".format(args.arch))
+            if args.ipex or pytorch_version >= Version("1.7.0-rc1"):
+                model = models.__dict__[args.arch](pretrained=True)
+            else:
+                model = quantize_models.__dict__[args.arch](pretrained=True, quantize=False)
+        else:
+            print("=> creating model '{}'".format(args.arch))
+            if args.ipex:
+                model = models.__dict__[args.arch]()
+            else:
+                model = quantize_models.__dict__[args.arch]()
+
+    if args.ipex and not args.int8:
+        model = model.to(memory_format=torch.channels_last)
+
+    if not torch.cuda.is_available():
+        print('using CPU...')
+    elif args.distributed:
+        # For multiprocessing distributed, DistributedDataParallel constructor
+        # should always set the single device scope, otherwise,
+        # DistributedDataParallel will use all available devices.
+        if args.gpu is not None:
+            torch.cuda.set_device(args.gpu)
+            model.cuda(args.gpu)
+            # When using a single GPU per process and per
+            # DistributedDataParallel, we need to divide the batch size
+            # ourselves based on the total number of GPUs we have
+            args.batch_size = int(args.batch_size / ngpus_per_node)
+            args.workers = int(args.workers / ngpus_per_node)
+            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        else:
+            #model.cuda()
+            # DistributedDataParallel will divide and allocate batch_size to all
+            # available GPUs if device_ids are not set
+            model = torch.nn.parallel.DistributedDataParallelCPU(model)
+    elif args.gpu is not None:
+        torch.cuda.set_device(args.gpu)
+        model = model.cuda(args.gpu)
+    else:
+        # DataParallel will divide and allocate batch_size to all available GPUs
+        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
+            model.features = torch.nn.DataParallel(model.features)
+            model.cuda()
+        else:
+            model = torch.nn.DataParallel(model)
+    if args.xpu:
+        model = model.to("xpu")
+    # define loss function (criterion) and optimizer
+    criterion = nn.CrossEntropyLoss()
+    #criterion = nn.CrossEntropyLoss().cuda(args.gpu)
+
+    optimizer = torch.optim.SGD(model.parameters(), args.lr,
+                                momentum=args.momentum,
+                                weight_decay=args.weight_decay)
+
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            checkpoint = torch.load(args.resume)
+            args.start_epoch = checkpoint['epoch']
+            best_acc1 = checkpoint['best_acc1']
+            if args.gpu is not None:
+                # best_acc1 may be from a checkpoint from a different GPU
+                best_acc1 = best_acc1.to(args.gpu)
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            print("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+    #cudnn.benchmark = True
+
+    # Data loading code
+    traindir = os.path.join(args.data, 'train')
+    valdir = os.path.join(args.data, 'val')
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+
+    train_dataset = datasets.ImageFolder(
+        traindir,
+        transforms.Compose([
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            normalize,
+        ]))
+
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+    else:
+        train_sampler = None
+
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
+        num_workers=args.workers, pin_memory=True, sampler=train_sampler)
+
+    val_loader = torch.utils.data.DataLoader(
+        datasets.ImageFolder(valdir, transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            normalize,
+        ])),
+        batch_size=args.batch_size, shuffle=False,
+        num_workers=args.workers, pin_memory=True)
+
+    if args.evaluate:
+        validate(val_loader, model, criterion, args)
+
+    def eval_func(model):
+        accu = validate(val_loader, model, criterion, args)
+        return float(accu)
+
+    if args.tune:
+        from neural_compressor.torch.quantization import get_default_static_config
+        quant_config =  get_default_static_config()
+
+        from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device
+        from tqdm import tqdm
+        def run_fn(model):
+            calib_iter = 0
+            for batch in tqdm(val_loader, total=args.calib_iters):
+                batch = move_input_to_device(batch, device=None)
+                if isinstance(batch, tuple) or isinstance(batch, list):
+                    model(batch[0])
+                elif isinstance(batch, dict):
+                    model(**batch)
+                else:
+                    model(batch)
+                
+                calib_iter += 1
+                if calib_iter >= args.calib_iters:
+                    break
+            return
+
+        from utils import get_example_inputs
+        example_inputs = get_example_inputs(model, val_loader)
+
+        from neural_compressor.torch.quantization import prepare, convert
+        model = prepare(model=model, quant_config=quant_config, example_inputs=example_inputs)
+        run_fn(model)
+        q_model = convert(model)
+        q_model.save(args.tuned_checkpoint)
+        return
+
+    if args.performance or args.accuracy:
+        model.eval()
+        if args.int8:
+            print("load int8 model")
+            from neural_compressor.torch.quantization import load
+            model = load(os.path.abspath(os.path.expanduser(args.tuned_checkpoint)))
+        else:
+            from utils import get_example_inputs
+            example_inputs = get_example_inputs(model, val_loader)
+            model = ipex.optimize(model)
+            with torch.no_grad():
+                model = torch.jit.trace(model, example_inputs)
+                model = torch.jit.freeze(model)
+
+        if args.performance:
+            from neural_compressor.config import BenchmarkConfig
+            from neural_compressor import benchmark
+            b_conf = BenchmarkConfig(warmup=5,
+                                     iteration=args.iter,
+                                     cores_per_instance=4,
+                                     num_of_instance=1)
+            benchmark.fit(model, b_conf, b_dataloader=val_loader)
+        if args.accuracy:
+            validate(val_loader, model, criterion, args)
+        return
+
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+        adjust_learning_rate(optimizer, epoch, args)
+
+        # train for one epoch
+        train(train_loader, model, criterion, optimizer, epoch, args)
+
+        # evaluate on validation set
+        acc1 = validate(val_loader, model, criterion, args)
+
+        # remember best acc@1 and save checkpoint
+        is_best = acc1 > best_acc1
+        best_acc1 = max(acc1, best_acc1)
+
+        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                and args.rank % ngpus_per_node == 0):
+            save_checkpoint({
+                'epoch': epoch + 1,
+                'arch': args.arch,
+                'state_dict': model.state_dict(),
+                'best_acc1': best_acc1,
+                'optimizer' : optimizer.state_dict(),
+            }, is_best)
+
+def train(train_loader, model, criterion, optimizer, epoch, args):
+    batch_time = AverageMeter('Time', ':6.3f')
+    data_time = AverageMeter('Data', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(len(train_loader), batch_time, data_time, losses, top1,
+                             top5, prefix="Epoch: [{}]".format(epoch))
+
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    for i, (input, target) in enumerate(train_loader):
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        if args.gpu is not None:
+            input = input.cuda(args.gpu, non_blocking=True)
+            target = target.cuda(args.gpu, non_blocking=True)
+
+        # compute output
+        output = model(input)
+        loss = criterion(output, target)
+
+        # measure accuracy and record loss
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), input.size(0))
+        top1.update(acc1[0], input.size(0))
+        top5.update(acc5[0], input.size(0))
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if i % args.print_freq == 0:
+            progress.print(i)
+
+
+def validate(val_loader, model, criterion, args):
+    batch_time = AverageMeter('Time', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(len(val_loader), batch_time, losses, top1, top5,
+                             prefix='Test: ')
+
+    # switch to evaluate mode
+    with torch.no_grad():
+        for i, (input, target) in enumerate(val_loader):
+            input = input.contiguous(memory_format=torch.channels_last) 
+            if i >= args.warmup_iter:
+                start = time.time()
+            if args.gpu is not None:
+                input = input.cuda(args.gpu, non_blocking=True)
+                target = target.cuda(args.gpu, non_blocking=True)
+            if args.xpu:
+                input = input.to("xpu")
+                target = target.to("xpu")
+
+            # compute output
+            output = model(input)
+           
+           # measure elapsed time
+            if i >= args.warmup_iter:
+                batch_time.update(time.time() - start)
+
+            loss = criterion(output, target)
+
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), input.size(0))
+            top1.update(acc1[0], input.size(0))
+            top5.update(acc5[0], input.size(0))
+
+
+            if i % args.print_freq == 0:
+                progress.print(i)
+
+            if args.iter > 0 and i >= (args.warmup_iter + args.iter - 1):
+                break
+
+        print('Batch size = %d' % args.batch_size)
+        print('Accuracy: {top1:.5f} Accuracy@5 {top5:.5f}'
+              .format(top1=(top1.avg / 100), top5=(top5.avg / 100)))
+
+    return top1.avg/100
+
+
+def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, 'model_best.pth.tar')
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=':f'):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, *meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def print(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print('\t'.join(entries))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+
+
+def adjust_learning_rate(optimizer, epoch, args):
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+    lr = args.lr * (0.1 ** (epoch // 30))
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/requirements.txt b/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/requirements.txt
new file mode 100644
index 00000000000..94f1a7356fe
--- /dev/null
+++ b/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/requirements.txt
@@ -0,0 +1,3 @@
+neural-compressor
+torch>=1.9.0
+torchvision>=0.10.0
diff --git a/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/run_benchmark.sh b/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/run_benchmark.sh
new file mode 100644
index 00000000000..f5a2e251554
--- /dev/null
+++ b/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/run_benchmark.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  iters=100
+  batch_size=32
+  tuned_checkpoint=saved_results
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --xpu=*)
+          xpu=$(echo ${var} |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+
+# run_benchmark
+function run_benchmark {
+    if [[ ${mode} == "accuracy" ]]; then
+        mode_cmd=" --accuracy"
+    elif [[ ${mode} == "performance" ]]; then
+        mode_cmd=" --iter ${iters} --performance "
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+
+    extra_cmd="--ipex"
+    if [ "resnext101_32x16d_wsl_ipex" = "${topology}" ];then
+        extra_cmd=$extra_cmd" --hub"
+    fi
+
+    if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --int8"
+    fi
+
+    if [[ ${xpu} == "true" ]]; then
+        extra_cmd=$extra_cmd" --xpu"
+    fi
+    echo $extra_cmd
+
+
+    python main.py \
+            --pretrained \
+            --tuned_checkpoint ${tuned_checkpoint} \
+            -b ${batch_size} \
+            -a ${input_model} \
+            ${mode_cmd} \
+            ${extra_cmd} \
+            ${dataset_location}
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/run_quant.sh b/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/run_quant.sh
new file mode 100644
index 00000000000..5595b069671
--- /dev/null
+++ b/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/run_quant.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  output_model=saved_results
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --output_model=*)
+          output_model=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+    extra_cmd="--ipex"
+    if [ -n "$output_model" ];then
+        extra_cmd=$extra_cmd" --tuned_checkpoint ${output_model}"
+    fi
+    if [[ "${topology}" == "resnext101_32x16d_wsl"* ]];then
+        extra_cmd=$extra_cmd" --hub "
+    fi
+    extra_cmd=$extra_cmd" ${dataset_location}"
+
+    python main.py \
+            --pretrained \
+            -t \
+            -a $input_model \
+            -b 30 \
+            ${extra_cmd}
+
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/utils.py b/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/utils.py
new file mode 100644
index 00000000000..76117f8b0b5
--- /dev/null
+++ b/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/utils.py
@@ -0,0 +1,47 @@
+import torch
+from collections import UserDict
+from packaging.version import Version
+from neural_compressor.torch.utils import get_torch_version
+
+def get_example_inputs(model, dataloader):
+    version = get_torch_version()
+    from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device
+
+    # Suggest set dataloader like calib_dataloader
+    if dataloader is None:
+        return None
+    device = next(model.parameters()).device
+    try:
+        for idx, (input, label) in enumerate(dataloader):
+            input = move_input_to_device(input, device)
+            if isinstance(input, (dict, UserDict)):  # pragma: no cover
+                assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0"
+                if "label" in input.keys():
+                    input.pop("label")
+                if version.release <= Version("2.0.1").release:
+                    return tuple(input.values())
+                else:
+                    return dict(input)
+            if isinstance(input, (list, tuple)):
+                return tuple(input)
+            if isinstance(input, torch.Tensor):
+                return input
+            break
+    except Exception as e:  # pragma: no cover
+        for idx, input in enumerate(dataloader):
+            input = move_input_to_device(input, device)
+            if isinstance(input, (dict, UserDict)):  # pragma: no cover
+                assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0"
+                if "label" in input.keys():
+                    input.pop("label")
+                if version.release <= Version("2.0.1").release:
+                    return tuple(input.values())
+                else:
+                    return dict(input)
+            if isinstance(input, list) or isinstance(input, tuple):
+                return tuple(input)
+            if isinstance(input, torch.Tensor):
+                return input
+            break
+    if idx == 0:
+        assert False, "Please checkout the example_inputs format."
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/README.md
new file mode 100644
index 00000000000..b035249baac
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/README.md
@@ -0,0 +1,57 @@
+Step-by-Step
+============
+This document describes the step-by-step instructions for reproducing Huggingface models with IPEX backend tuning results with Intel® Neural Compressor.
+> Note: IPEX version >= 1.10
+
+# Prerequisite
+
+## 1. Environment
+Recommend python 3.6 or higher version.
+```shell
+pip install -r requirements.txt
+pip install torch
+pip install intel_extension_for_pytorch
+```
+
+# Quantization
+
+## 1. Quantization with CPU
+If IPEX version is equal or higher than 1.12, please install transformers 4.19.0.
+```shell
+python run_qa.py \
+    --model_name_or_path bert-large-uncased-whole-word-masking-finetuned-squad \
+    --dataset_name squad \
+    --do_eval \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --no_cuda \
+    --tune \
+    --output_dir ./savedresult
+```
+
+## 2. Quantization with XPU
+### 2.1 Environment Setting
+Please build an IPEX docker container according to the [official guide](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu&version=v2.1.30%2bxpu&os=linux%2fwsl2&package=docker).
+
+You can run a simple sanity test to double confirm if the correct version is installed, and if the software stack can get correct hardware information onboard your system. The command should return PyTorch and IPEX versions installed, as well as GPU card(s) information detected.
+```bash
+source {DPCPPROOT}/env/vars.sh
+source {MKLROOT}/env/vars.sh
+source {CCLROOT}/env/vars.sh
+source {MPIROOT}/env/vars.sh
+python -c "import torch; import intel_extension_for_pytorch as ipex; print(torch.__version__); print(ipex.__version__); [print(f'[{i}]: {torch.xpu.get_device_properties(i)}') for i in range(torch.xpu.device_count())];"
+```
+Please also refer to this [tutorial](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu&version=v2.1.30%2bxpu&os=linux%2fwsl2&package=conda) to check system requirements and install dependencies.
+
+#### 2.2 Quantization Command
+```shell
+python run_qa.py \
+    --model_name_or_path bert-large-uncased-whole-word-masking-finetuned-squad \
+    --dataset_name squad \
+    --do_eval \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --xpu \
+    --tune \
+    --output_dir ./savedresult
+```
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/requirements.txt
new file mode 100644
index 00000000000..2bb000d2deb
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/requirements.txt
@@ -0,0 +1,5 @@
+accelerate
+datasets>=1.8.0
+transformers>=4.34.1
+tensorboard
+tqdm
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_benchmark.sh
new file mode 100644
index 00000000000..2f646afacdb
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_benchmark.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  tuned_checkpoint=saved_results
+  tokenizer_name=bert-large-uncased-whole-word-masking-finetuned-squad
+  iters=100
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --config=*)
+          tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
+      --xpu=*)
+          xpu=$(echo ${var} |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+
+# run_benchmark
+function run_benchmark {
+    if [[ ${mode} == "accuracy" ]]; then
+        mode_cmd=" --accuracy_only"
+    elif [[ ${mode} == "performance" ]]; then
+        mode_cmd=" --benchmark --iters "${iters}
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+
+    extra_cmd=""
+    if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --int8"
+    fi
+    if [[ ${xpu} == "true" ]]; then
+        extra_cmd=$extra_cmd" --xpu"
+    fi
+    echo $extra_cmd
+    if [[ "${topology}" == "bert_large_ipex" ]]; then
+        model_name_or_path="bert-large-uncased-whole-word-masking-finetuned-squad"
+        python run_qa.py \
+            --model_name_or_path $model_name_or_path \
+            --dataset_name squad \
+            --do_eval \
+            --max_seq_length 384 \
+            --no_cuda \
+            --output_dir $tuned_checkpoint \
+            --per_gpu_eval_batch_size $batch_size \
+            $mode_cmd \
+            ${extra_cmd}
+    fi
+    if [[ "${topology}" == "distilbert_base_ipex" ]]; then
+        model_name_or_path="distilbert-base-uncased-distilled-squad"
+        python run_qa.py \
+            --model_name_or_path $model_name_or_path \
+            --dataset_name squad \
+            --do_eval \
+            --max_seq_length 384 \
+            --no_cuda \
+            --output_dir $tuned_checkpoint \
+            --per_gpu_eval_batch_size $batch_size \
+            $mode_cmd \
+            ${extra_cmd}
+    fi
+}
+
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_qa.py b/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_qa.py
new file mode 100644
index 00000000000..079c0749994
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_qa.py
@@ -0,0 +1,738 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for question answering using a slightly adapted version of the 🤗 Trainer.
+"""
+# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
+
+import datasets
+import logging
+import os
+import sys
+import timeit
+import transformers
+from dataclasses import dataclass, field
+from datasets import load_dataset, load_metric
+from trainer_qa import QuestionAnsweringTrainer
+from transformers import (
+    AutoConfig,
+    AutoModelForQuestionAnswering,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    HfArgumentParser,
+    PreTrainedTokenizerFast,
+    TrainingArguments,
+    default_data_collator,
+    set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+from typing import Optional
+from utils_qa import postprocess_qa_predictions
+from neural_compressor.utils.utility import LazyImport
+try:
+    import intel_extension_for_pytorch as ipex
+    from intel_extension_for_pytorch.quantization import prepare, convert
+    from torch.ao.quantization import MinMaxObserver, PerChannelMinMaxObserver, QConfig
+except:
+    assert False, "transformers 4.19.0 requests IPEX version higher or equal to 1.12"
+torch = LazyImport("torch")
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.12.0")
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
+
+logger = logging.getLogger(__name__)
+
+os.environ["WANDB_DISABLED"] = "true"
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+    tune: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to apply quantization."},
+    )
+    int8: bool = field(
+        default=False, metadata={"help": "use int8 model to get accuracy or benchmark"}
+    )
+    benchmark: bool = field(
+        default=False, metadata={"help": "get benchmark instead of accuracy"}
+    )
+    accuracy_only: bool = field(
+        default=False, metadata={"help": "get accuracy"}
+    )
+    iters: int = field(
+        default=100,
+        metadata={
+            "help": "The inference iterations to run for benchmark."
+        },
+    )
+    xpu: bool = field(
+        default=False, metadata={"help": "whether to use xpu"}
+    )
+    calib_iters: int = field(
+        default=512,
+        metadata={
+            "help": "The inference iterations to calibration."
+        },
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_seq_length: int = field(
+        default=384,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can "
+            "be faster on GPU but will be slower on TPU)."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+            "value if set."
+        },
+    )
+    version_2_with_negative: bool = field(
+        default=False, metadata={"help": "If true, some of the examples do not have an answer."}
+    )
+    null_score_diff_threshold: float = field(
+        default=0.0,
+        metadata={
+            "help": "The threshold used to select the null answer: if the best answer has a score that is less than "
+            "the score of the null answer minus this threshold, the null answer is selected for this example. "
+            "Only useful when `version_2_with_negative=True`."
+        },
+    )
+    doc_stride: int = field(
+        default=128,
+        metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
+    )
+    n_best_size: int = field(
+        default=20,
+        metadata={"help": "The total number of n-best predictions to generate when looking for an answer."},
+    )
+    max_answer_length: int = field(
+        default=30,
+        metadata={
+            "help": "The maximum length of an answer that can be generated. This is needed because the start "
+            "and end predictions are not conditioned on one another."
+        },
+    )
+
+    def __post_init__(self):
+        if (
+            self.dataset_name is None
+            and self.train_file is None
+            and self.validation_file is None
+            and self.test_file is None
+        ):
+            raise ValueError("Need either a dataset name or a training/validation file/test_file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+            if self.test_file is not None:
+                extension = self.test_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
+        )
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
+
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+            extension = data_args.validation_file.split(".")[-1]
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+            extension = data_args.test_file.split(".")[-1]
+        raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=True,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    model = AutoModelForQuestionAnswering.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    # Tokenizer check: this script requires a fast tokenizer.
+    if not isinstance(tokenizer, PreTrainedTokenizerFast):
+        raise ValueError(
+            "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
+            "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this "
+            "requirement"
+        )
+
+    # Preprocessing the datasets.
+    # Preprocessing is slightly different for training and evaluation.
+    if training_args.do_train:
+        column_names = raw_datasets["train"].column_names
+    elif training_args.do_eval:
+        column_names = raw_datasets["validation"].column_names
+    else:
+        column_names = raw_datasets["test"].column_names
+    question_column_name = "question" if "question" in column_names else column_names[0]
+    context_column_name = "context" if "context" in column_names else column_names[1]
+    answer_column_name = "answers" if "answers" in column_names else column_names[2]
+
+    # Padding side determines if we do (question|context) or (context|question).
+    pad_on_right = tokenizer.padding_side == "right"
+
+    if data_args.max_seq_length > tokenizer.model_max_length:
+        logger.warning(
+            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+        )
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+    # Training preprocessing
+    def prepare_train_features(examples):
+        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+        # left whitespace
+        examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
+
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=data_args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length" if data_args.pad_to_max_length else False,
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        # The offset mappings will give us a map from token to character position in the original context. This will
+        # help us compute the start_positions and end_positions.
+        offset_mapping = tokenized_examples.pop("offset_mapping")
+
+        # Let's label those examples!
+        tokenized_examples["start_positions"] = []
+        tokenized_examples["end_positions"] = []
+
+        for i, offsets in enumerate(offset_mapping):
+            # We will label impossible answers with the index of the CLS token.
+            input_ids = tokenized_examples["input_ids"][i]
+            cls_index = input_ids.index(tokenizer.cls_token_id)
+
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            answers = examples[answer_column_name][sample_index]
+            # If no answers are given, set the cls_index as answer.
+            if len(answers["answer_start"]) == 0:
+                tokenized_examples["start_positions"].append(cls_index)
+                tokenized_examples["end_positions"].append(cls_index)
+            else:
+                # Start/end character index of the answer in the text.
+                start_char = answers["answer_start"][0]
+                end_char = start_char + len(answers["text"][0])
+
+                # Start token index of the current span in the text.
+                token_start_index = 0
+                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
+                    token_start_index += 1
+
+                # End token index of the current span in the text.
+                token_end_index = len(input_ids) - 1
+                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
+                    token_end_index -= 1
+
+                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
+                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
+                    tokenized_examples["start_positions"].append(cls_index)
+                    tokenized_examples["end_positions"].append(cls_index)
+                else:
+                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
+                    # Note: we could go after the last offset if the answer is the last word (edge case).
+                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
+                        token_start_index += 1
+                    tokenized_examples["start_positions"].append(token_start_index - 1)
+                    while offsets[token_end_index][1] >= end_char:
+                        token_end_index -= 1
+                    tokenized_examples["end_positions"].append(token_end_index + 1)
+
+        return tokenized_examples
+
+    if training_args.do_train:
+        if "train" not in raw_datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = raw_datasets["train"]
+        if data_args.max_train_samples is not None:
+            # We will select sample from whole data if argument is specified
+            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
+            train_dataset = train_dataset.select(range(max_train_samples))
+        # Create train feature from dataset
+        with training_args.main_process_first(desc="train dataset map pre-processing"):
+            train_dataset = train_dataset.map(
+                prepare_train_features,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on train dataset",
+            )
+        if data_args.max_train_samples is not None:
+            # Number of samples might increase during Feature Creation, We select only specified max samples
+            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
+            train_dataset = train_dataset.select(range(max_train_samples))
+
+    # Validation preprocessing
+    def prepare_validation_features(examples):
+        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+        # left whitespace
+        examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
+
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=data_args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length" if data_args.pad_to_max_length else False,
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+
+        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+        # corresponding example_id and we will store the offset mappings.
+        tokenized_examples["example_id"] = []
+
+        for i in range(len(tokenized_examples["input_ids"])):
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            context_index = 1 if pad_on_right else 0
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            tokenized_examples["example_id"].append(examples["id"][sample_index])
+
+            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
+            # position is part of the context or not.
+            tokenized_examples["offset_mapping"][i] = [
+                (o if sequence_ids[k] == context_index else None)
+                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+            ]
+
+        return tokenized_examples
+
+    if training_args.do_eval:
+        if "validation" not in raw_datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_examples = raw_datasets["validation"]
+        if data_args.max_eval_samples is not None:
+            # We will select sample from whole data
+            max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
+            eval_examples = eval_examples.select(range(max_eval_samples))
+        # Validation Feature Creation
+        with training_args.main_process_first(desc="validation dataset map pre-processing"):
+            eval_dataset = eval_examples.map(
+                prepare_validation_features,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on validation dataset",
+            )
+        if data_args.max_eval_samples is not None:
+            # During Feature creation dataset samples might increase, we will select required samples again
+            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
+            eval_dataset = eval_dataset.select(range(max_eval_samples))
+
+    if training_args.do_predict:
+        if "test" not in raw_datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        predict_examples = raw_datasets["test"]
+        if data_args.max_predict_samples is not None:
+            # We will select sample from whole data
+            predict_examples = predict_examples.select(range(data_args.max_predict_samples))
+        # Predict Feature Creation
+        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
+            predict_dataset = predict_examples.map(
+                prepare_validation_features,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on prediction dataset",
+            )
+        if data_args.max_predict_samples is not None:
+            # During Feature creation dataset samples might increase, we will select required samples again
+            max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
+            predict_dataset = predict_dataset.select(range(max_predict_samples))
+
+    # Data collator
+    # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
+    # collator.
+    data_collator = (
+        default_data_collator
+        if data_args.pad_to_max_length
+        else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
+    )
+
+    # Post-processing:
+    def post_processing_function(examples, features, predictions, stage="eval"):
+        # Post-processing: we match the start logits and end logits to answers in the original context.
+        predictions = postprocess_qa_predictions(
+            examples=examples,
+            features=features,
+            predictions=predictions,
+            version_2_with_negative=data_args.version_2_with_negative,
+            n_best_size=data_args.n_best_size,
+            max_answer_length=data_args.max_answer_length,
+            null_score_diff_threshold=data_args.null_score_diff_threshold,
+            output_dir=training_args.output_dir,
+            log_level=log_level,
+            prefix=stage,
+        )
+        # Format the result to the format the metric expects.
+        if data_args.version_2_with_negative:
+            formatted_predictions = [
+                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
+            ]
+        else:
+            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+
+        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
+        return EvalPrediction(predictions=formatted_predictions, label_ids=references)
+
+    metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad", trust_remote_code=True)
+
+    def compute_metrics(p: EvalPrediction):
+        return metric.compute(predictions=p.predictions, references=p.label_ids)
+
+    # Initialize our Trainer
+    trainer = QuestionAnsweringTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        eval_examples=eval_examples if training_args.do_eval else None,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        post_process_function=post_processing_function,
+        compute_metrics=compute_metrics,
+    )
+
+    eval_dataloader = trainer.get_eval_dataloader()
+    # transformer issue #1
+    # for transformers 4.31.0: accelerate dataloader
+    # *** ValueError: batch_size attribute should not be set 
+    # after DataLoaderShard is initialized
+    if eval_dataloader.batch_size is None:
+        def _build_inc_dataloader(dataloader):
+            class INCDataLoader:
+                __iter__ = dataloader.__iter__
+                def __init__(self) -> None:
+                    self.dataloader = dataloader
+                    self.batch_size = dataloader.total_batch_size
+            return INCDataLoader()
+        eval_dataloader = _build_inc_dataloader(eval_dataloader)
+    batch_size = eval_dataloader.batch_size
+    metric_name = "eval_f1"
+
+    def take_eval_steps(model, trainer, metric_name, save_metrics=False):
+        trainer.model = model
+        start_time = timeit.default_timer()
+        metrics = trainer.evaluate()
+        evalTime = timeit.default_timer() - start_time
+        max_eval_samples = data_args.max_eval_samples \
+         if data_args.max_eval_samples is not None else len(eval_dataset)
+        eval_samples = min(max_eval_samples, len(eval_dataset))
+        samples = eval_samples - (eval_samples % batch_size) \
+         if training_args.dataloader_drop_last else eval_samples
+        if save_metrics:
+            trainer.save_metrics("eval", metrics)
+        logger.info("metrics keys: {}".format(metrics.keys()))
+        print('Batch size = %d' % batch_size)
+        print("Finally Eval {} Accuracy: {}".format(metric_name, metrics.get(metric_name)))
+        print("Latency: %.3f ms" % (evalTime / samples * 1000))
+        print("Throughput: {} samples/sec".format(samples / evalTime))
+        return metrics.get(metric_name)
+
+    def eval_func(model):
+        return take_eval_steps(model, trainer, metric_name)
+
+    if model_args.tune:
+        ipex.nn.utils._model_convert.replace_dropout_with_identity(model)
+        from neural_compressor.torch.quantization import get_default_static_config
+        quant_config =  get_default_static_config()
+        dummy_input_ids = torch.ones((training_args.per_device_eval_batch_size, data_args.max_seq_length), dtype=torch.long)
+        dummy_token_type_ids = torch.ones((training_args.per_device_eval_batch_size, data_args.max_seq_length), dtype=torch.long)
+        dummy_attention_mask = torch.ones((training_args.per_device_eval_batch_size, data_args.max_seq_length), dtype=torch.long)
+        if model.config.model_type == "distilbert":
+            example_inputs = (dummy_input_ids, dummy_attention_mask)
+        elif model.config.model_type == "bert":
+            example_inputs = (dummy_input_ids, dummy_attention_mask, dummy_token_type_ids)
+        else:
+            example_inputs = None  # please provide correct example_inputs if necessary.
+
+        from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device
+        from tqdm import tqdm
+        def run_fn(model):
+            calib_iter = 0
+            for batch in tqdm(eval_dataloader, total=model_args.calib_iters):
+                batch = move_input_to_device(batch, device=None)
+                if isinstance(batch, tuple) or isinstance(batch, list):
+                    model(batch[0])
+                elif isinstance(batch, dict):
+                    model(**batch)
+                else:
+                    model(batch)
+                
+                calib_iter += 1
+                if calib_iter >= model_args.calib_iters:
+                    break
+            return
+
+        from neural_compressor.torch.quantization import prepare, convert
+        model = prepare(model=model, quant_config=quant_config, example_inputs=example_inputs)
+        run_fn(model)
+        q_model = convert(model)
+        q_model.save(training_args.output_dir)
+        return
+
+    model.eval()
+    if model_args.int8:
+        print("load int8 model")
+        from neural_compressor.torch.quantization import load
+        model = load(os.path.abspath(os.path.expanduser(training_args.output_dir)))
+    else:
+        from utils_qa import get_example_inputs
+        example_inputs = get_example_inputs(model, eval_dataloader)
+        model = ipex.optimize(model)
+        with torch.no_grad():
+            if isinstance(example_inputs, dict):
+                model = torch.jit.trace(model, example_kwarg_inputs=example_inputs, strict=False)
+            else:
+                model = torch.jit.trace(model, example_inputs, strict=False)
+            model = torch.jit.freeze(model)
+
+    if model_args.benchmark or model_args.accuracy_only:
+        if model_args.benchmark:
+            from neural_compressor.config import BenchmarkConfig
+            from neural_compressor import benchmark
+            b_conf = BenchmarkConfig(backend="ipex",
+                                     warmup=5,
+                                     iteration=model_args.iters,
+                                     cores_per_instance=4,
+                                     num_of_instance=1)
+            if model_args.xpu:
+                b_conf.device = "xpu"
+            benchmark.fit(model, b_conf, b_dataloader=eval_dataloader)
+        else:
+            eval_func(model)
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_quant.sh
new file mode 100644
index 00000000000..ae49ed79f5f
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_quant.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_tuning
+
+}
+# init params
+function init_params {
+  tuned_checkpoint=saved_results
+  tokenizer_name=bert-large-uncased-whole-word-masking-finetuned-squad
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --output_model=*)
+          tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+}
+
+
+# run_tuning
+function run_tuning {
+    if [[ "${topology}" == "bert_large_ipex" ]]; then
+        model_name_or_path="bert-large-uncased-whole-word-masking-finetuned-squad"
+        python run_qa.py \
+            --model_name_or_path $model_name_or_path \
+            --dataset_name squad \
+            --do_eval \
+            --max_seq_length 384 \
+            --no_cuda \
+            --tune \
+            --output_dir $tuned_checkpoint
+    fi
+    if [[ "${topology}" == "distilbert_base_ipex" ]]; then
+        model_name_or_path="distilbert-base-uncased-distilled-squad"
+        python run_qa.py \
+           --model_name_or_path $model_name_or_path \
+           --dataset_name squad \
+           --do_eval \
+           --max_seq_length 384 \
+           --no_cuda \
+           --tune \
+           --output_dir $tuned_checkpoint
+    fi
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/trainer_qa.py b/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/trainer_qa.py
new file mode 100644
index 00000000000..7f98eba236c
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/trainer_qa.py
@@ -0,0 +1,105 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A subclass of `Trainer` specific to Question-Answering tasks
+"""
+
+from transformers import Trainer, is_torch_tpu_available
+from transformers.trainer_utils import PredictionOutput
+
+
+if is_torch_tpu_available():
+    import torch_xla.core.xla_model as xm
+    import torch_xla.debug.metrics as met
+
+
+class QuestionAnsweringTrainer(Trainer):
+    def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.eval_examples = eval_examples
+        self.post_process_function = post_process_function
+
+    def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"):
+        eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
+        eval_dataloader = self.get_eval_dataloader(eval_dataset)
+        eval_examples = self.eval_examples if eval_examples is None else eval_examples
+
+        # Temporarily disable metric computation, we will do it in the loop here.
+        compute_metrics = self.compute_metrics
+        self.compute_metrics = None
+        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
+        try:
+            output = eval_loop(
+                eval_dataloader,
+                description="Evaluation",
+                # No point gathering the predictions if there are no metrics, otherwise we defer to
+                # self.args.prediction_loss_only
+                prediction_loss_only=True if compute_metrics is None else None,
+                ignore_keys=ignore_keys,
+            )
+        finally:
+            self.compute_metrics = compute_metrics
+
+        if self.post_process_function is not None and self.compute_metrics is not None:
+            eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions)
+            metrics = self.compute_metrics(eval_preds)
+
+            # Prefix all keys with metric_key_prefix + '_'
+            for key in list(metrics.keys()):
+                if not key.startswith(f"{metric_key_prefix}_"):
+                    metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+            self.log(metrics)
+        else:
+            metrics = {}
+
+        if self.args.tpu_metrics_debug or self.args.debug:
+            # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
+            xm.master_print(met.metrics_report())
+
+        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
+        return metrics
+
+    def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"):
+        predict_dataloader = self.get_test_dataloader(predict_dataset)
+
+        # Temporarily disable metric computation, we will do it in the loop here.
+        compute_metrics = self.compute_metrics
+        self.compute_metrics = None
+        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
+        try:
+            output = eval_loop(
+                predict_dataloader,
+                description="Prediction",
+                # No point gathering the predictions if there are no metrics, otherwise we defer to
+                # self.args.prediction_loss_only
+                prediction_loss_only=True if compute_metrics is None else None,
+                ignore_keys=ignore_keys,
+            )
+        finally:
+            self.compute_metrics = compute_metrics
+
+        if self.post_process_function is None or self.compute_metrics is None:
+            return output
+
+        predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict")
+        metrics = self.compute_metrics(predictions)
+
+        # Prefix all keys with metric_key_prefix + '_'
+        for key in list(metrics.keys()):
+            if not key.startswith(f"{metric_key_prefix}_"):
+                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+        return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics)
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/utils_qa.py b/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/utils_qa.py
new file mode 100644
index 00000000000..6514e6ba7ad
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/utils_qa.py
@@ -0,0 +1,481 @@
+# coding=utf-8
+#  Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""
+Post-processing utilities for question answering.
+"""
+import collections
+import json
+import logging
+import os
+import torch
+from typing import Optional, Tuple
+from collections import UserDict
+from packaging.version import Version
+from neural_compressor.torch.utils import get_torch_version
+
+import numpy as np
+from tqdm.auto import tqdm
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_example_inputs(model, dataloader):
+    version = get_torch_version()
+    from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device
+
+    # Suggest set dataloader like calib_dataloader
+    if dataloader is None:
+        return None
+    device = next(model.parameters()).device
+    try:
+        for idx, (input, label) in enumerate(dataloader):
+            input = move_input_to_device(input, device)
+            if isinstance(input, (dict, UserDict)):  # pragma: no cover
+                assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0"
+                if "label" in input.keys():
+                    input.pop("label")
+                if version.release <= Version("2.0.1").release:
+                    return tuple(input.values())
+                else:
+                    return dict(input)
+            if isinstance(input, (list, tuple)):
+                return tuple(input)
+            if isinstance(input, torch.Tensor):
+                return input
+            break
+    except Exception as e:  # pragma: no cover
+        for idx, input in enumerate(dataloader):
+            input = move_input_to_device(input, device)
+            if isinstance(input, (dict, UserDict)):  # pragma: no cover
+                assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0"
+                if "label" in input.keys():
+                    input.pop("label")
+                if version.release <= Version("2.0.1").release:
+                    return tuple(input.values())
+                else:
+                    return dict(input)
+            if isinstance(input, list) or isinstance(input, tuple):
+                return tuple(input)
+            if isinstance(input, torch.Tensor):
+                return input
+            break
+    if idx == 0:
+        assert False, "Please checkout the example_inputs format."
+
+
+def postprocess_qa_predictions(
+    examples,
+    features,
+    predictions: Tuple[np.ndarray, np.ndarray],
+    version_2_with_negative: bool = False,
+    n_best_size: int = 20,
+    max_answer_length: int = 30,
+    null_score_diff_threshold: float = 0.0,
+    output_dir: Optional[str] = None,
+    prefix: Optional[str] = None,
+    log_level: Optional[int] = logging.WARNING,
+):
+    """
+    Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
+    original contexts. This is the base postprocessing functions for models that only return start and end logits.
+
+    Args:
+        examples: The non-preprocessed dataset (see the main script for more information).
+        features: The processed dataset (see the main script for more information).
+        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
+            first dimension must match the number of elements of :obj:`features`.
+        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the underlying dataset contains examples with no answers.
+        n_best_size (:obj:`int`, `optional`, defaults to 20):
+            The total number of n-best predictions to generate when looking for an answer.
+        max_answer_length (:obj:`int`, `optional`, defaults to 30):
+            The maximum length of an answer that can be generated. This is needed because the start and end predictions
+            are not conditioned on one another.
+        null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
+            The threshold used to select the null answer: if the best answer has a score that is less than the score of
+            the null answer minus this threshold, the null answer is selected for this example (note that the score of
+            the null answer for an example giving several features is the minimum of the scores for the null answer on
+            each feature: all features must be aligned on the fact they `want` to predict a null answer).
+
+            Only useful when :obj:`version_2_with_negative` is :obj:`True`.
+        output_dir (:obj:`str`, `optional`):
+            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
+            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
+            answers, are saved in `output_dir`.
+        prefix (:obj:`str`, `optional`):
+            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
+        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+            ``logging`` log level (e.g., ``logging.WARNING``)
+    """
+    if len(predictions) != 2:
+        raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).")
+    all_start_logits, all_end_logits = predictions
+
+    if len(predictions[0]) != len(features):
+        raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")
+
+    # Build a map example to its corresponding features.
+    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+    features_per_example = collections.defaultdict(list)
+    for i, feature in enumerate(features):
+        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+    # The dictionaries we have to fill.
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    if version_2_with_negative:
+        scores_diff_json = collections.OrderedDict()
+
+    # Logging.
+    logger.setLevel(log_level)
+    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
+
+    # Let's loop over all the examples!
+    for example_index, example in enumerate(tqdm(examples)):
+        # Those are the indices of the features associated to the current example.
+        feature_indices = features_per_example[example_index]
+
+        min_null_prediction = None
+        prelim_predictions = []
+
+        # Looping through all the features associated to the current example.
+        for feature_index in feature_indices:
+            # We grab the predictions of the model for this feature.
+            start_logits = all_start_logits[feature_index]
+            end_logits = all_end_logits[feature_index]
+            # This is what will allow us to map some the positions in our logits to span of texts in the original
+            # context.
+            offset_mapping = features[feature_index]["offset_mapping"]
+            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+            # available in the current feature.
+            token_is_max_context = features[feature_index].get("token_is_max_context", None)
+
+            # Update minimum null prediction.
+            feature_null_score = start_logits[0] + end_logits[0]
+            if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
+                min_null_prediction = {
+                    "offsets": (0, 0),
+                    "score": feature_null_score,
+                    "start_logit": start_logits[0],
+                    "end_logit": end_logits[0],
+                }
+
+            # Go through all possibilities for the `n_best_size` greater start and end logits.
+            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
+            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
+                    # to part of the input_ids that are not in the context.
+                    if (
+                        start_index >= len(offset_mapping)
+                        or end_index >= len(offset_mapping)
+                        or offset_mapping[start_index] is None
+                        or len(offset_mapping[start_index]) < 2
+                        or offset_mapping[end_index] is None
+                        or len(offset_mapping[end_index]) < 2
+                    ):
+                        continue
+                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
+                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
+                        continue
+                    # Don't consider answer that don't have the maximum context available (if such information is
+                    # provided).
+                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
+                        continue
+                    prelim_predictions.append(
+                        {
+                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
+                            "score": start_logits[start_index] + end_logits[end_index],
+                            "start_logit": start_logits[start_index],
+                            "end_logit": end_logits[end_index],
+                        }
+                    )
+        if version_2_with_negative:
+            # Add the minimum null prediction
+            prelim_predictions.append(min_null_prediction)
+            null_score = min_null_prediction["score"]
+
+        # Only keep the best `n_best_size` predictions.
+        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
+
+        # Add back the minimum null prediction if it was removed because of its low score.
+        if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions):
+            predictions.append(min_null_prediction)
+
+        # Use the offsets to gather the answer text in the original context.
+        context = example["context"]
+        for pred in predictions:
+            offsets = pred.pop("offsets")
+            pred["text"] = context[offsets[0] : offsets[1]]
+
+        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+        # failure.
+        if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
+            predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})
+
+        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+        # the LogSumExp trick).
+        scores = np.array([pred.pop("score") for pred in predictions])
+        exp_scores = np.exp(scores - np.max(scores))
+        probs = exp_scores / exp_scores.sum()
+
+        # Include the probabilities in our predictions.
+        for prob, pred in zip(probs, predictions):
+            pred["probability"] = prob
+
+        # Pick the best prediction. If the null answer is not possible, this is easy.
+        if not version_2_with_negative:
+            all_predictions[example["id"]] = predictions[0]["text"]
+        else:
+            # Otherwise we first need to find the best non-empty prediction.
+            i = 0
+            while predictions[i]["text"] == "":
+                i += 1
+            best_non_null_pred = predictions[i]
+
+            # Then we compare to the null prediction using the threshold.
+            score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"]
+            scores_diff_json[example["id"]] = float(score_diff)  # To be JSON-serializable.
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example["id"]] = ""
+            else:
+                all_predictions[example["id"]] = best_non_null_pred["text"]
+
+        # Make `predictions` JSON-serializable by casting np.float32 back to float.
+        all_nbest_json[example["id"]] = [
+            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
+            for pred in predictions
+        ]
+
+    # If we have an output_dir, let's save all those dicts.
+    if output_dir is not None:
+        if not os.path.isdir(output_dir):
+            raise EnvironmentError(f"{output_dir} is not a directory.")
+
+        prediction_file = os.path.join(
+            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
+        )
+        nbest_file = os.path.join(
+            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
+        )
+        if version_2_with_negative:
+            null_odds_file = os.path.join(
+                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
+            )
+
+        logger.info(f"Saving predictions to {prediction_file}.")
+        with open(prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+        logger.info(f"Saving nbest_preds to {nbest_file}.")
+        with open(nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+        if version_2_with_negative:
+            logger.info(f"Saving null_odds to {null_odds_file}.")
+            with open(null_odds_file, "w") as writer:
+                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions
+
+
+def postprocess_qa_predictions_with_beam_search(
+    examples,
+    features,
+    predictions: Tuple[np.ndarray, np.ndarray],
+    version_2_with_negative: bool = False,
+    n_best_size: int = 20,
+    max_answer_length: int = 30,
+    start_n_top: int = 5,
+    end_n_top: int = 5,
+    output_dir: Optional[str] = None,
+    prefix: Optional[str] = None,
+    log_level: Optional[int] = logging.WARNING,
+):
+    """
+    Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the
+    original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as
+    cls token predictions.
+
+    Args:
+        examples: The non-preprocessed dataset (see the main script for more information).
+        features: The processed dataset (see the main script for more information).
+        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
+            first dimension must match the number of elements of :obj:`features`.
+        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the underlying dataset contains examples with no answers.
+        n_best_size (:obj:`int`, `optional`, defaults to 20):
+            The total number of n-best predictions to generate when looking for an answer.
+        max_answer_length (:obj:`int`, `optional`, defaults to 30):
+            The maximum length of an answer that can be generated. This is needed because the start and end predictions
+            are not conditioned on one another.
+        start_n_top (:obj:`int`, `optional`, defaults to 5):
+            The number of top start logits too keep when searching for the :obj:`n_best_size` predictions.
+        end_n_top (:obj:`int`, `optional`, defaults to 5):
+            The number of top end logits too keep when searching for the :obj:`n_best_size` predictions.
+        output_dir (:obj:`str`, `optional`):
+            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
+            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
+            answers, are saved in `output_dir`.
+        prefix (:obj:`str`, `optional`):
+            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
+        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+            ``logging`` log level (e.g., ``logging.WARNING``)
+    """
+    if len(predictions) != 5:
+        raise ValueError("`predictions` should be a tuple with five elements.")
+    start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions
+
+    if len(predictions[0]) != len(features):
+        raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")
+
+    # Build a map example to its corresponding features.
+    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+    features_per_example = collections.defaultdict(list)
+    for i, feature in enumerate(features):
+        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+    # The dictionaries we have to fill.
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict() if version_2_with_negative else None
+
+    # Logging.
+    logger.setLevel(log_level)
+    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
+
+    # Let's loop over all the examples!
+    for example_index, example in enumerate(tqdm(examples)):
+        # Those are the indices of the features associated to the current example.
+        feature_indices = features_per_example[example_index]
+
+        min_null_score = None
+        prelim_predictions = []
+
+        # Looping through all the features associated to the current example.
+        for feature_index in feature_indices:
+            # We grab the predictions of the model for this feature.
+            start_log_prob = start_top_log_probs[feature_index]
+            start_indexes = start_top_index[feature_index]
+            end_log_prob = end_top_log_probs[feature_index]
+            end_indexes = end_top_index[feature_index]
+            feature_null_score = cls_logits[feature_index]
+            # This is what will allow us to map some the positions in our logits to span of texts in the original
+            # context.
+            offset_mapping = features[feature_index]["offset_mapping"]
+            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+            # available in the current feature.
+            token_is_max_context = features[feature_index].get("token_is_max_context", None)
+
+            # Update minimum null prediction
+            if min_null_score is None or feature_null_score < min_null_score:
+                min_null_score = feature_null_score
+
+            # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits.
+            for i in range(start_n_top):
+                for j in range(end_n_top):
+                    start_index = int(start_indexes[i])
+                    j_index = i * end_n_top + j
+                    end_index = int(end_indexes[j_index])
+                    # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the
+                    # p_mask but let's not take any risk)
+                    if (
+                        start_index >= len(offset_mapping)
+                        or end_index >= len(offset_mapping)
+                        or offset_mapping[start_index] is None
+                        or offset_mapping[end_index] is None
+                    ):
+                        continue
+                    # Don't consider answers with a length negative or > max_answer_length.
+                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
+                        continue
+                    # Don't consider answer that don't have the maximum context available (if such information is
+                    # provided).
+                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
+                        continue
+                    prelim_predictions.append(
+                        {
+                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
+                            "score": start_log_prob[i] + end_log_prob[j_index],
+                            "start_log_prob": start_log_prob[i],
+                            "end_log_prob": end_log_prob[j_index],
+                        }
+                    )
+
+        # Only keep the best `n_best_size` predictions.
+        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
+
+        # Use the offsets to gather the answer text in the original context.
+        context = example["context"]
+        for pred in predictions:
+            offsets = pred.pop("offsets")
+            pred["text"] = context[offsets[0] : offsets[1]]
+
+        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+        # failure.
+        if len(predictions) == 0:
+            predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6})
+
+        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+        # the LogSumExp trick).
+        scores = np.array([pred.pop("score") for pred in predictions])
+        exp_scores = np.exp(scores - np.max(scores))
+        probs = exp_scores / exp_scores.sum()
+
+        # Include the probabilities in our predictions.
+        for prob, pred in zip(probs, predictions):
+            pred["probability"] = prob
+
+        # Pick the best prediction and set the probability for the null answer.
+        all_predictions[example["id"]] = predictions[0]["text"]
+        if version_2_with_negative:
+            scores_diff_json[example["id"]] = float(min_null_score)
+
+        # Make `predictions` JSON-serializable by casting np.float32 back to float.
+        all_nbest_json[example["id"]] = [
+            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
+            for pred in predictions
+        ]
+
+    # If we have an output_dir, let's save all those dicts.
+    if output_dir is not None:
+        if not os.path.isdir(output_dir):
+            raise EnvironmentError(f"{output_dir} is not a directory.")
+
+        prediction_file = os.path.join(
+            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
+        )
+        nbest_file = os.path.join(
+            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
+        )
+        if version_2_with_negative:
+            null_odds_file = os.path.join(
+                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
+            )
+
+        logger.info(f"Saving predictions to {prediction_file}.")
+        with open(prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+        logger.info(f"Saving nbest_preds to {nbest_file}.")
+        with open(nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+        if version_2_with_negative:
+            logger.info(f"Saving null_odds to {null_odds_file}.")
+            with open(null_odds_file, "w") as writer:
+                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions, scores_diff_json