From b401b02db2cc7d7f4f8412a815fa435e66e330a0 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Fri, 14 Jun 2024 17:48:03 +0800
Subject: [PATCH] Add PT2E cv&llm example (#1853)

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 docs/3x/PT_StaticQuant.md                     |   6 +-
 examples/.config/model_params_pytorch_3x.json | 120 +++--
 .../3.x_api/pytorch/cv/static_quant/README.md |  27 +
 .../pytorch/cv/static_quant/extract_ILSVRC.sh |  80 +++
 .../3.x_api/pytorch/cv/static_quant/main.py   | 471 ++++++++++++++++++
 .../pytorch/cv/static_quant/requirements.txt  |   3 +
 .../pytorch/cv/static_quant/run_quant.sh      |  45 ++
 .../quantization/static_quant/pt2e/README.md  |  27 +
 .../static_quant/pt2e/requirements.txt        |   7 +
 .../static_quant/pt2e/run_clm_no_trainer.py   | 148 ++++++
 .../static_quant/pt2e/run_quant.sh            |  46 ++
 11 files changed, 926 insertions(+), 54 deletions(-)
 create mode 100644 examples/3.x_api/pytorch/cv/static_quant/README.md
 create mode 100644 examples/3.x_api/pytorch/cv/static_quant/extract_ILSVRC.sh
 create mode 100644 examples/3.x_api/pytorch/cv/static_quant/main.py
 create mode 100644 examples/3.x_api/pytorch/cv/static_quant/requirements.txt
 create mode 100644 examples/3.x_api/pytorch/cv/static_quant/run_quant.sh
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh

diff --git a/docs/3x/PT_StaticQuant.md b/docs/3x/PT_StaticQuant.md
index 7d56f817296..d687e83c1f6 100644
--- a/docs/3x/PT_StaticQuant.md
+++ b/docs/3x/PT_StaticQuant.md
@@ -1,6 +1,5 @@
 PyTorch Static Quantization
 ========================================
-
 1. [Introduction](#introduction)
 2. [Get Started](#get-started) \
     2.1 [Static Quantization with IPEX Backend](#static-quantization-with-ipex-backend) \
@@ -9,6 +8,7 @@ PyTorch Static Quantization
         2.1.3 [Model Examples](#model-examples) \
     2.2 [Static Quantization with PT2E Backend](#static-quantization-with-pt2e-backend) \
         2.2.1 [Usage Sample with PT2E](#usage-sample-with-pt2e)
+        2.2.2 [Model Examples with PT2E](#model-examples-with-pt2e)
 
 
 ## Introduction
@@ -102,3 +102,7 @@ opt_model = torch.compile(q_model)
 ```
 
 > Note: The `set_local` of `StaticQuantConfig` will be supported after the torch 2.4 release.
+
+#### Model Examples with PT2E
+
+Users could refer to [cv examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/cv/static_quant) and [llm examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e) on how to quantize a new model.
diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
index 3a21f42bd20..dfedb7486d3 100644
--- a/examples/.config/model_params_pytorch_3x.json
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -1,53 +1,67 @@
-{
-    "pytorch": {
-      "gpt_j_ipex":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "gpt_j_ipex_sq":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "llama2_7b_ipex":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "llama2_7b_ipex_sq":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "opt_125m_ipex":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "opt_125m_ipex_sq":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "dlrm_ipex": {
-        "model_src_dir": "recommendation/dlrm/static_quant/ipex",
-        "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input",
-        "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt",
-        "main_script": "dlrm_s_pytorch.py",
-        "batch_size": 16384
-      }
-    }
-}
+{
+    "pytorch": {
+      "gpt_j_ipex":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "gpt_j_ipex_sq":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "llama2_7b_ipex":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "llama2_7b_ipex_sq":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "opt_125m_ipex":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "opt_125m_ipex_sq":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "dlrm_ipex": {
+        "model_src_dir": "recommendation/dlrm/static_quant/ipex",
+        "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input",
+        "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt",
+        "main_script": "dlrm_s_pytorch.py",
+        "batch_size": 16384
+      },
+      "resnet18_pt2e_static":{
+        "model_src_dir": "cv/static_quant",
+        "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
+        "input_model": "",
+        "main_script": "main.py",
+        "batch_size": 1
+      },
+      "opt_125m_pt2e_static":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      }
+    }
+}
diff --git a/examples/3.x_api/pytorch/cv/static_quant/README.md b/examples/3.x_api/pytorch/cv/static_quant/README.md
new file mode 100644
index 00000000000..172f8b0e12f
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/static_quant/README.md
@@ -0,0 +1,27 @@
+# ImageNet Quantization
+
+This implements quantization of popular model architectures, such as ResNet on the ImageNet dataset.
+
+## Requirements
+
+- Install requirements
+- `pip install -r requirements.txt`
+- Download the ImageNet dataset from http://www.image-net.org/
+  - Then, move and extract the training and validation images to labeled subfolders, using [the following shell script](extract_ILSVRC.sh)
+
+## Quantizaiton
+
+To quant a model and validate accaracy, run `main.py` with the desired model architecture and the path to the ImageNet dataset:
+
+```bash
+python main.py -a resnet18 [imagenet-folder with train and val folders] -q -e
+```
+
+
+## Use Dummy Data
+
+ImageNet dataset is large and time-consuming to download. To get started quickly, run `main.py` using dummy data by "--dummy". Note that the loss or accuracy is useless in this case.
+
+```bash
+python main.py -a resnet18 --dummy -q -e
+```
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/cv/static_quant/extract_ILSVRC.sh b/examples/3.x_api/pytorch/cv/static_quant/extract_ILSVRC.sh
new file mode 100644
index 00000000000..3ec05e8f328
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/static_quant/extract_ILSVRC.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+#
+# script to extract ImageNet dataset
+# ILSVRC2012_img_train.tar (about 138 GB)
+# ILSVRC2012_img_val.tar (about 6.3 GB)
+# make sure ILSVRC2012_img_train.tar & ILSVRC2012_img_val.tar in your current directory
+#
+#  Adapted from:
+#  https://github.com/facebook/fb.resnet.torch/blob/master/INSTALL.md
+#  https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4
+# 
+#  imagenet/train/
+#  ├── n01440764
+#  │   ├── n01440764_10026.JPEG
+#  │   ├── n01440764_10027.JPEG
+#  │   ├── ......
+#  ├── ......
+#  imagenet/val/
+#  ├── n01440764
+#  │   ├── ILSVRC2012_val_00000293.JPEG
+#  │   ├── ILSVRC2012_val_00002138.JPEG
+#  │   ├── ......
+#  ├── ......
+#
+#
+# Make imagnet directory
+#
+mkdir imagenet
+#
+# Extract the training data:
+#
+# Create train directory; move .tar file; change directory
+mkdir imagenet/train && mv ILSVRC2012_img_train.tar imagenet/train/ && cd imagenet/train
+# Extract training set; remove compressed file
+tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
+#
+# At this stage imagenet/train will contain 1000 compressed .tar files, one for each category
+#
+# For each .tar file: 
+#   1. create directory with same name as .tar file
+#   2. extract and copy contents of .tar file into directory
+#   3. remove .tar file
+find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
+#
+# This results in a training directory like so:
+#
+#  imagenet/train/
+#  ├── n01440764
+#  │   ├── n01440764_10026.JPEG
+#  │   ├── n01440764_10027.JPEG
+#  │   ├── ......
+#  ├── ......
+#
+# Change back to original directory
+cd ../..
+#
+# Extract the validation data and move images to subfolders:
+#
+# Create validation directory; move .tar file; change directory; extract validation .tar; remove compressed file
+mkdir imagenet/val && mv ILSVRC2012_img_val.tar imagenet/val/ && cd imagenet/val && tar -xvf ILSVRC2012_img_val.tar && rm -f ILSVRC2012_img_val.tar
+# get script from soumith and run; this script creates all class directories and moves images into corresponding directories
+wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
+#
+# This results in a validation directory like so:
+#
+#  imagenet/val/
+#  ├── n01440764
+#  │   ├── ILSVRC2012_val_00000293.JPEG
+#  │   ├── ILSVRC2012_val_00002138.JPEG
+#  │   ├── ......
+#  ├── ......
+#
+#
+# Check total files after extract
+#
+#  $ find train/ -name "*.JPEG" | wc -l
+#  1281167
+#  $ find val/ -name "*.JPEG" | wc -l
+#  50000
+#
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/cv/static_quant/main.py b/examples/3.x_api/pytorch/cv/static_quant/main.py
new file mode 100644
index 00000000000..3ab2d6bd6ad
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/static_quant/main.py
@@ -0,0 +1,471 @@
+import argparse
+import os
+import random
+import shutil
+import time
+import warnings
+from enum import Enum
+
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn as nn
+import torch.nn.parallel
+import torch.optim
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.datasets as datasets
+import torchvision.models as models
+import torchvision.transforms as transforms
+from torch.optim.lr_scheduler import StepLR
+from torch.utils.data import Subset
+
+model_names = sorted(name for name in models.__dict__
+    if name.islower() and not name.startswith("__")
+    and callable(models.__dict__[name]))
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser.add_argument('data', metavar='DIR', nargs='?', default='imagenet',
+                    help='path to dataset (default: imagenet)')
+parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
+                    choices=model_names,
+                    help='model architecture: ' +
+                        ' | '.join(model_names) +
+                        ' (default: resnet18)')
+parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                    help='number of data loading workers (default: 4)')
+parser.add_argument('--epochs', default=90, type=int, metavar='N',
+                    help='number of total epochs to run')
+parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+                    metavar='N',
+                    help='mini-batch size (default: 256), this is the total '
+                         'batch size of all GPUs on the current node when '
+                         'using Data Parallel or Distributed Data Parallel')
+parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
+                    metavar='LR', help='initial learning rate', dest='lr')
+parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                    help='momentum')
+parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                    metavar='W', help='weight decay (default: 1e-4)',
+                    dest='weight_decay')
+parser.add_argument('-p', '--print-freq', default=10, type=int,
+                    metavar='N', help='print frequency (default: 10)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+parser.add_argument('--world-size', default=-1, type=int,
+                    help='number of nodes for distributed training')
+parser.add_argument('--rank', default=-1, type=int,
+                    help='node rank for distributed training')
+parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
+                    help='url used to set up distributed training')
+parser.add_argument('--dist-backend', default='nccl', type=str,
+                    help='distributed backend')
+parser.add_argument('--seed', default=None, type=int,
+                    help='seed for initializing training. ')
+parser.add_argument('--gpu', default=None, type=int,
+                    help='GPU id to use.')
+parser.add_argument('--multiprocessing-distributed', action='store_true',
+                    help='Use multi-processing distributed training to launch '
+                         'N processes per node, which has N GPUs. This is the '
+                         'fastest way to use PyTorch for either single node or '
+                         'multi node data parallel training')
+parser.add_argument('--dummy', action='store_true', help="use fake data to benchmark")
+parser.add_argument('-q', '--quantize', dest='quantize', action='store_true',
+                    help='quantize model')
+parser.add_argument("--calib_iters", default=2, type=int,
+                    help="For calibration only.")
+
+best_acc1 = 0
+
+
+def main():
+    args = parser.parse_args()
+
+    if args.seed is not None:
+        random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        cudnn.deterministic = True
+        cudnn.benchmark = False
+        warnings.warn('You have chosen to seed training. '
+                      'This will turn on the CUDNN deterministic setting, '
+                      'which can slow down your training considerably! '
+                      'You may see unexpected behavior when restarting '
+                      'from checkpoints.')
+
+    if args.gpu is not None:
+        warnings.warn('You have chosen a specific GPU. This will completely '
+                      'disable data parallelism.')
+
+    if args.dist_url == "env://" and args.world_size == -1:
+        args.world_size = int(os.environ["WORLD_SIZE"])
+
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+
+    if torch.cuda.is_available():
+        ngpus_per_node = torch.cuda.device_count()
+        if ngpus_per_node == 1 and args.dist_backend == "nccl":
+            warnings.warn("nccl backend >=2.5 requires GPU count>1, see https://github.com/NVIDIA/nccl/issues/103 perhaps use 'gloo'")
+    else:
+        ngpus_per_node = 1
+
+    if args.multiprocessing_distributed:
+        # Since we have ngpus_per_node processes per node, the total world_size
+        # needs to be adjusted accordingly
+        args.world_size = ngpus_per_node * args.world_size
+        # Use torch.multiprocessing.spawn to launch distributed processes: the
+        # main_worker process function
+        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+    else:
+        # Simply call main_worker function
+        main_worker(args.gpu, ngpus_per_node, args)
+
+
+def main_worker(gpu, ngpus_per_node, args):
+    global best_acc1
+    args.gpu = gpu
+
+    if args.gpu is not None:
+        print("Use GPU: {} for training".format(args.gpu))
+
+    if args.distributed:
+        if args.dist_url == "env://" and args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            # For multiprocessing distributed training, rank needs to be the
+            # global rank among all the processes
+            args.rank = args.rank * ngpus_per_node + gpu
+        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                world_size=args.world_size, rank=args.rank)
+    # create model
+    if args.pretrained:
+        print("=> using pre-trained model '{}'".format(args.arch))
+        model = models.__dict__[args.arch](pretrained=True)
+    else:
+        print("=> creating model '{}'".format(args.arch))
+        model = models.__dict__[args.arch]()
+
+    if not torch.cuda.is_available() and not torch.backends.mps.is_available():
+        print('using CPU, this will be slow')
+    elif args.distributed:
+        # For multiprocessing distributed, DistributedDataParallel constructor
+        # should always set the single device scope, otherwise,
+        # DistributedDataParallel will use all available devices.
+        if torch.cuda.is_available():
+            if args.gpu is not None:
+                torch.cuda.set_device(args.gpu)
+                model.cuda(args.gpu)
+                # When using a single GPU per process and per
+                # DistributedDataParallel, we need to divide the batch size
+                # ourselves based on the total number of GPUs of the current node.
+                args.batch_size = int(args.batch_size / ngpus_per_node)
+                args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
+                model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+            else:
+                model.cuda()
+                # DistributedDataParallel will divide and allocate batch_size to all
+                # available GPUs if device_ids are not set
+                model = torch.nn.parallel.DistributedDataParallel(model)
+    elif args.gpu is not None and torch.cuda.is_available():
+        torch.cuda.set_device(args.gpu)
+        model = model.cuda(args.gpu)
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+        model = model.to(device)
+    else:
+        # DataParallel will divide and allocate batch_size to all available GPUs
+        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
+            model.features = torch.nn.DataParallel(model.features)
+            model.cuda()
+        else:
+            model = torch.nn.DataParallel(model).cuda()
+
+    if torch.cuda.is_available():
+        if args.gpu:
+            device = torch.device('cuda:{}'.format(args.gpu))
+        else:
+            device = torch.device("cuda")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+    # define loss function (criterion), optimizer, and learning rate scheduler
+    criterion = nn.CrossEntropyLoss().to(device)
+
+    optimizer = torch.optim.SGD(model.parameters(), args.lr,
+                                momentum=args.momentum,
+                                weight_decay=args.weight_decay)
+    
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+    scheduler = StepLR(optimizer, step_size=30, gamma=0.1)
+    
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            if args.gpu is None:
+                checkpoint = torch.load(args.resume)
+            elif torch.cuda.is_available():
+                # Map model to be loaded to specified single gpu.
+                loc = 'cuda:{}'.format(args.gpu)
+                checkpoint = torch.load(args.resume, map_location=loc)
+            args.start_epoch = checkpoint['epoch']
+            best_acc1 = checkpoint['best_acc1']
+            if args.gpu is not None:
+                # best_acc1 may be from a checkpoint from a different GPU
+                best_acc1 = best_acc1.to(args.gpu)
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            scheduler.load_state_dict(checkpoint['scheduler'])
+            print("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+
+    # Data loading code
+    if args.dummy:
+        print("=> Dummy data is used!")
+        train_dataset = datasets.FakeData(1281167, (3, 224, 224), 1000, transforms.ToTensor())
+        val_dataset = datasets.FakeData(50000, (3, 224, 224), 1000, transforms.ToTensor())
+    else:
+        traindir = os.path.join(args.data, 'train')
+        valdir = os.path.join(args.data, 'val')
+        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+
+        train_dataset = datasets.ImageFolder(
+            traindir,
+            transforms.Compose([
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                normalize,
+            ]))
+
+        val_dataset = datasets.ImageFolder(
+            valdir,
+            transforms.Compose([
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                normalize,
+            ]))
+
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True)
+    else:
+        train_sampler = None
+        val_sampler = None
+
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
+        num_workers=args.workers, pin_memory=True, sampler=train_sampler)
+
+    val_loader = torch.utils.data.DataLoader(
+        val_dataset, batch_size=args.batch_size, shuffle=False,
+        num_workers=args.workers, pin_memory=True, sampler=val_sampler)
+    
+    if args.quantize:
+        from neural_compressor.torch.export import export
+        from neural_compressor.torch.quantization import prepare, convert, get_default_static_config
+
+        # Prepare the float model and example inputs for exporting model
+        x = torch.randn(args.batch_size, 3, 224, 224).contiguous(memory_format=torch.channels_last)
+        example_inputs = (x,)
+
+        # Export eager model into FX graph model
+        exported_model = export(model=model, example_inputs=example_inputs)
+        # Quantize the model
+        quant_config = get_default_static_config()
+        
+        prepared_model = prepare(exported_model, quant_config=quant_config)
+        # Calibrate
+        for i in range(args.calib_iters):
+            prepared_model(*example_inputs)
+        q_model = convert(prepared_model)
+        # Compile the quantized model and replace the Q/DQ pattern with Q-operator
+        from torch._inductor import config
+
+        config.freezing = True
+        opt_model = torch.compile(q_model)
+        model = opt_model
+
+    
+    if args.evaluate:
+        validate(val_loader, model, criterion, args)
+        return
+
+
+def validate(val_loader, model, criterion, args):
+
+    def run_validate(loader, base_progress=0):
+        with torch.no_grad():
+            end = time.time()
+            for i, (images, target) in enumerate(loader):
+                i = base_progress + i
+                if args.gpu is not None and torch.cuda.is_available():
+                    images = images.cuda(args.gpu, non_blocking=True)
+                if torch.backends.mps.is_available():
+                    images = images.to('mps')
+                    target = target.to('mps')
+                if torch.cuda.is_available():
+                    target = target.cuda(args.gpu, non_blocking=True)
+
+                # compute output
+                output = model(images)
+                loss = criterion(output, target)
+
+                # measure accuracy and record loss
+                acc1, acc5 = accuracy(output, target, topk=(1, 5))
+                losses.update(loss.item(), images.size(0))
+                top1.update(acc1[0], images.size(0))
+                top5.update(acc5[0], images.size(0))
+
+                # measure elapsed time
+                batch_time.update(time.time() - end)
+                end = time.time()
+
+                if i % args.print_freq == 0:
+                    progress.display(i + 1)
+
+    batch_time = AverageMeter('Time', ':6.3f', Summary.NONE)
+    losses = AverageMeter('Loss', ':.4e', Summary.NONE)
+    top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE)
+    top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE)
+    progress = ProgressMeter(
+        len(val_loader) + (args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset))),
+        [batch_time, losses, top1, top5],
+        prefix='Test: ')
+
+    # switch to evaluate mode, pt2e no eval() or train()
+    # model.eval()
+
+    run_validate(val_loader)
+    if args.distributed:
+        top1.all_reduce()
+        top5.all_reduce()
+
+    if args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset)):
+        aux_val_dataset = Subset(val_loader.dataset,
+                                 range(len(val_loader.sampler) * args.world_size, len(val_loader.dataset)))
+        aux_val_loader = torch.utils.data.DataLoader(
+            aux_val_dataset, batch_size=args.batch_size, shuffle=False,
+            num_workers=args.workers, pin_memory=True)
+        run_validate(aux_val_loader, len(val_loader))
+
+    progress.display_summary()
+
+    return top1.avg
+
+
+def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, 'model_best.pth.tar')
+
+class Summary(Enum):
+    NONE = 0
+    AVERAGE = 1
+    SUM = 2
+    COUNT = 3
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=':f', summary_type=Summary.AVERAGE):
+        self.name = name
+        self.fmt = fmt
+        self.summary_type = summary_type
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def all_reduce(self):
+        if torch.cuda.is_available():
+            device = torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            device = torch.device("mps")
+        else:
+            device = torch.device("cpu")
+        total = torch.tensor([self.sum, self.count], dtype=torch.float32, device=device)
+        dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
+        self.sum, self.count = total.tolist()
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+    
+    def summary(self):
+        fmtstr = ''
+        if self.summary_type is Summary.NONE:
+            fmtstr = ''
+        elif self.summary_type is Summary.AVERAGE:
+            fmtstr = '{name} {avg:.3f}'
+        elif self.summary_type is Summary.SUM:
+            fmtstr = '{name} {sum:.3f}'
+        elif self.summary_type is Summary.COUNT:
+            fmtstr = '{name} {count:.3f}'
+        else:
+            raise ValueError('invalid summary type %r' % self.summary_type)
+        
+        return fmtstr.format(**self.__dict__)
+
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print('\t'.join(entries))
+        
+    def display_summary(self):
+        entries = [" *"]
+        entries += [meter.summary() for meter in self.meters]
+        print(' '.join(entries))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/cv/static_quant/requirements.txt b/examples/3.x_api/pytorch/cv/static_quant/requirements.txt
new file mode 100644
index 00000000000..ebd3df6ae7a
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/static_quant/requirements.txt
@@ -0,0 +1,3 @@
+torch
+torchvision
+neural-compressor
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh b/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh
new file mode 100644
index 00000000000..ac4a5a2b668
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+       --output_model=*)
+           tuned_checkpoint=$(echo $var |cut -f2 -d=)
+       ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+    if [ "${topology}" = "resnet18_pt2e_static" ]; then
+        model_name_or_path="resnet18"
+    fi
+    python main.py -a ${model_name_or_path} ${dataset_location} -q -e
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md
new file mode 100644
index 00000000000..7ad8b76bd1e
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md
@@ -0,0 +1,27 @@
+Step-by-Step
+============
+This document describes the step-by-step instructions to run large language models (LLMs) on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch 2 Export Quantization.
+
+Currently, users can use `run_clm_no_trainer.py` to quantize the `OPT` series models and validate the last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git). We will add more models in the near future.
+
+# Prerequisite
+## 1. Create Environment
+```
+# Installation
+pip install -r requirements.txt
+```
+
+# Run
+
+Here is how to run the scripts:
+
+**Causal Language Modeling (CLM)**
+
+`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows.
+### OPT-125m
+
+#### Quantization
+
+```bash
+python run_clm_no_trainer.py --model facebook/opt-125m --quantize --accuracy
+```
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt
new file mode 100644
index 00000000000..b6d9b6c55de
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt
@@ -0,0 +1,7 @@
+transformers
+torch
+sentencepiece
+neural-compressor
+intel-extension-for-transformers >= 1.4.1
+lm-eval==0.4.2
+peft
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
new file mode 100644
index 00000000000..98d3f11a1dd
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
@@ -0,0 +1,148 @@
+import argparse
+import time
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--model", nargs="?", default="facebook/opt-125m"
+)
+parser.add_argument(
+    "--trust_remote_code", default=True,
+    help="Transformers parameter: use the external repo")
+parser.add_argument(
+    "--revision", default=None,
+    help="Transformers parameter: set the model hub commit number")
+parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k")
+parser.add_argument("--output_dir", nargs="?", default="./saved_results")
+parser.add_argument("--quantize", action="store_true")
+parser.add_argument("--approach", type=str, default='static',
+                    help="Select from ['dynamic', 'static', 'weight-only']")
+parser.add_argument("--int8", action="store_true")
+parser.add_argument("--accuracy", action="store_true")
+parser.add_argument("--performance", action="store_true")
+parser.add_argument("--calib_iters", default=2, type=int,
+                    help="For calibration only.")
+parser.add_argument("--iters", default=100, type=int,
+                    help="For accuracy measurement only.")
+parser.add_argument("--batch_size", default=1, type=int,
+                    help="For accuracy measurement only.")
+parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext",
+                    type=str, help="tasks for accuracy validation")
+parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
+# =======================================
+
+args = parser.parse_args()
+
+
+def get_user_model():
+    torchscript = False
+    user_model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        torchscript=torchscript,  # torchscript will force `return_dict=False` to avoid jit errors
+        trust_remote_code=args.trust_remote_code,
+        revision=args.revision,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+
+    if args.peft_model_id is not None:
+        from peft import PeftModel
+        user_model = PeftModel.from_pretrained(user_model, args.peft_model_id)
+
+    # to channels last
+    user_model = user_model.to(memory_format=torch.channels_last)
+    user_model.eval()
+    return user_model, tokenizer
+
+user_model, tokenizer = get_user_model()
+if args.quantize:
+    
+    from neural_compressor.torch.quantization import (
+            convert,
+            get_default_static_config,
+            prepare,
+        )
+    from neural_compressor.torch.export import export
+    from torch.export import Dim
+    def get_example_inputs(tokenizer):
+        text = "Hello, welcome to LLM world."
+        encoded_input = tokenizer(text, return_tensors="pt")
+
+        example_inputs = encoded_input
+        input_ids = example_inputs["input_ids"]
+        input_ids_batch = torch.cat((input_ids, input_ids), dim=0)
+        print(f"input_ids_batch shape: {input_ids_batch.shape}")
+        tuple_inputs = (input_ids_batch,)
+        return tuple_inputs
+    # torch._dynamo.config.cache_size_limit = 4 # set limitation if out of memory
+    batch = Dim(name="batch_size")
+    seq_len = Dim(name="seq_len")
+    dynamic_shapes = {"input_ids": (batch, seq_len)}
+    example_inputs = get_example_inputs(tokenizer)
+    exported_model = export(user_model, example_inputs=example_inputs, dynamic_shapes=dynamic_shapes)
+
+    quant_config = get_default_static_config()
+    # prepare
+    prepare_model = prepare(exported_model, quant_config)
+
+    # calibrate
+    for i in range(args.calib_iters):
+        prepare_model(*example_inputs)
+    # convert
+    converted_model = convert(prepare_model)
+    # inference
+    from torch._inductor import config
+
+    config.freezing = True
+    opt_model = torch.compile(converted_model)
+
+    opt_model.config = user_model.config # for lm eval
+    user_model = opt_model
+
+
+if args.accuracy:
+    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    eval_args = LMEvalParser(
+        model="hf",
+        user_model=user_model,
+        tokenizer=tokenizer,
+        batch_size=args.batch_size,
+        tasks=args.tasks,
+        device="cpu",
+    )
+    results = evaluate(eval_args)
+    for task_name in args.tasks.split(","):
+        if task_name == "wikitext":
+            acc = results["results"][task_name]["word_perplexity,none"]
+        else:
+            acc = results["results"][task_name]["acc,none"]
+    print("Accuracy: %.5f" % acc)
+    print('Batch size = %d' % args.batch_size)
+
+if args.performance:
+    # user_model.eval()
+    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    import time
+
+    samples = args.iters * args.batch_size
+    eval_args = LMEvalParser(
+        model="hf",
+        user_model=user_model,
+        tokenizer=tokenizer,
+        batch_size=args.batch_size,
+        tasks=args.tasks,
+        limit=samples,
+        device="cpu",
+    )
+    start = time.time()
+    results = evaluate(eval_args)
+    end = time.time()
+    for task_name in args.tasks.split(","):
+        if task_name == "wikitext":
+            acc = results["results"][task_name]["word_perplexity,none"]
+        else:
+            acc = results["results"][task_name]["acc,none"]
+    print("Accuracy: %.5f" % acc)
+    print('Throughput: %.3f samples/sec' % (samples / (end - start)))
+    print('Latency: %.3f ms' % ((end - start) * 1000 / samples))
+    print('Batch size = %d' % args.batch_size)
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh
new file mode 100644
index 00000000000..6bd599483ff
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+       --output_model=*)
+           tuned_checkpoint=$(echo $var |cut -f2 -d=)
+       ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+
+    if [ "${topology}" = "opt_125m_pt2e_static" ]; then
+        model_name_or_path="facebook/opt-125m"
+    fi
+    python run_clm_no_trainer.py --model ${model_name_or_path} --quantize --accuracy --tasks "lambada_openai"
+}
+
+main "$@"