From b401b02db2cc7d7f4f8412a815fa435e66e330a0 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Fri, 14 Jun 2024 17:48:03 +0800 Subject: [PATCH] Add PT2E cv&llm example (#1853) Signed-off-by: Kaihui-intel --- docs/3x/PT_StaticQuant.md | 6 +- examples/.config/model_params_pytorch_3x.json | 120 +++-- .../3.x_api/pytorch/cv/static_quant/README.md | 27 + .../pytorch/cv/static_quant/extract_ILSVRC.sh | 80 +++ .../3.x_api/pytorch/cv/static_quant/main.py | 471 ++++++++++++++++++ .../pytorch/cv/static_quant/requirements.txt | 3 + .../pytorch/cv/static_quant/run_quant.sh | 45 ++ .../quantization/static_quant/pt2e/README.md | 27 + .../static_quant/pt2e/requirements.txt | 7 + .../static_quant/pt2e/run_clm_no_trainer.py | 148 ++++++ .../static_quant/pt2e/run_quant.sh | 46 ++ 11 files changed, 926 insertions(+), 54 deletions(-) create mode 100644 examples/3.x_api/pytorch/cv/static_quant/README.md create mode 100644 examples/3.x_api/pytorch/cv/static_quant/extract_ILSVRC.sh create mode 100644 examples/3.x_api/pytorch/cv/static_quant/main.py create mode 100644 examples/3.x_api/pytorch/cv/static_quant/requirements.txt create mode 100644 examples/3.x_api/pytorch/cv/static_quant/run_quant.sh create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh diff --git a/docs/3x/PT_StaticQuant.md b/docs/3x/PT_StaticQuant.md index 7d56f817296..d687e83c1f6 100644 --- a/docs/3x/PT_StaticQuant.md +++ b/docs/3x/PT_StaticQuant.md @@ -1,6 +1,5 @@ PyTorch Static Quantization ======================================== - 1. [Introduction](#introduction) 2. [Get Started](#get-started) \ 2.1 [Static Quantization with IPEX Backend](#static-quantization-with-ipex-backend) \ @@ -9,6 +8,7 @@ PyTorch Static Quantization 2.1.3 [Model Examples](#model-examples) \ 2.2 [Static Quantization with PT2E Backend](#static-quantization-with-pt2e-backend) \ 2.2.1 [Usage Sample with PT2E](#usage-sample-with-pt2e) + 2.2.2 [Model Examples with PT2E](#model-examples-with-pt2e) ## Introduction @@ -102,3 +102,7 @@ opt_model = torch.compile(q_model) ``` > Note: The `set_local` of `StaticQuantConfig` will be supported after the torch 2.4 release. + +#### Model Examples with PT2E + +Users could refer to [cv examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/cv/static_quant) and [llm examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e) on how to quantize a new model. diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json index 3a21f42bd20..dfedb7486d3 100644 --- a/examples/.config/model_params_pytorch_3x.json +++ b/examples/.config/model_params_pytorch_3x.json @@ -1,53 +1,67 @@ -{ - "pytorch": { - "gpt_j_ipex":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, - "gpt_j_ipex_sq":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, - "llama2_7b_ipex":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, - "llama2_7b_ipex_sq":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, - "opt_125m_ipex":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 8 - }, - "opt_125m_ipex_sq":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 8 - }, - "dlrm_ipex": { - "model_src_dir": "recommendation/dlrm/static_quant/ipex", - "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input", - "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt", - "main_script": "dlrm_s_pytorch.py", - "batch_size": 16384 - } - } -} +{ + "pytorch": { + "gpt_j_ipex":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "gpt_j_ipex_sq":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "llama2_7b_ipex":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "llama2_7b_ipex_sq":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "opt_125m_ipex":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "opt_125m_ipex_sq":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "dlrm_ipex": { + "model_src_dir": "recommendation/dlrm/static_quant/ipex", + "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input", + "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt", + "main_script": "dlrm_s_pytorch.py", + "batch_size": 16384 + }, + "resnet18_pt2e_static":{ + "model_src_dir": "cv/static_quant", + "dataset_location": "/tf_dataset/pytorch/ImageNet/raw", + "input_model": "", + "main_script": "main.py", + "batch_size": 1 + }, + "opt_125m_pt2e_static":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + } + } +} diff --git a/examples/3.x_api/pytorch/cv/static_quant/README.md b/examples/3.x_api/pytorch/cv/static_quant/README.md new file mode 100644 index 00000000000..172f8b0e12f --- /dev/null +++ b/examples/3.x_api/pytorch/cv/static_quant/README.md @@ -0,0 +1,27 @@ +# ImageNet Quantization + +This implements quantization of popular model architectures, such as ResNet on the ImageNet dataset. + +## Requirements + +- Install requirements +- `pip install -r requirements.txt` +- Download the ImageNet dataset from http://www.image-net.org/ + - Then, move and extract the training and validation images to labeled subfolders, using [the following shell script](extract_ILSVRC.sh) + +## Quantizaiton + +To quant a model and validate accaracy, run `main.py` with the desired model architecture and the path to the ImageNet dataset: + +```bash +python main.py -a resnet18 [imagenet-folder with train and val folders] -q -e +``` + + +## Use Dummy Data + +ImageNet dataset is large and time-consuming to download. To get started quickly, run `main.py` using dummy data by "--dummy". Note that the loss or accuracy is useless in this case. + +```bash +python main.py -a resnet18 --dummy -q -e +``` \ No newline at end of file diff --git a/examples/3.x_api/pytorch/cv/static_quant/extract_ILSVRC.sh b/examples/3.x_api/pytorch/cv/static_quant/extract_ILSVRC.sh new file mode 100644 index 00000000000..3ec05e8f328 --- /dev/null +++ b/examples/3.x_api/pytorch/cv/static_quant/extract_ILSVRC.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# +# script to extract ImageNet dataset +# ILSVRC2012_img_train.tar (about 138 GB) +# ILSVRC2012_img_val.tar (about 6.3 GB) +# make sure ILSVRC2012_img_train.tar & ILSVRC2012_img_val.tar in your current directory +# +# Adapted from: +# https://github.com/facebook/fb.resnet.torch/blob/master/INSTALL.md +# https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4 +# +# imagenet/train/ +# ├── n01440764 +# │ ├── n01440764_10026.JPEG +# │ ├── n01440764_10027.JPEG +# │ ├── ...... +# ├── ...... +# imagenet/val/ +# ├── n01440764 +# │ ├── ILSVRC2012_val_00000293.JPEG +# │ ├── ILSVRC2012_val_00002138.JPEG +# │ ├── ...... +# ├── ...... +# +# +# Make imagnet directory +# +mkdir imagenet +# +# Extract the training data: +# +# Create train directory; move .tar file; change directory +mkdir imagenet/train && mv ILSVRC2012_img_train.tar imagenet/train/ && cd imagenet/train +# Extract training set; remove compressed file +tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar +# +# At this stage imagenet/train will contain 1000 compressed .tar files, one for each category +# +# For each .tar file: +# 1. create directory with same name as .tar file +# 2. extract and copy contents of .tar file into directory +# 3. remove .tar file +find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done +# +# This results in a training directory like so: +# +# imagenet/train/ +# ├── n01440764 +# │ ├── n01440764_10026.JPEG +# │ ├── n01440764_10027.JPEG +# │ ├── ...... +# ├── ...... +# +# Change back to original directory +cd ../.. +# +# Extract the validation data and move images to subfolders: +# +# Create validation directory; move .tar file; change directory; extract validation .tar; remove compressed file +mkdir imagenet/val && mv ILSVRC2012_img_val.tar imagenet/val/ && cd imagenet/val && tar -xvf ILSVRC2012_img_val.tar && rm -f ILSVRC2012_img_val.tar +# get script from soumith and run; this script creates all class directories and moves images into corresponding directories +wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash +# +# This results in a validation directory like so: +# +# imagenet/val/ +# ├── n01440764 +# │ ├── ILSVRC2012_val_00000293.JPEG +# │ ├── ILSVRC2012_val_00002138.JPEG +# │ ├── ...... +# ├── ...... +# +# +# Check total files after extract +# +# $ find train/ -name "*.JPEG" | wc -l +# 1281167 +# $ find val/ -name "*.JPEG" | wc -l +# 50000 +# \ No newline at end of file diff --git a/examples/3.x_api/pytorch/cv/static_quant/main.py b/examples/3.x_api/pytorch/cv/static_quant/main.py new file mode 100644 index 00000000000..3ab2d6bd6ad --- /dev/null +++ b/examples/3.x_api/pytorch/cv/static_quant/main.py @@ -0,0 +1,471 @@ +import argparse +import os +import random +import shutil +import time +import warnings +from enum import Enum + +import torch +import torch.backends.cudnn as cudnn +import torch.distributed as dist +import torch.multiprocessing as mp +import torch.nn as nn +import torch.nn.parallel +import torch.optim +import torch.utils.data +import torch.utils.data.distributed +import torchvision.datasets as datasets +import torchvision.models as models +import torchvision.transforms as transforms +from torch.optim.lr_scheduler import StepLR +from torch.utils.data import Subset + +model_names = sorted(name for name in models.__dict__ + if name.islower() and not name.startswith("__") + and callable(models.__dict__[name])) + +parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') +parser.add_argument('data', metavar='DIR', nargs='?', default='imagenet', + help='path to dataset (default: imagenet)') +parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18', + choices=model_names, + help='model architecture: ' + + ' | '.join(model_names) + + ' (default: resnet18)') +parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', + help='number of data loading workers (default: 4)') +parser.add_argument('--epochs', default=90, type=int, metavar='N', + help='number of total epochs to run') +parser.add_argument('--start-epoch', default=0, type=int, metavar='N', + help='manual epoch number (useful on restarts)') +parser.add_argument('-b', '--batch-size', default=256, type=int, + metavar='N', + help='mini-batch size (default: 256), this is the total ' + 'batch size of all GPUs on the current node when ' + 'using Data Parallel or Distributed Data Parallel') +parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, + metavar='LR', help='initial learning rate', dest='lr') +parser.add_argument('--momentum', default=0.9, type=float, metavar='M', + help='momentum') +parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, + metavar='W', help='weight decay (default: 1e-4)', + dest='weight_decay') +parser.add_argument('-p', '--print-freq', default=10, type=int, + metavar='N', help='print frequency (default: 10)') +parser.add_argument('--resume', default='', type=str, metavar='PATH', + help='path to latest checkpoint (default: none)') +parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', + help='evaluate model on validation set') +parser.add_argument('--pretrained', dest='pretrained', action='store_true', + help='use pre-trained model') +parser.add_argument('--world-size', default=-1, type=int, + help='number of nodes for distributed training') +parser.add_argument('--rank', default=-1, type=int, + help='node rank for distributed training') +parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, + help='url used to set up distributed training') +parser.add_argument('--dist-backend', default='nccl', type=str, + help='distributed backend') +parser.add_argument('--seed', default=None, type=int, + help='seed for initializing training. ') +parser.add_argument('--gpu', default=None, type=int, + help='GPU id to use.') +parser.add_argument('--multiprocessing-distributed', action='store_true', + help='Use multi-processing distributed training to launch ' + 'N processes per node, which has N GPUs. This is the ' + 'fastest way to use PyTorch for either single node or ' + 'multi node data parallel training') +parser.add_argument('--dummy', action='store_true', help="use fake data to benchmark") +parser.add_argument('-q', '--quantize', dest='quantize', action='store_true', + help='quantize model') +parser.add_argument("--calib_iters", default=2, type=int, + help="For calibration only.") + +best_acc1 = 0 + + +def main(): + args = parser.parse_args() + + if args.seed is not None: + random.seed(args.seed) + torch.manual_seed(args.seed) + cudnn.deterministic = True + cudnn.benchmark = False + warnings.warn('You have chosen to seed training. ' + 'This will turn on the CUDNN deterministic setting, ' + 'which can slow down your training considerably! ' + 'You may see unexpected behavior when restarting ' + 'from checkpoints.') + + if args.gpu is not None: + warnings.warn('You have chosen a specific GPU. This will completely ' + 'disable data parallelism.') + + if args.dist_url == "env://" and args.world_size == -1: + args.world_size = int(os.environ["WORLD_SIZE"]) + + args.distributed = args.world_size > 1 or args.multiprocessing_distributed + + if torch.cuda.is_available(): + ngpus_per_node = torch.cuda.device_count() + if ngpus_per_node == 1 and args.dist_backend == "nccl": + warnings.warn("nccl backend >=2.5 requires GPU count>1, see https://github.com/NVIDIA/nccl/issues/103 perhaps use 'gloo'") + else: + ngpus_per_node = 1 + + if args.multiprocessing_distributed: + # Since we have ngpus_per_node processes per node, the total world_size + # needs to be adjusted accordingly + args.world_size = ngpus_per_node * args.world_size + # Use torch.multiprocessing.spawn to launch distributed processes: the + # main_worker process function + mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) + else: + # Simply call main_worker function + main_worker(args.gpu, ngpus_per_node, args) + + +def main_worker(gpu, ngpus_per_node, args): + global best_acc1 + args.gpu = gpu + + if args.gpu is not None: + print("Use GPU: {} for training".format(args.gpu)) + + if args.distributed: + if args.dist_url == "env://" and args.rank == -1: + args.rank = int(os.environ["RANK"]) + if args.multiprocessing_distributed: + # For multiprocessing distributed training, rank needs to be the + # global rank among all the processes + args.rank = args.rank * ngpus_per_node + gpu + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + # create model + if args.pretrained: + print("=> using pre-trained model '{}'".format(args.arch)) + model = models.__dict__[args.arch](pretrained=True) + else: + print("=> creating model '{}'".format(args.arch)) + model = models.__dict__[args.arch]() + + if not torch.cuda.is_available() and not torch.backends.mps.is_available(): + print('using CPU, this will be slow') + elif args.distributed: + # For multiprocessing distributed, DistributedDataParallel constructor + # should always set the single device scope, otherwise, + # DistributedDataParallel will use all available devices. + if torch.cuda.is_available(): + if args.gpu is not None: + torch.cuda.set_device(args.gpu) + model.cuda(args.gpu) + # When using a single GPU per process and per + # DistributedDataParallel, we need to divide the batch size + # ourselves based on the total number of GPUs of the current node. + args.batch_size = int(args.batch_size / ngpus_per_node) + args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + else: + model.cuda() + # DistributedDataParallel will divide and allocate batch_size to all + # available GPUs if device_ids are not set + model = torch.nn.parallel.DistributedDataParallel(model) + elif args.gpu is not None and torch.cuda.is_available(): + torch.cuda.set_device(args.gpu) + model = model.cuda(args.gpu) + elif torch.backends.mps.is_available(): + device = torch.device("mps") + model = model.to(device) + else: + # DataParallel will divide and allocate batch_size to all available GPUs + if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): + model.features = torch.nn.DataParallel(model.features) + model.cuda() + else: + model = torch.nn.DataParallel(model).cuda() + + if torch.cuda.is_available(): + if args.gpu: + device = torch.device('cuda:{}'.format(args.gpu)) + else: + device = torch.device("cuda") + elif torch.backends.mps.is_available(): + device = torch.device("mps") + else: + device = torch.device("cpu") + # define loss function (criterion), optimizer, and learning rate scheduler + criterion = nn.CrossEntropyLoss().to(device) + + optimizer = torch.optim.SGD(model.parameters(), args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay) + + """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" + scheduler = StepLR(optimizer, step_size=30, gamma=0.1) + + # optionally resume from a checkpoint + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + if args.gpu is None: + checkpoint = torch.load(args.resume) + elif torch.cuda.is_available(): + # Map model to be loaded to specified single gpu. + loc = 'cuda:{}'.format(args.gpu) + checkpoint = torch.load(args.resume, map_location=loc) + args.start_epoch = checkpoint['epoch'] + best_acc1 = checkpoint['best_acc1'] + if args.gpu is not None: + # best_acc1 may be from a checkpoint from a different GPU + best_acc1 = best_acc1.to(args.gpu) + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + scheduler.load_state_dict(checkpoint['scheduler']) + print("=> loaded checkpoint '{}' (epoch {})" + .format(args.resume, checkpoint['epoch'])) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + + # Data loading code + if args.dummy: + print("=> Dummy data is used!") + train_dataset = datasets.FakeData(1281167, (3, 224, 224), 1000, transforms.ToTensor()) + val_dataset = datasets.FakeData(50000, (3, 224, 224), 1000, transforms.ToTensor()) + else: + traindir = os.path.join(args.data, 'train') + valdir = os.path.join(args.data, 'val') + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + train_dataset = datasets.ImageFolder( + traindir, + transforms.Compose([ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ])) + + val_dataset = datasets.ImageFolder( + valdir, + transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])) + + if args.distributed: + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) + val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True) + else: + train_sampler = None + val_sampler = None + + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), + num_workers=args.workers, pin_memory=True, sampler=train_sampler) + + val_loader = torch.utils.data.DataLoader( + val_dataset, batch_size=args.batch_size, shuffle=False, + num_workers=args.workers, pin_memory=True, sampler=val_sampler) + + if args.quantize: + from neural_compressor.torch.export import export + from neural_compressor.torch.quantization import prepare, convert, get_default_static_config + + # Prepare the float model and example inputs for exporting model + x = torch.randn(args.batch_size, 3, 224, 224).contiguous(memory_format=torch.channels_last) + example_inputs = (x,) + + # Export eager model into FX graph model + exported_model = export(model=model, example_inputs=example_inputs) + # Quantize the model + quant_config = get_default_static_config() + + prepared_model = prepare(exported_model, quant_config=quant_config) + # Calibrate + for i in range(args.calib_iters): + prepared_model(*example_inputs) + q_model = convert(prepared_model) + # Compile the quantized model and replace the Q/DQ pattern with Q-operator + from torch._inductor import config + + config.freezing = True + opt_model = torch.compile(q_model) + model = opt_model + + + if args.evaluate: + validate(val_loader, model, criterion, args) + return + + +def validate(val_loader, model, criterion, args): + + def run_validate(loader, base_progress=0): + with torch.no_grad(): + end = time.time() + for i, (images, target) in enumerate(loader): + i = base_progress + i + if args.gpu is not None and torch.cuda.is_available(): + images = images.cuda(args.gpu, non_blocking=True) + if torch.backends.mps.is_available(): + images = images.to('mps') + target = target.to('mps') + if torch.cuda.is_available(): + target = target.cuda(args.gpu, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i + 1) + + batch_time = AverageMeter('Time', ':6.3f', Summary.NONE) + losses = AverageMeter('Loss', ':.4e', Summary.NONE) + top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE) + top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE) + progress = ProgressMeter( + len(val_loader) + (args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset))), + [batch_time, losses, top1, top5], + prefix='Test: ') + + # switch to evaluate mode, pt2e no eval() or train() + # model.eval() + + run_validate(val_loader) + if args.distributed: + top1.all_reduce() + top5.all_reduce() + + if args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset)): + aux_val_dataset = Subset(val_loader.dataset, + range(len(val_loader.sampler) * args.world_size, len(val_loader.dataset))) + aux_val_loader = torch.utils.data.DataLoader( + aux_val_dataset, batch_size=args.batch_size, shuffle=False, + num_workers=args.workers, pin_memory=True) + run_validate(aux_val_loader, len(val_loader)) + + progress.display_summary() + + return top1.avg + + +def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, 'model_best.pth.tar') + +class Summary(Enum): + NONE = 0 + AVERAGE = 1 + SUM = 2 + COUNT = 3 + +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self, name, fmt=':f', summary_type=Summary.AVERAGE): + self.name = name + self.fmt = fmt + self.summary_type = summary_type + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def all_reduce(self): + if torch.cuda.is_available(): + device = torch.device("cuda") + elif torch.backends.mps.is_available(): + device = torch.device("mps") + else: + device = torch.device("cpu") + total = torch.tensor([self.sum, self.count], dtype=torch.float32, device=device) + dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False) + self.sum, self.count = total.tolist() + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + return fmtstr.format(**self.__dict__) + + def summary(self): + fmtstr = '' + if self.summary_type is Summary.NONE: + fmtstr = '' + elif self.summary_type is Summary.AVERAGE: + fmtstr = '{name} {avg:.3f}' + elif self.summary_type is Summary.SUM: + fmtstr = '{name} {sum:.3f}' + elif self.summary_type is Summary.COUNT: + fmtstr = '{name} {count:.3f}' + else: + raise ValueError('invalid summary type %r' % self.summary_type) + + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print('\t'.join(entries)) + + def display_summary(self): + entries = [" *"] + entries += [meter.summary() for meter in self.meters] + print(' '.join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = '{:' + str(num_digits) + 'd}' + return '[' + fmt + '/' + fmt.format(num_batches) + ']' + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/examples/3.x_api/pytorch/cv/static_quant/requirements.txt b/examples/3.x_api/pytorch/cv/static_quant/requirements.txt new file mode 100644 index 00000000000..ebd3df6ae7a --- /dev/null +++ b/examples/3.x_api/pytorch/cv/static_quant/requirements.txt @@ -0,0 +1,3 @@ +torch +torchvision +neural-compressor \ No newline at end of file diff --git a/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh b/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh new file mode 100644 index 00000000000..ac4a5a2b668 --- /dev/null +++ b/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh @@ -0,0 +1,45 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + +# run_tuning +function run_tuning { + if [ "${topology}" = "resnet18_pt2e_static" ]; then + model_name_or_path="resnet18" + fi + python main.py -a ${model_name_or_path} ${dataset_location} -q -e +} + +main "$@" diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md new file mode 100644 index 00000000000..7ad8b76bd1e --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md @@ -0,0 +1,27 @@ +Step-by-Step +============ +This document describes the step-by-step instructions to run large language models (LLMs) on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch 2 Export Quantization. + +Currently, users can use `run_clm_no_trainer.py` to quantize the `OPT` series models and validate the last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git). We will add more models in the near future. + +# Prerequisite +## 1. Create Environment +``` +# Installation +pip install -r requirements.txt +``` + +# Run + +Here is how to run the scripts: + +**Causal Language Modeling (CLM)** + +`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows. +### OPT-125m + +#### Quantization + +```bash +python run_clm_no_trainer.py --model facebook/opt-125m --quantize --accuracy +``` \ No newline at end of file diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt new file mode 100644 index 00000000000..b6d9b6c55de --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt @@ -0,0 +1,7 @@ +transformers +torch +sentencepiece +neural-compressor +intel-extension-for-transformers >= 1.4.1 +lm-eval==0.4.2 +peft \ No newline at end of file diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py new file mode 100644 index 00000000000..98d3f11a1dd --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py @@ -0,0 +1,148 @@ +import argparse +import time +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument( + "--model", nargs="?", default="facebook/opt-125m" +) +parser.add_argument( + "--trust_remote_code", default=True, + help="Transformers parameter: use the external repo") +parser.add_argument( + "--revision", default=None, + help="Transformers parameter: set the model hub commit number") +parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k") +parser.add_argument("--output_dir", nargs="?", default="./saved_results") +parser.add_argument("--quantize", action="store_true") +parser.add_argument("--approach", type=str, default='static', + help="Select from ['dynamic', 'static', 'weight-only']") +parser.add_argument("--int8", action="store_true") +parser.add_argument("--accuracy", action="store_true") +parser.add_argument("--performance", action="store_true") +parser.add_argument("--calib_iters", default=2, type=int, + help="For calibration only.") +parser.add_argument("--iters", default=100, type=int, + help="For accuracy measurement only.") +parser.add_argument("--batch_size", default=1, type=int, + help="For accuracy measurement only.") +parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext", + type=str, help="tasks for accuracy validation") +parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") +# ======================================= + +args = parser.parse_args() + + +def get_user_model(): + torchscript = False + user_model = AutoModelForCausalLM.from_pretrained( + args.model, + torchscript=torchscript, # torchscript will force `return_dict=False` to avoid jit errors + trust_remote_code=args.trust_remote_code, + revision=args.revision, + ) + tokenizer = AutoTokenizer.from_pretrained(args.model) + + if args.peft_model_id is not None: + from peft import PeftModel + user_model = PeftModel.from_pretrained(user_model, args.peft_model_id) + + # to channels last + user_model = user_model.to(memory_format=torch.channels_last) + user_model.eval() + return user_model, tokenizer + +user_model, tokenizer = get_user_model() +if args.quantize: + + from neural_compressor.torch.quantization import ( + convert, + get_default_static_config, + prepare, + ) + from neural_compressor.torch.export import export + from torch.export import Dim + def get_example_inputs(tokenizer): + text = "Hello, welcome to LLM world." + encoded_input = tokenizer(text, return_tensors="pt") + + example_inputs = encoded_input + input_ids = example_inputs["input_ids"] + input_ids_batch = torch.cat((input_ids, input_ids), dim=0) + print(f"input_ids_batch shape: {input_ids_batch.shape}") + tuple_inputs = (input_ids_batch,) + return tuple_inputs + # torch._dynamo.config.cache_size_limit = 4 # set limitation if out of memory + batch = Dim(name="batch_size") + seq_len = Dim(name="seq_len") + dynamic_shapes = {"input_ids": (batch, seq_len)} + example_inputs = get_example_inputs(tokenizer) + exported_model = export(user_model, example_inputs=example_inputs, dynamic_shapes=dynamic_shapes) + + quant_config = get_default_static_config() + # prepare + prepare_model = prepare(exported_model, quant_config) + + # calibrate + for i in range(args.calib_iters): + prepare_model(*example_inputs) + # convert + converted_model = convert(prepare_model) + # inference + from torch._inductor import config + + config.freezing = True + opt_model = torch.compile(converted_model) + + opt_model.config = user_model.config # for lm eval + user_model = opt_model + + +if args.accuracy: + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + eval_args = LMEvalParser( + model="hf", + user_model=user_model, + tokenizer=tokenizer, + batch_size=args.batch_size, + tasks=args.tasks, + device="cpu", + ) + results = evaluate(eval_args) + for task_name in args.tasks.split(","): + if task_name == "wikitext": + acc = results["results"][task_name]["word_perplexity,none"] + else: + acc = results["results"][task_name]["acc,none"] + print("Accuracy: %.5f" % acc) + print('Batch size = %d' % args.batch_size) + +if args.performance: + # user_model.eval() + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + import time + + samples = args.iters * args.batch_size + eval_args = LMEvalParser( + model="hf", + user_model=user_model, + tokenizer=tokenizer, + batch_size=args.batch_size, + tasks=args.tasks, + limit=samples, + device="cpu", + ) + start = time.time() + results = evaluate(eval_args) + end = time.time() + for task_name in args.tasks.split(","): + if task_name == "wikitext": + acc = results["results"][task_name]["word_perplexity,none"] + else: + acc = results["results"][task_name]["acc,none"] + print("Accuracy: %.5f" % acc) + print('Throughput: %.3f samples/sec' % (samples / (end - start))) + print('Latency: %.3f ms' % ((end - start) * 1000 / samples)) + print('Batch size = %d' % args.batch_size) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh new file mode 100644 index 00000000000..6bd599483ff --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh @@ -0,0 +1,46 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + +# run_tuning +function run_tuning { + + if [ "${topology}" = "opt_125m_pt2e_static" ]; then + model_name_or_path="facebook/opt-125m" + fi + python run_clm_no_trainer.py --model ${model_name_or_path} --quantize --accuracy --tasks "lambada_openai" +} + +main "$@"