diff --git a/examples/.config/model_params_tensorflow_3x.json b/examples/.config/model_params_tensorflow_3x.json new file mode 100644 index 00000000000..70e1497e508 --- /dev/null +++ b/examples/.config/model_params_tensorflow_3x.json @@ -0,0 +1,48 @@ +{ + "tensorflow": { + "bert_large_squad_model_zoo": { + "model_src_dir": "nlp/bert_large_squad_model_zoo/quantization/ptq", + "dataset_location": "/tf_dataset/tensorflow/bert/data", + "input_model": "/tf_dataset/tensorflow/bert/fp32_bert_squad.pb", + "main_script": "main.py", + "batch_size": 64, + "fp32_model_url": "https://storage.googleapis.com/intel-optimized-tensorflow/models/v2_7_0/fp32_bert_squad.pb" + }, + "opt_125m_sq": { + "model_src_dir": "nlp/large_language_models/quantization/ptq/smoothquant", + "dataset_location": "", + "input_model": "facebook/opt-125m", + "main_script": "main.py", + "batch_size": 16 + }, + "gpt2_medium_sq": { + "model_src_dir": "nlp/large_language_models/quantization/ptq/smoothquant", + "dataset_location": "", + "input_model": "gpt2-medium", + "main_script": "main.py", + "batch_size": 16 + }, + "gpt-j-6B": { + "model_src_dir": "nlp/large_language_models/quantization/ptq/gpt-j", + "dataset_location": "", + "input_model": "/tf_dataset2/models/tensorflow/gpt-j-6B", + "main_script": "main.py", + "batch_size": 1 + }, + "ViT": { + "model_src_dir": "image_recognition/vision_transformer/quantization/ptq", + "dataset_location": "/tf_dataset/dataset/imagenet", + "input_model": "/tf_dataset/tensorflow/vit/HF-ViT-Base16-Img224-frozen.pb", + "main_script": "main.py", + "batch_size": 32 + }, + "GraphSage": { + "model_src_dir": "graph_networks/graphsage/quantization/ptq", + "dataset_location": "/tf_dataset/dataset/ppi", + "input_model": "/tf_dataset/tensorflow/graphsage/graphsage_frozen_model.pb", + "main_script": "main.py", + "batch_size": 1000 + } + } +} + diff --git a/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/README.md b/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/README.md new file mode 100644 index 00000000000..057b3559756 --- /dev/null +++ b/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/README.md @@ -0,0 +1,109 @@ +Step-by-Step +============ + +This document is used to list steps of reproducing TensorFlow Object Detection models tuning results. This example can run on Intel CPUs and GPUs. + +# Prerequisite + + +## 1. Environment +Recommend python 3.6 or higher version. + +### Install Intel® Neural Compressor +```shell +pip install neural-compressor +``` + +### Install Intel Tensorflow +```shell +pip install intel-tensorflow +``` +> Note: Validated TensorFlow [Version](/docs/source/installation_guide.md#validated-software-environment). + +### Installation Dependency packages +```shell +cd examples\tensorflow\graph_networks\graphsage\quantization\ptq +pip install -r requirements.txt +``` + +### Install Intel Extension for Tensorflow + +#### Quantizing the model on Intel GPU(Mandatory to install ITEX) +Intel Extension for Tensorflow is mandatory to be installed for quantizing the model on Intel GPUs. + +```shell +pip install --upgrade intel-extension-for-tensorflow[xpu] +``` +For any more details, please follow the procedure in [install-gpu-drivers](https://github.com/intel/intel-extension-for-tensorflow/blob/main/docs/install/install_for_xpu.md#install-gpu-drivers) + +#### Quantizing the model on Intel CPU(Optional to install ITEX) +Intel Extension for Tensorflow for Intel CPUs is experimental currently. It's not mandatory for quantizing the model on Intel CPUs. + +```shell +pip install --upgrade intel-extension-for-tensorflow[cpu] +``` + +> **Note**: +> The version compatibility of stock Tensorflow and ITEX can be checked [here](https://github.com/intel/intel-extension-for-tensorflow#compatibility-table). Please make sure you have installed compatible Tensorflow and ITEX. + +## 2. Prepare Model +Download Frozen graph: +```shell +wget https://storage.googleapis.com/intel-optimized-tensorflow/models/2_12_0/graphsage_frozen_model.pb +``` + +## 3. Prepare Dataset + +```shell +wget https://snap.stanford.edu/graphsage/ppi.zip +unzip ppi.zip +``` + +# Run + +## 1. Quantization + + ```shell + # The cmd of running faster_rcnn_resnet50 + bash run_quant.sh --input_model=./graphsage_frozen_model.pb --output_model=./nc_graphsage_int8_model.pb --dataset_location=./ppi + ``` + +## 2. Benchmark + ```shell + bash run_benchmark.sh --input_model=./nc_graphsage_int8_model.pb --dataset_location=./ppi --mode=performance + ``` + +Details of enabling Intel® Neural Compressor on graphsage for Tensorflow. +========================= + +This is a tutorial of how to enable graphsage model with Intel® Neural Compressor. +## User Code Analysis +User specifies fp32 *model*, calibration dataset *calib_dataloader* and a custom *eval_func* which encapsulates the evaluation dataset and metric by itself. + +For graphsage, we applied the latter one because our philosophy is to enable the model with minimal changes. Hence we need to make two changes on the original code. The first one is to implement the q_dataloader and make necessary changes to *eval_func*. + +### Code update + +After prepare step is done, we just need update main.py like below. +```python + if args.tune: + from neural_compressor.tensorflow import StaticQuantConfig, quantize_model + from neural_compressor.tensorflow.utils import BaseDataLoader + + dataset = CustomDataset() + calib_dataloader = BaseDataLoader(dataset=dataset, batch_size=1, collate_fn=collate_function) + quant_config = StaticQuantConfig() + q_model = quantize_model(args.input_graph, quant_config, calib_dataloader) + q_model.save(args.output_graph) + + if args.benchmark: + if args.mode == 'performance': + evaluate(args.input_graph) + elif args.mode == 'accuracy': + acc_result = evaluate(args.input_graph) + print("Batch size = %d" % args.batch_size) + print("Accuracy: %.5f" % acc_result) + +``` + +The quantization.fit() function will return a best quantized model during timeout constrain. diff --git a/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/dataloader.py b/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/dataloader.py new file mode 100644 index 00000000000..e2a1d28d7d7 --- /dev/null +++ b/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/dataloader.py @@ -0,0 +1,80 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +import random +import json +import sys +import os + +import networkx as nx +from networkx.readwrite import json_graph + + +def load_data(prefix, normalize=True, load_walks=False): + G_data = json.load(open(prefix + "-G.json")) + G = json_graph.node_link_graph(G_data) + if isinstance(list(G.nodes())[0], int): + conversion = lambda n : int(n) + else: + conversion = lambda n : n + + if os.path.exists(prefix + "-feats.npy"): + feats = np.load(prefix + "-feats.npy") + else: + print("No features present.. Only identity features will be used.") + feats = None + id_map = json.load(open(prefix + "-id_map.json")) + id_map = {conversion(k):int(v) for k,v in id_map.items()} + walks = [] + class_map = json.load(open(prefix + "-class_map.json")) + if isinstance(list(class_map.values())[0], list): + lab_conversion = lambda n : n + else: + lab_conversion = lambda n : int(n) + + class_map = {conversion(k):lab_conversion(v) for k,v in class_map.items()} + + ## Remove all nodes that do not have val/test annotations + ## (necessary because of networkx weirdness with the Reddit data) + broken_count = 0 + for node in G.nodes(): + if not 'val' in G.nodes[node] or not 'test' in G.nodes[node]: + G.remove_node(node) + broken_count += 1 + print("Removed {:d} nodes that lacked proper annotations due to networkx versioning issues".format(broken_count)) + + ## Make sure the graph has edge train_removed annotations + ## (some datasets might already have this..) + print("Loaded data.. now preprocessing..") + for edge in G.edges(): + if (G.nodes[edge[0]]['val'] or G.nodes[edge[1]]['val'] or + G.nodes[edge[0]]['test'] or G.nodes[edge[1]]['test']): + G[edge[0]][edge[1]]['train_removed'] = True + else: + G[edge[0]][edge[1]]['train_removed'] = False + + if normalize and not feats is None: + from sklearn.preprocessing import StandardScaler + train_ids = np.array([id_map[n] for n in G.nodes() if not G.nodes[n]['val'] and not G.nodes[n]['test']]) + train_feats = feats[train_ids] + scaler = StandardScaler() + scaler.fit(train_feats) + feats = scaler.transform(feats) + + return G, feats, id_map, walks, class_map diff --git a/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/main.py b/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/main.py new file mode 100644 index 00000000000..87837510d3d --- /dev/null +++ b/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/main.py @@ -0,0 +1,189 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import time +import utils +import dataloader +import numpy as np +import tensorflow as tf + +from tensorflow.python.platform import tf_logging +from tensorflow.core.protobuf import rewriter_config_pb2 + +from argparse import ArgumentParser + +np.random.seed(123) + +tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) + +arg_parser = ArgumentParser(description='Parse args') +arg_parser.add_argument('-g', "--input-graph", + help='Specify the input graph for the transform tool', + dest='input_graph') +arg_parser.add_argument("--output-graph", + help='Specify tune result model save dir', + dest='output_graph') +arg_parser.add_argument('--benchmark', dest='benchmark', action='store_true', help='run benchmark') +arg_parser.add_argument('--mode', dest='mode', default='performance', help='benchmark mode') +arg_parser.add_argument('--tune', dest='tune', action='store_true', help='use neural_compressor to tune.') +arg_parser.add_argument('--dataset_location', dest='dataset_location', + help='location of calibration dataset and evaluate dataset') +arg_parser.add_argument('-e', "--num-inter-threads", + help='The number of inter-thread.', + dest='num_inter_threads', type=int, default=0) + +arg_parser.add_argument('-a', "--num-intra-threads", + help='The number of intra-thread.', + dest='num_intra_threads', type=int, default=0) +arg_parser.add_argument('--batch_size', type=int, default=1000, dest='batch_size', help='batch_size of benchmark') +arg_parser.add_argument('--iters', type=int, default=100, dest='iters', help='interations') +args = arg_parser.parse_args() + +def prepare_Dataset(): + data_location = args.dataset_location + pretrained_model = args.input_graph + data = dataloader.load_data(prefix=data_location+'/ppi') + G = data[0] + features = data[1] + id_map = data[2] + class_map = data[4] + if isinstance(list(class_map.values())[0], list): + num_classes = len(list(class_map.values())[0]) + else: + num_classes = len(set(class_map.values())) + + context_pairs = data[3] + placeholders = utils.construct_placeholders(num_classes) + minibatch = utils.NodeMinibatchIterator(G, + id_map, + placeholders, + class_map, + num_classes, + batch_size=args.batch_size, + context_pairs = context_pairs) + return minibatch + +class CustomDataset(object): + def __init__(self): + self.batch1 = [] + self.batch_labels = [] + minibatch = prepare_Dataset() + self.parse_minibatch(minibatch) + + def parse_minibatch(self, minibatch): + iter_num = 0 + finished = False + while not finished: + feed_dict_val, batch_labels, finished, _ = minibatch.incremental_node_val_feed_dict(args.batch_size, iter_num, test=True) + self.batch1.append(feed_dict_val['batch1:0']) + self.batch_labels.append(batch_labels) + iter_num += 1 + + def __getitem__(self, index): + return (self.batch1[index], len(self.batch1[index])), self.batch_labels[index] + + def __len__(self): + return len(self.batch1) + +def evaluate(model): + """Custom evaluate function to estimate the accuracy of the model. + + Args: + model (tf.Graph_def): The input model graph + + Returns: + accuracy (float): evaluation result, the larger is better. + """ + from neural_compressor.tensorflow import Model + model = Model(model) + output_tensor = model.output_tensor if len(model.output_tensor)>1 else \ + model.output_tensor[0] + iteration = -1 + minibatch = prepare_Dataset() + if args.benchmark and args.mode == 'performance': + iteration = args.iters + + #output_tensor = model.sess.graph.get_tensor_by_name('Sigmoid:0') + def eval_func(size, output_tensor, minibatch, test): + t_test = time.time() + val_losses = [] + val_preds = [] + labels = [] + iter_num = 0 + finished = False + total_time = 0 + while not finished: + feed_dict_val, batch_labels, finished, _ = minibatch.incremental_node_val_feed_dict(size, iter_num, test=True) + tf_logging.warn('\n---> Start iteration {0}'.format(str(iter_num))) + start_time = time.time() + node_outs_val = model.sess.run([output_tensor],feed_dict=feed_dict_val) + time_consume = time.time() - start_time + val_preds.append(node_outs_val[0].astype(float)) + labels.append(batch_labels) + iter_num += 1 + total_time += time_consume + if iteration != -1 and iter_num >= iteration: + break + tf_logging.warn('\n---> Stop iteration {0}'.format(str(iter_num))) + val_preds = np.vstack(val_preds) + labels = np.vstack(labels) + f1_scores = utils.calc_f1(labels, val_preds) + time_average = total_time / iter_num + return f1_scores, (time.time() - t_test)/iter_num, time_average + + test_f1_micro, duration, time_average = eval_func(args.batch_size, output_tensor, minibatch, test=True) + if args.benchmark and args.mode == 'performance': + latency = time_average / args.batch_size + print("Batch size = {}".format(args.batch_size)) + print("Latency: {:.3f} ms".format(latency * 1000)) + print("Throughput: {:.3f} images/sec".format(1. / latency)) + return test_f1_micro + +def collate_function(batch): + return (batch[0][0][0], batch[0][0][1]), batch[0][1] + +class eval_graphsage_optimized_graph: + """Evaluate image classifier with optimized TensorFlow graph.""" + + def run(self): + """This is neural_compressor function include tuning, export and benchmark option.""" + from neural_compressor.common import set_random_seed + set_random_seed(9527) + + if args.tune: + from neural_compressor.tensorflow import StaticQuantConfig, quantize_model + from neural_compressor.tensorflow.utils import BaseDataLoader + + dataset = CustomDataset() + calib_dataloader = BaseDataLoader(dataset=dataset, batch_size=1, collate_fn=collate_function) + quant_config = StaticQuantConfig() + q_model = quantize_model(args.input_graph, quant_config, calib_dataloader) + q_model.save(args.output_graph) + + if args.benchmark: + if args.mode == 'performance': + evaluate(args.input_graph) + elif args.mode == 'accuracy': + acc_result = evaluate(args.input_graph) + print("Batch size = %d" % args.batch_size) + print("Accuracy: %.5f" % acc_result) + +if __name__ == "__main__": + evaluate_opt_graph = eval_graphsage_optimized_graph() + evaluate_opt_graph.run() diff --git a/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/requirements.txt b/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/requirements.txt new file mode 100644 index 00000000000..a6c2afe448c --- /dev/null +++ b/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/requirements.txt @@ -0,0 +1,2 @@ +networkx +scikit-learn \ No newline at end of file diff --git a/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/run_benchmark.sh b/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/run_benchmark.sh new file mode 100644 index 00000000000..89c7cc19b6e --- /dev/null +++ b/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/run_benchmark.sh @@ -0,0 +1,51 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + batch_size=1000 + iters=100 + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo "$var" |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo $var |cut -f2 -d=) + ;; + --iters=*) + iters=$(echo $var |cut -f2 -d=) + ;; + esac + done + +} + + +# run_tuning +function run_benchmark { + + python main.py \ + --input-graph ${input_model} \ + --mode ${mode} \ + --dataset_location "${dataset_location}" \ + --batch_size ${batch_size} \ + --iters ${iters} \ + --benchmark +} + +main "$@" diff --git a/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/run_quant.sh b/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/run_quant.sh new file mode 100644 index 00000000000..f7046cc3df7 --- /dev/null +++ b/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/run_quant.sh @@ -0,0 +1,41 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + + run_tuning + +} + +# init params +function init_params { + + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo "$var" |cut -f2 -d=) + ;; + --output_model=*) + output_model=$(echo "$var" |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo "$var" |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function run_tuning { + python main.py \ + --input-graph "${input_model}" \ + --output-graph "${output_model}" \ + --dataset_location "${dataset_location}" \ + --tune +} + +main "$@" diff --git a/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/utils.py b/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/utils.py new file mode 100644 index 00000000000..babe7146f5c --- /dev/null +++ b/examples/3.x_api/tensorflow/graph_networks/graphsage/quantization/ptq/utils.py @@ -0,0 +1,112 @@ +#!/usr/bin/env bash +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +import random +import json +import sys +import os +import json +import networkx as nx +from networkx.readwrite import json_graph +import tensorflow as tf +from sklearn import metrics + +def calc_f1(y_true, y_pred): + y_pred[y_pred > 0.5] = 1 + y_pred[y_pred <= 0.5] = 0 + return metrics.f1_score(y_true, y_pred, average="micro") + +def construct_placeholders(num_classes): + # Define placeholders + tf.compat.v1.disable_eager_execution() + placeholders = { + 'labels' : tf.compat.v1.placeholder(tf.float32, shape=(None, num_classes), name='labels'), + 'batch' : tf.compat.v1.placeholder(tf.int32, shape=(None), name='batch1'), + 'batch_size' : tf.compat.v1.placeholder(tf.int32, name='batch_size'), + } + return placeholders + + +class NodeMinibatchIterator(object): + + """ + This minibatch iterator iterates over nodes for supervised learning. + + G -- networkx graph + id2idx -- dict mapping node ids to integer values indexing feature tensor + placeholders -- standard tensorflow placeholders object for feeding + label_map -- map from node ids to class values (integer or list) + num_classes -- number of output classes + batch_size -- size of the minibatches + max_degree -- maximum size of the downsampled adjacency lists + """ + # (G, + # id_map, + # placeholders, + # class_map, + # num_classes, + # batch_size=FLAGS.batch_size, + # max_degree=FLAGS.max_degree, + # context_pairs = context_pairs) + def __init__(self, G, id2idx, + placeholders, label_map, num_classes, + batch_size=100, max_degree=25, + **kwargs): + + self.G = G + self.nodes = G.nodes() + self.id2idx = id2idx + self.placeholders = placeholders + self.batch_size = batch_size + self.max_degree = max_degree + self.batch_num = 0 + self.label_map = label_map + self.num_classes = num_classes + self.test_nodes = [n for n in self.G.nodes() if self.G.nodes[n]['test']] + + def _make_label_vec(self, node): + label = self.label_map[node] + if isinstance(label, list): + label_vec = np.array(label) + else: + label_vec = np.zeros((self.num_classes)) + class_ind = self.label_map[node] + label_vec[class_ind] = 1 + return label_vec + def batch_feed_dict(self, batch_nodes, val=False): + batch1id = batch_nodes + batch1 = [self.id2idx[n] for n in batch1id] + + labels = np.vstack([self._make_label_vec(node) for node in batch1id]) + feed_dict = dict() + feed_dict.update({'batch1:0': batch1}) + feed_dict.update({'batch_size:0' : len(batch1)}) + return feed_dict, labels + + + def incremental_node_val_feed_dict(self, size, iter_num, test=False): + if test: + val_nodes = self.test_nodes + else: + val_nodes = self.val_nodes + val_node_subset = val_nodes[iter_num*size:min((iter_num+1)*size, + len(val_nodes))] + + # add a dummy neighbor + ret_val = self.batch_feed_dict(val_node_subset) + return ret_val[0], ret_val[1], (iter_num+1)*size >= len(val_nodes), val_node_subset diff --git a/examples/3.x_api/tensorflow/image_recognition/prepare_dataset.sh b/examples/3.x_api/tensorflow/image_recognition/prepare_dataset.sh new file mode 100644 index 00000000000..4aad5d69a3f --- /dev/null +++ b/examples/3.x_api/tensorflow/image_recognition/prepare_dataset.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# set -x + +OUTPUT_DIR="./data" +SUBSET="validation" +SHARDS=1 + +help() +{ + cat <<- EOF + Desc: Convert prepared raw imagnet dataset to tfrecord + -h --help help info + --output_dir Output data directory + default: './data' + --raw_dir Raw data directory + --shards Number of shards in TFRecord files. + default: '1' + --subset Subset of imagenet, can be validation/train. + default: 'validation' +EOF + exit 0 +} + +function main { + init_params "$@" + convert_dataset +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --output_dir=*) + OUTPUT_DIR=$(echo $var |cut -f2 -d=) + ;; + --raw_dir=*) + RAW_DIR=$(echo $var |cut -f2 -d=) + ;; + --shards=*) + SHARDS=$(echo $var |cut -f2 -d=) + ;; + --subset=*) + SUBSET=$(echo $var |cut -f2 -d=) + ;; + -h|--help) help + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done +} + +# convert dataset +function convert_dataset { + if [ ! -d ${OUTPUT_DIR} ]; then + mkdir ${OUTPUT_DIR} + fi + python imagenet_prepare/build_imagenet_data.py \ + --imagenet_metadata_file "imagenet_prepare/imagenet_metadata.txt" \ + --labels_file "imagenet_prepare/imagenet_lsvrc_2015_synsets.txt" \ + --output_directory ${OUTPUT_DIR} \ + --subset ${SUBSET} \ + --raw_directory ${RAW_DIR} \ + --shards ${SHARDS} +} + +main "$@" + diff --git a/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/README.md b/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/README.md new file mode 100644 index 00000000000..7dcf3e7a363 --- /dev/null +++ b/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/README.md @@ -0,0 +1,79 @@ +Step-by-Step +============ + +This document list steps of reproducing Vision Transformer model tuning results via Neural Compressor. + +# Prerequisite + +## 1. Environment + +### Install Dependency Package + +``` +pip install -r requirements.txt +``` + +### Install Intel Extension for Tensorflow +#### Quantizing the model on Intel GPU(Mandatory to install ITEX) +Intel Extension for Tensorflow is mandatory to be installed for quantizing the model on Intel GPUs. + +```shell +pip install --upgrade intel-extension-for-tensorflow[xpu] +``` +For any more details, please follow the procedure in [install-gpu-drivers](https://github.com/intel/intel-extension-for-tensorflow/blob/main/docs/install/install_for_xpu.md#install-gpu-drivers) + +#### Quantizing the model on Intel CPU(Optional to install ITEX) +Intel Extension for Tensorflow for Intel CPUs is experimental currently. It's not mandatory for quantizing the model on Intel CPUs. + +```shell +pip install --upgrade intel-extension-for-tensorflow[cpu] +``` +> **Note**: +> The version compatibility of stock Tensorflow and ITEX can be checked [here](https://github.com/intel/intel-extension-for-tensorflow#compatibility-table). Please make sure you have installed compatible Tensorflow and ITEX. + +## 2. Prepare Pretrained model + +``` +wget https://storage.googleapis.com/intel-optimized-tensorflow/models/2_11_0/HF-ViT-Base16-Img224-frozen.pb +``` + +## 3. Prepare Dataset + + TensorFlow [models](https://github.com/tensorflow/models) repo provides [scripts and instructions](https://github.com/tensorflow/models/tree/master/research/slim#an-automated-script-for-processing-imagenet-data) to download, process and convert the ImageNet dataset to the TF records format. + We also prepared related scripts in ` examples/tensorflow/image_recognition/tensorflow_models/imagenet_prepare` directory. To download the raw images, the user must create an account with image-net.org. If you have downloaded the raw data and preprocessed the validation data by moving the images into the appropriate sub-directory based on the label (synset) of the image. we can use below command ro convert it to tf records format. + + ```shell + cd examples/3.x_api/tensorflow/image_recognition/tensorflow_models/ + # convert validation subset + bash prepare_dataset.sh --output_dir=./vision_transformer/quantization/ptq/data --raw_dir=/PATH/TO/img_raw/val/ --subset=validation + # convert train subset + bash prepare_dataset.sh --output_dir=./vision_transformer/quantization/ptq/data --raw_dir=/PATH/TO/img_raw/train/ --subset=train + ``` +> **Note**: +> The raw ImageNet dataset resides in JPEG files should be in the following directory structure. Taking validation set as an example:
+>         /PATH/TO/img_raw/val/n01440764/ILSVRC2012_val_00000293.JPEG
+>         /PATH/TO/img_raw/val/n01440764/ILSVRC2012_val_00000543.JPEG
+> where 'n01440764' is the unique synset label associated with these images. + +# Run + +## 1. Quantization + +```shell +bash run_quant.sh --input_model= --output_model=./output --dataset_location= +``` + + +## 2. Benchmark + +### Benchmark the fp32 model + +```shell +bash run_benchmark.sh --input_model= --mode=accuracy --dataset_location= --batch_size=32 +``` + +### Benchmark the int8 model + +```shell +bash run_benchmark.sh --input_model=./output.pb --mode=accuracy --dataset_location= --batch_size=32 --int8=true +``` \ No newline at end of file diff --git a/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/__init__.py b/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/data_process.py b/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/data_process.py new file mode 100644 index 00000000000..8d28e4a3e17 --- /dev/null +++ b/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/data_process.py @@ -0,0 +1,576 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import collections + +import numpy as np +import tensorflow as tf + +from abc import abstractmethod +from neural_compressor.common import logger +from neural_compressor.tensorflow.utils.data import default_collate + +class ParseDecodeImagenet: + """Parse features in Example proto. + + Returns: + tuple of parsed image and label + """ + + def __call__(self, sample): + """Parse features in example.""" + # Dense features in Example proto. + feature_map = { + "image/encoded": tf.io.FixedLenFeature([], dtype=tf.string, default_value=""), + "image/class/label": tf.io.FixedLenFeature([1], dtype=tf.int64, default_value=-1), + } + + sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32) + # Sparse features in Example proto. + feature_map.update( + { + k: sparse_float32 + for k in [ + "image/object/bbox/xmin", + "image/object/bbox/ymin", + "image/object/bbox/xmax", + "image/object/bbox/ymax", + ] + } + ) + + features = tf.io.parse_single_example(serialized=sample, features=feature_map) + label = tf.cast(features["image/class/label"], dtype=tf.int32) + image = features["image/encoded"] + image = tf.image.decode_jpeg(image, channels=3, fancy_upscaling=False, dct_method="INTEGER_FAST") + return (image, label) + + +class ResizeCropImagenet(object): + """Combination of a series of transforms which is applicable to images in Imagenet. + + Args: + height (int): Height of the result + width (int): Width of the result + random_crop (bool, default=False): whether to random crop + resize_side (int, default=256):desired shape after resize operation + random_flip_left_right (bool, default=False): whether to random flip left and right + mean_value (list, default=[0.0,0.0,0.0]):means for each channel + scale (float, default=1.0):std value + + Returns: + tuple of processed image and label + """ + + def __init__( + self, + height, + width, + random_crop=False, + resize_side=256, + resize_method="bilinear", + random_flip_left_right=False, + mean_value=[0.0, 0.0, 0.0], + scale=1.0, + data_format="channels_last", + subpixels="RGB", + ): + """Initialize `TensorflowResizeCropImagenetTransform` class.""" + self.height = height + self.width = width + self.mean_value = mean_value + self.scale = scale + self.random_crop = random_crop + self.random_flip_left_right = random_flip_left_right + self.resize_side = resize_side + self.resize_method = resize_method + self.data_format = data_format + self.subpixels = subpixels + + # sample is (images, labels) + def __call__(self, sample): + """Convert `TensorflowResizeCropImagenetTransform` feature.""" + image, label = sample + shape = tf.shape(input=image) + + height = ( + tf.cast(shape[0], dtype=tf.float32) + if self.data_format == "channels_last" + else tf.cast(shape[1], dtype=tf.float32) + ) + width = ( + tf.cast(shape[1], dtype=tf.float32) + if self.data_format == "channels_last" + else tf.cast(shape[2], dtype=tf.float32) + ) + scale = tf.cond( + pred=tf.greater(height, width), + true_fn=lambda: self.resize_side / width, + false_fn=lambda: self.resize_side / height, + ) + + scale = tf.cast(scale, dtype=tf.float32) + new_height = tf.cast(tf.math.rint(height * scale), dtype=tf.int32) + new_width = tf.cast(tf.math.rint(width * scale), dtype=tf.int32) + + if self.subpixels == "BGR" and self.data_format == "channels_first": + # 'RGB'->'BGR' + image = tf.cond( + tf.equal(tf.rank(image), 3), + lambda: tf.experimental.numpy.moveaxis(image[::-1, ...], 0, -1), + lambda: tf.experimental.numpy.moveaxis(image[:, ::-1, ...], 1, -1), + ) + elif self.subpixels == "BGR": + # 'RGB'->'BGR' + image = image[..., ::-1] + image = tf.expand_dims(image, 0) + image = tf.image.resize(image, [new_height, new_width], method=self.resize_method) + image = tf.squeeze(image) + shape = tf.shape(input=image) + if self.random_crop: + y0 = tf.random.uniform(shape=[], minval=0, maxval=(shape[0] - self.height + 1), dtype=tf.dtypes.int32) + x0 = tf.random.uniform(shape=[], minval=0, maxval=(shape[1] - self.width + 1), dtype=tf.dtypes.int32) + else: + y0 = (shape[0] - self.height) // 2 + x0 = (shape[1] - self.width) // 2 + + image = tf.image.crop_to_bounding_box(image, y0, x0, self.height, self.width) + image.set_shape([self.height, self.width, 3]) + if self.random_flip_left_right: + image = tf.image.random_flip_left_right(image) + means = tf.broadcast_to(self.mean_value, tf.shape(input=image)) + image = (image - means) * self.scale + return (image, label) + + +class TransposeLastChannel(object): + """Transpose NHWC to NCHW. + + Returns: + tuple of processed image and label + """ + + def __call__(self, sample): + image, label = sample + image = tf.transpose(image, perm=[2, 0, 1]) + return (image, label) + + +class ComposeTransform(object): + """Composes several transforms together. + + Args: + transform_list (list of Transform objects): list of transforms to compose + + Returns: + sample (tuple): tuple of processed image and label + """ + + def __init__(self, transform_list): + """Initialize `ComposeTransform` class.""" + self.transform_list = transform_list + + def __call__(self, sample): + """Call transforms in transform_list.""" + for transform in self.transform_list: + sample = transform(sample) + return sample + + +class ShiftRescale(object): + """Label shift by 1 and rescale. + + Returns: + tuple of processed image and label + """ + + def __call__(self, sample): + image, label = sample + label -= 1 + image = (image - 127.5) / 127.5 + return (image, label) + + +class ImageRecordDataset(object): + """Tensorflow imageNet database in tf record format. + + Please arrange data in this way: + root/validation-000-of-100 + root/validation-001-of-100 + ... + root/validation-099-of-100 + The file name needs to follow this pattern: '* - * -of- *' + + Args: root (str): Root directory of dataset. + transform (transform object, default=None): transform to process input data. + filter (Filter objects, default=None): filter out examples according + to specific conditions. + """ + + """Configuration for Imagenet dataset.""" + + def __new__(cls, root, transform=None, filter=None): + """Build a new object of TensorflowImageRecord class.""" + from tensorflow.python.platform import gfile # pylint: disable=no-name-in-module + + glob_pattern = os.path.join(root, "*-*-of-*") + file_names = gfile.Glob(glob_pattern) + if not file_names: + raise ValueError("Found no files in --root matching: {}".format(glob_pattern)) + + # pylint: disable=no-name-in-module + from tensorflow.python.data.experimental import parallel_interleave + + ds = tf.data.TFRecordDataset.list_files(file_names, shuffle=False) + ds = ds.apply(parallel_interleave(tf.data.TFRecordDataset, cycle_length=len(file_names))) + + if transform is not None: + transform.transform_list.insert(0, ParseDecodeImagenet()) + else: + transform = ParseDecodeImagenet() + ds = ds.map(transform, num_parallel_calls=None) + ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) # this number can be tuned + return ds + + +class BaseMetric(object): + """The base class of Metric.""" + + def __init__(self, metric, single_output=False, hvd=None): + """Initialize the basic metric. + + Args: + metric: The metric class. + single_output: Whether the output is single or not, defaults to False. + hvd: The Horovod class for distributed training, defaults to None. + """ + self._metric_cls = metric + self._single_output = single_output + self._hvd = hvd + + def __call__(self, *args, **kwargs): + """Evaluate the model predictions, and the reference. + + Returns: + The class itself. + """ + self._metric = self._metric_cls(*args, **kwargs) + return self + + @abstractmethod + def update(self, preds, labels=None, sample_weight=None): + """Update the state that need to be evaluated. + + Args: + preds: The prediction result. + labels: The reference. Defaults to None. + sample_weight: The sampling weight. Defaults to None. + + Raises: + NotImplementedError: The method should be implemented by subclass. + """ + raise NotImplementedError + + @abstractmethod + def reset(self): + """Clear the predictions and labels. + + Raises: + NotImplementedError: The method should be implemented by subclass. + """ + raise NotImplementedError + + @abstractmethod + def result(self): + """Evaluate the difference between predictions and labels. + + Raises: + NotImplementedError: The method should be implemented by subclass. + """ + raise NotImplementedError + + @property + def metric(self): + """Return its metric class. + + Returns: + The metric class. + """ + return self._metric_cls + + @property + def hvd(self): + """Return its hvd class. + + Returns: + The hvd class. + """ + return self._hvd + + @hvd.setter + def hvd(self, hvd): + """Set its hvd. + + Args: + hvd: The Horovod class for distributed training. + """ + self._hvd = hvd + + +class TopKMetric(BaseMetric): + """Compute Top-k Accuracy classification score for Tensorflow model. + + This metric computes the number of times where the correct label is among + the top k labels predicted. + + Attributes: + k (int): The number of most likely outcomes considered to find the correct label. + num_correct: The number of predictions that were correct classified. + num_sample: The total number of predictions. + """ + + def __init__(self, k=1): + """Initialize the k, number of samples and correct predictions. + + Args: + k: The number of most likely outcomes considered to find the correct label. + """ + self.k = k + self.num_correct = 0 + self.num_sample = 0 + + def update(self, preds, labels, sample_weight=None): + """Add the predictions and labels. + + Args: + preds: The predictions. + labels: The labels corresponding to the predictions. + sample_weight: The sample weight. + """ + preds, labels = TopKMetric._topk_shape_validate(preds, labels) + + labels = labels.reshape([len(labels)]) + with tf.Graph().as_default() as acc_graph: + topk = tf.nn.in_top_k( + predictions=tf.constant(preds, dtype=tf.float32), targets=tf.constant(labels, dtype=tf.int32), k=self.k + ) + fp32_topk = tf.cast(topk, tf.float32) + correct_tensor = tf.reduce_sum(input_tensor=fp32_topk) + + with tf.compat.v1.Session() as acc_sess: + correct = acc_sess.run(correct_tensor) + + self.num_sample += len(labels) + self.num_correct += correct + + def reset(self): + """Reset the number of samples and correct predictions.""" + self.num_correct = 0 + self.num_sample = 0 + + def result(self): + """Compute the top-k score. + + Returns: + The top-k score. + """ + if self.num_sample == 0: + logger.warning("Sample num during evaluation is 0.") + return 0 + elif getattr(self, "_hvd", None) is not None: # pragma: no cover + allgather_num_correct = sum(self._hvd.allgather_object(self.num_correct)) + allgather_num_sample = sum(self._hvd.allgather_object(self.num_sample)) + return allgather_num_correct / allgather_num_sample + return self.num_correct / self.num_sample + + @staticmethod + def _topk_shape_validate(preds, labels): + # preds shape can be Nxclass_num or class_num(N=1 by default) + # it's more suitable for 'Accuracy' with preds shape Nx1(or 1) output from argmax + if isinstance(preds, int): + preds = [preds] + preds = np.array(preds) + elif isinstance(preds, np.ndarray): + preds = np.array(preds) + elif isinstance(preds, list): + preds = np.array(preds) + preds = preds.reshape((-1, preds.shape[-1])) + + # consider labels just int value 1x1 + if isinstance(labels, int): + labels = [labels] + labels = np.array(labels) + elif isinstance(labels, tuple): + labels = np.array([labels]) + labels = labels.reshape((labels.shape[-1], -1)) + elif isinstance(labels, list): + if isinstance(labels[0], int): + labels = np.array(labels) + labels = labels.reshape((labels.shape[0], 1)) + elif isinstance(labels[0], tuple): + labels = np.array(labels) + labels = labels.reshape((labels.shape[-1], -1)) + else: + labels = np.array(labels) + # labels most have 2 axis, 2 cases: N(or Nx1 sparse) or Nxclass_num(one-hot) + # only support 2 dimension one-shot labels + # or 1 dimension one-hot class_num will confuse with N + + if len(preds.shape) == 1: + N = 1 + class_num = preds.shape[0] + preds = preds.reshape([-1, class_num]) + elif len(preds.shape) >= 2: + N = preds.shape[0] + preds = preds.reshape([N, -1]) + class_num = preds.shape[1] + + label_N = labels.shape[0] + assert label_N == N, "labels batch size should same with preds" + labels = labels.reshape([N, -1]) + # one-hot labels will have 2 dimension not equal 1 + if labels.shape[1] != 1: + labels = labels.argsort()[..., -1:] + return preds, labels + + +class TFDataLoader(object): # pragma: no cover + """Tensorflow dataloader class. + + In tensorflow1.x dataloader is coupled with the graph, but it also support feed_dict + method to do session run, this dataloader is designed to satisfy the usage of feed dict + in tf1.x. Although it's a general dataloader and can be used in MXNet and PyTorch. + + Args: + dataset: obj. wrapper of needed data. + batch_size: int. batch size + """ + + def __init__(self, dataset, batch_size=1, last_batch="rollover"): + """Initialize `TFDataDataLoader` class.""" + self.dataset = dataset + self.last_batch = last_batch + self.batch_size = batch_size + dataset = dataset.batch(batch_size) + + def batch(self, batch_size, last_batch="rollover"): + """Dataset return data per batch.""" + drop_last = False if last_batch == "rollover" else True + self.batch_size = batch_size + self.dataset = self.dataset.batch(batch_size, drop_last) + + def __iter__(self): + """Iterate dataloader.""" + return self._generate_dataloader( + self.dataset, + batch_size=self.batch_size, + last_batch=self.last_batch, + ) + + def _generate_dataloader( + self, + dataset, + batch_size=1, + last_batch="rollover", + collate_fn=None, + sampler=None, + batch_sampler=None, + num_workers=None, + pin_memory=None, + distributed=False, + ): + """Yield data.""" + drop_last = False if last_batch == "rollover" else True + + def check_dynamic_shape(element_spec): + if isinstance(element_spec, collections.abc.Sequence): + return any([check_dynamic_shape(ele) for ele in element_spec]) + elif isinstance(element_spec, tf.TensorSpec): + return True if element_spec.shape.num_elements() is None else False + else: + raise ValueError("unrecognized element spec...") + + def squeeze_output(output): + if isinstance(output, collections.abc.Sequence): + return [squeeze_output(ele) for ele in output] + elif isinstance(output, np.ndarray): + return np.squeeze(output, axis=0) + else: + raise ValueError("not supported output format....") + + if tf.executing_eagerly(): + index = 0 + outputs = [] + for iter_tensors in dataset: + samples = [] + iter_inputs, iter_labels = iter_tensors[0], iter_tensors[1] + if isinstance(iter_inputs, tf.Tensor): + samples.append(iter_inputs.numpy()) + else: + samples.append(tuple(iter_input.numpy() for iter_input in iter_inputs)) + if isinstance(iter_labels, tf.Tensor): + samples.append(iter_labels.numpy()) + else: + samples.append([np.array(l) for l in iter_labels]) + index += 1 + outputs.append(samples) + if index == batch_size: + outputs = default_collate(outputs) + yield outputs + outputs = [] + index = 0 + if len(outputs) > 0: + outputs = default_collate(outputs) + yield outputs + else: + try_single_batch = check_dynamic_shape(dataset.element_spec) + dataset = dataset.batch(1 if try_single_batch else batch_size, drop_last) + ds_iterator = tf.compat.v1.data.make_one_shot_iterator(dataset) + iter_tensors = ds_iterator.get_next() + data_config = tf.compat.v1.ConfigProto() + data_config.use_per_session_threads = 1 + data_config.intra_op_parallelism_threads = 1 + data_config.inter_op_parallelism_threads = 16 + data_sess = tf.compat.v1.Session(config=data_config) + # pylint: disable=no-name-in-module + from tensorflow.python.framework.errors_impl import OutOfRangeError + + while True: + if not try_single_batch: + try: + outputs = data_sess.run(iter_tensors) + yield outputs + except OutOfRangeError: + data_sess.close() + return + else: + try: + outputs = [] + for i in range(0, batch_size): + outputs.append(squeeze_output(data_sess.run(iter_tensors))) + outputs = default_collate(outputs) + yield outputs + except OutOfRangeError: + if len(outputs) == 0: + data_sess.close() + return + else: + outputs = default_collate(outputs) + yield outputs + data_sess.close() + return diff --git a/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/main.py b/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/main.py new file mode 100644 index 00000000000..49b4771c61a --- /dev/null +++ b/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/main.py @@ -0,0 +1,187 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import time + +import numpy as np +import tensorflow as tf + +from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference +from tensorflow.python.framework import dtypes +from tensorflow.core.protobuf import saved_model_pb2 + +from argparse import ArgumentParser +from data_process import ( + ImageRecordDataset, + ComposeTransform, + ResizeCropImagenet, + TransposeLastChannel, + ShiftRescale, + TFDataLoader, +) + + +INPUTS = 'inputs' +OUTPUTS = 'Identity' + +RESNET_IMAGE_SIZE = 224 +IMAGENET_VALIDATION_IMAGES = 50000 + +tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) + +arg_parser = ArgumentParser(description='Parse args') +arg_parser.add_argument('-g', "--input-graph", + help='Specify the input graph for the transform tool', + dest='input_graph') +arg_parser.add_argument("--output-graph", + help='Specify tune result model save dir', + dest='output_graph') +arg_parser.add_argument('--benchmark', dest='benchmark', action='store_true', help='run benchmark') +arg_parser.add_argument('--mode', dest='mode', default='performance', help='benchmark mode') +arg_parser.add_argument('--tune', dest='tune', action='store_true', help='use neural_compressor to tune.') +arg_parser.add_argument('--diagnose', dest='diagnose', action='store_true', help='use Neural Insights to diagnose tuning and benchmark.') +arg_parser.add_argument('--dataset_location', dest='dataset_location', + help='location of calibration dataset and evaluate dataset') +arg_parser.add_argument('--batch_size', type=int, default=32, dest='batch_size', help='batch_size of benchmark') +arg_parser.add_argument('--iters', type=int, default=100, dest='iters', help='interations') +arg_parser.add_argument('--int8', dest='int8', action='store_true', help='whether to use int8 model for benchmark') +args = arg_parser.parse_args() + +def evaluate(model, eval_dataloader, preprocess=None): + """Custom evaluate function to estimate the accuracy of the model. + + Args: + model (tf.Graph_def): The input model graph + + Returns: + accuracy (float): evaluation result, the larger is better. + """ + from data_process import TopKMetric + from neural_compressor.tensorflow import Model + model = Model(model) + input_tensor = model.input_tensor + output_tensor = model.output_tensor if len(model.output_tensor)>1 else \ + model.output_tensor[0] + iteration = -1 + metric = TopKMetric() + if args.benchmark and args.mode == 'performance': + iteration = args.iters + + def eval_func(dataloader): + latency_list = [] + for idx, (inputs, labels) in enumerate(dataloader): + # shift the label and rescale the inputs + inputs, labels = preprocess((inputs, labels)) + # dataloader should keep the order and len of inputs same with input_tensor + inputs = np.array([inputs]) + feed_dict = dict(zip(input_tensor, inputs)) + + start = time.time() + predictions = model.sess.run(output_tensor, feed_dict) + end = time.time() + + if isinstance(predictions, list): + if len(model.output_tensor_names) == 1: + predictions = predictions[0] + elif len(model.output_tensor_names) > 1: + predictions = predictions[1] + metric.update(predictions, labels) + latency_list.append(end-start) + if idx + 1 == iteration: + break + latency = np.array(latency_list).mean() / args.batch_size + return latency + + latency = eval_func(eval_dataloader) + if args.benchmark and args.mode == 'performance': + print("Batch size = {}".format(args.batch_size)) + print("Latency: {:.3f} ms".format(latency * 1000)) + print("Throughput: {:.3f} images/sec".format(1. / latency)) + acc = metric.result() + return acc + +class eval_classifier_optimized_graph: + """Evaluate image classifier with optimized TensorFlow graph.""" + + def run(self): + """This is neural_compressor function include tuning, export and benchmark option.""" + from neural_compressor.common import set_random_seed + set_random_seed(9527) + + if args.tune: + from neural_compressor.tensorflow import StaticQuantConfig, quantize_model + + dataset = ImageRecordDataset( + root=args.dataset_location, + transform=ComposeTransform(transform_list= [ + ResizeCropImagenet(height=224, width=224), + TransposeLastChannel(), + ] + ) + ) + calib_dataloader = TFDataLoader(dataset=dataset, batch_size=10) + + quant_config = StaticQuantConfig() + matmul_config = StaticQuantConfig(weight_dtype="fp32", act_dtype="fp32") + conv_config = StaticQuantConfig(weight_dtype="fp32", act_dtype="fp32") + quant_config.set_local("StatefulPartitionedCall/vit/encoder/layer_._9/output/dense/Tensordot/MatMul", matmul_config) + quant_config.set_local("Conv2D", conv_config) + + sm = saved_model_pb2.SavedModel() + with tf.io.gfile.GFile(args.input_graph, "rb") as f: + sm.ParseFromString(f.read()) + graph_def = sm.meta_graphs[0].graph_def + postprocess = ShiftRescale() + + q_model = quantize_model(graph_def, quant_config, calib_dataloader) + q_model.save(args.output_graph) + + if args.benchmark: + dataset = ImageRecordDataset( + root=args.dataset_location, + transform=ComposeTransform(transform_list= [ + ResizeCropImagenet(height=224, width=224), + TransposeLastChannel(), + ] + ) + ) + dataloader = TFDataLoader(dataset=dataset, batch_size=args.batch_size) + + if args.int8 or args.input_graph.endswith("-tune.pb"): + input_graph = args.input_graph + else: + sm = saved_model_pb2.SavedModel() + with tf.io.gfile.GFile(args.input_graph, "rb") as f: + sm.ParseFromString(f.read()) + graph_def = sm.meta_graphs[0].graph_def + input_graph = graph_def + + def eval(model): + preprocess = ShiftRescale() + return evaluate(model, dataloader, preprocess) + + if args.mode == 'performance': + eval(input_graph) + elif args.mode == 'accuracy': + acc_result = eval(input_graph) + print("Batch size = %d" % dataloader.batch_size) + print("Accuracy: %.5f" % acc_result) + +if __name__ == "__main__": + evaluate_opt_graph = eval_classifier_optimized_graph() + evaluate_opt_graph.run() \ No newline at end of file diff --git a/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/requirements.txt b/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/requirements.txt new file mode 100644 index 00000000000..c8d21e74265 --- /dev/null +++ b/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/requirements.txt @@ -0,0 +1,2 @@ +tensorflow==2.11.0 +neural-compressor \ No newline at end of file diff --git a/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/run_benchmark.sh b/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/run_benchmark.sh new file mode 100644 index 00000000000..2348865d66e --- /dev/null +++ b/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/run_benchmark.sh @@ -0,0 +1,57 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + batch_size=32 + iters=100 + + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo $var |cut -f2 -d=) + ;; + --iters=*) + iters=$(echo $var |cut -f2 -d=) + ;; + --int8=*) + int8=$(echo $var |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function run_benchmark { + if [[ ${int8} == "true" ]]; then + extra_cmd=$extra_cmd" --int8" + fi + python main.py \ + --input-graph ${input_model} \ + --mode ${mode} \ + --dataset_location ${dataset_location} \ + --batch_size ${batch_size} \ + --benchmark \ + --iters ${iters} \ + ${extra_cmd} +} + +main "$@" diff --git a/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/run_quant.sh b/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/run_quant.sh new file mode 100644 index 00000000000..6a9e1b859c9 --- /dev/null +++ b/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/run_quant.sh @@ -0,0 +1,39 @@ +#!/bin/bash +set -x + +function main { + init_params "$@" + run_tuning + +} + +# init params +function init_params { + + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + output_model=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function run_tuning { + python main.py \ + --input-graph ${input_model} \ + --output-graph ${output_model} \ + --dataset_location ${dataset_location} \ + --tune +} + +main "$@" diff --git a/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/README.md b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/README.md new file mode 100644 index 00000000000..f82b696bd07 --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/README.md @@ -0,0 +1,92 @@ +Step-by-Step +============ + +This document is used to list steps of reproducing TensorFlow Intel® Neural Compressor tuning result of Intel® Model Zoo bert large model on squad v1.1 task. +This example can run on Intel CPUs and GPUs. + + +# Prerequisite + +## 1. Environment + +### Installation +```shell +# Install Intel® Neural Compressor +pip install neural-compressor +``` + +### Install Intel Tensorflow +```shell +pip install intel-tensorflow +``` + +> Note: Validated TensorFlow [Version](/docs/source/installation_guide.md#validated-software-environment). + +### Install Intel Extension for Tensorflow + +#### Quantizing the model on Intel GPU(Mandatory to install ITEX) +Intel Extension for Tensorflow is mandatory to be installed for quantizing the model on Intel GPUs. + +```shell +pip install --upgrade intel-extension-for-tensorflow[xpu] +``` +For any more details, please follow the procedure in [install-gpu-drivers](https://github.com/intel/intel-extension-for-tensorflow/blob/main/docs/install/install_for_xpu.md#install-gpu-drivers) + +#### Quantizing the model on Intel CPU(Optional to install ITEX) +Intel Extension for Tensorflow for Intel CPUs is experimental currently. It's not mandatory for quantizing the model on Intel CPUs. + +```shell +pip install --upgrade intel-extension-for-tensorflow[cpu] +``` +> **Note**: +> The version compatibility of stock Tensorflow and ITEX can be checked [here](https://github.com/intel/intel-extension-for-tensorflow#compatibility-table). Please make sure you have installed compatible Tensorflow and ITEX. + +## 2. Prepare Pretrained model +```shell +wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v2_7_0/fp32_bert_squad.pb +``` + +## 3. Prepare Dataset +Please choose one way to prepare the dataset from the manual approach and the automatic approach. +### Manual approach +```shell +wget https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip +``` + +```shell +unzip wwm_uncased_L-24_H-1024_A-16.zip +``` + +```shell +wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -P wwm_uncased_L-24_H-1024_A-16 +``` +wwm_uncased_L-24_H-1024_A-16 folder will be located on your data path. + +#### Automatic dataset download +Run the `prepare_dataset.sh` script located in `examples/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq`. + +Usage: +```shell +cd examples/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq +bash prepare_dataset.sh --output_dir=./data +``` + +### Convert the dataset to TF Record format +After the dataset is downloaded by either of ways above, the dataset should be converted to files of TF Record format. +```shell +python create_tf_record.py --vocab_file=data/vocab.txt --predict_file=data/dev-v1.1.json --output_file=./data/eval.tf_record +``` + +# Run Command + Please make sure below command should be executed with the same Tensorflow runtime version as above step. + +## Quantization + ```shell + bash run_quant.sh --input_model=./fp32_bert_squad.pb --output_model=./bert_squad_int8.pb --dataset_location=/path/to/evaluation/dataset + ``` + +## Benchmark + ```shell + bash run_benchmark.sh --input_model=./bert_squad_int8.pb --mode=accuracy --dataset_location=/path/to/evaluation/dataset --batch_size=64 + bash run_benchmark.sh --input_model=./bert_squad_int8.pb --mode=performance --dataset_location=/path/to/evaluation/dataset --batch_size=64 + ``` \ No newline at end of file diff --git a/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/create_pretraining_data.py b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/create_pretraining_data.py new file mode 100644 index 00000000000..8adecb971fd --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/create_pretraining_data.py @@ -0,0 +1,475 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Create masked LM/next sentence masked_lm TF examples for BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import random +import tokenization +import tensorflow as tf + +from absl import app +#from absl import flags +from absl import logging +flags = tf.compat.v1.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string("input_file", None, + "Input raw text file (or comma-separated list of files).") + +flags.DEFINE_string( + "output_file", None, + "Output TF example file (or comma-separated list of files).") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_bool( + "do_whole_word_mask", False, + "Whether to use whole word masking rather than per-WordPiece masking.") + +flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.") + +flags.DEFINE_integer("max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence.") + +flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") + +flags.DEFINE_integer( + "dupe_factor", 10, + "Number of times to duplicate the input data (with different masks).") + +flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") + +flags.DEFINE_float( + "short_seq_prob", 0.1, + "Probability of creating sequences which are shorter than the " + "maximum length.") + + +class TrainingInstance(object): + """A single training instance (sentence pair).""" + + def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, + is_random_next): + self.tokens = tokens + self.segment_ids = segment_ids + self.is_random_next = is_random_next + self.masked_lm_positions = masked_lm_positions + self.masked_lm_labels = masked_lm_labels + + def __str__(self): + s = "" + s += "tokens: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.tokens])) + s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) + s += "is_random_next: %s\n" % self.is_random_next + s += "masked_lm_positions: %s\n" % (" ".join( + [str(x) for x in self.masked_lm_positions])) + s += "masked_lm_labels: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.masked_lm_labels])) + s += "\n" + return s + + def __repr__(self): + return self.__str__() + + +def write_instance_to_example_files(instances, tokenizer, max_seq_length, + max_predictions_per_seq, output_files): + """Create TF example files from `TrainingInstance`s.""" + writers = [] + for output_file in output_files: + writers.append(tf.io.TFRecordWriter(output_file)) + + writer_index = 0 + + total_written = 0 + for (inst_index, instance) in enumerate(instances): + input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) + input_mask = [1] * len(input_ids) + segment_ids = list(instance.segment_ids) + assert len(input_ids) <= max_seq_length + + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + masked_lm_positions = list(instance.masked_lm_positions) + masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) + masked_lm_weights = [1.0] * len(masked_lm_ids) + + while len(masked_lm_positions) < max_predictions_per_seq: + masked_lm_positions.append(0) + masked_lm_ids.append(0) + masked_lm_weights.append(0.0) + + next_sentence_label = 1 if instance.is_random_next else 0 + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(input_ids) + features["input_mask"] = create_int_feature(input_mask) + features["segment_ids"] = create_int_feature(segment_ids) + features["masked_lm_positions"] = create_int_feature(masked_lm_positions) + features["masked_lm_ids"] = create_int_feature(masked_lm_ids) + features["masked_lm_weights"] = create_float_feature(masked_lm_weights) + features["next_sentence_labels"] = create_int_feature([next_sentence_label]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + + writers[writer_index].write(tf_example.SerializeToString()) + writer_index = (writer_index + 1) % len(writers) + + total_written += 1 + + if inst_index < 20: + tf.compat.v1.logging.info("*** Example ***") + tf.compat.v1.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in instance.tokens])) + + for feature_name in features.keys(): + feature = features[feature_name] + values = [] + if feature.int64_list.value: + values = feature.int64_list.value + elif feature.float_list.value: + values = feature.float_list.value + tf.compat.v1.logging.info( + "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) + + for writer in writers: + writer.close() + + tf.compat.v1.logging.info("Wrote %d total instances", total_written) + + +def create_int_feature(values): + feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return feature + + +def create_float_feature(values): + feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) + return feature + + +def create_training_instances(input_files, tokenizer, max_seq_length, + dupe_factor, short_seq_prob, masked_lm_prob, + max_predictions_per_seq, rng): + """Create `TrainingInstance`s from raw text.""" + all_documents = [[]] + + # Input file format: + # (1) One sentence per line. These should ideally be actual sentences, not + # entire paragraphs or arbitrary spans of text. (Because we use the + # sentence boundaries for the "next sentence prediction" task). + # (2) Blank lines between documents. Document boundaries are needed so + # that the "next sentence prediction" task doesn't span between documents. + for input_file in input_files: + with tf.io.gfile.GFile(input_file, "r") as reader: + while True: + line = tokenization.convert_to_unicode(reader.readline()) + if not line: + break + line = line.strip() + + # Empty lines are used as document delimiters + if not line: + all_documents.append([]) + tokens = tokenizer.tokenize(line) + if tokens: + all_documents[-1].append(tokens) + + # Remove empty documents + all_documents = [x for x in all_documents if x] + rng.shuffle(all_documents) + + vocab_words = list(tokenizer.vocab.keys()) + instances = [] + for _ in range(dupe_factor): + for document_index in range(len(all_documents)): + instances.extend( + create_instances_from_document( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) + + rng.shuffle(instances) + return instances + + +def create_instances_from_document( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng): + """Creates `TrainingInstance`s for a single document.""" + document = all_documents[document_index] + + # Account for [CLS], [SEP], [SEP] + max_num_tokens = max_seq_length - 3 + + # We *usually* want to fill up the entire sequence since we are padding + # to `max_seq_length` anyways, so short sequences are generally wasted + # computation. However, we *sometimes* + # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter + # sequences to minimize the mismatch between pre-training and fine-tuning. + # The `target_seq_length` is just a rough target however, whereas + # `max_seq_length` is a hard limit. + target_seq_length = max_num_tokens + if rng.random() < short_seq_prob: + target_seq_length = rng.randint(2, max_num_tokens) + + # We DON'T just concatenate all of the tokens from a document into a long + # sequence and choose an arbitrary split point because this would make the + # next sentence prediction task too easy. Instead, we split the input into + # segments "A" and "B" based on the actual "sentences" provided by the user + # input. + instances = [] + current_chunk = [] + current_length = 0 + i = 0 + while i < len(document): + segment = document[i] + current_chunk.append(segment) + current_length += len(segment) + if i == len(document) - 1 or current_length >= target_seq_length: + if current_chunk: + # `a_end` is how many segments from `current_chunk` go into the `A` + # (first) sentence. + a_end = 1 + if len(current_chunk) >= 2: + a_end = rng.randint(1, len(current_chunk) - 1) + + tokens_a = [] + for j in range(a_end): + tokens_a.extend(current_chunk[j]) + + tokens_b = [] + # Random next + is_random_next = False + if len(current_chunk) == 1 or rng.random() < 0.5: + is_random_next = True + target_b_length = target_seq_length - len(tokens_a) + + # This should rarely go for more than one iteration for large + # corpora. However, just to be careful, we try to make sure that + # the random document is not the same as the document + # we're processing. + for _ in range(10): + random_document_index = rng.randint(0, len(all_documents) - 1) + if random_document_index != document_index: + break + + random_document = all_documents[random_document_index] + random_start = rng.randint(0, len(random_document) - 1) + for j in range(random_start, len(random_document)): + tokens_b.extend(random_document[j]) + if len(tokens_b) >= target_b_length: + break + # We didn't actually use these segments so we "put them back" so + # they don't go to waste. + num_unused_segments = len(current_chunk) - a_end + i -= num_unused_segments + # Actual next + else: + is_random_next = False + for j in range(a_end, len(current_chunk)): + tokens_b.extend(current_chunk[j]) + truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) + + assert len(tokens_a) >= 1 + assert len(tokens_b) >= 1 + + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + + tokens.append("[SEP]") + segment_ids.append(0) + + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + (tokens, masked_lm_positions, + masked_lm_labels) = create_masked_lm_predictions( + tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) + instance = TrainingInstance( + tokens=tokens, + segment_ids=segment_ids, + is_random_next=is_random_next, + masked_lm_positions=masked_lm_positions, + masked_lm_labels=masked_lm_labels) + instances.append(instance) + current_chunk = [] + current_length = 0 + i += 1 + + return instances + + +MaskedLmInstance = collections.namedtuple("MaskedLmInstance", + ["index", "label"]) + + +def create_masked_lm_predictions(tokens, masked_lm_prob, + max_predictions_per_seq, vocab_words, rng): + """Creates the predictions for the masked LM objective.""" + + cand_indexes = [] + for (i, token) in enumerate(tokens): + if token == "[CLS]" or token == "[SEP]": + continue + # Whole Word Masking means that if we mask all of the wordpieces + # corresponding to an original word. When a word has been split into + # WordPieces, the first token does not have any marker and any subsequence + # tokens are prefixed with ##. So whenever we see the ## token, we + # append it to the previous set of word indexes. + # + # Note that Whole Word Masking does *not* change the training code + # at all -- we still predict each WordPiece independently, softmaxed + # over the entire vocabulary. + if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and + token.startswith("##")): + cand_indexes[-1].append(i) + else: + cand_indexes.append([i]) + + rng.shuffle(cand_indexes) + + output_tokens = list(tokens) + + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) + + masked_lms = [] + covered_indexes = set() + for index_set in cand_indexes: + if len(masked_lms) >= num_to_predict: + break + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(masked_lms) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + covered_indexes.add(index) + + masked_token = None + # 80% of the time, replace with [MASK] + if rng.random() < 0.8: + masked_token = "[MASK]" + else: + # 10% of the time, keep original + if rng.random() < 0.5: + masked_token = tokens[index] + # 10% of the time, replace with random word + else: + masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] + + output_tokens[index] = masked_token + + masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) + assert len(masked_lms) <= num_to_predict + masked_lms = sorted(masked_lms, key=lambda x: x.index) + + masked_lm_positions = [] + masked_lm_labels = [] + for p in masked_lms: + masked_lm_positions.append(p.index) + masked_lm_labels.append(p.label) + + return (output_tokens, masked_lm_positions, masked_lm_labels) + + +def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng): + """Truncates a pair of sequences to a maximum sequence length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_num_tokens: + break + + trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b + assert len(trunc_tokens) >= 1 + + # We want to sometimes truncate from the front and sometimes from the + # back to add more randomness and avoid biases. + if rng.random() < 0.5: + del trunc_tokens[0] + else: + trunc_tokens.pop() + + +def main(_): + tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.io.gfile.glob(input_pattern)) + + tf.compat.v1.logging.info("*** Reading from input files ***") + for input_file in input_files: + tf.compat.v1.logging.info(" %s", input_file) + + rng = random.Random(FLAGS.random_seed) + instances = create_training_instances( + input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, + FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, + rng) + + output_files = FLAGS.output_file.split(",") + tf.compat.v1.logging.info("*** Writing to output files ***") + for output_file in output_files: + tf.compat.v1.logging.info(" %s", output_file) + + write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, + FLAGS.max_predictions_per_seq, output_files) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("output_file") + flags.mark_flag_as_required("vocab_file") + tf.compat.v1.app.run() diff --git a/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/create_tf_record.py b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/create_tf_record.py new file mode 100644 index 00000000000..12c6486283d --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/create_tf_record.py @@ -0,0 +1,509 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""generate bert dataset""" + +import collections +import json +import os +import tokenization +import six +import tensorflow as tf + +from absl import app +#from absl import flags +from absl import logging + +flags = tf.compat.v1.flags +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_string( + "predict_file", None, + "SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") + +flags.DEFINE_string( + "output_file", None, "The output tf_record for usage.") + +class SquadExample(object): + """A single training/test example for simple sequence classification. + + For examples without an answer, the start and end position are -1. + """ + + def __init__(self, + qas_id, + question_text, + doc_tokens, + orig_answer_text=None, + start_position=None, + end_position=None, + is_impossible=False): + self.qas_id = qas_id + self.question_text = question_text + self.doc_tokens = doc_tokens + self.orig_answer_text = orig_answer_text + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + #self.startpb = 0 + + def __str__(self): + return self.__repr__() + + def __repr__(self): + s = "" + s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) + s += ", question_text: %s" % ( + tokenization.printable_text(self.question_text)) + s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) + if self.start_position: + s += ", start_position: %d" % (self.start_position) + if self.start_position: + s += ", end_position: %d" % (self.end_position) + if self.start_position: + s += ", is_impossible: %r" % (self.is_impossible) + return s + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + unique_id, + example_index, + doc_span_index, + tokens, + token_to_orig_map, + token_is_max_context, + input_ids, + input_mask, + segment_ids, + start_position=None, + end_position=None, + is_impossible=None): + self.unique_id = unique_id + self.example_index = example_index + self.doc_span_index = doc_span_index + self.tokens = tokens + self.token_to_orig_map = token_to_orig_map + self.token_is_max_context = token_is_max_context + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + +def read_squad_examples(input_file, is_training=None): + """Read a SQuAD json file into a list of SquadExample.""" + with tf.io.gfile.GFile(input_file, "r") as reader: + input_data = json.load(reader)["data"] + + def is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False + + examples = [] + for entry in input_data: + for paragraph in entry["paragraphs"]: + paragraph_text = paragraph["context"] + doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + for c in paragraph_text: + if is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + char_to_word_offset.append(len(doc_tokens) - 1) + + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + end_position = None + orig_answer_text = None + is_impossible = False + if is_training: + + if FLAGS.version_2_with_negative: + is_impossible = qa["is_impossible"] + if (len(qa["answers"]) != 1) and (not is_impossible): + raise ValueError( + "For training, each question should have exactly 1 answer.") + if not is_impossible: + answer = qa["answers"][0] + orig_answer_text = answer["text"] + answer_offset = answer["answer_start"] + answer_length = len(orig_answer_text) + start_position = char_to_word_offset[answer_offset] + end_position = char_to_word_offset[answer_offset + answer_length - + 1] + # Only add answers where the text can be exactly recovered from the + # document. If this CAN'T happen it's likely due to weird Unicode + # stuff so we will just skip the example. + # + # Note that this means for training mode, every example is NOT + # guaranteed to be preserved. + actual_text = " ".join( + doc_tokens[start_position:(end_position + 1)]) + cleaned_answer_text = " ".join( + tokenization.whitespace_tokenize(orig_answer_text)) + if actual_text.find(cleaned_answer_text) == -1: + tf.compat.v1.logging.warning("Could not find answer: '%s' vs. '%s'", + actual_text, cleaned_answer_text) + continue + else: + start_position = -1 + end_position = -1 + orig_answer_text = "" + + example = SquadExample( + qas_id=qas_id, + question_text=question_text, + doc_tokens=doc_tokens, + orig_answer_text=orig_answer_text, + start_position=start_position, + end_position=end_position, + is_impossible=is_impossible) + examples.append(example) + + return examples + + +def convert_examples_to_features(examples, tokenizer, max_seq_length, + doc_stride, max_query_length, is_training=None, + output_fn=None): + """Loads a data file into a list of `InputBatch`s.""" + + unique_id = 1000000000 + + for (example_index, example) in enumerate(examples): + query_tokens = tokenizer.tokenize(example.question_text) + + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] + + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(example.doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + tok_start_position = None + tok_end_position = None + if is_training and example.is_impossible: + tok_start_position = -1 + tok_end_position = -1 + if is_training and not example.is_impossible: + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, + example.orig_answer_text) + + # The -3 accounts for [CLS], [SEP] and [SEP] + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 + + # We can have documents that are longer than the maximum sequence length. + # To deal with this we do a sliding window approach, where we take chunks + # of the up to our max length with a stride of `doc_stride`. + _DocSpan = collections.namedtuple( # pylint: disable=invalid-name + "DocSpan", ["start", "length"]) + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + token_to_orig_map = {} + token_is_max_context = {} + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in query_tokens: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + for i in range(doc_span.length): + split_token_index = doc_span.start + i + token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] + + is_max_context = _check_is_max_context(doc_spans, doc_span_index, + split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + start_position = None + end_position = None + if is_training and not example.is_impossible: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = doc_span.start + doc_end = doc_span.start + doc_span.length - 1 + out_of_span = False + if not (tok_start_position >= doc_start and + tok_end_position <= doc_end): + out_of_span = True + if out_of_span: + start_position = 0 + end_position = 0 + else: + doc_offset = len(query_tokens) + 2 + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + if is_training and example.is_impossible: + start_position = 0 + end_position = 0 + + if example_index < 1: + tf.compat.v1.logging.info("*** Example ***") + tf.compat.v1.logging.info("unique_id: %s" % (unique_id)) + tf.compat.v1.logging.info("example_index: %s" % (example_index)) + tf.compat.v1.logging.info("doc_span_index: %s" % (doc_span_index)) + tf.compat.v1.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.compat.v1.logging.info("token_to_orig_map: %s" % " ".join( + ["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)])) + tf.compat.v1.logging.info("token_is_max_context: %s" % " ".join([ + "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) + ])) + tf.compat.v1.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.compat.v1.logging.info( + "input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.compat.v1.logging.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + if is_training and example.is_impossible: + tf.compat.v1.logging.info("impossible example") + if is_training and not example.is_impossible: + answer_text = " ".join(tokens[start_position:(end_position + 1)]) + tf.compat.v1.logging.info("start_position: %d" % (start_position)) + tf.compat.v1.logging.info("end_position: %d" % (end_position)) + tf.compat.v1.logging.info( + "answer: %s" % (tokenization.printable_text(answer_text))) + + feature = InputFeatures( + unique_id=unique_id, + example_index=example_index, + doc_span_index=doc_span_index, + tokens=tokens, + token_to_orig_map=token_to_orig_map, + token_is_max_context=token_is_max_context, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + start_position=start_position, + end_position=end_position, + is_impossible=example.is_impossible) + + # Run callback + output_fn(feature) + + unique_id += 1 + + +def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, + orig_answer_text): + """Returns tokenized answer spans that better match the annotated answer.""" + + # The SQuAD annotations are character based. We first project them to + # whitespace-tokenized words. But then after WordPiece tokenization, we can + # often find a "better match". For example: + # + # Question: What year was John Smith born? + # Context: The leader was John Smith (1895-1943). + # Answer: 1895 + # + # The original whitespace-tokenized answer will be "(1895-1943).". However + # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match + # the exact answer, 1895. + # + # However, this is not always possible. Consider the following: + # + # Question: What country is the top exporter of electornics? + # Context: The Japanese electronics industry is the lagest in the world. + # Answer: Japan + # + # In this case, the annotator chose "Japan" as a character sub-span of + # the word "Japanese". Since our WordPiece tokenizer does not split + # "Japanese", we just use "Japanese" as the annotation. This is fairly rare + # in SQuAD, but does happen. + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + + +def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + + # Because of the sliding window approach taken to scoring documents, a single + # token can appear in multiple documents. E.g. + # Doc: the man went to the store and bought a gallon of milk + # Span A: the man went to the + # Span B: to the store and bought + # Span C: and bought a gallon of + # ... + # + # Now the word 'bought' will have two scores from spans B and C. We only + # want to consider the score with "maximum context", which we define as + # the *minimum* of its left and right context (the *sum* of left and + # right context will always be the same, of course). + # + # In the example the maximum context for 'bought' would be span C since + # it has 1 left context and 3 right context, while span B has 4 left context + # and 0 right context. + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + +class FeatureWriter(object): + """Writes InputFeature to TF example file.""" + + def __init__(self, filename, is_training): + self.is_training = is_training + self.num_features = 0 + self.filename = filename + self._writer = tf.io.TFRecordWriter(self.filename) + + def process_feature(self, feature): + """Write a InputFeature to the TFRecordWriter as a tf.train.Example.""" + self.num_features += 1 + + def create_int_feature(values): + feature = tf.train.Feature( + int64_list=tf.train.Int64List(value=list(values))) + return feature + + features = collections.OrderedDict() + features["unique_ids"] = create_int_feature([feature.unique_id]) + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + + if self.is_training: + features["start_positions"] = create_int_feature([feature.start_position]) + features["end_positions"] = create_int_feature([feature.end_position]) + impossible = 0 + if feature.is_impossible: + impossible = 1 + features["is_impossible"] = create_int_feature([impossible]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + self._writer.write(tf_example.SerializeToString()) + + def close(self): + self._writer.close() + + def rm_tmp_file(self): + os.remove(self.filename) + +def main(_): + tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=True) + + eval_examples = read_squad_examples( + input_file=FLAGS.predict_file, is_training=False) + + eval_writer = FeatureWriter( + filename=FLAGS.output_file, is_training=False) + + eval_features = [] + def append_feature(feature): + eval_features.append(feature) + eval_writer.process_feature(feature) + convert_examples_to_features( + examples=eval_examples, + tokenizer=tokenizer, + max_seq_length=384, + doc_stride=128, + max_query_length=64, + is_training=False, + output_fn=append_feature) + + +if __name__ == "__main__": + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("predict_file") + flags.mark_flag_as_required("output_file") + tf.compat.v1.app.run() diff --git a/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/data_process.py b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/data_process.py new file mode 100644 index 00000000000..6e9d169ada5 --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/data_process.py @@ -0,0 +1,936 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import json +import re +import sys +import string +import collections + +import numpy as np +import tensorflow as tf + +from abc import abstractmethod +from collections import Counter +from neural_compressor.tensorflow.utils.data import default_collate, BaseDataLoader, BatchSampler, IterableFetcher + +def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): + """Calculate the max metric for each ground truth. + + For each answer in ground_truths, evaluate the metric of prediction with + this answer, and return the max metric. + + Args: + metric_fn: The function to calculate the metric. + prediction: The prediction result. + ground_truths: A list of correct answers. + + Returns: + The max metric. Float point number. + """ + scores_for_ground_truths = [] + for ground_truth in ground_truths: + score = metric_fn(prediction, ground_truth) + scores_for_ground_truths.append(score) + return max(scores_for_ground_truths) + +def normalize_answer(text: str) -> str: + """Normalize the answer text. + + Lower text, remove punctuation, articles and extra whitespace, + and replace other whitespace (newline, tab, etc.) to space. + + Args: + s: The text to be normalized. + + Returns: + The normalized text. + """ + + def _remove_articles(text): + return re.sub(r"\b(a|an|the)\b", " ", text) + + def _white_space_fix(text): + return " ".join(text.split()) + + def _remove_punc(text): + exclude = set(string.punctuation) + return "".join(ch for ch in text if ch not in exclude) + + def _lower(text): + return text.lower() + + return _white_space_fix(_remove_articles(_remove_punc(_lower(text)))) + +def exact_match_score(prediction, ground_truth): + """Compute the exact match score between prediction and ground truth. + + Args: + prediction: The result of predictions to be evaluated. + ground_truth: The ground truth. + + Returns: + The exact match score. + """ + return normalize_answer(prediction) == normalize_answer(ground_truth) + +def f1_score(prediction, ground_truth): + """Calculate the F1 score of the prediction and the ground_truth. + + Args: + prediction: The predicted result. + ground_truth: The ground truth. + + Returns: + The F1 score of prediction. Float point number. + """ + prediction_tokens = normalize_answer(prediction).split() + ground_truth_tokens = normalize_answer(ground_truth).split() + common = Counter(prediction_tokens) & Counter(ground_truth_tokens) + num_same = sum(common.values()) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(prediction_tokens) + recall = 1.0 * num_same / len(ground_truth_tokens) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + +def evaluate(dataset, predictions): + """Evaluate the average F1 score and the exact match score for Question-Answering results. + + Args: + dataset: The dataset to evaluate the prediction. A list instance of articles. + An article contains a list of paragraphs, a paragraph contains a list of + question-and-answers (qas), and a question-and-answer contains an id, a question, + and a list of correct answers. For example: + predictions: The result of predictions to be evaluated. A dict mapping the id of + a question to the predicted answer of the question. + + Returns: + The F1 score and the exact match score. + """ + f1 = exact_match = total = 0 + for article in dataset: + for paragraph in article["paragraphs"]: + for qa in paragraph["qas"]: + total += 1 + if qa["id"] not in predictions: + message = "Unanswered question " + qa["id"] + " will receive score 0." + print(message, file=sys.stderr) + continue + ground_truths = list(map(lambda x: x["text"], qa["answers"])) + prediction = predictions[qa["id"]] + exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths) + f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths) + + exact_match = 100.0 * exact_match / total + f1 = 100.0 * f1 / total + + return {"exact_match": exact_match, "f1": f1} + + +class BaseMetric(object): + """The base class of Metric.""" + + def __init__(self, metric, single_output=False, hvd=None): + """Initialize the basic metric. + + Args: + metric: The metric class. + single_output: Whether the output is single or not, defaults to False. + hvd: The Horovod class for distributed training, defaults to None. + """ + self._metric_cls = metric + self._single_output = single_output + self._hvd = hvd + + def __call__(self, *args, **kwargs): + """Evaluate the model predictions, and the reference. + + Returns: + The class itself. + """ + self._metric = self._metric_cls(*args, **kwargs) + return self + + @abstractmethod + def update(self, preds, labels=None, sample_weight=None): + """Update the state that need to be evaluated. + + Args: + preds: The prediction result. + labels: The reference. Defaults to None. + sample_weight: The sampling weight. Defaults to None. + + Raises: + NotImplementedError: The method should be implemented by subclass. + """ + raise NotImplementedError + + @abstractmethod + def reset(self): + """Clear the predictions and labels. + + Raises: + NotImplementedError: The method should be implemented by subclass. + """ + raise NotImplementedError + + @abstractmethod + def result(self): + """Evaluate the difference between predictions and labels. + + Raises: + NotImplementedError: The method should be implemented by subclass. + """ + raise NotImplementedError + + @property + def metric(self): + """Return its metric class. + + Returns: + The metric class. + """ + return self._metric + + @property + def hvd(self): + """Return its hvd class. + + Returns: + The hvd class. + """ + return self._hvd + + @hvd.setter + def hvd(self, hvd): + """Set its hvd. + + Args: + hvd: The Horovod class for distributed training. + """ + self._hvd = hvd + + +class SquadF1(BaseMetric): + """Evaluate for v1.1 of the SQuAD dataset.""" + + def __init__(self): + """Initialize the score list.""" + self._score_list = [] # squad metric only work when all data preds collected + + def update(self, preds, labels, sample_weight=None): + """Add the predictions and labels. + + Args: + preds: The predictions. + labels: The labels corresponding to the predictions. + sample_weight: The sample weight. + """ + if preds: + if getattr(self, "_hvd", None) is not None: + gathered_preds_list = self._hvd.allgather_object(preds) + gathered_labels_list = self._hvd.allgather_object(labels) + temp_preds_list, temp_labels_list = [], [] + for i in range(0, self._hvd.size()): + temp_preds_list += gathered_preds_list[i] + temp_labels_list += gathered_labels_list[i] + preds = temp_preds_list + labels = temp_labels_list + result = evaluate(labels, preds) + self._score_list.append(result["f1"]) + + def reset(self): + """Reset the score list.""" + self._score_list = [] + + def result(self): + """Compute F1 score.""" + if len(self._score_list) == 0: + return 0.0 + return np.array(self._score_list).mean() + + +class ParseDecodeBert: + """Helper function for TensorflowModelZooBertDataset. + + Parse the features from sample. + """ + + def __call__(self, sample): + """Parse the sample data. + + Args: + sample: Data to be parsed. + """ + # Dense features in Example proto. + feature_map = { + "input_ids": tf.compat.v1.VarLenFeature(dtype=tf.int64), + "input_mask": tf.compat.v1.VarLenFeature(dtype=tf.int64), + "segment_ids": tf.compat.v1.VarLenFeature(dtype=tf.int64), + } + + features = tf.io.parse_single_example(sample, feature_map) + + input_ids = features["input_ids"].values + input_mask = features["input_mask"].values + segment_ids = features["segment_ids"].values + + return (input_ids, input_mask, segment_ids) + + +class TFDataLoader(object): # pragma: no cover + """Tensorflow dataloader class. + + In tensorflow1.x dataloader is coupled with the graph, but it also support feed_dict + method to do session run, this dataloader is designed to satisfy the usage of feed dict + in tf1.x. Although it's a general dataloader and can be used in MXNet and PyTorch. + + Args: + dataset: obj. wrapper of needed data. + batch_size: int. batch size + """ + + def __init__(self, dataset, batch_size=1, last_batch="rollover"): + """Initialize `TFDataDataLoader` class.""" + self.dataset = dataset + self.last_batch = last_batch + self.batch_size = batch_size + dataset = dataset.batch(batch_size) + + def batch(self, batch_size, last_batch="rollover"): + """Dataset return data per batch.""" + drop_last = False if last_batch == "rollover" else True + self.batch_size = batch_size + self.dataset = self.dataset.batch(batch_size, drop_last) + + def __iter__(self): + """Iterate dataloader.""" + return self._generate_dataloader( + self.dataset, + batch_size=self.batch_size, + last_batch=self.last_batch, + ) + + def _generate_dataloader( + self, + dataset, + batch_size=1, + last_batch="rollover", + collate_fn=None, + sampler=None, + batch_sampler=None, + num_workers=None, + pin_memory=None, + distributed=False, + ): + """Yield data.""" + drop_last = False if last_batch == "rollover" else True + + def check_dynamic_shape(element_spec): + if isinstance(element_spec, collections.abc.Sequence): + return any([check_dynamic_shape(ele) for ele in element_spec]) + elif isinstance(element_spec, tf.TensorSpec): + return True if element_spec.shape.num_elements() is None else False + else: + raise ValueError("unrecognized element spec...") + + def squeeze_output(output): + if isinstance(output, collections.abc.Sequence): + return [squeeze_output(ele) for ele in output] + elif isinstance(output, np.ndarray): + return np.squeeze(output, axis=0) + else: + raise ValueError("not supported output format....") + + if tf.executing_eagerly(): + index = 0 + outputs = [] + for iter_tensors in dataset: + samples = [] + iter_inputs, iter_labels = iter_tensors[0], iter_tensors[1] + if isinstance(iter_inputs, tf.Tensor): + samples.append(iter_inputs.numpy()) + else: + samples.append(tuple(iter_input.numpy() for iter_input in iter_inputs)) + if isinstance(iter_labels, tf.Tensor): + samples.append(iter_labels.numpy()) + else: + samples.append([np.array(l) for l in iter_labels]) + index += 1 + outputs.append(samples) + if index == batch_size: + outputs = default_collate(outputs) + yield outputs + outputs = [] + index = 0 + if len(outputs) > 0: + outputs = default_collate(outputs) + yield outputs + else: + try_single_batch = check_dynamic_shape(dataset.element_spec) + dataset = dataset.batch(1 if try_single_batch else batch_size, drop_last) + ds_iterator = tf.compat.v1.data.make_one_shot_iterator(dataset) + iter_tensors = ds_iterator.get_next() + data_config = tf.compat.v1.ConfigProto() + data_config.use_per_session_threads = 1 + data_config.intra_op_parallelism_threads = 1 + data_config.inter_op_parallelism_threads = 16 + data_sess = tf.compat.v1.Session(config=data_config) + # pylint: disable=no-name-in-module + from tensorflow.python.framework.errors_impl import OutOfRangeError + + while True: + if not try_single_batch: + try: + outputs = data_sess.run(iter_tensors) + yield outputs + except OutOfRangeError: + data_sess.close() + return + else: + try: + outputs = [] + for i in range(0, batch_size): + outputs.append(squeeze_output(data_sess.run(iter_tensors))) + outputs = default_collate(outputs) + yield outputs + except OutOfRangeError: + if len(outputs) == 0: + data_sess.close() + return + else: + outputs = default_collate(outputs) + yield outputs + data_sess.close() + return + + +class ModelZooBertDataset(object): + """Tensorflow dataset for three-input Bert in tf record format. + + Root is a full path to tfrecord file, which contains the file name. + Please use Resize transform when batch_size > 1 + Args: root (str): path of dataset. + label_file (str): path of label file. + task (str, default='squad'): task type of model. + model_type (str, default='bert'): model type, support 'bert'. + transform (transform object, default=None): transform to process input data. + filter (Filter objects, default=None): filter out examples according. + """ + + def __init__(self, root, label_file, task="squad", model_type="bert", transform=None, filter=None, num_cores=28): + """Initialize the attributes of class.""" + with open(label_file) as lf: + label_json = json.load(lf) + assert label_json["version"] == "1.1", "only support squad 1.1" + self.label = label_json["data"] + + record_iterator = tf.compat.v1.python_io.tf_record_iterator(root) + example = tf.train.SequenceExample() + for element in record_iterator: + example.ParseFromString(element) + break + feature = example.context.feature + if len(feature["input_ids"].int64_list.value) == 0 and len(feature["input_mask"].int64_list.value) == 0: + raise ValueError( + "Tfrecord format is incorrect, please refer\ + 'https://github.com/tensorflow/models/blob/master/research/\ + object_detection/dataset_tools/' to create correct tfrecord" + ) + # pylint: disable=no-name-in-module + from tensorflow.python.data.experimental import parallel_interleave + + tfrecord_paths = [root] + ds = tf.data.TFRecordDataset.list_files(tfrecord_paths) + ds = ds.apply( + parallel_interleave( + tf.data.TFRecordDataset, + cycle_length=num_cores, + block_length=5, + sloppy=True, + buffer_output_elements=10000, + prefetch_input_elements=10000, + ) + ) + if transform is not None: + transform.transform_list.insert(0, ParseDecodeBert()) + else: + transform = ParseDecodeBert() + ds = ds.map(transform, num_parallel_calls=None) + if filter is not None: + ds = ds.filter(filter) + ds = ds.prefetch(buffer_size=1000) + ds = TFDataLoader(ds) + self.root = [] + for inputs in ds: + self.root.append(inputs) + self.transform = transform + self.filter = filter + + def __getitem__(self, index): + """Magic method. + + x[i] is roughly equivalent to type(x).__getitem__(x, index) + """ + return self.root[index], self.label + + def __len__(self): + """Length of the dataset.""" + return len(self.root) + + +class TFSquadV1PostTransform(object): + """Postprocess the predictions of bert on SQuAD. + + Args: + label_file (str): path of label file + vocab_file(str): path of vocabulary file + n_best_size (int, default=20): + The total number of n-best predictions to generate in nbest_predictions.json + max_seq_length (int, default=384): + The maximum total input sequence length after WordPiece tokenization. + Sequences longer than this will be truncated, shorter than this will be padded + max_query_length (int, default=64): + The maximum number of tokens for the question. + Questions longer than this will be truncated to this length + max_answer_length (int, default=30): + The maximum length of an answer that can be generated. This is needed because + the start and end predictions are not conditioned on one another + do_lower_case (bool, default=True): + Whether to lower case the input text. + Should be True for uncased models and False for cased models + doc_stride (int, default=128): + When splitting up a long document into chunks, + how much stride to take between chunks + + Returns: + tuple of processed prediction and label + """ + + def __init__( + self, + label_file, + vocab_file, + n_best_size=20, + max_seq_length=384, + max_query_length=64, + max_answer_length=30, + do_lower_case=True, + doc_stride=128, + ): + """Initialize `TFSquadV1PostTransform` class.""" + from tokenization import FullTokenizer + from create_tf_record import read_squad_examples, convert_examples_to_features + self.eval_examples = read_squad_examples(label_file) + tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) + + self.eval_features = [] + + def append_feature(feature): + self.eval_features.append(feature) + + convert_examples_to_features( + examples=self.eval_examples, + tokenizer=tokenizer, + max_seq_length=max_seq_length, + doc_stride=doc_stride, + max_query_length=max_query_length, + output_fn=append_feature, + ) + + self.n_best_size = n_best_size + self.max_answer_length = max_answer_length + self.do_lower_case = do_lower_case + self.RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"]) + + def process_result(self, results): + """Get the processed results.""" + processed_results = [] + # notice the result list sequence + for unique_id, start_logits, end_logits in zip(*results): + processed_results.append( + self.RawResult( + unique_id=int(unique_id), + start_logits=[float(x) for x in start_logits.flat], + end_logits=[float(x) for x in end_logits.flat], + ) + ) + + return processed_results + + def get_postprocess_result(self, sample): + """Get the post processed results.""" + if sample == (None, None): + return (None, None) + all_results, label = sample + all_results = self.process_result(all_results) + example_index_to_features = collections.defaultdict(list) + for feature in self.eval_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"] + ) + + all_predictions = collections.OrderedDict() + for example_index, example in enumerate(self.eval_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + min_null_feature_index = 0 # the paragraph slice with min mull score + null_start_logit = 0 # the start logit at the slice with min null score + null_end_logit = 0 # the end logit at the slice with min null score + for feature_index, feature in enumerate(features): + # skip the case that is not predicted + if feature.unique_id not in unique_id_to_result: + all_predictions[example.qas_id] = "*#skip this example#*" + continue + result = unique_id_to_result[feature.unique_id] + start_indexes = TFSquadV1PostTransform._get_best_indexes(result.start_logits, self.n_best_size) + end_indexes = TFSquadV1PostTransform._get_best_indexes(result.end_logits, self.n_best_size) + + for start_index in start_indexes: + for end_index in end_indexes: + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= len(feature.tokens): + continue + if end_index >= len(feature.tokens): + continue + if start_index not in feature.token_to_orig_map: + continue + if end_index not in feature.token_to_orig_map: + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > self.max_answer_length: + continue + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[start_index], + end_logit=result.end_logits[end_index], + ) + ) + + prelim_predictions = sorted( + prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True + ) + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_logit", "end_logit"] + ) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= self.n_best_size: + break + feature = features[pred.feature_index] + if pred.start_index > 0: # this is a non-null prediction + tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)] + orig_doc_start = feature.token_to_orig_map[pred.start_index] + orig_doc_end = feature.token_to_orig_map[pred.end_index] + orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)] + tok_text = " ".join(tok_tokens) + + # De-tokenize WordPieces that have been split off. + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) + + final_text = TFSquadV1PostTransform.get_final_text(tok_text, orig_text, self.do_lower_case) + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + else: + final_text = "" + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit) + ) + + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + assert len(nbest) >= 1 + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_logit + entry.end_logit) + if not best_non_null_entry: + if entry.text: + best_non_null_entry = entry + probs = TFSquadV1PostTransform._compute_softmax(total_scores) + + nbest_json = [] + for i, entry in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_logit"] = entry.start_logit + output["end_logit"] = entry.end_logit + nbest_json.append(output) + + assert len(nbest_json) >= 1 + all_predictions[example.qas_id] = nbest_json[0]["text"] + return (all_predictions, label) + + @staticmethod + def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + @staticmethod + def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + import math + + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs + + @staticmethod + def get_final_text(pred_text, orig_text, do_lower_case): + """Project the tokenized prediction back to the original text.""" + import six + + from tokenization import BasicTokenizer + + def _strip_spaces(text): + ns_chars = [] + ns_to_s_map = collections.OrderedDict() + for i, c in enumerate(text): + if c == " ": + continue + ns_to_s_map[len(ns_chars)] = i + ns_chars.append(c) + ns_text = "".join(ns_chars) + return (ns_text, ns_to_s_map) + + tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + tok_text = " ".join(tokenizer.tokenize(orig_text)) + start_position = tok_text.find(pred_text) + if start_position == -1: + return orig_text + end_position = start_position + len(pred_text) - 1 + + (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) + (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + + if len(orig_ns_text) != len(tok_ns_text): + return orig_text + + tok_s_to_ns_map = {} + for i, tok_index in six.iteritems(tok_ns_to_s_map): + tok_s_to_ns_map[tok_index] = i + + orig_start_position = None + if start_position in tok_s_to_ns_map: + ns_start_position = tok_s_to_ns_map[start_position] + if ns_start_position in orig_ns_to_s_map: + orig_start_position = orig_ns_to_s_map[ns_start_position] + + if orig_start_position is None: + return orig_text + + orig_end_position = None + if end_position in tok_s_to_ns_map: + ns_end_position = tok_s_to_ns_map[end_position] + if ns_end_position in orig_ns_to_s_map: + orig_end_position = orig_ns_to_s_map[ns_end_position] + + if orig_end_position is None: + return orig_text + + output_text = orig_text[orig_start_position : (orig_end_position + 1)] + return output_text + + def __call__(self, sample): + """Call the get_postprocess_result.""" + return self.get_postprocess_result(sample) + + +class CollectTransform(object): + """Postprocess the predictions, collect data.""" + + def __init__(self, length=10833): + """Initialize `CollectTransform` class.""" + self.length = length + self.unique_id = [] + self.start_logits = [] + self.end_logits = [] + self.all_sample = (None, None) + self.idx = 1000000000 + + def __call__(self, sample): + """Collect postprocess data.""" + all_results, label = sample + result_list = [np.expand_dims(result, 0) for result in all_results] + for result in result_list: + if len(self.unique_id) < self.length: + result = result.transpose(2, 0, 1) + self.unique_id.append(self.idx) + self.start_logits.append(result[0]) + self.end_logits.append(result[1]) + self.idx += 1 + if len(self.unique_id) == self.length: + self.all_sample = ([self.unique_id, self.start_logits, self.end_logits], label) + return self.all_sample + + +class TFModelZooCollectTransform(CollectTransform): + """Postprocess the predictions of model zoo, collect data.""" + + def __call__(self, sample): + """Collect postprocess data.""" + all_results, label = sample + if len(all_results) == 1: + all_results = all_results.reshape((2, 1, 384)) + all_results = zip(all_results[0], all_results[1]) + for start_logits, end_logits in all_results: + if len(self.unique_id) < self.length: + self.unique_id.append(self.idx) + self.start_logits.append(start_logits) + self.end_logits.append(end_logits) + self.idx += 1 + if len(self.unique_id) == self.length: + self.all_sample = ([self.unique_id, self.start_logits, self.end_logits], label) + return self.all_sample + + +class TFSquadV1ModelZooPostTransform(TFSquadV1PostTransform): + """Postprocess the predictions of bert on SQuADV1.1. + + See class TFSquadV1PostTransform for more details + """ + + def __init__( + self, + label_file, + vocab_file, + n_best_size=20, + max_seq_length=384, + max_query_length=64, + max_answer_length=30, + do_lower_case=True, + doc_stride=128, + ): + """Initialize `TFSquadV1ModelZooPostTransform` class.""" + super().__init__( + label_file, + vocab_file, + n_best_size, + max_seq_length, + max_query_length, + max_answer_length, + do_lower_case, + doc_stride, + ) + self.length = len(self.eval_features) + self.collect_data = TFModelZooCollectTransform(length=self.length) + + def __call__(self, sample): + """Collect data and get postprocess results.""" + sample = self.collect_data(sample) + return self.get_postprocess_result(sample) + + +class ModelZooBertDataLoader(BaseDataLoader): # pragma: no cover + """This dataloader is designed to satisfy the usage of Model Zoo Bert models.""" + + def _generate_dataloader( + self, + dataset, + batch_size, + last_batch, + collate_fn, + sampler, + batch_sampler, + num_workers, + pin_memory, + shuffle, + distributed, + ): + def bert_collate_fn(batch): + input_ids = [] + input_mask = [] + segment_ids = [] + for elem in batch: + input_ids.append(elem[0][0][0]) + input_mask.append(elem[0][1][0]) + segment_ids.append(elem[0][2][0]) + inputs = [input_ids, input_mask, segment_ids] + return inputs, batch[0][1] + + drop_last = False if last_batch == "rollover" else True + sampler = self._generate_sampler(dataset, distributed) + self.batch_sampler = BatchSampler(sampler, batch_size, drop_last) + self.fetcher = IterableFetcher(dataset, bert_collate_fn, drop_last, distributed) + + inputs = [] + for batched_indices in self.batch_sampler: + try: + data = self.fetcher(batched_indices) + yield data + except StopIteration: + return diff --git a/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/main.py b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/main.py new file mode 100644 index 00000000000..b5403618f40 --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/main.py @@ -0,0 +1,141 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Run BERT on SQuAD 1.1 and SQuAD 2.0.""" +import os +import time + +import numpy as np +import tensorflow as tf + +from data_process import SquadF1, ModelZooBertDataset, TFSquadV1ModelZooPostTransform, ModelZooBertDataLoader + +flags = tf.compat.v1.flags +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + 'input_model', None, 'Run inference with specified pb graph.') + +flags.DEFINE_string( + 'output_model', None, 'The output model of the quantized model.') + +flags.DEFINE_string( + 'mode', 'performance', 'define benchmark mode for accuracy or performance') + +flags.DEFINE_bool( + 'tune', False, 'whether to tune the model') + +flags.DEFINE_bool( + 'benchmark', False, 'whether to benchmark the model') + +flags.DEFINE_bool( + 'strip_iterator', False, 'whether to strip the iterator of the model') + +flags.DEFINE_string('dataset_location', None, + 'location of calibration dataset and evaluate dataset') + +flags.DEFINE_integer("batch_size", 64, "run batch size") + +flags.DEFINE_integer("iters", 100, "The iteration used for benchmark.") + + +def evaluate(model, dataloader, metric, postprocess): + """Custom evaluate function to estimate the accuracy of the bert model. + + Args: + model (tf.Graph_def): The input model graph + + Returns: + accuracy (float): evaluation result, the larger is better. + """ + from neural_compressor.tensorflow.quantization.utils.utility import iterator_sess_run + from neural_compressor.tensorflow.utils import Model, BaseModel + if not isinstance(model, BaseModel): + model = Model(model) + model.input_tensor_names = ['input_ids', 'input_mask', 'segment_ids'] + model.output_tensor_names = ['start_logits', 'end_logits'] + input_tensor = model.input_tensor + output_tensor = model.output_tensor if len(model.output_tensor)>1 else \ + model.output_tensor[0] + iteration = -1 + if FLAGS.benchmark and FLAGS.mode == 'performance': + iteration = FLAGS.iters + + warmup = 5 + latency_list = [] + for idx, (inputs, labels) in enumerate(dataloader): + # dataloader should keep the order and len of inputs same with input_tensor + assert len(input_tensor) == len(inputs), \ + 'inputs len must equal with input_tensor' + feed_dict = dict(zip(input_tensor, inputs)) + start_time = time.time() + predictions = model.sess.run(output_tensor, feed_dict) + latency_list.append(time.time() - start_time) + predictions, labels = postprocess((predictions, labels)) + metric.update(predictions, labels) + if idx + 1 == iteration: + break + + latency = np.array(latency_list[warmup:]).mean() / FLAGS.batch_size + + if FLAGS.benchmark and FLAGS.mode == 'performance': + print("Batch size = {}".format(FLAGS.batch_size)) + print("Latency: {:.3f} ms".format(latency * 1000)) + print("Throughput: {:.3f} images/sec".format(1. / latency)) + acc = metric.result() + return acc + +def main(_): + tf.compat.v1.disable_eager_execution() + tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) + + data_path = os.path.join(FLAGS.dataset_location, 'eval.tf_record') + label_path = os.path.join(FLAGS.dataset_location, 'dev-v1.1.json') + vocab_path = os.path.join(FLAGS.dataset_location, 'vocab.txt') + + dataset = ModelZooBertDataset(root=data_path, label_file=label_path) + dataloader = ModelZooBertDataLoader(dataset=dataset, batch_size=FLAGS.batch_size) + + def eval(model): + metric = SquadF1() + postprocess = TFSquadV1ModelZooPostTransform(label_file=label_path, vocab_file=vocab_path) + return evaluate(model, dataloader, metric, postprocess) + + if FLAGS.benchmark: + if FLAGS.mode == 'performance': + eval(FLAGS.input_model) + elif FLAGS.mode == 'accuracy': + acc_result = eval(FLAGS.input_model) + print("Batch size = %d" % dataloader.batch_size) + print("Accuracy: %.5f" % acc_result) + + elif FLAGS.tune: + from neural_compressor.tensorflow import StaticQuantConfig, quantize_model, Model + + model = Model(FLAGS.input_model) + model.input_tensor_names = ['input_ids', 'input_mask', 'segment_ids'] + model.output_tensor_names = ['start_logits', 'end_logits'] + quant_config = StaticQuantConfig() + q_model = quantize_model(model, quant_config, dataloader) + q_model.save(FLAGS.output_model) + + dataset = ModelZooBertDataset(root=data_path, label_file=label_path) + dataloader = ModelZooBertDataLoader(dataset=dataset, batch_size=FLAGS.batch_size) + +if __name__ == "__main__": + tf.compat.v1.app.run() diff --git a/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/prepare_dataset.sh b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/prepare_dataset.sh new file mode 100644 index 00000000000..acae8ce944d --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/prepare_dataset.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# set -x + +OUTPUT_DIR="./data" + +help() +{ + cat <<- EOF + Desc: Prepare bert dataset + -h --help help info + --output_dir Output data directory + default: './data' +EOF + exit 0 +} + +function main { + init_params "$@" + convert_dataset +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --output_dir=*) + OUTPUT_DIR=$(echo $var |cut -f2 -d=) + ;; + -h|--help) help + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done +} + +# convert dataset +function convert_dataset { + if [ ! -d ${OUTPUT_DIR} ]; then + echo '${OUTPUT_DIR} already exists, please check...' + fi + wget https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip + unzip wwm_uncased_L-24_H-1024_A-16.zip + wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -P wwm_uncased_L-24_H-1024_A-16 + mv wwm_uncased_L-24_H-1024_A-16 ${OUTPUT_DIR} + +} + +main "$@" + diff --git a/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/requirements.txt b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/requirements.txt new file mode 100644 index 00000000000..d42132a4e87 --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/requirements.txt @@ -0,0 +1 @@ +intel-tensorflow>=2.12.0 \ No newline at end of file diff --git a/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/run_benchmark.sh b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/run_benchmark.sh new file mode 100644 index 00000000000..aa8d269a79a --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/run_benchmark.sh @@ -0,0 +1,50 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + iters=100 + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo $var |cut -f2 -d=) + ;; + --iters=*) + iters=$(echo $var |cut -f2 -d=) + ;; + esac + done + +} + + +# run_tuning +function run_benchmark { + + python main.py \ + --input_model=${input_model} \ + --mode=${mode} \ + --dataset_location=${dataset_location} \ + --batch_size=${batch_size} \ + --benchmark \ + +} + +main "$@" diff --git a/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/run_quant.sh b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/run_quant.sh new file mode 100644 index 00000000000..ddc30b40177 --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/run_quant.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# set -x + +function main { + + init_params "$@" + + run_tuning + +} + +# init params +function init_params { + batch_size=64 + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo "$var" |cut -f2 -d=) + ;; + --output_model=*) + output_model=$(echo "$var" |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo $var |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function run_tuning { + python main.py \ + --input_model=${input_model} \ + --output_model=${output_model} \ + --dataset_location=${dataset_location} \ + --batch_size=${batch_size} \ + --tune \ + +} + +main "$@" diff --git a/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/tokenization.py b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/tokenization.py new file mode 100644 index 00000000000..77c3175db07 --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/tokenization.py @@ -0,0 +1,402 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import re +import unicodedata +import six +import tensorflow as tf + + +def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): + """Checks whether the casing config is consistent with the checkpoint name.""" + + # The casing has to be passed in by the user and there is no explicit check + # as to whether it matches the checkpoint. The casing information probably + # should have been stored in the bert_config.json file, but it's not, so + # we have to heuristically detect it to validate. + + if not init_checkpoint: + return + + m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) + if m is None: + return + + model_name = m.group(1) + + lower_models = [ + "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", + "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" + ] + + cased_models = [ + "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", + "multi_cased_L-12_H-768_A-12" + ] + + is_bad_config = False + if model_name in lower_models and not do_lower_case: + is_bad_config = True + actual_flag = "False" + case_name = "lowercased" + opposite_flag = "True" + + if model_name in cased_models and do_lower_case: + is_bad_config = True + actual_flag = "True" + case_name = "cased" + opposite_flag = "False" + + if is_bad_config: + raise ValueError( + "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " + "However, `%s` seems to be a %s model, so you " + "should pass in `--do_lower_case=%s` so that the fine-tuning matches " + "how the model was pre-training. If this error is wrong, please " + "just comment out this check." % (actual_flag, init_checkpoint, + model_name, case_name, opposite_flag)) + + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with tf.io.gfile.GFile(vocab_file, "r") as reader: + while True: + token = convert_to_unicode(reader.readline()) + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + for item in items: + output.append(vocab[item]) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically control characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat in ("Cc", "Cf"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/README.md b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/README.md new file mode 100644 index 00000000000..7a8c22631e0 --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/README.md @@ -0,0 +1,141 @@ +Step-by-Step +============ + +This document is used to list steps of reproducing TensorFlow Intel® Neural Compressor smooth quantization of language models gpt-j-6B. + +# Prerequisite + +## 1. Environment + +### Installation +```shell +# Install Intel® Neural Compressor +pip install neural-compressor +pip install -r requirements.txt +``` + +## 2. Prepare Pretrained model +Run the follow script to download gpt-j-6B saved_model to ```./gpt-j-6B```: + ``` +bash prepare_model.sh + ``` + +## 3. Install TensorFlow 2.11.dev202242 +Build a TensorFlow pip package from [intel-tensorflow spr_ww42 branch](https://github.com/Intel-tensorflow/tensorflow/tree/spr_ww42) and install it. How to build a TensorFlow pip package from source please refer to this [tutorial](https://www.tensorflow.org/install/source). + +The performance of int8 gpt-j-6B would be better once intel-tensorflow for gnr is released. + +## 4. Prepare Dataset +The dataset will be automatically loaded. + +# Run + +## Smooth Quantization + +```shell +bash run_quant.sh --input_model= --output_model= +``` + +## Benchmark + +### Evaluate Performance + +```shell +bash run_benchmark.sh --input_model= --mode=benchmark +``` + +### Evaluate Accuracy + +```shell +bash run_benchmark.sh --input_model= --mode=accuracy +``` + + +Details of enabling Intel® Neural Compressor on gpt-j-6B for TensorFlow +========================= + +This is a tutorial of how to enable gpt-j-6B model with Intel® Neural Compressor. +## User Code Analysis + +User specifies fp32 *model*, calibration dataloader *q_dataloader* and a custom *eval_func* which encapsulates the evaluation dataloader and metric by itself. + +### calib_dataloader Part Adaption +Below dataloader class uses generator function to provide the model with input. + +```python +class MyDataloader: + def __init__(self, dataset, batch_size=1): + self.dataset = dataset + self.batch_size = batch_size + self.length = math.ceil(len(dataset) / self.batch_size) + + def generate_data(self, data, pad_token_id=50256): + input_ids = tf.convert_to_tensor([data[:-1]], dtype=tf.int32) + cur_len = len(data)-1 + input_ids_padding = tf.ones((self.batch_size, 1), dtype=tf.int32) * (pad_token_id or 0) + generated = tf.concat([input_ids, input_ids_padding], axis=-1) + model_kwargs = {'attention_mask': prepare_attention_mask_for_generation(input_ids)} + if model_kwargs.get("past_key_values") is None: + input_ids = generated[:, :cur_len] + else: + input_ids = tf.expand_dims(generated[:, cur_len - 1], -1) + return model_kwargs['attention_mask'], input_ids + + def __iter__(self): + labels = None + for _, data in enumerate(self.dataset): + cur_input = self.generate_data(data) + yield (cur_input, labels) + + def __len__(self): + return self.length +``` + + +### Code Update +After prepare step is done, we add the code for quantization tuning to generate quantized model. + +Firstly, let's load a INC inner class model from the path of gpt-j-6B saved_model. +```python + from neural_compressor import Model + model = Model(run_args.input_model, modelType='llm_saved_model') +``` + +#### Tune + +To apply quantization, the function that maps names from AutoTrackable variables to graph nodes must be defined to match names of nodes in different format. +```python + def weight_name_mapping(name): + """The function that maps name from AutoTrackable variables to graph nodes""" + name = name.replace('tfgptj_for_causal_lm', 'StatefulPartitionedCall') + name = name.replace('kernel:0', 'Tensordot/ReadVariableOp') + return name +``` + +Please use the recipe to set smooth quantization. +```python + from neural_compressor.tensorflow import StaticQuantConfig, SmoothQuantConfig, autotune + from neural_compressor.tensorflow.quantization import TuningConfig + from neural_compressor.tensorflow.utils import BaseDataLoader + + calib_dataloader = MyDataloader(mydata, batch_size=run_args.batch_size) + quant_config = [SmoothQuantConfig(alpha=0.52705), StaticQuantConfig(act_dtype="int8", weight_dtype="int8")] + tune_config = TuningConfig(config_set=quant_config, max_trials=1) + model.weight_name_mapping = weight_name_mapping + q_model = autotune(model, + tune_config, + eval_fn=evaluate, + calib_dataloader=calib_dataloader) + q_model.save(run_args.output_model) +``` +#### Benchmark +```python + if run_args.mode == "performance": + evaluate(model.model) + elif run_args.mode == "accuracy": + acc_result = evaluate(model.model) + print("Batch size = %d" % run_args.batch_size) + print("Accuracy: %.5f" % acc_result) +``` + +The Intel® Neural Compressor quantization.fit() function will return a best quantized model under time constraint. \ No newline at end of file diff --git a/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/main.py b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/main.py new file mode 100644 index 00000000000..faf54b65bd0 --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/main.py @@ -0,0 +1,349 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import time +import math +import numpy as np +import logging +import datasets +import tensorflow as tf +from typing import Optional +from itertools import chain +from datasets import load_dataset +from collections import defaultdict +from dataclasses import dataclass, field + +import transformers +from transformers import ( + TF_MODEL_FOR_CAUSAL_LM_MAPPING, + AutoConfig, + AutoTokenizer, + HfArgumentParser, + TFAutoModelForCausalLM, + TFTrainingArguments, + set_seed, +) +from transformers.utils.versions import require_version + +logger = logging.getLogger(__name__) +require_version("datasets>=1.8.0", "To fix: pip install -r requirements.txt") +MODEL_CONFIG_CLASSES = list(TF_MODEL_FOR_CAUSAL_LM_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to use. + """ + + model_name_or_path: Optional[str] = field( + default="EleutherAI/gpt-j-6B", + metadata={ + "help": ( + "The model checkpoint for GPT-J weights." + ) + }, + ) + config_overrides: Optional[str] = field( + default=None, + metadata={ + "help": ( + "Override some existing default config settings when a model is trained from scratch. Example: " + "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + ) + }, + ) + checkpoint: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + precision: Optional[str] = field( + default="fp32", + metadata={"help": "The precision that we want to run with."}, + ) + + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for evaluation. + """ + + dataset_name: Optional[str] = field( + default="EleutherAI/lambada_openai", metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + block_size: Optional[int] = field( + default=None, + metadata={ + "help": ( + "Optional input sequence length after tokenization. " + "The training dataset will be truncated in block of this size for training. " + "Default to the model max input length for single sentence inputs (take into account special tokens)." + ) + }, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + + +@dataclass +class RunningArguments: + """ + Arguments for options of running. + """ + + input_model: Optional[str] = field( + default="./gpt-j-6B", + metadata={ + "help": ( + "The path of input model." + ) + }, + ) + output_model: Optional[str] = field( + default="./nc_int8_gpt-j-6B", + metadata={ + "help": ( + "The path save quantized gpt-j-6B int8 model." + ) + }, + ) + tune: bool = field( + default=False, + metadata={"help": "Whether to apply quantization."}, + ) + benchmark: bool = field( + default=False, + metadata={"help": "Whether to apply benchmarking."}, + ) + mode: Optional[str] = field( + default="performance", + metadata={"help": ("Evaluate performance or accuracy benchmark." + "Set it to be accuracy or performance.")}, + ) + batch_size: Optional[int] = field( + default=1, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + iteration: Optional[int] = field( + default=200, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + + + +parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments, RunningArguments)) +model_args, data_args, train_args, run_args = parser.parse_args_into_dataclasses() + +logger.setLevel(logging.INFO) +datasets.utils.logging.set_verbosity_warning() +transformers.utils.logging.set_verbosity_info() + +if train_args.seed is not None: + set_seed(train_args.seed) + +raw_datasets = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.checkpoint, + use_auth_token=None, + ) + +config = AutoConfig.from_pretrained(model_args.model_name_or_path) +tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) +column_names = raw_datasets["test"].column_names +text_column_name = "text" if "text" in column_names else column_names[0] + +mydata = tokenizer(raw_datasets["test"][text_column_name], return_tensors="np").input_ids + + +def prepare_attention_mask_for_generation( + inputs: tf.Tensor, + pad_token_id=50256, + eos_token_id=50256, +) -> tf.Tensor: + """Generate attention_mask from input_ids. + + Args: + inputs (tf.Tensor): The tensor of input_ids. + + Returns: + attention_mask (tf.Tensor): The tensor of attention_mask. + """ + is_input_ids = len(inputs.shape) == 2 and inputs.dtype in (tf.int32, tf.int64) + is_pad_token_in_inputs = (pad_token_id is not None) and tf.math.reduce_any(inputs == pad_token_id) + is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (pad_token_id != eos_token_id) + + # Check if input is input_ids and padded -> only then is attention_mask defined + attention_mask = tf.cast(tf.math.not_equal(inputs, pad_token_id), dtype=tf.int32) \ + if is_input_ids and is_pad_token_in_inputs and is_pad_token_not_equal_to_eos_token_id \ + else tf.ones(inputs.shape[:2], dtype=tf.int32) + + return attention_mask + +class MyDataloader: + def __init__(self, dataset, batch_size=1): + self.dataset = dataset + self.batch_size = batch_size + self.length = math.ceil(len(dataset) / self.batch_size) + + def generate_data(self, data, pad_token_id=50256): + input_ids = tf.convert_to_tensor([data[:-1]], dtype=tf.int32) + cur_len = len(data)-1 + input_ids_padding = tf.ones((self.batch_size, 1), dtype=tf.int32) * (pad_token_id or 0) + generated = tf.concat([input_ids, input_ids_padding], axis=-1) + model_kwargs = {'attention_mask': prepare_attention_mask_for_generation(input_ids)} + if model_kwargs.get("past_key_values") is None: + input_ids = generated[:, :cur_len] + else: + input_ids = tf.expand_dims(generated[:, cur_len - 1], -1) + return model_kwargs['attention_mask'], input_ids + + def __iter__(self): + labels = None + for _, data in enumerate(self.dataset): + cur_input = self.generate_data(data) + yield (cur_input, labels) + + def __len__(self): + return self.length + +def postprocess(outputs, generated, batch_size, cur_len): + """The function that processes the inference outputs to prediction""" + finished_sequences = tf.convert_to_tensor([False]) + next_token_logits = outputs['logits'][:, -1] + # pre-process distribution + next_tokens_scores = next_token_logits + # argmax + next_tokens = tf.argmax(next_tokens_scores, axis=-1, output_type=tf.int32) + + pad_token_id = 50256 + eos_token_id = [50256] + + unfinished_seq = 1 - tf.cast(finished_sequences, tf.int32) + next_tokens = next_tokens * unfinished_seq + pad_token_id * (1 - unfinished_seq) + next_token_is_eos = tf.math.reduce_any( + tf.equal( + tf.broadcast_to(next_tokens, (len(eos_token_id), batch_size)), tf.expand_dims(eos_token_id, -1) + ), + axis=0, + ) + finished_sequences = finished_sequences | next_token_is_eos + + # update `generated` and `cur_len` + update_indices = tf.stack([tf.range(batch_size), tf.broadcast_to(cur_len, [batch_size])], axis=-1) + return tf.tensor_scatter_nd_update(tensor=generated, indices=update_indices, updates=next_tokens) + +def evaluate(model, tf_eval_dataset=mydata): + """Evaluate function that inference the model to apply calibration or benchmarking. + + Args: + model (tf.python.trackable.autotrackable): The model to be evaluated. + The object is usually gotten by using tf.saved_model.load(model_dir) API. + + Returns: + accuracy (float): The accuracy result. + """ + warmup = 5 + batch_size = run_args.batch_size + pad_token_id = 50256 + iteration = run_args.iteration + correct = 0 + latency_list = [] + from neural_compressor.tensorflow.utils import BaseModel + + if isinstance(model, BaseModel): + model = model.model + infer = model.signatures["serving_default"] + for idx, data in enumerate(tf_eval_dataset): + input_ids = tf.convert_to_tensor([data[:-1]], dtype=tf.int32) + cur_len = len(data)-1 + input_ids_padding = tf.ones((batch_size, 1), dtype=tf.int32) * (pad_token_id or 0) + generated = tf.concat([input_ids, input_ids_padding], axis=-1) + input_ids = generated[:, :cur_len] + attention_mask = prepare_attention_mask_for_generation(input_ids) + inputs = {'input_ids': input_ids, 'attention_mask': attention_mask} + + start = time.time() + outputs = infer(**inputs) + end = time.time() + dur = end-start + + predictions = postprocess(outputs, generated, batch_size, cur_len) + if data[-1] == predictions[0][-1].numpy(): + correct+=1 + + latency_list.append(dur) + if idx >= iteration: + break + latency = np.array(latency_list[warmup:]).mean() / 1 + acc = correct/(iteration+1) + if run_args.benchmark and run_args.mode == 'performance': + print("Batch size = {}".format(run_args.batch_size)) + print("Latency: {:.3f} ms".format(latency * 1000)) + print("Throughput: {:.3f} images/sec".format(1. / latency)) + return acc + +def weight_name_mapping(name): + """The function that maps name from AutoTrackable variables to graph nodes""" + name = name.replace('tfgptj_for_causal_lm', 'StatefulPartitionedCall') + name = name.replace('kernel:0', 'Tensordot/ReadVariableOp') + return name + +def main(): + with train_args.strategy.scope(): + options = tf.data.Options() + options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF + from neural_compressor.tensorflow import Model + model = Model(run_args.input_model, modelType='llm_saved_model') + + if run_args.tune: + from neural_compressor.tensorflow import StaticQuantConfig, SmoothQuantConfig, autotune + from neural_compressor.tensorflow.quantization import TuningConfig + from neural_compressor.tensorflow.utils import BaseDataLoader + + calib_dataloader = MyDataloader(mydata, batch_size=run_args.batch_size) + quant_config = [SmoothQuantConfig(alpha=0.52705), StaticQuantConfig(act_dtype="int8", weight_dtype="int8")] + tune_config = TuningConfig(config_set=quant_config, max_trials=1) + model.weight_name_mapping = weight_name_mapping + q_model = autotune(model, + tune_config, + eval_fn=evaluate, + calib_dataloader=calib_dataloader) + q_model.save(run_args.output_model) + if run_args.benchmark: + if run_args.mode == "performance": + evaluate(model.model) + elif run_args.mode == "accuracy": + acc_result = evaluate(model.model) + print("Batch size = %d" % run_args.batch_size) + print("Accuracy: %.5f" % acc_result) + +if __name__ == "__main__": + main() diff --git a/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/prepare_model.py b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/prepare_model.py new file mode 100644 index 00000000000..cb4cd7f3f29 --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/prepare_model.py @@ -0,0 +1,23 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from transformers import AutoTokenizer, TFAutoModelForCausalLM + +tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B") +model = TFAutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B") +model.save_pretrained("./gpt-j-6B", saved_model=True) \ No newline at end of file diff --git a/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/prepare_model.sh b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/prepare_model.sh new file mode 100644 index 00000000000..67e59f983f5 --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/prepare_model.sh @@ -0,0 +1,6 @@ +pip install transformers==4.25.0 +python prepare_model.py +mv ./gpt-j-6B/saved_model/1 ./ +rm -r ./gpt-j-6B +mv ./1 ./gpt-j-6B +pip install transformers==4.35 \ No newline at end of file diff --git a/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/requirements.txt b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/requirements.txt new file mode 100644 index 00000000000..23c79d8bbd3 --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/requirements.txt @@ -0,0 +1,4 @@ +tensorflow==2.12 +transformers +datasets==2.17 +numpy \ No newline at end of file diff --git a/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/run_benchmark.sh b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/run_benchmark.sh new file mode 100644 index 00000000000..75196199bce --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/run_benchmark.sh @@ -0,0 +1,42 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + batch_size=1 + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo $var |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo $var |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function run_benchmark { + python main.py \ + --input_model ${input_model} \ + --mode ${mode} \ + --batch_size ${batch_size} \ + --benchmark \ + --output_dir "./outputs" + +} + +main "$@" diff --git a/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/run_quant.sh b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/run_quant.sh new file mode 100644 index 00000000000..e8ad1f1dc19 --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/gpt-j/run_quant.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo "$var" |cut -f2 -d=) + ;; + --output_model=*) + output_model=$(echo "$var" |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function run_tuning { + python main.py \ + --input_model=${input_model} \ + --output_model=${output_model} \ + --output_dir="./outputs" \ + --tune +} + +main "$@" diff --git a/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/README.md b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/README.md new file mode 100644 index 00000000000..fa45adbd5ef --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/README.md @@ -0,0 +1,52 @@ +Step-by-Step +============ + +This document is used to list steps of reproducing TensorFlow Intel® Neural Compressor quantization and smooth quantization of language models such as OPT and GPT2. + +## Prerequisite + +```shell +# Install Intel® Neural Compressor +pip install neural-compressor +pip install -r requirements +``` +## Run + + +### Basic quantization + +``` +python main.py --model_name_or_path +``` + +`` can be following: + +- gpt2-medium +- facebook/opt-125m + +### Smooth quant + +```shell +bash run_quant.sh --input_model= +``` + +Or you can use + +``` +python main.py --model_name_or_path --sq +``` + +## Benchmark + +### Get the FP32 performance + +```shell +bash run_benchmark.sh --input_model= +``` + +### Get the INT8 performance + +```shell +bash run_benchmark.sh --input_model= --int8=true +``` + diff --git a/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/benchmark.py b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/benchmark.py new file mode 100644 index 00000000000..673d50c034f --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/benchmark.py @@ -0,0 +1,190 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os.path +import transformers +import tensorflow as tf +from tqdm import tqdm +import sys +import argparse +from datasets import load_dataset +import numpy as np +import time + +sys.path.insert(0, './') + +parser = argparse.ArgumentParser() +parser.add_argument('--int8', action='store_true', help="eval fp32 model or int8 model") +parser.add_argument('--model_name_or_path', type=str, default='facebook/opt-125m') +parser.add_argument('--batch_size', type=int, default=16) +parser.add_argument('--warmup', type=int, default=10) +args = parser.parse_args() + +class Evaluator: + def __init__(self, dataset, tokenizer, device, batch_size=args.batch_size): + self.dataset = dataset + self.tokenizer = tokenizer + self.device = device + self.dataloader = INCDataloader(dataset, tokenizer, batch_size, device) + + def evaluate(self, model): + # model.eval() + # The task is to predict the last word of the input. + total, hit = 0, 0 + index = 1 + for input_ids, label, label_indices in tqdm(self.dataloader): + # TFCausalLMOutputWithPast len: 2 + # first element shape (16, 196, 50272) + # second element shape (16, 12, 196, 64) + outputs = model(input_ids) + last_token_logits = outputs[0].numpy()[np.arange(len(label_indices)), label_indices, :] + pred = last_token_logits.argmax(axis=-1) + total += label.shape[0] + hit += (pred == label.numpy()).sum().item() + index += 1 + acc = hit / total + print(acc, flush=True) + return acc + + def get_attention_mask(self, input_ids): + return tf.constant(1 - (input_ids==1).numpy().astype(int)) + + def evaluate_tf_v1(self, model): + total, hit = 0, 0 + index = 1 + infer = model.signatures["serving_default"] + overall_infer_duration = 0 + for input_ids, label, label_indices in tqdm(self.dataloader): + attention_mask = self.get_attention_mask(input_ids) + input_ids = tf.constant(input_ids.numpy(), dtype=infer.inputs[0].dtype) + attention_mask = tf.constant(attention_mask.numpy(), dtype=infer.inputs[0].dtype) + start = time.time() + results = infer(input_ids=input_ids, attention_mask=attention_mask) # len: 25 Identity: [16, 196, 50272], Identity_1: [16, 12, 196, 64] + batch_infer_time = time.time() - start + if index > args.warmup: + overall_infer_duration += batch_infer_time + last_token_logits = results['Identity'].numpy()[np.arange(len(label_indices)), label_indices, :] + pred = last_token_logits.argmax(axis=-1) + total += label.shape[0] + hit += (pred == label.numpy()).sum().item() + index += 1 + acc = hit / total + print("\nEvaluation result: ") + print(f"Batch size = {args.batch_size}") + print(f"Accuracy: {acc}") + print( + f"Throughput: {(len(self.dataloader) - args.warmup * args.batch_size) / overall_infer_duration} samples/sec" + ) + +class INCDataloader: + # for_calib=True in quantization, only input_id is needed, =False in evaluation need label + def __init__(self, dataset, tokenizer, batch_size=1, device='cpu', for_calib=False): + self.dataset = dataset + self.tokenizer = tokenizer + self.device = device + self.batch_size = batch_size + self.for_calib = for_calib + import math + self.length = math.ceil(len(dataset) / self.batch_size) # batch number + self.pad_len = 196 + + # tokenize the dataset + def tokenize_function(examples): + example = self.tokenizer(examples['text']) + return example + + self.dataset = self.dataset.map(tokenize_function, batched=True) + self.dataset.set_format(type='tensorflow', columns=['input_ids']) + def get_attention_mask(self, input_ids): + return 1 - (input_ids==1).numpy().astype(int) + def pad_input(self, input): # input: a record + input_id = input['input_ids'] + if input_id.numpy().shape[0] > self.pad_len: # truncate the sequence to pad_len if the sequence is longer than pad_len + input_id = input_id[:self.pad_len] + label = input_id[-1] + pad_len = self.pad_len - input_id.numpy().shape[0] + label_index = -2 - pad_len # last logit index + input_id = tf.pad(input_id, tf.constant([[0,pad_len]]), constant_values=1) + input_id = tf.expand_dims(input_id, axis=0) + label = tf.expand_dims(label, axis=0) + return (input_id, label, label_index) + + def __iter__(self): + if self.for_calib: + labels = None + # label_indices = None + for idx, record in enumerate(self.dataset): + input_id, label, label_index = self.pad_input(record) + attention_mask = self.get_attention_mask(input_id) + # compose attention_mask and input_id together + # during the calibration, it requires to yield a + # cur_input = tf.constant(np.append(attention_mask, input_id.numpy(), axis=0)) + cur_input = {"input_ids": input_id.numpy(), "attention_mask": attention_mask} + assert self.batch_size == 1 + yield (cur_input, label) + else: + input_ids = None + labels = None + label_indices = None + for idx, record in enumerate(self.dataset): + input_id, label, label_index = self.pad_input(record) + if input_ids is None: + input_ids = input_id + labels = label + label_indices = [label_index] + else: + input_ids = tf.concat([input_ids, input_id], 0) + labels = tf.concat([labels, label], 0) + + label_indices.append(label_index) + + if (idx + 1) % self.batch_size == 0: + yield (input_ids, labels, label_indices) + input_ids = None + labels = None + label_indices = None + if (idx + 1) % self.batch_size != 0: + yield (input_ids, labels, label_indices) + + def __len__(self): + return self.length + +from datasets import load_dataset + +model_name = args.model_name_or_path +tokenizer = transformers.AutoTokenizer.from_pretrained( + model_name, +) +eval_dataset = load_dataset('lambada', split='validation') + +evaluator = Evaluator(eval_dataset, tokenizer, 'cpu') + +if args.int8: + print("benchmarking int8 model") + int8_folder = model_name.split('/')[-1] + "_int8" + if not os.path.exists(int8_folder): + print(f"could not find int8 folder {int8_folder} ") + exit() + model = tf.saved_model.load(int8_folder) # tensorflow.python.trackable.autotrackable.AutoTrackable object +else: + print("benchmaking fp32 model") + model = transformers.TFAutoModelForCausalLM.from_pretrained(model_name) + from neural_compressor.tensorflow import Model + + model = Model(model).model # tensorflow.python.trackable.autotrackable.AutoTrackable object + +evaluator.evaluate_tf_v1(model) diff --git a/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/main.py b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/main.py new file mode 100644 index 00000000000..8f012ceb404 --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/main.py @@ -0,0 +1,140 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import transformers +import tensorflow as tf +from tqdm import tqdm +import sys +import argparse +from datasets import load_dataset +import numpy as np + +sys.path.insert(0, './') + +parser = argparse.ArgumentParser() +parser.add_argument('--sq', action='store_true', default=False, help="whether to use smooth quant") +parser.add_argument('--model_name_or_path', type=str, default="facebook/opt-125m") +parser.add_argument('--alpha', type=float, default=0.5, help="alpha value for smoothing.") +parser.add_argument('--log_frequency', type=int, default=100) +parser.add_argument('--batch_size', type=int, default=16) +parser.add_argument('--kl', action='store_true', default=False, help="whether to use kl divergence for calibration") +parser.add_argument('--fallback_add', action='store_true', default=False, help="Whether to add fp32 fallback option" ) +args = parser.parse_args() + +class CustomDataloader: + # for_calib=True in quantization, only input_id is needed, =False in evaluation need label + def __init__(self, dataset, tokenizer, batch_size=1, device='cpu', for_calib=False): + self.dataset = dataset + self.tokenizer = tokenizer + self.device = device + self.batch_size = batch_size + self.for_calib = for_calib + import math + self.length = math.ceil(len(dataset) / self.batch_size) # batch number + self.pad_len = 196 + + # tokenize the dataset + def tokenize_function(examples): + example = self.tokenizer(examples['text']) + return example + + self.dataset = self.dataset.map(tokenize_function, batched=True) + self.dataset.set_format(type='tensorflow', columns=['input_ids']) + def get_attention_mask(self, input_ids): + return 1 - (input_ids==1).numpy().astype(int) + def pad_input(self, input): # input: a record + input_id = input['input_ids'] + if input_id.numpy().shape[0] > self.pad_len: # truncate the sequence to pad_len if the sequence is longer than pad_len + input_id = input_id[:self.pad_len] + label = input_id[-1] + pad_len = self.pad_len - input_id.numpy().shape[0] + label_index = -2 - pad_len # last logit index + input_id = tf.pad(input_id, tf.constant([[0,pad_len]]), constant_values=1) # TODO need to check why pad with 1 + input_id = tf.expand_dims(input_id, axis=0) + label = tf.expand_dims(label, axis=0) + return (input_id, label, label_index) + + def __iter__(self): + if self.for_calib: + labels = None + for idx, record in enumerate(self.dataset): + input_id, label, label_index = self.pad_input(record) + attention_mask = self.get_attention_mask(input_id) + cur_input = {"input_ids": input_id.numpy(), "attention_mask": attention_mask} + assert self.batch_size == 1 + yield (cur_input, label) + else: + input_ids = None + labels = None + label_indices = None + for idx, record in enumerate(self.dataset): + input_id, label, label_index = self.pad_input(record) + if input_ids is None: + input_ids = input_id + labels = label + label_indices = [label_index] + else: + input_ids = tf.concat([input_ids, input_id], 0) + labels = tf.concat([labels, label], 0) + + label_indices.append(label_index) + + if (idx + 1) % self.batch_size == 0: + yield (input_ids, labels, label_indices) + input_ids = None + labels = None + label_indices = None + if (idx + 1) % self.batch_size != 0: + yield (input_ids, labels, label_indices) + + def __len__(self): + return self.length + + +model_name = args.model_name_or_path + +tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) +model = transformers.TFAutoModelForCausalLM.from_pretrained(model_name) + +calib_dataset = load_dataset('lambada', split='validation') +calib_dataset = calib_dataset.shuffle(seed=42) +calib_dataloader = CustomDataloader(calib_dataset, tokenizer, device='cpu', batch_size=1, for_calib=True) + +from neural_compressor.tensorflow import StaticQuantConfig, SmoothQuantConfig, quantize_model + +ptq_config = None +quant_config = [] + +if args.sq: + quant_config.append(SmoothQuantConfig(alpha=args.alpha)) +if args.kl: + ptq_config = StaticQuantConfig(act_dtype="int8", weight_dtype="int8", act_algorithm="kl") +if args.fallback_add: + ptq_config = StaticQuantConfig(act_dtype="int8", weight_dtype="int8") + ptq_config.set_local("Add", StaticQuantConfig(act_dtype="fp32", weight_dtype="fp32")) + +if not ptq_config: + ptq_config = StaticQuantConfig(act_dtype="int8", weight_dtype="int8") +quant_config.append(ptq_config) + +q_model = quantize_model(model, + quant_config, + calib_dataloader=calib_dataloader) + +save_model_name = model_name.split("/")[-1] +q_model.save(f"{save_model_name}_int8") diff --git a/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/requirements.txt b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/requirements.txt new file mode 100644 index 00000000000..3486c09473c --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/requirements.txt @@ -0,0 +1,3 @@ +tensorflow==2.15 +datasets +transformers==4.35 \ No newline at end of file diff --git a/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/run_benchmark.sh b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/run_benchmark.sh new file mode 100644 index 00000000000..b8fad17eebd --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/run_benchmark.sh @@ -0,0 +1,47 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + int8=false + batch_size=16 + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --int8=*) + int8=$(echo $var |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo $var |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function run_benchmark { + if [[ "${int8}" == "true" ]]; then + python benchmark.py \ + --model_name_or_path ${input_model} \ + --batch_size ${batch_size} \ + --int8 + else + python benchmark.py \ + --model_name_or_path ${input_model} \ + --batch_size ${batch_size} + fi + +} + +main "$@" \ No newline at end of file diff --git a/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/run_quant.sh b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/run_quant.sh new file mode 100644 index 00000000000..4295060acb9 --- /dev/null +++ b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/run_quant.sh @@ -0,0 +1,39 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --sq=*) + sq=$(echo ${var} |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function run_tuning { + + ext_cmd="" + if [[ ${sq} == "True" ]]; then + ext_cmd="--sq" + fi + python main.py \ + --model_name_or_path ${input_model} \ + ${ext_cmd} +} + +main "$@" \ No newline at end of file diff --git a/examples/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/main.py b/examples/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/main.py index 78c91f446bb..b88cd9f7a09 100644 --- a/examples/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/main.py +++ b/examples/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/main.py @@ -188,7 +188,6 @@ def eval_func(model): from neural_compressor.config import AccuracyCriterion from neural_compressor import quantization -os.environ["TF_USE_LEGACY_KERAS"]="False" recipes = {} if args.sq: recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': args.alpha}} diff --git a/neural_compressor/tensorflow/quantization/__init__.py b/neural_compressor/tensorflow/quantization/__init__.py index c675d8ae2dc..e9b0f25ffa4 100644 --- a/neural_compressor/tensorflow/quantization/__init__.py +++ b/neural_compressor/tensorflow/quantization/__init__.py @@ -13,7 +13,7 @@ # limitations under the License. from neural_compressor.tensorflow.quantization.quantize import quantize_model -from neural_compressor.tensorflow.quantization.autotune import autotune, get_all_config_set +from neural_compressor.tensorflow.quantization.autotune import autotune, get_all_config_set, TuningConfig from neural_compressor.tensorflow.quantization.algorithm_entry import static_quant_entry, smooth_quant_entry from neural_compressor.tensorflow.quantization.config import ( StaticQuantConfig, diff --git a/neural_compressor/tensorflow/quantization/autotune.py b/neural_compressor/tensorflow/quantization/autotune.py index 5bd588c0c0c..ab0d3a61949 100644 --- a/neural_compressor/tensorflow/quantization/autotune.py +++ b/neural_compressor/tensorflow/quantization/autotune.py @@ -23,7 +23,7 @@ from neural_compressor.common.utils import dump_elapsed_time from neural_compressor.tensorflow.quantization import quantize_model from neural_compressor.tensorflow.quantization.config import FRAMEWORK_NAME, StaticQuantConfig -from neural_compressor.tensorflow.utils import BaseModel, constants +from neural_compressor.tensorflow.utils import BaseModel, Model, constants __all__ = [ "autotune", @@ -45,6 +45,7 @@ def autotune( calib_iteration: int = 100, ) -> Optional[BaseModel]: """The main entry of auto-tune.""" + model = Model(model) best_quant_model = None eval_func_wrapper = EvaluationFuncWrapper(eval_fn, eval_args) config_loader, tuning_logger, tuning_monitor = init_tuning(tuning_config=tune_config) diff --git a/neural_compressor/tensorflow/quantization/config.py b/neural_compressor/tensorflow/quantization/config.py index a49832b6bad..752f8d4ecbe 100644 --- a/neural_compressor/tensorflow/quantization/config.py +++ b/neural_compressor/tensorflow/quantization/config.py @@ -134,8 +134,7 @@ def register_supported_configs(cls) -> List[OperatorConfig]: supported_configs.append(OperatorConfig(config=static_quant_config, operators=operators)) cls.supported_configs = supported_configs - @staticmethod - def get_model_info(model) -> List[Tuple[str, Callable]]: + def get_model_info(self, model) -> List[Tuple[str, Callable]]: white_list = [ "Conv2D", "FusedBatchNormV3", @@ -154,6 +153,9 @@ def get_model_info(model) -> List[Tuple[str, Callable]]: "Conv2DBackpropInput", "Conv3DBackpropInputV2", ] + for key in self._local_config.keys(): + if key in white_list: + white_list.remove(key) filter_result = [] for node in model.graph_def.node: if node.op in white_list: diff --git a/neural_compressor/tensorflow/utils/model_wrappers.py b/neural_compressor/tensorflow/utils/model_wrappers.py index b9fc4a54a63..a2b65b7ad96 100644 --- a/neural_compressor/tensorflow/utils/model_wrappers.py +++ b/neural_compressor/tensorflow/utils/model_wrappers.py @@ -86,7 +86,7 @@ def get_model_type(model): return "keras" else: # otherwise, the backend will fallback to tensorflow_itex - return "AutoTrackable" + return "saved_model" if isinstance(model, tf.Graph): return "graph" elif isinstance(model, tf.compat.v1.GraphDef):