From d99068c6c1b50a96bd4a9d06cd1b5a4ede49c1d6 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Tue, 14 May 2019 05:16:08 +0000 Subject: [PATCH 01/11] rename folder --- .../bert/{ => export}/staticbert/__init__.py | 0 .../{ => export}/staticbert/static_bert.py | 0 .../staticbert/static_bert_qa_model.py | 0 .../staticbert/static_export_base.py | 0 .../staticbert/static_export_squad.py | 0 .../bert/staticbert/static_finetune_squad.py | 530 ------------------ 6 files changed, 530 deletions(-) rename scripts/bert/{ => export}/staticbert/__init__.py (100%) rename scripts/bert/{ => export}/staticbert/static_bert.py (100%) rename scripts/bert/{ => export}/staticbert/static_bert_qa_model.py (100%) rename scripts/bert/{ => export}/staticbert/static_export_base.py (100%) rename scripts/bert/{ => export}/staticbert/static_export_squad.py (100%) delete mode 100644 scripts/bert/staticbert/static_finetune_squad.py diff --git a/scripts/bert/staticbert/__init__.py b/scripts/bert/export/staticbert/__init__.py similarity index 100% rename from scripts/bert/staticbert/__init__.py rename to scripts/bert/export/staticbert/__init__.py diff --git a/scripts/bert/staticbert/static_bert.py b/scripts/bert/export/staticbert/static_bert.py similarity index 100% rename from scripts/bert/staticbert/static_bert.py rename to scripts/bert/export/staticbert/static_bert.py diff --git a/scripts/bert/staticbert/static_bert_qa_model.py b/scripts/bert/export/staticbert/static_bert_qa_model.py similarity index 100% rename from scripts/bert/staticbert/static_bert_qa_model.py rename to scripts/bert/export/staticbert/static_bert_qa_model.py diff --git a/scripts/bert/staticbert/static_export_base.py b/scripts/bert/export/staticbert/static_export_base.py similarity index 100% rename from scripts/bert/staticbert/static_export_base.py rename to scripts/bert/export/staticbert/static_export_base.py diff --git a/scripts/bert/staticbert/static_export_squad.py b/scripts/bert/export/staticbert/static_export_squad.py similarity index 100% rename from scripts/bert/staticbert/static_export_squad.py rename to scripts/bert/export/staticbert/static_export_squad.py diff --git a/scripts/bert/staticbert/static_finetune_squad.py b/scripts/bert/staticbert/static_finetune_squad.py deleted file mode 100644 index 91db1f8268..0000000000 --- a/scripts/bert/staticbert/static_finetune_squad.py +++ /dev/null @@ -1,530 +0,0 @@ -# coding=utf-8 - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint:disable=redefined-outer-name,logging-format-interpolation -""" -SQuAD with Static Bidirectional Encoder Representations from Transformers (BERT) - -========================================================================================= - -This example shows how to finetune a model with pre-trained BERT parameters with static shape for -SQuAD, with Gluon NLP Toolkit. - -@article{devlin2018bert, - title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding}, - author={Devlin, Jacob and Chang, Ming- \ - Wei and Lee, Kenton and Toutanova, Kristina}, - journal={arXiv preprint arXiv:1810.04805}, - year={2018} -} -""" -import argparse -import collections -import json -import logging -import os -import random -import time -import warnings -import sys - -import numpy as np -import mxnet as mx -from mxnet import gluon, nd - -import gluonnlp as nlp - -from static_bert_qa_model import BertForQALoss, StaticBertForQA -from bert_qa_dataset import (SQuADTransform, preprocess_dataset) -from bert_qa_evaluate import get_F1_EM, predictions -from static_bert import get_model - -sys.path.append('..') - -np.random.seed(6) -random.seed(6) -mx.random.seed(6) - -log = logging.getLogger('gluonnlp') -log.setLevel(logging.DEBUG) -formatter = logging.Formatter( - fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s', datefmt='%H:%M:%S') - -parser = argparse.ArgumentParser(description='BERT QA example.' - 'We fine-tune the BERT model on SQuAD dataset.') - -parser.add_argument('--only_predict', - action='store_true', - help='Whether to predict only.') - -parser.add_argument('--model_parameters', - type=str, - default=None, - help='Model parameter file') - -parser.add_argument('--bert_model', - type=str, - default='bert_12_768_12', - help='BERT model name. options are bert_12_768_12 and bert_24_1024_16.') - -parser.add_argument('--bert_dataset', - type=str, - default='book_corpus_wiki_en_uncased', - help='BERT dataset name.' - 'options are book_corpus_wiki_en_uncased and book_corpus_wiki_en_cased.') - -parser.add_argument('--pretrained_bert_parameters', - type=str, - default=None, - help='Pre-trained bert model parameter file. default is None') - -parser.add_argument('--uncased', - action='store_false', - help='if not set, inputs are converted to lower case.') - -parser.add_argument('--output_dir', - type=str, - default='./output_dir', - help='The output directory where the model params will be written.' - ' default is ./output_dir') - -parser.add_argument('--epochs', - type=int, - default=3, - help='number of epochs, default is 3') - -parser.add_argument('--batch_size', - type=int, - default=32, - help='Batch size. Number of examples per gpu in a minibatch. default is 32') - -parser.add_argument('--test_batch_size', - type=int, - default=24, - help='Test batch size. default is 24') - -parser.add_argument('--optimizer', - type=str, - default='bertadam', - help='optimization algorithm. default is bertadam(mxnet >= 1.5.0.)') - -parser.add_argument('--accumulate', - type=int, - default=None, - help='The number of batches for ' - 'gradients accumulation to simulate large batch size. Default is None') - -parser.add_argument('--lr', - type=float, - default=5e-5, - help='Initial learning rate. default is 5e-5') - -parser.add_argument('--warmup_ratio', - type=float, - default=0.1, - help='ratio of warmup steps that linearly increase learning rate from ' - '0 to target learning rate. default is 0.1') - -parser.add_argument('--log_interval', - type=int, - default=50, - help='report interval. default is 50') - -parser.add_argument('--max_seq_length', - type=int, - default=384, - help='The maximum total input sequence length after WordPiece tokenization.' - 'Sequences longer than this will be truncated, and sequences shorter ' - 'than this will be padded. default is 384') - -parser.add_argument('--doc_stride', - type=int, - default=128, - help='When splitting up a long document into chunks, how much stride to ' - 'take between chunks. default is 128') - -parser.add_argument('--max_query_length', - type=int, - default=64, - help='The maximum number of tokens for the question. Questions longer than ' - 'this will be truncated to this length. default is 64') - -parser.add_argument('--n_best_size', - type=int, - default=20, - help='The total number of n-best predictions to generate in the ' - 'nbest_predictions.json output file. default is 20') - -parser.add_argument('--max_answer_length', - type=int, - default=30, - help='The maximum length of an answer that can be generated. This is needed ' - 'because the start and end predictions are not conditioned on one another.' - ' default is 30') - -parser.add_argument('--version_2', - action='store_true', - help='SQuAD examples whether contain some that do not have an answer.') - -parser.add_argument('--null_score_diff_threshold', - type=float, - default=0.0, - help='If null_score - best_non_null is greater than the threshold predict null.' - 'Typical values are between -1.0 and -5.0. default is 0.0') - -parser.add_argument('--gpu', type=str, help='single gpu id') - -parser.add_argument('--seq_length', - type=int, - default=384, - help='The sequence length of the input') - -parser.add_argument('--input_size', - type=int, - default=768, - help='The embedding size of the input') - -parser.add_argument('--export', - action='store_true', - help='Whether to export the model.') - -args = parser.parse_args() - - -output_dir = args.output_dir -if not os.path.exists(output_dir): - os.mkdir(output_dir) - -fh = logging.FileHandler(os.path.join( - args.output_dir, 'static_finetune_squad.log'), mode='w') -fh.setLevel(logging.INFO) -fh.setFormatter(formatter) -console = logging.StreamHandler() -console.setLevel(logging.INFO) -console.setFormatter(formatter) -log.addHandler(console) -log.addHandler(fh) - -log.info(args) - -model_name = args.bert_model -dataset_name = args.bert_dataset -only_predict = args.only_predict -model_parameters = args.model_parameters -pretrained_bert_parameters = args.pretrained_bert_parameters -lower = args.uncased - -epochs = args.epochs -batch_size = args.batch_size -test_batch_size = args.test_batch_size -lr = args.lr -ctx = mx.cpu() if not args.gpu else mx.gpu(int(args.gpu)) - -accumulate = args.accumulate -log_interval = args.log_interval * accumulate if accumulate else args.log_interval -if accumulate: - log.info('Using gradient accumulation. Effective batch size = {}'. - format(accumulate * batch_size)) - -optimizer = args.optimizer -warmup_ratio = args.warmup_ratio - -version_2 = args.version_2 -null_score_diff_threshold = args.null_score_diff_threshold - -max_seq_length = args.max_seq_length -doc_stride = args.doc_stride -max_query_length = args.max_query_length -n_best_size = args.n_best_size -max_answer_length = args.max_answer_length - -if max_seq_length <= max_query_length + 3: - raise ValueError('The max_seq_length (%d) must be greater than max_query_length ' - '(%d) + 3' % (max_seq_length, max_query_length)) - -bert, vocab = get_model( - name=model_name, - dataset_name=dataset_name, - pretrained=not model_parameters and not pretrained_bert_parameters, - ctx=ctx, - use_pooler=False, - use_decoder=False, - use_classifier=False, - input_size=args.input_size, - seq_length=args.seq_length) - -batchify_fn = nlp.data.batchify.Tuple( - nlp.data.batchify.Stack(), - nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]), - nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]), - nlp.data.batchify.Stack('float32'), - nlp.data.batchify.Stack('float32'), - nlp.data.batchify.Stack('float32')) - -berttoken = nlp.data.BERTTokenizer(vocab=vocab, lower=lower) - - -############################################################################### -# Hybridize the model # -############################################################################### -net = StaticBertForQA(bert=bert) -if pretrained_bert_parameters and not model_parameters: - bert.load_parameters(pretrained_bert_parameters, ctx=ctx, - ignore_extra=True) -if not model_parameters: - net.span_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx) -else: - net.load_parameters(model_parameters, ctx=ctx) -net.hybridize(static_alloc=True, static_shape=True) - -loss_function = BertForQALoss() -loss_function.hybridize(static_alloc=True, static_shape=True) - - -def train(): - """Training function.""" - log.info('Loader Train data...') - if version_2: - train_data = nlp.data.SQuAD('train', version='2.0') - else: - train_data = nlp.data.SQuAD('train', version='1.1') - log.info('Number of records in Train data:{}'.format(len(train_data))) - - train_data_transform, _ = preprocess_dataset( - train_data, SQuADTransform( - berttoken, - max_seq_length=max_seq_length, - doc_stride=doc_stride, - max_query_length=max_query_length, - is_pad=True, - is_training=True)) - log.info('The number of examples after preprocessing:{}'.format( - len(train_data_transform))) - - train_dataloader = mx.gluon.data.DataLoader( - train_data_transform, batchify_fn=batchify_fn, - batch_size=batch_size, num_workers=4, shuffle=True) - - log.info('Start Training') - - optimizer_params = {'learning_rate': lr} - try: - trainer = gluon.Trainer(net.collect_params(), optimizer, - optimizer_params, update_on_kvstore=False) - except ValueError as e: - print(e) - warnings.warn('AdamW optimizer is not found. Please consider upgrading to ' - 'mxnet>=1.5.0. Now the original Adam optimizer is used instead.') - trainer = gluon.Trainer(net.collect_params(), 'adam', - optimizer_params, update_on_kvstore=False) - - num_train_examples = len(train_data_transform) - step_size = batch_size * accumulate if accumulate else batch_size - num_train_steps = int(num_train_examples / step_size * epochs) - num_warmup_steps = int(num_train_steps * warmup_ratio) - step_num = 0 - - def set_new_lr(step_num, batch_id): - """set new learning rate""" - # set grad to zero for gradient accumulation - if accumulate: - if batch_id % accumulate == 0: - net.collect_params().zero_grad() - step_num += 1 - else: - step_num += 1 - # learning rate schedule - # Notice that this learning rate scheduler is adapted from traditional linear learning - # rate scheduler where step_num >= num_warmup_steps, new_lr = 1 - step_num/num_train_steps - if step_num < num_warmup_steps: - new_lr = lr * step_num / num_warmup_steps - else: - offset = (step_num - num_warmup_steps) * lr / \ - (num_train_steps - num_warmup_steps) - new_lr = lr - offset - trainer.set_learning_rate(new_lr) - return step_num - - # Do not apply weight decay on LayerNorm and bias terms - for _, v in net.collect_params('.*beta|.*gamma|.*bias').items(): - v.wd_mult = 0.0 - # Collect differentiable parameters - params = [p for p in net.collect_params().values() - if p.grad_req != 'null'] - # Set grad_req if gradient accumulation is required - if accumulate: - for p in params: - p.grad_req = 'add' - - epoch_tic = time.time() - total_num = 0 - log_num = 0 - for epoch_id in range(epochs): - step_loss = 0.0 - tic = time.time() - for batch_id, data in enumerate(train_dataloader): - # set new lr - step_num = set_new_lr(step_num, batch_id) - # forward and backward - with mx.autograd.record(): - _, inputs, token_types, valid_length, start_label, end_label = data - - log_num += len(inputs) - total_num += len(inputs) - - out = net(inputs.astype('float32').as_in_context(ctx), - token_types.astype('float32').as_in_context(ctx), - valid_length.astype('float32').as_in_context(ctx)) - - ls = loss_function(out, [ - start_label.astype('float32').as_in_context(ctx), - end_label.astype('float32').as_in_context(ctx)]).mean() - - if accumulate: - ls = ls / accumulate - ls.backward() - # update - if not accumulate or (batch_id + 1) % accumulate == 0: - trainer.allreduce_grads() - nlp.utils.clip_grad_global_norm(params, 1) - trainer.update(1) - - step_loss += ls.asscalar() - - if (batch_id + 1) % log_interval == 0: - toc = time.time() - log.info('Epoch: %d, Batch: %d/%d, Loss=%.4f, lr=%.7f ' - 'Time cost=%.1f Thoughput=%.2f samples/s', - epoch_id, batch_id, len(train_dataloader), - step_loss / log_interval, - trainer.learning_rate, toc - tic, log_num / (toc - tic)) - tic = time.time() - step_loss = 0.0 - log_num = 0 - epoch_toc = time.time() - log.info('Epoch: {}, Time cost={:.2f} s, Thoughput={:.2f} samples/s' - .format(epoch_id, epoch_toc - epoch_tic, - len(train_dataloader) / (epoch_toc - epoch_tic))) - - net.save_parameters(os.path.join(output_dir, 'net.params')) - - -def evaluate(): - """Evaluate the model on validation dataset. - """ - log.info('Loader dev data...') - if version_2: - dev_data = nlp.data.SQuAD('dev', version='2.0') - else: - dev_data = nlp.data.SQuAD('dev', version='1.1') - log.info('Number of records in Train data:{}'.format(len(dev_data))) - - dev_dataset = dev_data.transform( - SQuADTransform( - berttoken, - max_seq_length=max_seq_length, - doc_stride=doc_stride, - max_query_length=max_query_length, - is_pad=True, - is_training=False)._transform) - - dev_data_transform, _ = preprocess_dataset( - dev_data, SQuADTransform( - berttoken, - max_seq_length=max_seq_length, - doc_stride=doc_stride, - max_query_length=max_query_length, - is_pad=True, - is_training=False)) - log.info('The number of examples after preprocessing:{}'.format( - len(dev_data_transform))) - - dev_dataloader = mx.gluon.data.DataLoader( - dev_data_transform, - batchify_fn=batchify_fn, - num_workers=4, batch_size=test_batch_size, shuffle=False, last_batch='keep') - - log.info('Start predict') - - _Result = collections.namedtuple( - '_Result', ['example_id', 'start_logits', 'end_logits']) - all_results = {} - - epoch_tic = time.time() - total_num = 0 - for data in dev_dataloader: - example_ids, inputs, token_types, valid_length, _, _ = data - total_num += len(inputs) - out = net(inputs.astype('float32').as_in_context(ctx), - token_types.astype('float32').as_in_context(ctx), - valid_length.astype('float32').as_in_context(ctx)) - - output = nd.split(out, axis=2, num_outputs=2) - start_logits = output[0].reshape((0, -3)).asnumpy() - end_logits = output[1].reshape((0, -3)).asnumpy() - - for example_id, start, end in zip(example_ids, start_logits, end_logits): - example_id = example_id.asscalar() - if example_id not in all_results: - all_results[example_id] = [] - all_results[example_id].append( - _Result(example_id, start.tolist(), end.tolist())) - epoch_toc = time.time() - log.info('Inference time cost={:.2f} s, Thoughput={:.2f} samples/s' - .format(epoch_toc - epoch_tic, - len(dev_dataloader) / (epoch_toc - epoch_tic))) - log.info('Get prediction results...') - - all_predictions, all_nbest_json, scores_diff_json = predictions( - dev_dataset=dev_dataset, - all_results=all_results, - tokenizer=nlp.data.BERTBasicTokenizer(lower=lower), - max_answer_length=max_answer_length, - null_score_diff_threshold=null_score_diff_threshold, - n_best_size=n_best_size, - version_2=version_2) - - with open(os.path.join(output_dir, 'predictions.json'), - 'w', encoding='utf-8') as all_predictions_write: - all_predictions_write.write(json.dumps(all_predictions)) - - with open(os.path.join(output_dir, 'nbest_predictions.json'), - 'w', encoding='utf-8') as all_predictions_write: - all_predictions_write.write(json.dumps(all_nbest_json)) - - if version_2: - with open(os.path.join(output_dir, 'null_odds.json'), - 'w', encoding='utf-8') as all_predictions_write: - all_predictions_write.write(json.dumps(scores_diff_json)) - else: - log.info(get_F1_EM(dev_data, all_predictions)) - - -############################################################################### -# Export the model # -############################################################################### -if __name__ == '__main__': - if not only_predict: - train() - evaluate() - if args.export: - net.export(os.path.join(args.output_dir, 'static_net'), epoch=args.epochs) - elif model_parameters: - evaluate() - if args.export: - net.export(os.path.join(args.output_dir, 'static_net'), epoch=args.epochs) From 98c3e90aab1ba274a96064f5a09a1e8ae5bb6098 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Tue, 14 May 2019 06:36:37 +0000 Subject: [PATCH 02/11] rename folder --- scripts/bert/export/{staticbert => }/__init__.py | 0 .../export/{staticbert/static_export_base.py => export_static.py} | 0 scripts/bert/export/{staticbert => }/static_bert.py | 0 scripts/bert/export/{staticbert => }/static_bert_qa_model.py | 0 scripts/bert/export/{staticbert => }/static_export_squad.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename scripts/bert/export/{staticbert => }/__init__.py (100%) rename scripts/bert/export/{staticbert/static_export_base.py => export_static.py} (100%) rename scripts/bert/export/{staticbert => }/static_bert.py (100%) rename scripts/bert/export/{staticbert => }/static_bert_qa_model.py (100%) rename scripts/bert/export/{staticbert => }/static_export_squad.py (100%) diff --git a/scripts/bert/export/staticbert/__init__.py b/scripts/bert/export/__init__.py similarity index 100% rename from scripts/bert/export/staticbert/__init__.py rename to scripts/bert/export/__init__.py diff --git a/scripts/bert/export/staticbert/static_export_base.py b/scripts/bert/export/export_static.py similarity index 100% rename from scripts/bert/export/staticbert/static_export_base.py rename to scripts/bert/export/export_static.py diff --git a/scripts/bert/export/staticbert/static_bert.py b/scripts/bert/export/static_bert.py similarity index 100% rename from scripts/bert/export/staticbert/static_bert.py rename to scripts/bert/export/static_bert.py diff --git a/scripts/bert/export/staticbert/static_bert_qa_model.py b/scripts/bert/export/static_bert_qa_model.py similarity index 100% rename from scripts/bert/export/staticbert/static_bert_qa_model.py rename to scripts/bert/export/static_bert_qa_model.py diff --git a/scripts/bert/export/staticbert/static_export_squad.py b/scripts/bert/export/static_export_squad.py similarity index 100% rename from scripts/bert/export/staticbert/static_export_squad.py rename to scripts/bert/export/static_export_squad.py From 7815a5353990309725897e6a9e948e62e9527116 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Tue, 14 May 2019 06:37:33 +0000 Subject: [PATCH 03/11] draft for export --- scripts/bert/export/export_static.py | 174 ++++++++------------ scripts/bert/export/static_bert.py | 109 +++++++++++- scripts/bert/export/static_bert_qa_model.py | 106 ------------ scripts/bert/index.rst | 97 +---------- 4 files changed, 178 insertions(+), 308 deletions(-) delete mode 100644 scripts/bert/export/static_bert_qa_model.py diff --git a/scripts/bert/export/export_static.py b/scripts/bert/export/export_static.py index 21607c630e..43248693fa 100644 --- a/scripts/bert/export/export_static.py +++ b/scripts/bert/export/export_static.py @@ -1,9 +1,16 @@ """ -Export Base Static Model (BERT) +Export the BERT Model for Deployment -========================================================================================= +==================================== -This will export the base BERT model to a static model suitable for use in MXNet Module API. +This script exports the BERT model to a static model suitable for use with MXNet Module API. + +@article{devlin2018bert, + title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding}, + author={Devlin, Jacob and Chang, Ming- \ + Wei and Lee, Kenton and Toutanova, Kristina}, + journal={arXiv preprint arXiv:1810.04805}, + year={2018} } """ @@ -29,96 +36,71 @@ import argparse import logging +import warnings import os import time import mxnet as mx - +import gluonnlp as nlp from static_bert import get_model -log = logging.getLogger('gluonnlp') -log.setLevel(logging.DEBUG) -formatter = logging.Formatter( - fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s', datefmt='%H:%M:%S') - -parser = argparse.ArgumentParser(description='export static BERT base model.') +parser = argparse.ArgumentParser(description='Export static BERT base model.') parser.add_argument('--model_parameters', type=str, default=None, - help='Model parameter file') + help='The model parameter file saved from training.') -parser.add_argument('--bert_model', +parser.add_argument('--model_name', type=str, default='bert_12_768_12', - help='BERT model name. options are bert_12_768_12 and bert_24_1024_16.') + choices=['bert_12_768_12', 'bert_24_1024_16'], + help='BERT model name. Options are "bert_12_768_12" and "bert_24_1024_16"') -parser.add_argument('--bert_dataset', - type=str, - default='book_corpus_wiki_en_uncased', - help='BERT dataset name.' - 'options are book_corpus_wiki_en_uncased and book_corpus_wiki_en_cased.') - -parser.add_argument('--pretrained_bert_parameters', +parser.add_argument('--task', type=str, default=None, - help='Pre-trained bert model parameter file. default is None') + choices=['classification', 'regression', 'qa'], + help='Task to export. Options are "classification", "regression", "qa". ' + 'If not set, the model for masked language model and next sentence ' + 'prediction will be exported.') -parser.add_argument('--uncased', - action='store_false', - help='if not set, inputs are converted to lower case.') +parser.add_argument('--dataset_name', + type=str, + default='book_corpus_wiki_en_uncased', + choices=['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased', + 'wiki_multilingual_uncased', 'wiki_multilingual_cased', + 'wiki_cn_cased'], + help='BERT dataset name. Options include ' + '"book_corpus_wiki_en_uncased", "book_corpus_wiki_en_cased", ' + '"wiki_multilingual_uncased", "wiki_multilingual_cased", ' + '"wiki_cn_cased"') parser.add_argument('--output_dir', type=str, default='./output_dir', - help='The output directory where the model params will be written.' - ' default is ./output_dir') - -parser.add_argument('--test_batch_size', - type=int, - default=24, - help='Test batch size. default is 24') - -parser.add_argument('--max_seq_length', - type=int, - default=384, - help='The maximum total input sequence length after WordPiece tokenization.' - 'Sequences longer than this will be truncated, and sequences shorter ' - 'than this will be padded. default is 384') - -parser.add_argument('--doc_stride', - type=int, - default=128, - help='When splitting up a long document into chunks, how much stride to ' - 'take between chunks. default is 128') - -parser.add_argument('--max_query_length', - type=int, - default=64, - help='The maximum number of tokens for the question. Questions longer than ' - 'this will be truncated to this length. default is 64') - -parser.add_argument('--gpu', type=str, help='single gpu id') + help='The directory where the exported model symbol will be created. ' + 'The default is ./output_dir') parser.add_argument('--seq_length', type=int, default=384, - help='The sequence length of the input') - -parser.add_argument('--input_size', - type=int, - default=768, - help='The embedding size of the input') + help='The maximum total input sequence length after WordPiece tokenization.' + 'Sequences longer than this needs to be truncated, and sequences shorter ' + 'than this needs to be padded. Default is 384') args = parser.parse_args() - +# create output dir output_dir = args.output_dir -if not os.path.exists(output_dir): - os.mkdir(output_dir) +nlp.utils.mkdir(output_dir) -fh = logging.FileHandler(os.path.join( - args.output_dir, 'static_export_bert_base.log'), mode='w') +# logging +log = logging.getLogger('gluonnlp') +log.setLevel(logging.DEBUG) +formatter = logging.Formatter(fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s', + datefmt='%H:%M:%S') +fh = logging.FileHandler(os.path.join(args.output_dir, 'static_export_bert.log'), mode='w') fh.setLevel(logging.INFO) fh.setFormatter(formatter) console = logging.StreamHandler() @@ -129,49 +111,31 @@ log.info(args) -model_name = args.bert_model -dataset_name = args.bert_dataset model_parameters = args.model_parameters -pretrained_bert_parameters = args.pretrained_bert_parameters -lower = args.uncased - seq_length = args.seq_length -input_size = args.input_size -test_batch_size = args.test_batch_size -ctx = mx.cpu() if not args.gpu else mx.gpu(int(args.gpu)) - -max_seq_length = args.max_seq_length -doc_stride = args.doc_stride -max_query_length = args.max_query_length - -if max_seq_length <= max_query_length + 3: - raise ValueError('The max_seq_length (%d) must be greater than max_query_length ' - '(%d) + 3' % (max_seq_length, max_query_length)) - +test_batch_size = 1 +ctx = mx.cpu() ############################################################################### # Prepare dummy input data # ############################################################################### -inputs = mx.nd.arange(test_batch_size * seq_length).reshape(shape=(test_batch_size, seq_length)) +inputs = mx.nd.arange(test_batch_size * seq_length) +inputs = inputs.reshape(shape=(test_batch_size, seq_length)) token_types = mx.nd.zeros_like(inputs) -valid_length = mx.nd.arange(seq_length)[:test_batch_size] +valid_length = mx.nd.arange(test_batch_size) batch = inputs, token_types, valid_length num_batch = 10 -sample_dataset = [] -for _ in range(num_batch): - sample_dataset.append(batch) - +sample_dataset = [batch for _ in range(10)] bert, vocab = get_model( - name=model_name, - dataset_name=dataset_name, - pretrained=not model_parameters and not pretrained_bert_parameters, + name=args.model_name, + dataset_name=args.dataset_name, + pretrained=True, ctx=ctx, use_pooler=False, use_decoder=False, use_classifier=False, - input_size=args.input_size, seq_length=args.seq_length) @@ -179,28 +143,26 @@ # Hybridize the model # ############################################################################### net = bert -if pretrained_bert_parameters and not model_parameters: - bert.load_parameters(pretrained_bert_parameters, ctx=ctx, - ignore_extra=True) -net.hybridize(static_alloc=True, static_shape=True) +if args.task == 'classification': + net = StaticBERTClassifier(net, num_classes=2) +if model_parameters: + bert.load_parameters(model_parameters, ctx=ctx) +else: + warnings.warn('using random initialization') + +net.hybridize(static_alloc=True, static_shape=True) def evaluate(data_source): - """Evaluate the model on a mini-batch. - """ - log.info('Start predict') + """Evaluate the model on a mini-batch.""" + log.info('start predicting ... ') tic = time.time() - for batch in data_source: - inputs, token_types, valid_length = batch - net(inputs.astype('float32').as_in_context(ctx), - token_types.astype('float32').as_in_context(ctx), - valid_length.astype('float32').as_in_context(ctx)) + for inputs, token_types, valid_length in data_source: + net(inputs.as_in_context(ctx), token_types.as_in_context(ctx), + valid_length.as_in_context(ctx)) toc = time.time() log.info('Inference time cost={:.2f} s, Thoughput={:.2f} samples/s' - .format(toc - tic, - len(data_source) / (toc - tic))) - - + .format(toc - tic, len(data_source) / (toc - tic))) ############################################################################### # Export the model # diff --git a/scripts/bert/export/static_bert.py b/scripts/bert/export/static_bert.py index 569942d705..cc11670adc 100644 --- a/scripts/bert/export/static_bert.py +++ b/scripts/bert/export/static_bert.py @@ -19,7 +19,11 @@ """Static BERT models.""" __all__ = ['StaticBERTModel', 'StaticBERTEncoder', - 'get_model', 'bert_12_768_12', 'bert_24_1024_16', 'get_static_bert_model'] + 'get_model', 'bert_12_768_12', 'bert_24_1024_16', 'get_static_bert_model', + 'StaticBertForQA', 'StaticBERTClassifier'] + +from mxnet.gluon import HybridBlock, loss, nn +from mxnet.gluon.loss import Loss import os import math @@ -783,3 +787,106 @@ def get_static_bert_model(model_name=None, dataset_name=None, vocab=None, _load_pretrained_params(net, model_name, dataset_name, root, ctx, ignore_extra=ignore_extra) return net, bert_vocab + +#create a hybridizable task guided model using BERT +class StaticBertForQA(HybridBlock): + """Hybridizable Model for SQuAD task with BERT. + + The model feeds token ids and token type ids into BERT to get the + pooled BERT sequence representation, then apply a Dense layer for QA task. + + Parameters + ---------- + bert: BERTModel + Bidirectional encoder with transformer. + prefix : str or None + See document of `mx.gluon.HybridBlock`. + params : ParameterDict or None + See document of `mx.gluon.HybridBlock`. + """ + + def __init__(self, bert, prefix=None, params=None): + super(StaticBertForQA, self).__init__(prefix=prefix, params=params) + self.bert = bert + with self.name_scope(): + self.span_classifier = nn.Dense(units=2, flatten=False) + + def hybrid_forward(self, F, inputs, token_types, valid_length=None): + # pylint: disable=arguments-differ + # pylint: disable=unused-argument + """Generate the unnormalized score for the given the input sequences. + + Parameters + ---------- + inputs : NDArray, shape (batch_size, seq_length) + Input words for the sequences. + token_types : NDArray, shape (batch_size, seq_length) + Token types for the sequences, used to indicate whether the word belongs to the + first sentence or the second one. + valid_length : NDArray or None, shape (batch_size,) + Valid length of the sequence. This is used to mask the padded tokens. + + Returns + ------- + outputs : NDArray + Shape (batch_size, seq_length, 2) + """ + bert_output = self.bert(inputs, token_types, valid_length) + output = self.span_classifier(bert_output) + return output + +class StaticBERTClassifier(HybridBlock): + """Model for sentence (pair) classification task with BERT. + + The model feeds token ids and token type ids into BERT to get the + pooled BERT sequence representation, then apply a Dense layer for + classification. + + Parameters + ---------- + bert: BERTModel + Bidirectional encoder with transformer. + num_classes : int, default is 2 + The number of target classes. + dropout : float or None, default 0.0. + Dropout probability for the bert output. + prefix : str or None + See document of `mx.gluon.Block`. + params : ParameterDict or None + See document of `mx.gluon.Block`. + """ + + def __init__(self, + bert, + num_classes=2, + dropout=0.0, + prefix=None, + params=None): + super(BERTClassifier, self).__init__(prefix=prefix, params=params) + self.bert = bert + with self.name_scope(): + self.classifier = nn.HybridSequential(prefix=prefix) + if dropout: + self.classifier.add(nn.Dropout(rate=dropout)) + self.classifier.add(nn.Dense(units=num_classes)) + + def hybrid_forward(self, F, inputs, token_types, valid_length=None): # pylint: disable=arguments-differ + """Generate the unnormalized score for the given the input sequences. + + Parameters + ---------- + inputs : NDArray, shape (batch_size, seq_length) + Input words for the sequences. + token_types : NDArray, shape (batch_size, seq_length) + Token types for the sequences, used to indicate whether the word belongs to the + first sentence or the second one. + valid_length : NDArray or None, shape (batch_size) + Valid length of the sequence. This is used to mask the padded tokens. + + Returns + ------- + outputs : NDArray + Shape (batch_size, num_classes) + """ + _, pooler_out = self.bert(inputs, token_types, valid_length) + return self.classifier(pooler_out) diff --git a/scripts/bert/export/static_bert_qa_model.py b/scripts/bert/export/static_bert_qa_model.py deleted file mode 100644 index e7980cd2e7..0000000000 --- a/scripts/bert/export/static_bert_qa_model.py +++ /dev/null @@ -1,106 +0,0 @@ -# coding: utf-8 - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Static BertForQA models.""" - -__all__ = ['StaticBertForQA', 'BertForQALoss'] - -from mxnet.gluon import HybridBlock, loss, nn -from mxnet.gluon.loss import Loss - - -#create a hybridizable task guided model using BERT -class StaticBertForQA(HybridBlock): - """Hybridizable Model for SQuAD task with BERT. - - The model feeds token ids and token type ids into BERT to get the - pooled BERT sequence representation, then apply a Dense layer for QA task. - - Parameters - ---------- - bert: BERTModel - Bidirectional encoder with transformer. - prefix : str or None - See document of `mx.gluon.HybridBlock`. - params : ParameterDict or None - See document of `mx.gluon.HybridBlock`. - """ - - def __init__(self, bert, prefix=None, params=None): - super(StaticBertForQA, self).__init__(prefix=prefix, params=params) - self.bert = bert - with self.name_scope(): - self.span_classifier = nn.Dense(units=2, flatten=False) - - def hybrid_forward(self, F, inputs, token_types, valid_length=None): - # pylint: disable=arguments-differ - # pylint: disable=unused-argument - """Generate the unnormalized score for the given the input sequences. - - Parameters - ---------- - inputs : NDArray, shape (batch_size, seq_length) - Input words for the sequences. - token_types : NDArray, shape (batch_size, seq_length) - Token types for the sequences, used to indicate whether the word belongs to the - first sentence or the second one. - valid_length : NDArray or None, shape (batch_size,) - Valid length of the sequence. This is used to mask the padded tokens. - - Returns - ------- - outputs : NDArray - Shape (batch_size, seq_length, 2) - """ - bert_output = self.bert(inputs, token_types, valid_length) - output = self.span_classifier(bert_output) - return output - - -class BertForQALoss(Loss): - """Loss for SQuAD task with BERT. - - """ - - def __init__(self, weight=None, batch_axis=0, **kwargs): # pylint: disable=unused-argument - super(BertForQALoss, self).__init__( - weight=None, batch_axis=0, **kwargs) - self.loss = loss.SoftmaxCELoss() - - def hybrid_forward(self, F, pred, label): # pylint: disable=arguments-differ - """ - Parameters - ---------- - pred : NDArray, shape (batch_size, seq_length, 2) - BERTSquad forward output. - label : list, length is 2, each shape is (batch_size,1) - label[0] is the starting position of the answer, - label[1] is the ending position of the answer. - - Returns - ------- - outputs : NDArray - Shape (batch_size,) - """ - pred = F.split(pred, axis=2, num_outputs=2) - start_pred = pred[0].reshape((0, -3)) - start_label = label[0] - end_pred = pred[1].reshape((0, -3)) - end_label = label[1] - return (self.loss(start_pred, start_label) + self.loss( - end_pred, end_label)) / 2 diff --git a/scripts/bert/index.rst b/scripts/bert/index.rst index 07ae1b4d8c..6a5def3eeb 100644 --- a/scripts/bert/index.rst +++ b/scripts/bert/index.rst @@ -319,8 +319,8 @@ Command line interface -0.1820574 , -0.16115054], dtype=float32)] -Example Usage of Exporting Hybridizable BERT -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Export BERT for Deployment +~~~~~~~~~~~~~~~~~~~~~~~~~~ The BERTModel class is a subclass of Block, rather than HybridBlock. To support exporting BERT model to json format for deployment, we introduce the StaticBERT class. @@ -376,96 +376,3 @@ To load and export the BERT base pretrained model that that is suitable for fine $ cd staticbert $ python static_export_base.py --model_parameters --seq_length 128 - - - -Example Usage of Finetuning Hybridizable BERT -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This example mainly introduces the steps needed to use the hybridizable BERT models to finetune on a specific NLP task. -We use SQuAD dataset for Question Answering as an example. - - -Step 1-3 are the same as in previous section 'Example Usage of Exporting Hybridizable BERT', -where an example of Step 1 can be found in 'staticbert/static_bert_for_qa_model.py', -an example of Step 2-3 can be found in 'staticbert/static_finetune_squad.py'. -To export the model, in 'staticbert/static_finetune_squad.py', set export=True. - - -For all model settings above, we set learning rate = 3e-5 and optimizer = adam. -Besides, seq_length stands for the sequence length of the input, input_size represents the embedding size of the input. -The options can be specified in the following command lines. - - -+-----------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ -| | SQuAD 1.1 | SQuAD 1.1 | SQuAD 2.0 | -+=======================+============================================================================================================================+=============================================================================================================================+=============================================================================================================================+ -| model | bert_12_768_12 | bert_24_1024_16 | bert_24_1024_16 | -+-----------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ -| F1 | 88.54 | 90.84 | 81.46 | -+-----------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ -| EM | 81.10 | 84.03 | 78.49 | -+-----------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ -| batch_size | 12 | 4 | 4 | -+-----------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ -| gradient accumulation | None | 6 | 8 | -+-----------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ -| epochs | 2 | 2 | 2 | -+-----------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ -| training log | `log `__ | `log `__ | `log `__ | -+-----------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ -| command | [8] | [9] | [10] | -+-----------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ - -BERT BASE on SQuAD 1.1 -++++++++++++++++++++++ - -[8] bert_12_768_12 - -.. code-block:: console - - $ cd staticbert - $ python static_finetune_squad.py --optimizer adam --batch_size 12 --lr 3e-5 --epochs 2 --gpu 0 --export - - -BERT LARGE on SQuAD 1.1 -+++++++++++++++++++++++ - -[9] bert_24_1024_16 - -.. code-block:: console - - $ cd staticbert - $ python static_finetune_squad.py --bert_model bert_24_1024_16 --optimizer adam --accumulate 6 --batch_size 4 --lr 3e-5 --epochs 2 --gpu 0 --export - - -BERT LARGE on SQuAD 2.0 -+++++++++++++++++++++++ - -[10] bert_24_1024_16 - -.. code-block:: console - - $ cd staticbert - $ python static_finetune_squad.py --bert_model bert_24_1024_16 --optimizer adam --accumulate 8 --batch_size 4 --lr 3e-5 --epochs 2 --gpu 0 --null_score_diff_threshold -2.0 --version_2 --export - -To get the score of the dev data, you need to download the dev dataset (`dev-v2.0.json `_) and the evaluate script (`evaluate-2.0.py `_). Then use the following command to get the score of the dev dataset. - -.. code-block:: console - - $ cd staticbert - $ python evaluate-v2.0.py dev-v2.0.json predictions.json - -.. code-block:: json - - { - "exact": 78.49743114629833, - "f1": 81.46366127573552, - "total": 11873, - "HasAns_exact": 73.38056680161944, - "HasAns_f1": 79.32153345593925, - "HasAns_total": 5928, - "NoAns_exact": 83.59966358284272, - "NoAns_f1": 83.59966358284272, - "NoAns_total": 5945 - } From 492d98cb8f51a218cf3f4489a5a8ef0eb169e64e Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Tue, 14 May 2019 22:08:00 +0000 Subject: [PATCH 04/11] suppport qa,regression, classification --- scripts/bert/export/__init__.py | 4 +- .../export/{export_static.py => export.py} | 140 ++++++++++++------ .../export/{static_bert.py => hybrid_bert.py} | 137 +++++++++++------ 3 files changed, 185 insertions(+), 96 deletions(-) rename scripts/bert/export/{export_static.py => export.py} (60%) rename scripts/bert/export/{static_bert.py => hybrid_bert.py} (90%) diff --git a/scripts/bert/export/__init__.py b/scripts/bert/export/__init__.py index e1627b1e3a..791e5fd1bd 100644 --- a/scripts/bert/export/__init__.py +++ b/scripts/bert/export/__init__.py @@ -18,5 +18,5 @@ # under the License. # pylint: disable=wildcard-import -"""static BERT example.""" -from . import static_bert, static_bert_qa_model +"""Hybrid BERT for deployment.""" +from . import hybrid_bert diff --git a/scripts/bert/export/export_static.py b/scripts/bert/export/export.py similarity index 60% rename from scripts/bert/export/export_static.py rename to scripts/bert/export/export.py index 43248693fa..79f77e27a7 100644 --- a/scripts/bert/export/export_static.py +++ b/scripts/bert/export/export.py @@ -3,7 +3,7 @@ ==================================== -This script exports the BERT model to a static model suitable for use with MXNet Module API. +This script exports the BERT model to a hybrid model suitable for use with MXNet Module API. @article{devlin2018bert, title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding}, @@ -42,9 +42,10 @@ import mxnet as mx import gluonnlp as nlp -from static_bert import get_model +from hybrid_bert import get_hybrid_model +from hybrid_bert import HybridBERTClassifier, HybridBERTRegression, HybridBERTForQA -parser = argparse.ArgumentParser(description='Export static BERT base model.') +parser = argparse.ArgumentParser(description='Export hybrid BERT base model.') parser.add_argument('--model_parameters', type=str, @@ -59,11 +60,10 @@ parser.add_argument('--task', type=str, - default=None, - choices=['classification', 'regression', 'qa'], - help='Task to export. Options are "classification", "regression", "qa". ' - 'If not set, the model for masked language model and next sentence ' - 'prediction will be exported.') + choices=['classification', 'regression', 'question_answering'], + required=True, + help='Task to export. Options are "classification", "regression", ' + '"question_answering"') parser.add_argument('--dataset_name', type=str, @@ -89,18 +89,26 @@ 'Sequences longer than this needs to be truncated, and sequences shorter ' 'than this needs to be padded. Default is 384') +parser.add_argument('--dropout', + type=float, + default=0.1, + help='The dropout probability for the classification/regression head.') + args = parser.parse_args() # create output dir output_dir = args.output_dir nlp.utils.mkdir(output_dir) -# logging +############################################################################### +# Logging # +############################################################################### + log = logging.getLogger('gluonnlp') log.setLevel(logging.DEBUG) formatter = logging.Formatter(fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s', datefmt='%H:%M:%S') -fh = logging.FileHandler(os.path.join(args.output_dir, 'static_export_bert.log'), mode='w') +fh = logging.FileHandler(os.path.join(args.output_dir, 'hybrid_export_bert.log'), mode='w') fh.setLevel(logging.INFO) fh.setFormatter(formatter) console = logging.StreamHandler() @@ -108,65 +116,99 @@ console.setFormatter(formatter) log.addHandler(console) log.addHandler(fh) - log.info(args) -model_parameters = args.model_parameters +############################################################################### +# Hybridize the model # +############################################################################### + seq_length = args.seq_length -test_batch_size = 1 -ctx = mx.cpu() + +if args.task == 'classification': + bert, _ = get_hybrid_model( + name=args.model_name, + dataset_name=args.dataset_name, + pretrained=False, + use_pooler=True, + use_decoder=False, + use_classifier=False, + seq_length=args.seq_length) + net = HybridBERTClassifier(bert, num_classes=2, dropout=args.dropout) +elif args.task == 'regression': + bert, _ = get_hybrid_model( + name=args.model_name, + dataset_name=args.dataset_name, + pretrained=False, + use_pooler=True, + use_decoder=False, + use_classifier=False, + seq_length=args.seq_length) + net = HybridBERTRegression(bert, dropout=args.dropout) +elif args.task == 'question_answering': + bert, _ = get_hybrid_model( + name=args.model_name, + dataset_name=args.dataset_name, + pretrained=False, + use_pooler=False, + use_decoder=False, + use_classifier=False, + seq_length=args.seq_length) + net = HybridBERTForQA(bert) +else: + raise ValueError('unknown task: %s'%args.task) + +if args.model_parameters: + net.load_parameters(args.model_parameters) +else: + net.initialize() + warnings.warn('--model_parameters is not provided. The parameter checkpoint (.params) ' + 'file will be created based on default parameter intialization.') + +net.hybridize(static_alloc=True, static_shape=True) ############################################################################### -# Prepare dummy input data # +# Prepare dummy input data # ############################################################################### +test_batch_size = 1 + inputs = mx.nd.arange(test_batch_size * seq_length) inputs = inputs.reshape(shape=(test_batch_size, seq_length)) token_types = mx.nd.zeros_like(inputs) valid_length = mx.nd.arange(test_batch_size) batch = inputs, token_types, valid_length -num_batch = 10 -sample_dataset = [batch for _ in range(10)] -bert, vocab = get_model( - name=args.model_name, - dataset_name=args.dataset_name, - pretrained=True, - ctx=ctx, - use_pooler=False, - use_decoder=False, - use_classifier=False, - seq_length=args.seq_length) +def export(batch, prefix): + """Export the model.""" + log.info('Exporting the model ... ') + inputs, token_types, valid_length = batch + net(inputs, token_types, valid_length) + net.export(prefix, epoch=0) + assert os.path.isfile(prefix + '-symbol.json') + assert os.path.isfile(prefix + '-0000.params') - -############################################################################### -# Hybridize the model # -############################################################################### -net = bert -if args.task == 'classification': - net = StaticBERTClassifier(net, num_classes=2) - -if model_parameters: - bert.load_parameters(model_parameters, ctx=ctx) -else: - warnings.warn('using random initialization') - -net.hybridize(static_alloc=True, static_shape=True) - -def evaluate(data_source): +def infer(batch, prefix): """Evaluate the model on a mini-batch.""" - log.info('start predicting ... ') + log.info('Start inference ... ') tic = time.time() - for inputs, token_types, valid_length in data_source: - net(inputs.as_in_context(ctx), token_types.as_in_context(ctx), - valid_length.as_in_context(ctx)) + # import with SymbolBlock. Alternatively, you can use Module.load APIs. + inputs, token_types, valid_length = batch + num_trials = 10 + imported_net = mx.gluon.nn.SymbolBlock.imports(prefix + '-symbol.json', + ['data0','data1','data2'], + prefix + '-0000.params') + for _ in range(num_trials): + net(inputs, token_types, valid_length) + mx.nd.waitall() toc = time.time() log.info('Inference time cost={:.2f} s, Thoughput={:.2f} samples/s' - .format(toc - tic, len(data_source) / (toc - tic))) + .format(toc - tic, num_trials / (toc - tic))) + ############################################################################### # Export the model # ############################################################################### if __name__ == '__main__': - evaluate(sample_dataset) - net.export(os.path.join(args.output_dir, 'static_bert_base_net'), epoch=0) + prefix = os.path.join(args.output_dir, args.task) + export(batch, prefix) + infer(batch, prefix) diff --git a/scripts/bert/export/static_bert.py b/scripts/bert/export/hybrid_bert.py similarity index 90% rename from scripts/bert/export/static_bert.py rename to scripts/bert/export/hybrid_bert.py index cc11670adc..c971eda4de 100644 --- a/scripts/bert/export/static_bert.py +++ b/scripts/bert/export/hybrid_bert.py @@ -16,11 +16,12 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Static BERT models.""" +"""Hybrid BERT models.""" -__all__ = ['StaticBERTModel', 'StaticBERTEncoder', - 'get_model', 'bert_12_768_12', 'bert_24_1024_16', 'get_static_bert_model', - 'StaticBertForQA', 'StaticBERTClassifier'] +__all__ = ['HybridBERTModel', 'HybridBERTEncoder', + 'get_hybrid_model', 'hybrid_bert_12_768_12', 'hybrid_bert_24_1024_16', + 'get_hybrid_bert_model', + 'HybridBERTForQA', 'HybridBERTClassifier', 'HybridBERTRegression'] from mxnet.gluon import HybridBlock, loss, nn from mxnet.gluon.loss import Loss @@ -33,21 +34,19 @@ from mxnet.gluon import nn import mxnet as mx from gluonnlp.model.block import GELU -from gluonnlp.model.bert import BERTLayerNorm, BERTEncoderCell, _load_vocab, \ - _load_pretrained_params, bert_hparams -from gluonnlp.model.transformer import TransformerEncoderCell, _get_layer_norm, \ - _position_encoding_init +from gluonnlp.model.bert import BERTLayerNorm, BERTEncoderCell, _load_vocab +from gluonnlp.model.bert import _load_pretrained_params, bert_hparams +from gluonnlp.model.transformer import TransformerEncoderCell, _get_layer_norm +from gluonnlp.model.transformer import _position_encoding_init from gluonnlp.vocab import BERTVocab from gluonnlp.base import get_home_dir - ############################################################################### # COMPONENTS # ############################################################################### - -class StaticBaseTransformerEncoder(HybridBlock): - """Base Structure of the Static Transformer Encoder. +class HybridBaseTransformerEncoder(HybridBlock): + """Base Structure of the Hybrid Transformer Encoder. Parameters ---------- @@ -109,7 +108,7 @@ def __init__(self, attention_cell='multi_head', num_layers=2, positional_weight='sinusoidal', use_bert_encoder=False, use_layer_norm_before_dropout=False, scale_embed=True, input_size=None, seq_length=None, prefix=None, params=None): - super(StaticBaseTransformerEncoder, self).__init__(prefix=prefix, params=params) + super(HybridBaseTransformerEncoder, self).__init__(prefix=prefix, params=params) assert units % num_heads == 0, \ 'In TransformerEncoder, The units should be divided exactly ' \ 'by the number of heads. Received units={}, num_heads={}' \ @@ -253,11 +252,11 @@ def hybrid_forward(self, F, inputs, states=None, return outputs, additional_outputs -class StaticBERTEncoder(StaticBaseTransformerEncoder): - """Structure of the Static BERT Encoder. +class HybridBERTEncoder(HybridBaseTransformerEncoder): + """Structure of the Hybrid BERT Encoder. Different from the original encoder for transformer, - `StaticBERTEncoder` uses learnable positional embedding, `BERTPositionwiseFFN` + `HybridBERTEncoder` uses learnable positional embedding, `BERTPositionwiseFFN` and `BERTLayerNorm`. Parameters @@ -318,7 +317,7 @@ def __init__(self, attention_cell='multi_head', num_layers=2, use_residual=True, output_attention=False, output_all_encodings=False, weight_initializer=None, bias_initializer='zeros', input_size=None, seq_length=None, prefix=None, params=None): - super(StaticBERTEncoder, self).__init__(attention_cell=attention_cell, + super(HybridBERTEncoder, self).__init__(attention_cell=attention_cell, num_layers=num_layers, units=units, hidden_size=hidden_size, max_length=max_length, num_heads=num_heads, scaled=scaled, dropout=dropout, @@ -341,12 +340,12 @@ def __init__(self, attention_cell='multi_head', num_layers=2, # FULL MODEL # ############################################################################### -class StaticBERTModel(HybridBlock): - """Static Model for BERT (Bidirectional Encoder Representations from Transformers). +class HybridBERTModel(HybridBlock): + """Hybrid Model for BERT (Bidirectional Encoder Representations from Transformers). Parameters ---------- - encoder : StaticBERTEncoder + encoder : HybridBERTEncoder Bidirectional encoder that encodes the input sentence. vocab_size : int or None, default None The size of the vocabulary. @@ -396,7 +395,7 @@ class StaticBERTModel(HybridBlock): layer of the Encoder, or a list of all sequence encodings of all layers. In both cases shape of the tensor(s) is/are (batch_size, seq_length, units). - **attention_outputs**: output list of all intermediate encodings per layer - Returned only if StaticBERTEncoder.output_attention is True. + Returned only if HybridBERTEncoder.output_attention is True. List of num_layers length of tensors of shape (num_masks, num_attention_heads, seq_length, seq_length) - **pooled_output**: output tensor of pooled representation of the first tokens. @@ -412,7 +411,7 @@ def __init__(self, encoder, vocab_size=None, token_type_vocab_size=None, units=N embed_size=None, embed_dropout=0.0, embed_initializer=None, word_embed=None, token_type_embed=None, use_pooler=True, use_decoder=True, use_classifier=True, prefix=None, params=None): - super(StaticBERTModel, self).__init__(prefix=prefix, params=params) + super(HybridBERTModel, self).__init__(prefix=prefix, params=params) self._use_decoder = use_decoder self._use_classifier = use_classifier self._use_pooler = use_pooler @@ -487,7 +486,7 @@ def hybrid_forward(self, F, inputs, token_types, valid_length=None, masked_posit # pylint: disable=unused-argument """Generate the representation given the inputs. - This is used in training or fine-tuning a static (hybridized) BERT model. + This is used in training or fine-tuning a hybrid (hybridized) BERT model. """ outputs = [] seq_out, attention_out = self._encode_sequence(F, inputs, token_types, valid_length) @@ -519,7 +518,7 @@ def _encode_sequence(self, F, inputs, token_types, valid_length=None): # pylint: disable=unused-argument """Generate the representation given the input sequences. - This is used for pre-training or fine-tuning a static (hybridized) BERT model. + This is used for pre-training or fine-tuning a hybrid (hybridized) BERT model. """ # embedding word_embedding = self.word_embed(inputs) @@ -534,7 +533,7 @@ def _apply_pooling(self, sequence): This is used for pre-training or fine-tuning a BERT model. """ - outputs = sequence[:, 0, :] + outputs = sequence.slice(begin=(None, 0, None), end=(None, 1, None)) return self.pooler(outputs) def _decode(self, sequence, masked_positions): @@ -573,8 +572,8 @@ def _decode(self, sequence, masked_positions): # GET MODEL # ############################################################################### -def get_model(name, dataset_name='wikitext-2', **kwargs): - """Returns a pre-defined model by name. +def get_hybrid_model(name, dataset_name='wikitext-2', **kwargs): + """Returns a pre-defined hybrid model by name. Parameters ---------- @@ -596,7 +595,7 @@ def get_model(name, dataset_name='wikitext-2', **kwargs): Returns ------- - gluon.Block, gluonnlp.Vocab, (optional) gluonnlp.Vocab + gluon.HybridBlock, BERTVocab """ models = {'bert_12_768_12': bert_12_768_12, 'bert_24_1024_16': bert_24_1024_16} @@ -613,7 +612,7 @@ def bert_12_768_12(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(), root=os.path.join(get_home_dir(), 'models'), use_pooler=True, use_decoder=True, use_classifier=True, input_size=None, seq_length=None, **kwargs): - """Static BERT BASE model. + """Hybrid BERT BASE model. The number of layers (L) is 12, number of units (H) is 768, and the number of self-attention heads (A) is 12. @@ -647,9 +646,9 @@ def bert_12_768_12(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(), Returns ------- - StaticBERTModel, gluonnlp.vocab.BERTVocab + HybridBERTModel, gluonnlp.vocab.BERTVocab """ - return get_static_bert_model(model_name='bert_12_768_12', vocab=vocab, + return get_hybrid_bert_model(model_name='bert_12_768_12', vocab=vocab, dataset_name=dataset_name, pretrained=pretrained, ctx=ctx, use_pooler=use_pooler, use_decoder=use_decoder, use_classifier=use_classifier, root=root, input_size=input_size, @@ -660,7 +659,7 @@ def bert_24_1024_16(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu() use_pooler=True, use_decoder=True, use_classifier=True, root=os.path.join(get_home_dir(), 'models'), input_size=None, seq_length=None, **kwargs): - """Static BERT LARGE model. + """Hybrid BERT LARGE model. The number of layers (L) is 24, number of units (H) is 1024, and the number of self-attention heads (A) is 16. @@ -693,22 +692,22 @@ def bert_24_1024_16(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu() Returns ------- - StaticBERTModel, gluonnlp.vocab.BERTVocab + HybridBERTModel, gluonnlp.vocab.BERTVocab """ - return get_static_bert_model(model_name='bert_24_1024_16', vocab=vocab, + return get_hybrid_bert_model(model_name='bert_24_1024_16', vocab=vocab, dataset_name=dataset_name, pretrained=pretrained, ctx=ctx, use_pooler=use_pooler, use_decoder=use_decoder, use_classifier=use_classifier, root=root, input_size=input_size, seq_length=seq_length, **kwargs) -def get_static_bert_model(model_name=None, dataset_name=None, vocab=None, +def get_hybrid_bert_model(model_name=None, dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(), use_pooler=True, use_decoder=True, use_classifier=True, output_attention=False, output_all_encodings=False, root=os.path.join(get_home_dir(), 'models'), input_size=None, seq_length=None, **kwargs): - """Any Static BERT pretrained model. + """Any Hybrid BERT pretrained model. Parameters ---------- @@ -747,7 +746,7 @@ def get_static_bert_model(model_name=None, dataset_name=None, vocab=None, Returns ------- - StaticBERTModel, gluonnlp.vocab.BERTVocab + HybridBERTModel, gluonnlp.vocab.BERTVocab """ predefined_args = bert_hparams[model_name] mutable_args = ['use_residual', 'dropout', 'embed_dropout', 'word_embed'] @@ -756,7 +755,7 @@ def get_static_bert_model(model_name=None, dataset_name=None, vocab=None, 'Cannot override predefined model settings.' predefined_args.update(kwargs) # encoder - encoder = StaticBERTEncoder(attention_cell=predefined_args['attention_cell'], + encoder = HybridBERTEncoder(attention_cell=predefined_args['attention_cell'], num_layers=predefined_args['num_layers'], units=predefined_args['units'], hidden_size=predefined_args['hidden_size'], @@ -774,7 +773,7 @@ def get_static_bert_model(model_name=None, dataset_name=None, vocab=None, ' Please use wiki_cn_cased/wiki_multilingual_uncased instead.') bert_vocab = _load_vocab(dataset_name, vocab, root, cls=BERTVocab) # BERT - net = StaticBERTModel(encoder, len(bert_vocab), + net = HybridBERTModel(encoder, len(bert_vocab), token_type_vocab_size=predefined_args['token_type_vocab_size'], units=predefined_args['units'], embed_size=predefined_args['embed_size'], @@ -788,8 +787,7 @@ def get_static_bert_model(model_name=None, dataset_name=None, vocab=None, ignore_extra=ignore_extra) return net, bert_vocab -#create a hybridizable task guided model using BERT -class StaticBertForQA(HybridBlock): +class HybridBERTForQA(HybridBlock): """Hybridizable Model for SQuAD task with BERT. The model feeds token ids and token type ids into BERT to get the @@ -806,14 +804,13 @@ class StaticBertForQA(HybridBlock): """ def __init__(self, bert, prefix=None, params=None): - super(StaticBertForQA, self).__init__(prefix=prefix, params=params) + super(HybridBERTForQA, self).__init__(prefix=prefix, params=params) self.bert = bert with self.name_scope(): self.span_classifier = nn.Dense(units=2, flatten=False) def hybrid_forward(self, F, inputs, token_types, valid_length=None): # pylint: disable=arguments-differ - # pylint: disable=unused-argument """Generate the unnormalized score for the given the input sequences. Parameters @@ -835,7 +832,7 @@ def hybrid_forward(self, F, inputs, token_types, valid_length=None): output = self.span_classifier(bert_output) return output -class StaticBERTClassifier(HybridBlock): +class HybridBERTClassifier(HybridBlock): """Model for sentence (pair) classification task with BERT. The model feeds token ids and token type ids into BERT to get the @@ -862,7 +859,7 @@ def __init__(self, dropout=0.0, prefix=None, params=None): - super(BERTClassifier, self).__init__(prefix=prefix, params=params) + super(HybridBERTClassifier, self).__init__(prefix=prefix, params=params) self.bert = bert with self.name_scope(): self.classifier = nn.HybridSequential(prefix=prefix) @@ -890,3 +887,53 @@ def hybrid_forward(self, F, inputs, token_types, valid_length=None): # pylint: """ _, pooler_out = self.bert(inputs, token_types, valid_length) return self.classifier(pooler_out) + +class HybridBERTRegression(HybridBlock): + """Model for sentence (pair) regression task with BERT. + + The model feeds token ids and token type ids into BERT to get the + pooled BERT sequence representation, then apply a Dense layer for + regression. + + Parameters + ---------- + bert: BERTModel + Bidirectional encoder with transformer. + dropout : float or None, default 0.0. + Dropout probability for the bert output. + prefix : str or None + See document of `mx.gluon.Block`. + params : ParameterDict or None + See document of `mx.gluon.Block`. + """ + + def __init__(self, bert, dropout=0.0, prefix=None, params=None): + super(HybridBERTRegression, self).__init__(prefix=prefix, params=params) + self.bert = bert + with self.name_scope(): + self.regression = nn.HybridSequential(prefix=prefix) + if dropout: + self.regression.add(nn.Dropout(rate=dropout)) + self.regression.add(nn.Dense(1)) + + def hybrid_forward(self, _, inputs, token_types, valid_length=None): # pylint: disable=arguments-differ + """Generate the unnormalized score for the given the input sequences. + + Parameters + ---------- + inputs : NDArray, shape (batch_size, seq_length) + Input words for the sequences. + token_types : NDArray, shape (batch_size, seq_length) + Token types for the sequences, used to indicate whether the word belongs to the + first sentence or the second one. + valid_length : NDArray or None, shape (batch_size) + Valid length of the sequence. This is used to mask the padded tokens. + + Returns + ------- + outputs : NDArray + Shape (batch_size, num_classes) + """ + _, pooler_out = self.bert(inputs, token_types, valid_length) + return self.regression(pooler_out) + From 705b970405d06c9a4c424d0836ffc6b5693c4b9d Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Tue, 14 May 2019 22:35:57 +0000 Subject: [PATCH 05/11] add readme and tasks --- scripts/bert/export/export.py | 9 +- scripts/bert/export/hybrid_bert.py | 2 +- scripts/bert/export/static_export_squad.py | 239 --------------------- scripts/bert/index.rst | 54 +---- scripts/tests/test_scripts.py | 7 + 5 files changed, 18 insertions(+), 293 deletions(-) delete mode 100644 scripts/bert/export/static_export_squad.py diff --git a/scripts/bert/export/export.py b/scripts/bert/export/export.py index 79f77e27a7..8903924ab9 100644 --- a/scripts/bert/export/export.py +++ b/scripts/bert/export/export.py @@ -3,7 +3,8 @@ ==================================== -This script exports the BERT model to a hybrid model suitable for use with MXNet Module API. +This script exports the BERT model to a hybrid model serialized as a symbol.json file, +which is suitable for deployment, or use with MXNet Module API. @article{devlin2018bert, title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding}, @@ -191,12 +192,14 @@ def infer(batch, prefix): """Evaluate the model on a mini-batch.""" log.info('Start inference ... ') tic = time.time() + # import with SymbolBlock. Alternatively, you can use Module.load APIs. - inputs, token_types, valid_length = batch - num_trials = 10 imported_net = mx.gluon.nn.SymbolBlock.imports(prefix + '-symbol.json', ['data0','data1','data2'], prefix + '-0000.params') + # run forward inference + inputs, token_types, valid_length = batch + num_trials = 10 for _ in range(num_trials): net(inputs, token_types, valid_length) mx.nd.waitall() diff --git a/scripts/bert/export/hybrid_bert.py b/scripts/bert/export/hybrid_bert.py index c971eda4de..d055946f4c 100644 --- a/scripts/bert/export/hybrid_bert.py +++ b/scripts/bert/export/hybrid_bert.py @@ -19,7 +19,7 @@ """Hybrid BERT models.""" __all__ = ['HybridBERTModel', 'HybridBERTEncoder', - 'get_hybrid_model', 'hybrid_bert_12_768_12', 'hybrid_bert_24_1024_16', + 'get_hybrid_model', 'bert_12_768_12', 'bert_24_1024_16', 'get_hybrid_bert_model', 'HybridBERTForQA', 'HybridBERTClassifier', 'HybridBERTRegression'] diff --git a/scripts/bert/export/static_export_squad.py b/scripts/bert/export/static_export_squad.py deleted file mode 100644 index da87914179..0000000000 --- a/scripts/bert/export/static_export_squad.py +++ /dev/null @@ -1,239 +0,0 @@ -""" -Export SQuAD with Static Bidirectional Encoder Representations from Transformers (BERT) - -========================================================================================= - -This example shows how to export a Block based BERT model with pre-trained BERT parameters -with static shape, we are using SQuAD as an example. - -@article{devlin2018bert, - title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding}, - author={Devlin, Jacob and Chang, Ming- \ - Wei and Lee, Kenton and Toutanova, Kristina}, - journal={arXiv preprint arXiv:1810.04805}, - year={2018} -} -""" - -# coding=utf-8 - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint:disable=redefined-outer-name,logging-format-interpolation - -import argparse -import logging -import os -import time - -import mxnet as mx - -from static_bert_qa_model import StaticBertForQA -from static_bert import get_model - -log = logging.getLogger('gluonnlp') -log.setLevel(logging.DEBUG) -formatter = logging.Formatter( - fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s', datefmt='%H:%M:%S') - -parser = argparse.ArgumentParser(description='export static BERT QA example.') - -parser.add_argument('--model_parameters', - type=str, - default=None, - help='Model parameter file') - -parser.add_argument('--bert_model', - type=str, - default='bert_12_768_12', - help='BERT model name. options are bert_12_768_12 and bert_24_1024_16.') - -parser.add_argument('--bert_dataset', - type=str, - default='book_corpus_wiki_en_uncased', - help='BERT dataset name.' - 'options are book_corpus_wiki_en_uncased and book_corpus_wiki_en_cased.') - -parser.add_argument('--pretrained_bert_parameters', - type=str, - default=None, - help='Pre-trained bert model parameter file. default is None') - -parser.add_argument('--uncased', - action='store_false', - help='if not set, inputs are converted to lower case.') - -parser.add_argument('--output_dir', - type=str, - default='./output_dir', - help='The output directory where the model params will be written.' - ' default is ./output_dir') - -parser.add_argument('--test_batch_size', - type=int, - default=24, - help='Test batch size. default is 24') - -parser.add_argument('--max_seq_length', - type=int, - default=384, - help='The maximum total input sequence length after WordPiece tokenization.' - 'Sequences longer than this will be truncated, and sequences shorter ' - 'than this will be padded. default is 384') - -parser.add_argument('--doc_stride', - type=int, - default=128, - help='When splitting up a long document into chunks, how much stride to ' - 'take between chunks. default is 128') - -parser.add_argument('--max_query_length', - type=int, - default=64, - help='The maximum number of tokens for the question. Questions longer than ' - 'this will be truncated to this length. default is 64') - -parser.add_argument('--gpu', type=str, help='single gpu id') - -parser.add_argument('--seq_length', - type=int, - default=384, - help='The sequence length of the input') - -parser.add_argument('--input_size', - type=int, - default=768, - help='The embedding size of the input') - -parser.add_argument('--export', - action='store_true', - help='Whether to export the model.') - -parser.add_argument('--evaluate', - action='store_true', - help='Whether to evaluate the model.') - -args = parser.parse_args() - - -output_dir = args.output_dir -if not os.path.exists(output_dir): - os.mkdir(output_dir) - -fh = logging.FileHandler(os.path.join( - args.output_dir, 'static_export_squad.log'), mode='w') -fh.setLevel(logging.INFO) -fh.setFormatter(formatter) -console = logging.StreamHandler() -console.setLevel(logging.INFO) -console.setFormatter(formatter) -log.addHandler(console) -log.addHandler(fh) - -log.info(args) - -model_name = args.bert_model -dataset_name = args.bert_dataset -model_parameters = args.model_parameters -pretrained_bert_parameters = args.pretrained_bert_parameters -lower = args.uncased - -seq_length = args.seq_length -input_size = args.input_size -test_batch_size = args.test_batch_size -ctx = mx.cpu() if not args.gpu else mx.gpu(int(args.gpu)) - -max_seq_length = args.max_seq_length -doc_stride = args.doc_stride -max_query_length = args.max_query_length - -if max_seq_length <= max_query_length + 3: - raise ValueError('The max_seq_length (%d) must be greater than max_query_length ' - '(%d) + 3' % (max_seq_length, max_query_length)) - - -############################################################################### -# Prepare dummy input data # -############################################################################### -if args.evaluate: - inputs = mx.nd.arange(test_batch_size * seq_length).reshape(shape=(test_batch_size, seq_length)) - token_types = mx.nd.zeros_like(inputs) - valid_length = mx.nd.arange(seq_length)[:test_batch_size] - batch = inputs, token_types, valid_length - num_batch = 10 - sample_dataset = [] - for _ in range(num_batch): - sample_dataset.append(batch) - - -bert, vocab = get_model( - name=model_name, - dataset_name=dataset_name, - pretrained=not model_parameters and not pretrained_bert_parameters, - ctx=ctx, - use_pooler=False, - use_decoder=False, - use_classifier=False, - input_size=args.input_size, - seq_length=args.seq_length) - - -############################################################################### -# Hybridize the model # -############################################################################### -net = StaticBertForQA(bert=bert) -if pretrained_bert_parameters and not model_parameters: - bert.load_parameters(pretrained_bert_parameters, ctx=ctx, - ignore_extra=True) -if not model_parameters: - net.span_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx) -else: - net.load_parameters(model_parameters, ctx=ctx) - -net.hybridize(static_alloc=True, static_shape=True) - - -def evaluate(data_source): - """Evaluate the model on a mini-batch. - """ - log.info('Start predict') - tic = time.time() - for batch in data_source: - inputs, token_types, valid_length = batch - net(inputs.astype('float32').as_in_context(ctx), - token_types.astype('float32').as_in_context(ctx), - valid_length.astype('float32').as_in_context(ctx)) - mx.nd.waitall() - toc = time.time() - log.info('Inference time cost={:.2f} s, Thoughput={:.2f} samples/s' - .format(toc - tic, - len(data_source) / (toc - tic))) - - - -############################################################################### -# Export the model # -############################################################################### -if __name__ == '__main__': - if args.export: - net.export(os.path.join(args.output_dir, 'static_net'), epoch=0) - if args.evaluate: - net.load_parameters(os.path.join(args.output_dir, 'static_net-0000.params')) - evaluate(sample_dataset) - else: - if args.evaluate: - evaluate(sample_dataset) diff --git a/scripts/bert/index.rst b/scripts/bert/index.rst index 6a5def3eeb..95d85ddcb5 100644 --- a/scripts/bert/index.rst +++ b/scripts/bert/index.rst @@ -322,57 +322,11 @@ Command line interface Export BERT for Deployment ~~~~~~~~~~~~~~~~~~~~~~~~~~ -The BERTModel class is a subclass of Block, rather than HybridBlock. -To support exporting BERT model to json format for deployment, we introduce the StaticBERT class. -Specifically, by exporting hybridizable BERT, we mean the BERT with fixed input embedding size and sequence length can be exported through -a static shape based implementation of hybridblock based BERT. By using this, we can export a block based BERT model. - -Please follow the steps below for exporting the model. - - -Step 1: create a hybridizable task guided model using BERT: - -.. code-block:: python - - class StaticBertForQA(HybridBlock) - -An example can be found in 'staticbert/static_bert_for_qa_model.py'. - - -Step 2: hybridize the model in the script: - -.. code-block:: python - - net = StaticBertForQA(bert=bert) - net.hybridize(static_alloc=True, static_shape=True) - -An example can be found in 'staticbert/static_export_squad.py'. - - -Step 3: export trained model: - -.. code-block:: python - - net.export(os.path.join(args.output_dir, 'static_net'), epoch=args.epochs) - -To export the model, in 'staticbert/static_export_squad.py', set export=True. - -To run the example, if you would like to export the Block parameters -and test the HybridBlock on your datasets with the specified input size and sequence length, +Current export/export.py support exporting BERT models. Supported values for --task argument include classification, regression and question_answering. .. code-block:: console - $ cd staticbert - $ python static_export_squad.py --model_parameters output_dir/net.params --export --evaluate --seq_length 384 --input_size 768 --gpu 0 - -This will load the the StaticBERTQA HybridBlock with parameter (requirement: output_dir/net.params should exist) -trained by a normal BERTQA Block, and export the HybridBlock to json format. - -Besides, Where seq_length stands for the sequence length of the input, input_size represents the embedding size of the input. - -To load and export the BERT base pretrained model that that is suitable for fine tuning, use the following: - -.. code-block:: console + $ python export/export.py --task classification --model_parameters /path/to/saved/ckpt.params --output_dir /path/to/output/dir/ --seq_length 256 - $ cd staticbert - $ python static_export_base.py --model_parameters --seq_length 128 +This will export the BERT model for classification to a symbol.json file, saved to the directory specified by --output_dir. +The --model_parameters argument is optional. If not set, the .params file saved in the output directory will be randomly intialized parameters. diff --git a/scripts/tests/test_scripts.py b/scripts/tests/test_scripts.py index 45f7034891..40935f3ee8 100644 --- a/scripts/tests/test_scripts.py +++ b/scripts/tests/test_scripts.py @@ -343,3 +343,10 @@ def test_finetune_train(dataset): process = subprocess.check_call([sys.executable, './scripts/bert/finetune_classifier.py', '--task_name', dataset, '--optimizer', 'adam'] + arguments) + +@pytest.mark.serial +@pytest.mark.integration +@pytest.mark.parametrize('task', ['classification', 'regression', 'question_answering']) +def test_export(task): + process = subprocess.check_call([sys.executable, './scripts/bert/export/export.py', + '--task', task]) From 1d9c780e8856e3de05eea57f8ce00ae12d2df573 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Tue, 14 May 2019 22:46:04 +0000 Subject: [PATCH 06/11] fix lint --- scripts/bert/export/export.py | 4 ++-- scripts/bert/export/hybrid_bert.py | 7 ++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/scripts/bert/export/export.py b/scripts/bert/export/export.py index 8903924ab9..f1ae1a01a7 100644 --- a/scripts/bert/export/export.py +++ b/scripts/bert/export/export.py @@ -195,13 +195,13 @@ def infer(batch, prefix): # import with SymbolBlock. Alternatively, you can use Module.load APIs. imported_net = mx.gluon.nn.SymbolBlock.imports(prefix + '-symbol.json', - ['data0','data1','data2'], + ['data0', 'data1', 'data2'], prefix + '-0000.params') # run forward inference inputs, token_types, valid_length = batch num_trials = 10 for _ in range(num_trials): - net(inputs, token_types, valid_length) + imported_net(inputs, token_types, valid_length) mx.nd.waitall() toc = time.time() log.info('Inference time cost={:.2f} s, Thoughput={:.2f} samples/s' diff --git a/scripts/bert/export/hybrid_bert.py b/scripts/bert/export/hybrid_bert.py index d055946f4c..63313380d8 100644 --- a/scripts/bert/export/hybrid_bert.py +++ b/scripts/bert/export/hybrid_bert.py @@ -23,16 +23,14 @@ 'get_hybrid_bert_model', 'HybridBERTForQA', 'HybridBERTClassifier', 'HybridBERTRegression'] -from mxnet.gluon import HybridBlock, loss, nn -from mxnet.gluon.loss import Loss - import os import math import warnings +import mxnet as mx from mxnet.gluon import Block, HybridBlock from mxnet.gluon import nn -import mxnet as mx + from gluonnlp.model.block import GELU from gluonnlp.model.bert import BERTLayerNorm, BERTEncoderCell, _load_vocab from gluonnlp.model.bert import _load_pretrained_params, bert_hparams @@ -936,4 +934,3 @@ def hybrid_forward(self, _, inputs, token_types, valid_length=None): # pylint: """ _, pooler_out = self.bert(inputs, token_types, valid_length) return self.regression(pooler_out) - From 3b593fd99edc2fabdb029bea65dd80a377c85eaf Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Wed, 15 May 2019 01:30:12 +0000 Subject: [PATCH 07/11] fix unit test --- scripts/bert/export/hybrid_bert.py | 2 +- scripts/tests/{test_static_bert.py => test_hybrid_bert.py} | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) rename scripts/tests/{test_static_bert.py => test_hybrid_bert.py} (97%) diff --git a/scripts/bert/export/hybrid_bert.py b/scripts/bert/export/hybrid_bert.py index 63313380d8..a875cb2c18 100644 --- a/scripts/bert/export/hybrid_bert.py +++ b/scripts/bert/export/hybrid_bert.py @@ -531,7 +531,7 @@ def _apply_pooling(self, sequence): This is used for pre-training or fine-tuning a BERT model. """ - outputs = sequence.slice(begin=(None, 0, None), end=(None, 1, None)) + outputs = sequence.slice(begin=(None, 0, None), end=(None, 1, None)).squeeze(axis=1) return self.pooler(outputs) def _decode(self, sequence, masked_positions): diff --git a/scripts/tests/test_static_bert.py b/scripts/tests/test_hybrid_bert.py similarity index 97% rename from scripts/tests/test_static_bert.py rename to scripts/tests/test_hybrid_bert.py index 868cfe5531..430774da1a 100644 --- a/scripts/tests/test_static_bert.py +++ b/scripts/tests/test_hybrid_bert.py @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -"""Test static bert models.""" +"""Test hybrid bert models.""" from __future__ import print_function @@ -30,12 +30,12 @@ import pytest -from ..bert.staticbert.static_bert import get_model +from ..bert.export.hybrid_bert import get_model @pytest.mark.serial @pytest.mark.remote_required -def test_static_bert_models(): +def test_hybrid_bert_models(): models = ['bert_12_768_12', 'bert_24_1024_16'] layers = [12, 24] attention_heads = [12, 16] From 25c4d0ed1d1ae4103df6614db3947c129e233665 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Wed, 15 May 2019 03:19:19 +0000 Subject: [PATCH 08/11] fix test --- scripts/tests/test_hybrid_bert.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/tests/test_hybrid_bert.py b/scripts/tests/test_hybrid_bert.py index 430774da1a..2d44d6ec48 100644 --- a/scripts/tests/test_hybrid_bert.py +++ b/scripts/tests/test_hybrid_bert.py @@ -30,7 +30,7 @@ import pytest -from ..bert.export.hybrid_bert import get_model +from ..bert.export.hybrid_bert import get_hybrid_model @pytest.mark.serial @@ -109,10 +109,10 @@ def collect_shapes(item, shapes): for kwarg, expected_shape in zip(kwargs, expected_shapes): expected_shape = infer_shape(expected_shape, unit) - model, _ = get_model(model_name, dataset_name=dataset, - pretrained=False, root='tests/data/model/', - seq_length=seq_len, input_size=unit, - **kwarg) + model, _ = get_hybrid_model(model_name, dataset_name=dataset, + pretrained=False, root='tests/data/model/', + seq_length=seq_len, input_size=unit, + **kwarg) model.initialize() if kwarg['use_decoder']: # position tensor is required for decoding From 18ed5b3e60c82bfa492071d8ae4369c91a164b62 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Wed, 15 May 2019 03:59:36 +0000 Subject: [PATCH 09/11] remove test_bert_static_base_export --- scripts/tests/test_scripts.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/scripts/tests/test_scripts.py b/scripts/tests/test_scripts.py index 40935f3ee8..e3de469aa0 100644 --- a/scripts/tests/test_scripts.py +++ b/scripts/tests/test_scripts.py @@ -176,16 +176,6 @@ def test_bert_embedding(use_pretrained): time.sleep(5) -@pytest.mark.serial -@pytest.mark.remote_required -@pytest.mark.gpu -@pytest.mark.integration -def test_bert_static_base_export(): - args = ['--gpu', '0', '--seq_length', '128'] - process = subprocess.check_call([sys.executable, './scripts/bert/staticbert/static_export_base.py'] + args) - time.sleep(5) - - @pytest.mark.serial @pytest.mark.gpu @pytest.mark.remote_required From 1ac214029d7b99e9bdbb5755ccb421cd74667e6e Mon Sep 17 00:00:00 2001 From: Haibin Lin Date: Wed, 15 May 2019 15:28:13 -0700 Subject: [PATCH 10/11] address comments --- scripts/bert/export/export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/bert/export/export.py b/scripts/bert/export/export.py index f1ae1a01a7..0e7c20ab8e 100644 --- a/scripts/bert/export/export.py +++ b/scripts/bert/export/export.py @@ -191,12 +191,12 @@ def export(batch, prefix): def infer(batch, prefix): """Evaluate the model on a mini-batch.""" log.info('Start inference ... ') - tic = time.time() # import with SymbolBlock. Alternatively, you can use Module.load APIs. imported_net = mx.gluon.nn.SymbolBlock.imports(prefix + '-symbol.json', ['data0', 'data1', 'data2'], prefix + '-0000.params') + tic = time.time() # run forward inference inputs, token_types, valid_length = batch num_trials = 10 From 33cabc12970218e73bfa3895efbfbbe919a42473 Mon Sep 17 00:00:00 2001 From: Haibin Lin Date: Fri, 17 May 2019 15:20:16 -0700 Subject: [PATCH 11/11] update default seq len --- scripts/bert/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/bert/index.rst b/scripts/bert/index.rst index 95d85ddcb5..7c129add49 100644 --- a/scripts/bert/index.rst +++ b/scripts/bert/index.rst @@ -326,7 +326,7 @@ Current export/export.py support exporting BERT models. Supported values for --t .. code-block:: console - $ python export/export.py --task classification --model_parameters /path/to/saved/ckpt.params --output_dir /path/to/output/dir/ --seq_length 256 + $ python export/export.py --task classification --model_parameters /path/to/saved/ckpt.params --output_dir /path/to/output/dir/ --seq_length 128 This will export the BERT model for classification to a symbol.json file, saved to the directory specified by --output_dir. The --model_parameters argument is optional. If not set, the .params file saved in the output directory will be randomly intialized parameters.