From d99068c6c1b50a96bd4a9d06cd1b5a4ede49c1d6 Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-172-31-20-62.us-west-2.compute.internal>
Date: Tue, 14 May 2019 05:16:08 +0000
Subject: [PATCH 01/11] rename folder

---
 .../bert/{ => export}/staticbert/__init__.py  |   0
 .../{ => export}/staticbert/static_bert.py    |   0
 .../staticbert/static_bert_qa_model.py        |   0
 .../staticbert/static_export_base.py          |   0
 .../staticbert/static_export_squad.py         |   0
 .../bert/staticbert/static_finetune_squad.py  | 530 ------------------
 6 files changed, 530 deletions(-)
 rename scripts/bert/{ => export}/staticbert/__init__.py (100%)
 rename scripts/bert/{ => export}/staticbert/static_bert.py (100%)
 rename scripts/bert/{ => export}/staticbert/static_bert_qa_model.py (100%)
 rename scripts/bert/{ => export}/staticbert/static_export_base.py (100%)
 rename scripts/bert/{ => export}/staticbert/static_export_squad.py (100%)
 delete mode 100644 scripts/bert/staticbert/static_finetune_squad.py

diff --git a/scripts/bert/staticbert/__init__.py b/scripts/bert/export/staticbert/__init__.py
similarity index 100%
rename from scripts/bert/staticbert/__init__.py
rename to scripts/bert/export/staticbert/__init__.py
diff --git a/scripts/bert/staticbert/static_bert.py b/scripts/bert/export/staticbert/static_bert.py
similarity index 100%
rename from scripts/bert/staticbert/static_bert.py
rename to scripts/bert/export/staticbert/static_bert.py
diff --git a/scripts/bert/staticbert/static_bert_qa_model.py b/scripts/bert/export/staticbert/static_bert_qa_model.py
similarity index 100%
rename from scripts/bert/staticbert/static_bert_qa_model.py
rename to scripts/bert/export/staticbert/static_bert_qa_model.py
diff --git a/scripts/bert/staticbert/static_export_base.py b/scripts/bert/export/staticbert/static_export_base.py
similarity index 100%
rename from scripts/bert/staticbert/static_export_base.py
rename to scripts/bert/export/staticbert/static_export_base.py
diff --git a/scripts/bert/staticbert/static_export_squad.py b/scripts/bert/export/staticbert/static_export_squad.py
similarity index 100%
rename from scripts/bert/staticbert/static_export_squad.py
rename to scripts/bert/export/staticbert/static_export_squad.py
diff --git a/scripts/bert/staticbert/static_finetune_squad.py b/scripts/bert/staticbert/static_finetune_squad.py
deleted file mode 100644
index 91db1f8268..0000000000
--- a/scripts/bert/staticbert/static_finetune_squad.py
+++ /dev/null
@@ -1,530 +0,0 @@
-# coding=utf-8
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-"""
-SQuAD with Static Bidirectional Encoder Representations from Transformers (BERT)
-
-=========================================================================================
-
-This example shows how to finetune a model with pre-trained BERT parameters with static shape for
-SQuAD, with Gluon NLP Toolkit.
-
-@article{devlin2018bert,
-  title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
-  author={Devlin, Jacob and Chang, Ming- \
-      Wei and Lee, Kenton and Toutanova, Kristina},
-  journal={arXiv preprint arXiv:1810.04805},
-  year={2018}
-}
-"""
-import argparse
-import collections
-import json
-import logging
-import os
-import random
-import time
-import warnings
-import sys
-
-import numpy as np
-import mxnet as mx
-from mxnet import gluon, nd
-
-import gluonnlp as nlp
-
-from static_bert_qa_model import BertForQALoss, StaticBertForQA
-from bert_qa_dataset import (SQuADTransform, preprocess_dataset)
-from bert_qa_evaluate import get_F1_EM, predictions
-from static_bert import get_model
-
-sys.path.append('..')
-
-np.random.seed(6)
-random.seed(6)
-mx.random.seed(6)
-
-log = logging.getLogger('gluonnlp')
-log.setLevel(logging.DEBUG)
-formatter = logging.Formatter(
-    fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s', datefmt='%H:%M:%S')
-
-parser = argparse.ArgumentParser(description='BERT QA example.'
-                                             'We fine-tune the BERT model on SQuAD dataset.')
-
-parser.add_argument('--only_predict',
-                    action='store_true',
-                    help='Whether to predict only.')
-
-parser.add_argument('--model_parameters',
-                    type=str,
-                    default=None,
-                    help='Model parameter file')
-
-parser.add_argument('--bert_model',
-                    type=str,
-                    default='bert_12_768_12',
-                    help='BERT model name. options are bert_12_768_12 and bert_24_1024_16.')
-
-parser.add_argument('--bert_dataset',
-                    type=str,
-                    default='book_corpus_wiki_en_uncased',
-                    help='BERT dataset name.'
-                         'options are book_corpus_wiki_en_uncased and book_corpus_wiki_en_cased.')
-
-parser.add_argument('--pretrained_bert_parameters',
-                    type=str,
-                    default=None,
-                    help='Pre-trained bert model parameter file. default is None')
-
-parser.add_argument('--uncased',
-                    action='store_false',
-                    help='if not set, inputs are converted to lower case.')
-
-parser.add_argument('--output_dir',
-                    type=str,
-                    default='./output_dir',
-                    help='The output directory where the model params will be written.'
-                         ' default is ./output_dir')
-
-parser.add_argument('--epochs',
-                    type=int,
-                    default=3,
-                    help='number of epochs, default is 3')
-
-parser.add_argument('--batch_size',
-                    type=int,
-                    default=32,
-                    help='Batch size. Number of examples per gpu in a minibatch. default is 32')
-
-parser.add_argument('--test_batch_size',
-                    type=int,
-                    default=24,
-                    help='Test batch size. default is 24')
-
-parser.add_argument('--optimizer',
-                    type=str,
-                    default='bertadam',
-                    help='optimization algorithm. default is bertadam(mxnet >= 1.5.0.)')
-
-parser.add_argument('--accumulate',
-                    type=int,
-                    default=None,
-                    help='The number of batches for '
-                         'gradients accumulation to simulate large batch size. Default is None')
-
-parser.add_argument('--lr',
-                    type=float,
-                    default=5e-5,
-                    help='Initial learning rate. default is 5e-5')
-
-parser.add_argument('--warmup_ratio',
-                    type=float,
-                    default=0.1,
-                    help='ratio of warmup steps that linearly increase learning rate from '
-                         '0 to target learning rate. default is 0.1')
-
-parser.add_argument('--log_interval',
-                    type=int,
-                    default=50,
-                    help='report interval. default is 50')
-
-parser.add_argument('--max_seq_length',
-                    type=int,
-                    default=384,
-                    help='The maximum total input sequence length after WordPiece tokenization.'
-                         'Sequences longer than this will be truncated, and sequences shorter '
-                         'than this will be padded. default is 384')
-
-parser.add_argument('--doc_stride',
-                    type=int,
-                    default=128,
-                    help='When splitting up a long document into chunks, how much stride to '
-                         'take between chunks. default is 128')
-
-parser.add_argument('--max_query_length',
-                    type=int,
-                    default=64,
-                    help='The maximum number of tokens for the question. Questions longer than '
-                         'this will be truncated to this length. default is 64')
-
-parser.add_argument('--n_best_size',
-                    type=int,
-                    default=20,
-                    help='The total number of n-best predictions to generate in the '
-                         'nbest_predictions.json output file. default is 20')
-
-parser.add_argument('--max_answer_length',
-                    type=int,
-                    default=30,
-                    help='The maximum length of an answer that can be generated. This is needed '
-                         'because the start and end predictions are not conditioned on one another.'
-                         ' default is 30')
-
-parser.add_argument('--version_2',
-                    action='store_true',
-                    help='SQuAD examples whether contain some that do not have an answer.')
-
-parser.add_argument('--null_score_diff_threshold',
-                    type=float,
-                    default=0.0,
-                    help='If null_score - best_non_null is greater than the threshold predict null.'
-                         'Typical values are between -1.0 and -5.0. default is 0.0')
-
-parser.add_argument('--gpu', type=str, help='single gpu id')
-
-parser.add_argument('--seq_length',
-                    type=int,
-                    default=384,
-                    help='The sequence length of the input')
-
-parser.add_argument('--input_size',
-                    type=int,
-                    default=768,
-                    help='The embedding size of the input')
-
-parser.add_argument('--export',
-                    action='store_true',
-                    help='Whether to export the model.')
-
-args = parser.parse_args()
-
-
-output_dir = args.output_dir
-if not os.path.exists(output_dir):
-    os.mkdir(output_dir)
-
-fh = logging.FileHandler(os.path.join(
-    args.output_dir, 'static_finetune_squad.log'), mode='w')
-fh.setLevel(logging.INFO)
-fh.setFormatter(formatter)
-console = logging.StreamHandler()
-console.setLevel(logging.INFO)
-console.setFormatter(formatter)
-log.addHandler(console)
-log.addHandler(fh)
-
-log.info(args)
-
-model_name = args.bert_model
-dataset_name = args.bert_dataset
-only_predict = args.only_predict
-model_parameters = args.model_parameters
-pretrained_bert_parameters = args.pretrained_bert_parameters
-lower = args.uncased
-
-epochs = args.epochs
-batch_size = args.batch_size
-test_batch_size = args.test_batch_size
-lr = args.lr
-ctx = mx.cpu() if not args.gpu else mx.gpu(int(args.gpu))
-
-accumulate = args.accumulate
-log_interval = args.log_interval * accumulate if accumulate else args.log_interval
-if accumulate:
-    log.info('Using gradient accumulation. Effective batch size = {}'.
-             format(accumulate * batch_size))
-
-optimizer = args.optimizer
-warmup_ratio = args.warmup_ratio
-
-version_2 = args.version_2
-null_score_diff_threshold = args.null_score_diff_threshold
-
-max_seq_length = args.max_seq_length
-doc_stride = args.doc_stride
-max_query_length = args.max_query_length
-n_best_size = args.n_best_size
-max_answer_length = args.max_answer_length
-
-if max_seq_length <= max_query_length + 3:
-    raise ValueError('The max_seq_length (%d) must be greater than max_query_length '
-                     '(%d) + 3' % (max_seq_length, max_query_length))
-
-bert, vocab = get_model(
-    name=model_name,
-    dataset_name=dataset_name,
-    pretrained=not model_parameters and not pretrained_bert_parameters,
-    ctx=ctx,
-    use_pooler=False,
-    use_decoder=False,
-    use_classifier=False,
-    input_size=args.input_size,
-    seq_length=args.seq_length)
-
-batchify_fn = nlp.data.batchify.Tuple(
-    nlp.data.batchify.Stack(),
-    nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
-    nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
-    nlp.data.batchify.Stack('float32'),
-    nlp.data.batchify.Stack('float32'),
-    nlp.data.batchify.Stack('float32'))
-
-berttoken = nlp.data.BERTTokenizer(vocab=vocab, lower=lower)
-
-
-###############################################################################
-#                              Hybridize the model                            #
-###############################################################################
-net = StaticBertForQA(bert=bert)
-if pretrained_bert_parameters and not model_parameters:
-    bert.load_parameters(pretrained_bert_parameters, ctx=ctx,
-                         ignore_extra=True)
-if not model_parameters:
-    net.span_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
-else:
-    net.load_parameters(model_parameters, ctx=ctx)
-net.hybridize(static_alloc=True, static_shape=True)
-
-loss_function = BertForQALoss()
-loss_function.hybridize(static_alloc=True, static_shape=True)
-
-
-def train():
-    """Training function."""
-    log.info('Loader Train data...')
-    if version_2:
-        train_data = nlp.data.SQuAD('train', version='2.0')
-    else:
-        train_data = nlp.data.SQuAD('train', version='1.1')
-    log.info('Number of records in Train data:{}'.format(len(train_data)))
-
-    train_data_transform, _ = preprocess_dataset(
-        train_data, SQuADTransform(
-            berttoken,
-            max_seq_length=max_seq_length,
-            doc_stride=doc_stride,
-            max_query_length=max_query_length,
-            is_pad=True,
-            is_training=True))
-    log.info('The number of examples after preprocessing:{}'.format(
-        len(train_data_transform)))
-
-    train_dataloader = mx.gluon.data.DataLoader(
-        train_data_transform, batchify_fn=batchify_fn,
-        batch_size=batch_size, num_workers=4, shuffle=True)
-
-    log.info('Start Training')
-
-    optimizer_params = {'learning_rate': lr}
-    try:
-        trainer = gluon.Trainer(net.collect_params(), optimizer,
-                                optimizer_params, update_on_kvstore=False)
-    except ValueError as e:
-        print(e)
-        warnings.warn('AdamW optimizer is not found. Please consider upgrading to '
-                      'mxnet>=1.5.0. Now the original Adam optimizer is used instead.')
-        trainer = gluon.Trainer(net.collect_params(), 'adam',
-                                optimizer_params, update_on_kvstore=False)
-
-    num_train_examples = len(train_data_transform)
-    step_size = batch_size * accumulate if accumulate else batch_size
-    num_train_steps = int(num_train_examples / step_size * epochs)
-    num_warmup_steps = int(num_train_steps * warmup_ratio)
-    step_num = 0
-
-    def set_new_lr(step_num, batch_id):
-        """set new learning rate"""
-        # set grad to zero for gradient accumulation
-        if accumulate:
-            if batch_id % accumulate == 0:
-                net.collect_params().zero_grad()
-                step_num += 1
-        else:
-            step_num += 1
-        # learning rate schedule
-        # Notice that this learning rate scheduler is adapted from traditional linear learning
-        # rate scheduler where step_num >= num_warmup_steps, new_lr = 1 - step_num/num_train_steps
-        if step_num < num_warmup_steps:
-            new_lr = lr * step_num / num_warmup_steps
-        else:
-            offset = (step_num - num_warmup_steps) * lr / \
-                     (num_train_steps - num_warmup_steps)
-            new_lr = lr - offset
-        trainer.set_learning_rate(new_lr)
-        return step_num
-
-    # Do not apply weight decay on LayerNorm and bias terms
-    for _, v in net.collect_params('.*beta|.*gamma|.*bias').items():
-        v.wd_mult = 0.0
-    # Collect differentiable parameters
-    params = [p for p in net.collect_params().values()
-              if p.grad_req != 'null']
-    # Set grad_req if gradient accumulation is required
-    if accumulate:
-        for p in params:
-            p.grad_req = 'add'
-
-    epoch_tic = time.time()
-    total_num = 0
-    log_num = 0
-    for epoch_id in range(epochs):
-        step_loss = 0.0
-        tic = time.time()
-        for batch_id, data in enumerate(train_dataloader):
-            # set new lr
-            step_num = set_new_lr(step_num, batch_id)
-            # forward and backward
-            with mx.autograd.record():
-                _, inputs, token_types, valid_length, start_label, end_label = data
-
-                log_num += len(inputs)
-                total_num += len(inputs)
-
-                out = net(inputs.astype('float32').as_in_context(ctx),
-                          token_types.astype('float32').as_in_context(ctx),
-                          valid_length.astype('float32').as_in_context(ctx))
-
-                ls = loss_function(out, [
-                    start_label.astype('float32').as_in_context(ctx),
-                    end_label.astype('float32').as_in_context(ctx)]).mean()
-
-                if accumulate:
-                    ls = ls / accumulate
-            ls.backward()
-            # update
-            if not accumulate or (batch_id + 1) % accumulate == 0:
-                trainer.allreduce_grads()
-                nlp.utils.clip_grad_global_norm(params, 1)
-                trainer.update(1)
-
-            step_loss += ls.asscalar()
-
-            if (batch_id + 1) % log_interval == 0:
-                toc = time.time()
-                log.info('Epoch: %d, Batch: %d/%d, Loss=%.4f, lr=%.7f '
-                         'Time cost=%.1f Thoughput=%.2f samples/s',
-                         epoch_id, batch_id, len(train_dataloader),
-                         step_loss / log_interval,
-                         trainer.learning_rate, toc - tic, log_num / (toc - tic))
-                tic = time.time()
-                step_loss = 0.0
-                log_num = 0
-        epoch_toc = time.time()
-        log.info('Epoch: {}, Time cost={:.2f} s, Thoughput={:.2f} samples/s'
-                 .format(epoch_id, epoch_toc - epoch_tic,
-                         len(train_dataloader) / (epoch_toc - epoch_tic)))
-
-    net.save_parameters(os.path.join(output_dir, 'net.params'))
-
-
-def evaluate():
-    """Evaluate the model on validation dataset.
-    """
-    log.info('Loader dev data...')
-    if version_2:
-        dev_data = nlp.data.SQuAD('dev', version='2.0')
-    else:
-        dev_data = nlp.data.SQuAD('dev', version='1.1')
-    log.info('Number of records in Train data:{}'.format(len(dev_data)))
-
-    dev_dataset = dev_data.transform(
-        SQuADTransform(
-            berttoken,
-            max_seq_length=max_seq_length,
-            doc_stride=doc_stride,
-            max_query_length=max_query_length,
-            is_pad=True,
-            is_training=False)._transform)
-
-    dev_data_transform, _ = preprocess_dataset(
-        dev_data, SQuADTransform(
-            berttoken,
-            max_seq_length=max_seq_length,
-            doc_stride=doc_stride,
-            max_query_length=max_query_length,
-            is_pad=True,
-            is_training=False))
-    log.info('The number of examples after preprocessing:{}'.format(
-        len(dev_data_transform)))
-
-    dev_dataloader = mx.gluon.data.DataLoader(
-        dev_data_transform,
-        batchify_fn=batchify_fn,
-        num_workers=4, batch_size=test_batch_size, shuffle=False, last_batch='keep')
-
-    log.info('Start predict')
-
-    _Result = collections.namedtuple(
-        '_Result', ['example_id', 'start_logits', 'end_logits'])
-    all_results = {}
-
-    epoch_tic = time.time()
-    total_num = 0
-    for data in dev_dataloader:
-        example_ids, inputs, token_types, valid_length, _, _ = data
-        total_num += len(inputs)
-        out = net(inputs.astype('float32').as_in_context(ctx),
-                  token_types.astype('float32').as_in_context(ctx),
-                  valid_length.astype('float32').as_in_context(ctx))
-
-        output = nd.split(out, axis=2, num_outputs=2)
-        start_logits = output[0].reshape((0, -3)).asnumpy()
-        end_logits = output[1].reshape((0, -3)).asnumpy()
-
-        for example_id, start, end in zip(example_ids, start_logits, end_logits):
-            example_id = example_id.asscalar()
-            if example_id not in all_results:
-                all_results[example_id] = []
-            all_results[example_id].append(
-                _Result(example_id, start.tolist(), end.tolist()))
-    epoch_toc = time.time()
-    log.info('Inference time cost={:.2f} s, Thoughput={:.2f} samples/s'
-             .format(epoch_toc - epoch_tic,
-                     len(dev_dataloader) / (epoch_toc - epoch_tic)))
-    log.info('Get prediction results...')
-
-    all_predictions, all_nbest_json, scores_diff_json = predictions(
-        dev_dataset=dev_dataset,
-        all_results=all_results,
-        tokenizer=nlp.data.BERTBasicTokenizer(lower=lower),
-        max_answer_length=max_answer_length,
-        null_score_diff_threshold=null_score_diff_threshold,
-        n_best_size=n_best_size,
-        version_2=version_2)
-
-    with open(os.path.join(output_dir, 'predictions.json'),
-              'w', encoding='utf-8') as all_predictions_write:
-        all_predictions_write.write(json.dumps(all_predictions))
-
-    with open(os.path.join(output_dir, 'nbest_predictions.json'),
-              'w', encoding='utf-8') as all_predictions_write:
-        all_predictions_write.write(json.dumps(all_nbest_json))
-
-    if version_2:
-        with open(os.path.join(output_dir, 'null_odds.json'),
-                  'w', encoding='utf-8') as all_predictions_write:
-            all_predictions_write.write(json.dumps(scores_diff_json))
-    else:
-        log.info(get_F1_EM(dev_data, all_predictions))
-
-
-###############################################################################
-#                              Export the model                               #
-###############################################################################
-if __name__ == '__main__':
-    if not only_predict:
-        train()
-        evaluate()
-        if args.export:
-            net.export(os.path.join(args.output_dir, 'static_net'), epoch=args.epochs)
-    elif model_parameters:
-        evaluate()
-        if args.export:
-            net.export(os.path.join(args.output_dir, 'static_net'), epoch=args.epochs)

From 98c3e90aab1ba274a96064f5a09a1e8ae5bb6098 Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-172-31-20-62.us-west-2.compute.internal>
Date: Tue, 14 May 2019 06:36:37 +0000
Subject: [PATCH 02/11] rename folder

---
 scripts/bert/export/{staticbert => }/__init__.py                  | 0
 .../export/{staticbert/static_export_base.py => export_static.py} | 0
 scripts/bert/export/{staticbert => }/static_bert.py               | 0
 scripts/bert/export/{staticbert => }/static_bert_qa_model.py      | 0
 scripts/bert/export/{staticbert => }/static_export_squad.py       | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename scripts/bert/export/{staticbert => }/__init__.py (100%)
 rename scripts/bert/export/{staticbert/static_export_base.py => export_static.py} (100%)
 rename scripts/bert/export/{staticbert => }/static_bert.py (100%)
 rename scripts/bert/export/{staticbert => }/static_bert_qa_model.py (100%)
 rename scripts/bert/export/{staticbert => }/static_export_squad.py (100%)

diff --git a/scripts/bert/export/staticbert/__init__.py b/scripts/bert/export/__init__.py
similarity index 100%
rename from scripts/bert/export/staticbert/__init__.py
rename to scripts/bert/export/__init__.py
diff --git a/scripts/bert/export/staticbert/static_export_base.py b/scripts/bert/export/export_static.py
similarity index 100%
rename from scripts/bert/export/staticbert/static_export_base.py
rename to scripts/bert/export/export_static.py
diff --git a/scripts/bert/export/staticbert/static_bert.py b/scripts/bert/export/static_bert.py
similarity index 100%
rename from scripts/bert/export/staticbert/static_bert.py
rename to scripts/bert/export/static_bert.py
diff --git a/scripts/bert/export/staticbert/static_bert_qa_model.py b/scripts/bert/export/static_bert_qa_model.py
similarity index 100%
rename from scripts/bert/export/staticbert/static_bert_qa_model.py
rename to scripts/bert/export/static_bert_qa_model.py
diff --git a/scripts/bert/export/staticbert/static_export_squad.py b/scripts/bert/export/static_export_squad.py
similarity index 100%
rename from scripts/bert/export/staticbert/static_export_squad.py
rename to scripts/bert/export/static_export_squad.py

From 7815a5353990309725897e6a9e948e62e9527116 Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-172-31-20-62.us-west-2.compute.internal>
Date: Tue, 14 May 2019 06:37:33 +0000
Subject: [PATCH 03/11] draft for export

---
 scripts/bert/export/export_static.py        | 174 ++++++++------------
 scripts/bert/export/static_bert.py          | 109 +++++++++++-
 scripts/bert/export/static_bert_qa_model.py | 106 ------------
 scripts/bert/index.rst                      |  97 +----------
 4 files changed, 178 insertions(+), 308 deletions(-)
 delete mode 100644 scripts/bert/export/static_bert_qa_model.py

diff --git a/scripts/bert/export/export_static.py b/scripts/bert/export/export_static.py
index 21607c630e..43248693fa 100644
--- a/scripts/bert/export/export_static.py
+++ b/scripts/bert/export/export_static.py
@@ -1,9 +1,16 @@
 """
-Export Base Static Model (BERT)
+Export the BERT Model for Deployment
 
-=========================================================================================
+====================================
 
-This will export the base BERT model to a static model suitable for use in MXNet Module API.
+This script exports the BERT model to a static model suitable for use with MXNet Module API.
+
+@article{devlin2018bert,
+  title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
+  author={Devlin, Jacob and Chang, Ming- \
+      Wei and Lee, Kenton and Toutanova, Kristina},
+  journal={arXiv preprint arXiv:1810.04805},
+  year={2018}
 }
 """
 
@@ -29,96 +36,71 @@
 
 import argparse
 import logging
+import warnings
 import os
 import time
 
 import mxnet as mx
-
+import gluonnlp as nlp
 from static_bert import get_model
 
-log = logging.getLogger('gluonnlp')
-log.setLevel(logging.DEBUG)
-formatter = logging.Formatter(
-    fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s', datefmt='%H:%M:%S')
-
-parser = argparse.ArgumentParser(description='export static BERT base model.')
+parser = argparse.ArgumentParser(description='Export static BERT base model.')
 
 parser.add_argument('--model_parameters',
                     type=str,
                     default=None,
-                    help='Model parameter file')
+                    help='The model parameter file saved from training.')
 
-parser.add_argument('--bert_model',
+parser.add_argument('--model_name',
                     type=str,
                     default='bert_12_768_12',
-                    help='BERT model name. options are bert_12_768_12 and bert_24_1024_16.')
+                    choices=['bert_12_768_12', 'bert_24_1024_16'],
+                    help='BERT model name. Options are "bert_12_768_12" and "bert_24_1024_16"')
 
-parser.add_argument('--bert_dataset',
-                    type=str,
-                    default='book_corpus_wiki_en_uncased',
-                    help='BERT dataset name.'
-                         'options are book_corpus_wiki_en_uncased and book_corpus_wiki_en_cased.')
-
-parser.add_argument('--pretrained_bert_parameters',
+parser.add_argument('--task',
                     type=str,
                     default=None,
-                    help='Pre-trained bert model parameter file. default is None')
+                    choices=['classification', 'regression', 'qa'],
+                    help='Task to export. Options are "classification", "regression", "qa". '
+                         'If not set, the model for masked language model and next sentence '
+                         'prediction will be exported.')
 
-parser.add_argument('--uncased',
-                    action='store_false',
-                    help='if not set, inputs are converted to lower case.')
+parser.add_argument('--dataset_name',
+                    type=str,
+                    default='book_corpus_wiki_en_uncased',
+                    choices=['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased',
+                             'wiki_multilingual_uncased', 'wiki_multilingual_cased',
+                             'wiki_cn_cased'],
+                    help='BERT dataset name. Options include '
+                         '"book_corpus_wiki_en_uncased", "book_corpus_wiki_en_cased", '
+                         '"wiki_multilingual_uncased", "wiki_multilingual_cased", '
+                         '"wiki_cn_cased"')
 
 parser.add_argument('--output_dir',
                     type=str,
                     default='./output_dir',
-                    help='The output directory where the model params will be written.'
-                         ' default is ./output_dir')
-
-parser.add_argument('--test_batch_size',
-                    type=int,
-                    default=24,
-                    help='Test batch size. default is 24')
-
-parser.add_argument('--max_seq_length',
-                    type=int,
-                    default=384,
-                    help='The maximum total input sequence length after WordPiece tokenization.'
-                         'Sequences longer than this will be truncated, and sequences shorter '
-                         'than this will be padded. default is 384')
-
-parser.add_argument('--doc_stride',
-                    type=int,
-                    default=128,
-                    help='When splitting up a long document into chunks, how much stride to '
-                         'take between chunks. default is 128')
-
-parser.add_argument('--max_query_length',
-                    type=int,
-                    default=64,
-                    help='The maximum number of tokens for the question. Questions longer than '
-                         'this will be truncated to this length. default is 64')
-
-parser.add_argument('--gpu', type=str, help='single gpu id')
+                    help='The directory where the exported model symbol will be created. '
+                         'The default is ./output_dir')
 
 parser.add_argument('--seq_length',
                     type=int,
                     default=384,
-                    help='The sequence length of the input')
-
-parser.add_argument('--input_size',
-                    type=int,
-                    default=768,
-                    help='The embedding size of the input')
+                    help='The maximum total input sequence length after WordPiece tokenization.'
+                         'Sequences longer than this needs to be truncated, and sequences shorter '
+                         'than this needs to be padded. Default is 384')
 
 args = parser.parse_args()
 
-
+# create output dir
 output_dir = args.output_dir
-if not os.path.exists(output_dir):
-    os.mkdir(output_dir)
+nlp.utils.mkdir(output_dir)
 
-fh = logging.FileHandler(os.path.join(
-    args.output_dir, 'static_export_bert_base.log'), mode='w')
+# logging
+log = logging.getLogger('gluonnlp')
+log.setLevel(logging.DEBUG)
+formatter = logging.Formatter(fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s',
+                              datefmt='%H:%M:%S')
+fh = logging.FileHandler(os.path.join(args.output_dir, 'static_export_bert.log'), mode='w')
 fh.setLevel(logging.INFO)
 fh.setFormatter(formatter)
 console = logging.StreamHandler()
@@ -129,49 +111,31 @@
 
 log.info(args)
 
-model_name = args.bert_model
-dataset_name = args.bert_dataset
 model_parameters = args.model_parameters
-pretrained_bert_parameters = args.pretrained_bert_parameters
-lower = args.uncased
-
 seq_length = args.seq_length
-input_size = args.input_size
-test_batch_size = args.test_batch_size
-ctx = mx.cpu() if not args.gpu else mx.gpu(int(args.gpu))
-
-max_seq_length = args.max_seq_length
-doc_stride = args.doc_stride
-max_query_length = args.max_query_length
-
-if max_seq_length <= max_query_length + 3:
-    raise ValueError('The max_seq_length (%d) must be greater than max_query_length '
-                     '(%d) + 3' % (max_seq_length, max_query_length))
-
+test_batch_size = 1
+ctx = mx.cpu()
 
 ###############################################################################
 #                              Prepare dummy input data                       #
 ###############################################################################
 
-inputs = mx.nd.arange(test_batch_size * seq_length).reshape(shape=(test_batch_size, seq_length))
+inputs = mx.nd.arange(test_batch_size * seq_length)
+inputs = inputs.reshape(shape=(test_batch_size, seq_length))
 token_types = mx.nd.zeros_like(inputs)
-valid_length = mx.nd.arange(seq_length)[:test_batch_size]
+valid_length = mx.nd.arange(test_batch_size)
 batch = inputs, token_types, valid_length
 num_batch = 10
-sample_dataset = []
-for _ in range(num_batch):
-    sample_dataset.append(batch)
-
+sample_dataset = [batch for _ in range(10)]
 
 bert, vocab = get_model(
-    name=model_name,
-    dataset_name=dataset_name,
-    pretrained=not model_parameters and not pretrained_bert_parameters,
+    name=args.model_name,
+    dataset_name=args.dataset_name,
+    pretrained=True,
     ctx=ctx,
     use_pooler=False,
     use_decoder=False,
     use_classifier=False,
-    input_size=args.input_size,
     seq_length=args.seq_length)
 
 
@@ -179,28 +143,26 @@
 #                              Hybridize the model                            #
 ###############################################################################
 net = bert
-if pretrained_bert_parameters and not model_parameters:
-    bert.load_parameters(pretrained_bert_parameters, ctx=ctx,
-                         ignore_extra=True)
-net.hybridize(static_alloc=True, static_shape=True)
+if args.task == 'classification':
+    net = StaticBERTClassifier(net, num_classes=2)
 
+if model_parameters:
+    bert.load_parameters(model_parameters, ctx=ctx)
+else:
+    warnings.warn('using random initialization')
+
+net.hybridize(static_alloc=True, static_shape=True)
 
 def evaluate(data_source):
-    """Evaluate the model on a mini-batch.
-    """
-    log.info('Start predict')
+    """Evaluate the model on a mini-batch."""
+    log.info('start predicting ... ')
     tic = time.time()
-    for batch in data_source:
-        inputs, token_types, valid_length = batch
-        net(inputs.astype('float32').as_in_context(ctx),
-            token_types.astype('float32').as_in_context(ctx),
-            valid_length.astype('float32').as_in_context(ctx))
+    for inputs, token_types, valid_length in data_source:
+        net(inputs.as_in_context(ctx), token_types.as_in_context(ctx),
+            valid_length.as_in_context(ctx))
     toc = time.time()
     log.info('Inference time cost={:.2f} s, Thoughput={:.2f} samples/s'
-             .format(toc - tic,
-                     len(data_source) / (toc - tic)))
-
-
+             .format(toc - tic, len(data_source) / (toc - tic)))
 
 ###############################################################################
 #                              Export the model                               #
diff --git a/scripts/bert/export/static_bert.py b/scripts/bert/export/static_bert.py
index 569942d705..cc11670adc 100644
--- a/scripts/bert/export/static_bert.py
+++ b/scripts/bert/export/static_bert.py
@@ -19,7 +19,11 @@
 """Static BERT models."""
 
 __all__ = ['StaticBERTModel', 'StaticBERTEncoder',
-           'get_model', 'bert_12_768_12', 'bert_24_1024_16', 'get_static_bert_model']
+           'get_model', 'bert_12_768_12', 'bert_24_1024_16', 'get_static_bert_model',
+           'StaticBertForQA', 'StaticBERTClassifier']
+
+from mxnet.gluon import HybridBlock, loss, nn
+from mxnet.gluon.loss import Loss
 
 import os
 import math
@@ -783,3 +787,106 @@ def get_static_bert_model(model_name=None, dataset_name=None, vocab=None,
         _load_pretrained_params(net, model_name, dataset_name, root, ctx,
                                 ignore_extra=ignore_extra)
     return net, bert_vocab
+
+#create a hybridizable task guided model using BERT
+class StaticBertForQA(HybridBlock):
+    """Hybridizable Model for SQuAD task with BERT.
+
+    The model feeds token ids and token type ids into BERT to get the
+    pooled BERT sequence representation, then apply a Dense layer for QA task.
+
+    Parameters
+    ----------
+    bert: BERTModel
+        Bidirectional encoder with transformer.
+    prefix : str or None
+        See document of `mx.gluon.HybridBlock`.
+    params : ParameterDict or None
+        See document of `mx.gluon.HybridBlock`.
+    """
+
+    def __init__(self, bert, prefix=None, params=None):
+        super(StaticBertForQA, self).__init__(prefix=prefix, params=params)
+        self.bert = bert
+        with self.name_scope():
+            self.span_classifier = nn.Dense(units=2, flatten=False)
+
+    def hybrid_forward(self, F, inputs, token_types, valid_length=None):
+        # pylint: disable=arguments-differ
+        # pylint: disable=unused-argument
+        """Generate the unnormalized score for the given the input sequences.
+
+        Parameters
+        ----------
+        inputs : NDArray, shape (batch_size, seq_length)
+            Input words for the sequences.
+        token_types : NDArray, shape (batch_size, seq_length)
+            Token types for the sequences, used to indicate whether the word belongs to the
+            first sentence or the second one.
+        valid_length : NDArray or None, shape (batch_size,)
+            Valid length of the sequence. This is used to mask the padded tokens.
+
+        Returns
+        -------
+        outputs : NDArray
+            Shape (batch_size, seq_length, 2)
+        """
+        bert_output = self.bert(inputs, token_types, valid_length)
+        output = self.span_classifier(bert_output)
+        return output
+
+class StaticBERTClassifier(HybridBlock):
+    """Model for sentence (pair) classification task with BERT.
+
+    The model feeds token ids and token type ids into BERT to get the
+    pooled BERT sequence representation, then apply a Dense layer for
+    classification.
+
+    Parameters
+    ----------
+    bert: BERTModel
+        Bidirectional encoder with transformer.
+    num_classes : int, default is 2
+        The number of target classes.
+    dropout : float or None, default 0.0.
+        Dropout probability for the bert output.
+    prefix : str or None
+        See document of `mx.gluon.Block`.
+    params : ParameterDict or None
+        See document of `mx.gluon.Block`.
+    """
+
+    def __init__(self,
+                 bert,
+                 num_classes=2,
+                 dropout=0.0,
+                 prefix=None,
+                 params=None):
+        super(BERTClassifier, self).__init__(prefix=prefix, params=params)
+        self.bert = bert
+        with self.name_scope():
+            self.classifier = nn.HybridSequential(prefix=prefix)
+            if dropout:
+                self.classifier.add(nn.Dropout(rate=dropout))
+            self.classifier.add(nn.Dense(units=num_classes))
+
+    def hybrid_forward(self, F, inputs, token_types, valid_length=None):  # pylint: disable=arguments-differ
+        """Generate the unnormalized score for the given the input sequences.
+
+        Parameters
+        ----------
+        inputs : NDArray, shape (batch_size, seq_length)
+            Input words for the sequences.
+        token_types : NDArray, shape (batch_size, seq_length)
+            Token types for the sequences, used to indicate whether the word belongs to the
+            first sentence or the second one.
+        valid_length : NDArray or None, shape (batch_size)
+            Valid length of the sequence. This is used to mask the padded tokens.
+
+        Returns
+        -------
+        outputs : NDArray
+            Shape (batch_size, num_classes)
+        """
+        _, pooler_out = self.bert(inputs, token_types, valid_length)
+        return self.classifier(pooler_out)
diff --git a/scripts/bert/export/static_bert_qa_model.py b/scripts/bert/export/static_bert_qa_model.py
deleted file mode 100644
index e7980cd2e7..0000000000
--- a/scripts/bert/export/static_bert_qa_model.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# coding: utf-8
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Static BertForQA models."""
-
-__all__ = ['StaticBertForQA', 'BertForQALoss']
-
-from mxnet.gluon import HybridBlock, loss, nn
-from mxnet.gluon.loss import Loss
-
-
-#create a hybridizable task guided model using BERT
-class StaticBertForQA(HybridBlock):
-    """Hybridizable Model for SQuAD task with BERT.
-
-    The model feeds token ids and token type ids into BERT to get the
-    pooled BERT sequence representation, then apply a Dense layer for QA task.
-
-    Parameters
-    ----------
-    bert: BERTModel
-        Bidirectional encoder with transformer.
-    prefix : str or None
-        See document of `mx.gluon.HybridBlock`.
-    params : ParameterDict or None
-        See document of `mx.gluon.HybridBlock`.
-    """
-
-    def __init__(self, bert, prefix=None, params=None):
-        super(StaticBertForQA, self).__init__(prefix=prefix, params=params)
-        self.bert = bert
-        with self.name_scope():
-            self.span_classifier = nn.Dense(units=2, flatten=False)
-
-    def hybrid_forward(self, F, inputs, token_types, valid_length=None):
-        # pylint: disable=arguments-differ
-        # pylint: disable=unused-argument
-        """Generate the unnormalized score for the given the input sequences.
-
-        Parameters
-        ----------
-        inputs : NDArray, shape (batch_size, seq_length)
-            Input words for the sequences.
-        token_types : NDArray, shape (batch_size, seq_length)
-            Token types for the sequences, used to indicate whether the word belongs to the
-            first sentence or the second one.
-        valid_length : NDArray or None, shape (batch_size,)
-            Valid length of the sequence. This is used to mask the padded tokens.
-
-        Returns
-        -------
-        outputs : NDArray
-            Shape (batch_size, seq_length, 2)
-        """
-        bert_output = self.bert(inputs, token_types, valid_length)
-        output = self.span_classifier(bert_output)
-        return output
-
-
-class BertForQALoss(Loss):
-    """Loss for SQuAD task with BERT.
-
-    """
-
-    def __init__(self, weight=None, batch_axis=0, **kwargs):  # pylint: disable=unused-argument
-        super(BertForQALoss, self).__init__(
-            weight=None, batch_axis=0, **kwargs)
-        self.loss = loss.SoftmaxCELoss()
-
-    def hybrid_forward(self, F, pred, label):  # pylint: disable=arguments-differ
-        """
-        Parameters
-        ----------
-        pred : NDArray, shape (batch_size, seq_length, 2)
-            BERTSquad forward output.
-        label : list, length is 2, each shape is (batch_size,1)
-            label[0] is the starting position of the answer,
-            label[1] is the ending position of the answer.
-
-        Returns
-        -------
-        outputs : NDArray
-            Shape (batch_size,)
-        """
-        pred = F.split(pred, axis=2, num_outputs=2)
-        start_pred = pred[0].reshape((0, -3))
-        start_label = label[0]
-        end_pred = pred[1].reshape((0, -3))
-        end_label = label[1]
-        return (self.loss(start_pred, start_label) + self.loss(
-            end_pred, end_label)) / 2
diff --git a/scripts/bert/index.rst b/scripts/bert/index.rst
index 07ae1b4d8c..6a5def3eeb 100644
--- a/scripts/bert/index.rst
+++ b/scripts/bert/index.rst
@@ -319,8 +319,8 @@ Command line interface
        -0.1820574 , -0.16115054], dtype=float32)]
 
 
-Example Usage of Exporting Hybridizable BERT
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Export BERT for Deployment
+~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The BERTModel class is a subclass of Block, rather than HybridBlock.
 To support exporting BERT model to json format for deployment, we introduce the StaticBERT class.
@@ -376,96 +376,3 @@ To load and export the BERT base pretrained model that that is suitable for fine
 
     $ cd staticbert
     $ python static_export_base.py --model_parameters --seq_length 128
-
-
-
-Example Usage of Finetuning Hybridizable BERT
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This example mainly introduces the steps needed to use the hybridizable BERT models to finetune on a specific NLP task.
-We use SQuAD dataset for Question Answering as an example.
-
-
-Step 1-3 are the same as in previous section 'Example Usage of Exporting Hybridizable BERT',
-where an example of Step 1 can be found in 'staticbert/static_bert_for_qa_model.py',
-an example of Step 2-3 can be found in 'staticbert/static_finetune_squad.py'.
-To export the model, in 'staticbert/static_finetune_squad.py', set export=True.
-
-
-For all model settings above, we set learning rate = 3e-5 and optimizer = adam.
-Besides, seq_length stands for the sequence length of the input, input_size represents the embedding size of the input.
-The options can be specified in the following command lines.
-
-
-+-----------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
-|                       | SQuAD 1.1                                                                                                                  | SQuAD 1.1                                                                                                                   | SQuAD 2.0                                                                                                                   |
-+=======================+============================================================================================================================+=============================================================================================================================+=============================================================================================================================+
-| model                 | bert_12_768_12                                                                                                             | bert_24_1024_16                                                                                                             | bert_24_1024_16                                                                                                             |
-+-----------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
-| F1                    | 88.54                                                                                                                      | 90.84                                                                                                                       | 81.46                                                                                                                       |
-+-----------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
-| EM                    | 81.10                                                                                                                      | 84.03                                                                                                                       | 78.49                                                                                                                       |
-+-----------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
-| batch_size            | 12                                                                                                                         | 4                                                                                                                           | 4                                                                                                                           |
-+-----------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
-| gradient accumulation | None                                                                                                                       | 6                                                                                                                           | 8                                                                                                                           |
-+-----------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
-| epochs                | 2                                                                                                                          | 2                                                                                                                           | 2                                                                                                                           |
-+-----------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
-| training log          | `log <https://raw.githubusercontent.com/dmlc/web-data/master/gluonnlp/logs/bert/static_finetune_squad1.1_base.log>`__      | `log <https://raw.githubusercontent.com/dmlc/web-data/master/gluonnlp/logs/bert/static_finetune_squad1.1_large.log>`__      | `log <https://raw.githubusercontent.com/dmlc/web-data/master/gluonnlp/logs/bert/static_finetune_squad2.0_large.log>`__      |
-+-----------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
-| command               | [8]                                                                                                                        | [9]                                                                                                                         | [10]                                                                                                                        |
-+-----------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
-
-BERT BASE on SQuAD 1.1
-++++++++++++++++++++++
-
-[8] bert_12_768_12
-
-.. code-block:: console
-
-    $ cd staticbert
-    $ python static_finetune_squad.py --optimizer adam --batch_size 12 --lr 3e-5 --epochs 2 --gpu 0 --export
-
-
-BERT LARGE on SQuAD 1.1
-+++++++++++++++++++++++
-
-[9] bert_24_1024_16
-
-.. code-block:: console
-
-    $ cd staticbert
-    $ python static_finetune_squad.py --bert_model bert_24_1024_16 --optimizer adam --accumulate 6 --batch_size 4 --lr 3e-5 --epochs 2 --gpu 0 --export
-
-
-BERT LARGE on SQuAD 2.0
-+++++++++++++++++++++++
-
-[10] bert_24_1024_16
-
-.. code-block:: console
-
-    $ cd staticbert
-    $ python static_finetune_squad.py --bert_model bert_24_1024_16 --optimizer adam --accumulate 8 --batch_size 4 --lr 3e-5 --epochs 2 --gpu 0 --null_score_diff_threshold -2.0 --version_2 --export
-
-To get the score of the dev data, you need to download the dev dataset (`dev-v2.0.json <https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json>`_) and the evaluate script (`evaluate-2.0.py <https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/>`_). Then use the following command to get the score of the dev dataset.
-
-.. code-block:: console
-
-    $ cd staticbert
-    $ python evaluate-v2.0.py dev-v2.0.json predictions.json
-
-.. code-block:: json
-
-    {
-        "exact": 78.49743114629833,
-        "f1": 81.46366127573552,
-        "total": 11873,
-        "HasAns_exact": 73.38056680161944,
-        "HasAns_f1": 79.32153345593925,
-        "HasAns_total": 5928,
-        "NoAns_exact": 83.59966358284272,
-        "NoAns_f1": 83.59966358284272,
-        "NoAns_total": 5945
-    }

From 492d98cb8f51a218cf3f4489a5a8ef0eb169e64e Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-172-31-20-62.us-west-2.compute.internal>
Date: Tue, 14 May 2019 22:08:00 +0000
Subject: [PATCH 04/11] suppport qa,regression, classification

---
 scripts/bert/export/__init__.py               |   4 +-
 .../export/{export_static.py => export.py}    | 140 ++++++++++++------
 .../export/{static_bert.py => hybrid_bert.py} | 137 +++++++++++------
 3 files changed, 185 insertions(+), 96 deletions(-)
 rename scripts/bert/export/{export_static.py => export.py} (60%)
 rename scripts/bert/export/{static_bert.py => hybrid_bert.py} (90%)

diff --git a/scripts/bert/export/__init__.py b/scripts/bert/export/__init__.py
index e1627b1e3a..791e5fd1bd 100644
--- a/scripts/bert/export/__init__.py
+++ b/scripts/bert/export/__init__.py
@@ -18,5 +18,5 @@
 # under the License.
 
 # pylint: disable=wildcard-import
-"""static BERT example."""
-from . import static_bert, static_bert_qa_model
+"""Hybrid BERT for deployment."""
+from . import hybrid_bert
diff --git a/scripts/bert/export/export_static.py b/scripts/bert/export/export.py
similarity index 60%
rename from scripts/bert/export/export_static.py
rename to scripts/bert/export/export.py
index 43248693fa..79f77e27a7 100644
--- a/scripts/bert/export/export_static.py
+++ b/scripts/bert/export/export.py
@@ -3,7 +3,7 @@
 
 ====================================
 
-This script exports the BERT model to a static model suitable for use with MXNet Module API.
+This script exports the BERT model to a hybrid model suitable for use with MXNet Module API.
 
 @article{devlin2018bert,
   title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
@@ -42,9 +42,10 @@
 
 import mxnet as mx
 import gluonnlp as nlp
-from static_bert import get_model
+from hybrid_bert import get_hybrid_model
+from hybrid_bert import HybridBERTClassifier, HybridBERTRegression, HybridBERTForQA
 
-parser = argparse.ArgumentParser(description='Export static BERT base model.')
+parser = argparse.ArgumentParser(description='Export hybrid BERT base model.')
 
 parser.add_argument('--model_parameters',
                     type=str,
@@ -59,11 +60,10 @@
 
 parser.add_argument('--task',
                     type=str,
-                    default=None,
-                    choices=['classification', 'regression', 'qa'],
-                    help='Task to export. Options are "classification", "regression", "qa". '
-                         'If not set, the model for masked language model and next sentence '
-                         'prediction will be exported.')
+                    choices=['classification', 'regression', 'question_answering'],
+                    required=True,
+                    help='Task to export. Options are "classification", "regression", '
+                         '"question_answering"')
 
 parser.add_argument('--dataset_name',
                     type=str,
@@ -89,18 +89,26 @@
                          'Sequences longer than this needs to be truncated, and sequences shorter '
                          'than this needs to be padded. Default is 384')
 
+parser.add_argument('--dropout',
+                    type=float,
+                    default=0.1,
+                    help='The dropout probability for the classification/regression head.')
+
 args = parser.parse_args()
 
 # create output dir
 output_dir = args.output_dir
 nlp.utils.mkdir(output_dir)
 
-# logging
+###############################################################################
+#                                Logging                                      #
+###############################################################################
+
 log = logging.getLogger('gluonnlp')
 log.setLevel(logging.DEBUG)
 formatter = logging.Formatter(fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s',
                               datefmt='%H:%M:%S')
-fh = logging.FileHandler(os.path.join(args.output_dir, 'static_export_bert.log'), mode='w')
+fh = logging.FileHandler(os.path.join(args.output_dir, 'hybrid_export_bert.log'), mode='w')
 fh.setLevel(logging.INFO)
 fh.setFormatter(formatter)
 console = logging.StreamHandler()
@@ -108,65 +116,99 @@
 console.setFormatter(formatter)
 log.addHandler(console)
 log.addHandler(fh)
-
 log.info(args)
 
-model_parameters = args.model_parameters
+###############################################################################
+#                              Hybridize the model                            #
+###############################################################################
+
 seq_length = args.seq_length
-test_batch_size = 1
-ctx = mx.cpu()
+
+if args.task == 'classification':
+    bert, _ = get_hybrid_model(
+        name=args.model_name,
+        dataset_name=args.dataset_name,
+        pretrained=False,
+        use_pooler=True,
+        use_decoder=False,
+        use_classifier=False,
+        seq_length=args.seq_length)
+    net = HybridBERTClassifier(bert, num_classes=2, dropout=args.dropout)
+elif args.task == 'regression':
+    bert, _ = get_hybrid_model(
+        name=args.model_name,
+        dataset_name=args.dataset_name,
+        pretrained=False,
+        use_pooler=True,
+        use_decoder=False,
+        use_classifier=False,
+        seq_length=args.seq_length)
+    net = HybridBERTRegression(bert, dropout=args.dropout)
+elif args.task == 'question_answering':
+    bert, _ = get_hybrid_model(
+        name=args.model_name,
+        dataset_name=args.dataset_name,
+        pretrained=False,
+        use_pooler=False,
+        use_decoder=False,
+        use_classifier=False,
+        seq_length=args.seq_length)
+    net = HybridBERTForQA(bert)
+else:
+    raise ValueError('unknown task: %s'%args.task)
+
+if args.model_parameters:
+    net.load_parameters(args.model_parameters)
+else:
+    net.initialize()
+    warnings.warn('--model_parameters is not provided. The parameter checkpoint (.params) '
+                  'file will be created based on default parameter intialization.')
+
+net.hybridize(static_alloc=True, static_shape=True)
 
 ###############################################################################
-#                              Prepare dummy input data                       #
+#                            Prepare dummy input data                         #
 ###############################################################################
 
+test_batch_size = 1
+
 inputs = mx.nd.arange(test_batch_size * seq_length)
 inputs = inputs.reshape(shape=(test_batch_size, seq_length))
 token_types = mx.nd.zeros_like(inputs)
 valid_length = mx.nd.arange(test_batch_size)
 batch = inputs, token_types, valid_length
-num_batch = 10
-sample_dataset = [batch for _ in range(10)]
 
-bert, vocab = get_model(
-    name=args.model_name,
-    dataset_name=args.dataset_name,
-    pretrained=True,
-    ctx=ctx,
-    use_pooler=False,
-    use_decoder=False,
-    use_classifier=False,
-    seq_length=args.seq_length)
+def export(batch, prefix):
+    """Export the model."""
+    log.info('Exporting the model ... ')
+    inputs, token_types, valid_length = batch
+    net(inputs, token_types, valid_length)
+    net.export(prefix, epoch=0)
+    assert os.path.isfile(prefix + '-symbol.json')
+    assert os.path.isfile(prefix + '-0000.params')
 
-
-###############################################################################
-#                              Hybridize the model                            #
-###############################################################################
-net = bert
-if args.task == 'classification':
-    net = StaticBERTClassifier(net, num_classes=2)
-
-if model_parameters:
-    bert.load_parameters(model_parameters, ctx=ctx)
-else:
-    warnings.warn('using random initialization')
-
-net.hybridize(static_alloc=True, static_shape=True)
-
-def evaluate(data_source):
+def infer(batch, prefix):
     """Evaluate the model on a mini-batch."""
-    log.info('start predicting ... ')
+    log.info('Start inference ... ')
     tic = time.time()
-    for inputs, token_types, valid_length in data_source:
-        net(inputs.as_in_context(ctx), token_types.as_in_context(ctx),
-            valid_length.as_in_context(ctx))
+    # import with SymbolBlock. Alternatively, you can use Module.load APIs.
+    inputs, token_types, valid_length = batch
+    num_trials = 10
+    imported_net = mx.gluon.nn.SymbolBlock.imports(prefix + '-symbol.json',
+                                                   ['data0','data1','data2'],
+                                                   prefix + '-0000.params')
+    for _ in range(num_trials):
+        net(inputs, token_types, valid_length)
+    mx.nd.waitall()
     toc = time.time()
     log.info('Inference time cost={:.2f} s, Thoughput={:.2f} samples/s'
-             .format(toc - tic, len(data_source) / (toc - tic)))
+             .format(toc - tic, num_trials / (toc - tic)))
+
 
 ###############################################################################
 #                              Export the model                               #
 ###############################################################################
 if __name__ == '__main__':
-    evaluate(sample_dataset)
-    net.export(os.path.join(args.output_dir, 'static_bert_base_net'), epoch=0)
+    prefix = os.path.join(args.output_dir, args.task)
+    export(batch, prefix)
+    infer(batch, prefix)
diff --git a/scripts/bert/export/static_bert.py b/scripts/bert/export/hybrid_bert.py
similarity index 90%
rename from scripts/bert/export/static_bert.py
rename to scripts/bert/export/hybrid_bert.py
index cc11670adc..c971eda4de 100644
--- a/scripts/bert/export/static_bert.py
+++ b/scripts/bert/export/hybrid_bert.py
@@ -16,11 +16,12 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Static BERT models."""
+"""Hybrid BERT models."""
 
-__all__ = ['StaticBERTModel', 'StaticBERTEncoder',
-           'get_model', 'bert_12_768_12', 'bert_24_1024_16', 'get_static_bert_model',
-           'StaticBertForQA', 'StaticBERTClassifier']
+__all__ = ['HybridBERTModel', 'HybridBERTEncoder',
+           'get_hybrid_model', 'hybrid_bert_12_768_12', 'hybrid_bert_24_1024_16',
+           'get_hybrid_bert_model',
+           'HybridBERTForQA', 'HybridBERTClassifier', 'HybridBERTRegression']
 
 from mxnet.gluon import HybridBlock, loss, nn
 from mxnet.gluon.loss import Loss
@@ -33,21 +34,19 @@
 from mxnet.gluon import nn
 import mxnet as mx
 from gluonnlp.model.block import GELU
-from gluonnlp.model.bert import BERTLayerNorm, BERTEncoderCell, _load_vocab, \
-    _load_pretrained_params, bert_hparams
-from gluonnlp.model.transformer import TransformerEncoderCell, _get_layer_norm, \
-    _position_encoding_init
+from gluonnlp.model.bert import BERTLayerNorm, BERTEncoderCell, _load_vocab
+from gluonnlp.model.bert import _load_pretrained_params, bert_hparams
+from gluonnlp.model.transformer import TransformerEncoderCell, _get_layer_norm
+from gluonnlp.model.transformer import _position_encoding_init
 from gluonnlp.vocab import BERTVocab
 from gluonnlp.base import get_home_dir
 
-
 ###############################################################################
 #                              COMPONENTS                                     #
 ###############################################################################
 
-
-class StaticBaseTransformerEncoder(HybridBlock):
-    """Base Structure of the Static Transformer Encoder.
+class HybridBaseTransformerEncoder(HybridBlock):
+    """Base Structure of the Hybrid Transformer Encoder.
 
     Parameters
     ----------
@@ -109,7 +108,7 @@ def __init__(self, attention_cell='multi_head', num_layers=2,
                  positional_weight='sinusoidal', use_bert_encoder=False,
                  use_layer_norm_before_dropout=False, scale_embed=True, input_size=None,
                  seq_length=None, prefix=None, params=None):
-        super(StaticBaseTransformerEncoder, self).__init__(prefix=prefix, params=params)
+        super(HybridBaseTransformerEncoder, self).__init__(prefix=prefix, params=params)
         assert units % num_heads == 0, \
             'In TransformerEncoder, The units should be divided exactly ' \
             'by the number of heads. Received units={}, num_heads={}' \
@@ -253,11 +252,11 @@ def hybrid_forward(self, F, inputs, states=None,
             return outputs, additional_outputs
 
 
-class StaticBERTEncoder(StaticBaseTransformerEncoder):
-    """Structure of the Static BERT Encoder.
+class HybridBERTEncoder(HybridBaseTransformerEncoder):
+    """Structure of the Hybrid BERT Encoder.
 
     Different from the original encoder for transformer,
-    `StaticBERTEncoder` uses learnable positional embedding, `BERTPositionwiseFFN`
+    `HybridBERTEncoder` uses learnable positional embedding, `BERTPositionwiseFFN`
     and `BERTLayerNorm`.
 
     Parameters
@@ -318,7 +317,7 @@ def __init__(self, attention_cell='multi_head', num_layers=2,
                  use_residual=True, output_attention=False, output_all_encodings=False,
                  weight_initializer=None, bias_initializer='zeros', input_size=None,
                  seq_length=None, prefix=None, params=None):
-        super(StaticBERTEncoder, self).__init__(attention_cell=attention_cell,
+        super(HybridBERTEncoder, self).__init__(attention_cell=attention_cell,
                                                 num_layers=num_layers, units=units,
                                                 hidden_size=hidden_size, max_length=max_length,
                                                 num_heads=num_heads, scaled=scaled, dropout=dropout,
@@ -341,12 +340,12 @@ def __init__(self, attention_cell='multi_head', num_layers=2,
 #                                FULL MODEL                                   #
 ###############################################################################
 
-class StaticBERTModel(HybridBlock):
-    """Static Model for BERT (Bidirectional Encoder Representations from Transformers).
+class HybridBERTModel(HybridBlock):
+    """Hybrid Model for BERT (Bidirectional Encoder Representations from Transformers).
 
     Parameters
     ----------
-    encoder : StaticBERTEncoder
+    encoder : HybridBERTEncoder
         Bidirectional encoder that encodes the input sentence.
     vocab_size : int or None, default None
         The size of the vocabulary.
@@ -396,7 +395,7 @@ class StaticBERTModel(HybridBlock):
             layer of the Encoder, or a list of all sequence encodings of all layers.
             In both cases shape of the tensor(s) is/are (batch_size, seq_length, units).
         - **attention_outputs**: output list of all intermediate encodings per layer
-            Returned only if StaticBERTEncoder.output_attention is True.
+            Returned only if HybridBERTEncoder.output_attention is True.
             List of num_layers length of tensors of shape
             (num_masks, num_attention_heads, seq_length, seq_length)
         - **pooled_output**: output tensor of pooled representation of the first tokens.
@@ -412,7 +411,7 @@ def __init__(self, encoder, vocab_size=None, token_type_vocab_size=None, units=N
                  embed_size=None, embed_dropout=0.0, embed_initializer=None,
                  word_embed=None, token_type_embed=None, use_pooler=True, use_decoder=True,
                  use_classifier=True, prefix=None, params=None):
-        super(StaticBERTModel, self).__init__(prefix=prefix, params=params)
+        super(HybridBERTModel, self).__init__(prefix=prefix, params=params)
         self._use_decoder = use_decoder
         self._use_classifier = use_classifier
         self._use_pooler = use_pooler
@@ -487,7 +486,7 @@ def hybrid_forward(self, F, inputs, token_types, valid_length=None, masked_posit
         # pylint: disable=unused-argument
         """Generate the representation given the inputs.
 
-        This is used in training or fine-tuning a static (hybridized) BERT model.
+        This is used in training or fine-tuning a hybrid (hybridized) BERT model.
         """
         outputs = []
         seq_out, attention_out = self._encode_sequence(F, inputs, token_types, valid_length)
@@ -519,7 +518,7 @@ def _encode_sequence(self, F, inputs, token_types, valid_length=None):
         # pylint: disable=unused-argument
         """Generate the representation given the input sequences.
 
-        This is used for pre-training or fine-tuning a static (hybridized) BERT model.
+        This is used for pre-training or fine-tuning a hybrid (hybridized) BERT model.
         """
         # embedding
         word_embedding = self.word_embed(inputs)
@@ -534,7 +533,7 @@ def _apply_pooling(self, sequence):
 
         This is used for pre-training or fine-tuning a BERT model.
         """
-        outputs = sequence[:, 0, :]
+        outputs = sequence.slice(begin=(None, 0, None), end=(None, 1, None))
         return self.pooler(outputs)
 
     def _decode(self, sequence, masked_positions):
@@ -573,8 +572,8 @@ def _decode(self, sequence, masked_positions):
 #                               GET MODEL                                     #
 ###############################################################################
 
-def get_model(name, dataset_name='wikitext-2', **kwargs):
-    """Returns a pre-defined model by name.
+def get_hybrid_model(name, dataset_name='wikitext-2', **kwargs):
+    """Returns a pre-defined hybrid model by name.
 
     Parameters
     ----------
@@ -596,7 +595,7 @@ def get_model(name, dataset_name='wikitext-2', **kwargs):
 
     Returns
     -------
-    gluon.Block, gluonnlp.Vocab, (optional) gluonnlp.Vocab
+    gluon.HybridBlock, BERTVocab
     """
     models = {'bert_12_768_12': bert_12_768_12,
               'bert_24_1024_16': bert_24_1024_16}
@@ -613,7 +612,7 @@ def bert_12_768_12(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
                    root=os.path.join(get_home_dir(), 'models'), use_pooler=True,
                    use_decoder=True, use_classifier=True, input_size=None, seq_length=None,
                    **kwargs):
-    """Static BERT BASE model.
+    """Hybrid BERT BASE model.
 
     The number of layers (L) is 12, number of units (H) is 768, and the
     number of self-attention heads (A) is 12.
@@ -647,9 +646,9 @@ def bert_12_768_12(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
 
     Returns
     -------
-    StaticBERTModel, gluonnlp.vocab.BERTVocab
+    HybridBERTModel, gluonnlp.vocab.BERTVocab
     """
-    return get_static_bert_model(model_name='bert_12_768_12', vocab=vocab,
+    return get_hybrid_bert_model(model_name='bert_12_768_12', vocab=vocab,
                                  dataset_name=dataset_name, pretrained=pretrained, ctx=ctx,
                                  use_pooler=use_pooler, use_decoder=use_decoder,
                                  use_classifier=use_classifier, root=root, input_size=input_size,
@@ -660,7 +659,7 @@ def bert_24_1024_16(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu()
                     use_pooler=True, use_decoder=True, use_classifier=True,
                     root=os.path.join(get_home_dir(), 'models'), input_size=None, seq_length=None,
                     **kwargs):
-    """Static BERT LARGE model.
+    """Hybrid BERT LARGE model.
 
     The number of layers (L) is 24, number of units (H) is 1024, and the
     number of self-attention heads (A) is 16.
@@ -693,22 +692,22 @@ def bert_24_1024_16(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu()
 
     Returns
     -------
-    StaticBERTModel, gluonnlp.vocab.BERTVocab
+    HybridBERTModel, gluonnlp.vocab.BERTVocab
     """
-    return get_static_bert_model(model_name='bert_24_1024_16', vocab=vocab,
+    return get_hybrid_bert_model(model_name='bert_24_1024_16', vocab=vocab,
                                  dataset_name=dataset_name, pretrained=pretrained,
                                  ctx=ctx, use_pooler=use_pooler,
                                  use_decoder=use_decoder, use_classifier=use_classifier,
                                  root=root, input_size=input_size, seq_length=seq_length, **kwargs)
 
 
-def get_static_bert_model(model_name=None, dataset_name=None, vocab=None,
+def get_hybrid_bert_model(model_name=None, dataset_name=None, vocab=None,
                           pretrained=True, ctx=mx.cpu(),
                           use_pooler=True, use_decoder=True, use_classifier=True,
                           output_attention=False, output_all_encodings=False,
                           root=os.path.join(get_home_dir(), 'models'), input_size=None,
                           seq_length=None, **kwargs):
-    """Any Static BERT pretrained model.
+    """Any Hybrid BERT pretrained model.
 
     Parameters
     ----------
@@ -747,7 +746,7 @@ def get_static_bert_model(model_name=None, dataset_name=None, vocab=None,
 
     Returns
     -------
-    StaticBERTModel, gluonnlp.vocab.BERTVocab
+    HybridBERTModel, gluonnlp.vocab.BERTVocab
     """
     predefined_args = bert_hparams[model_name]
     mutable_args = ['use_residual', 'dropout', 'embed_dropout', 'word_embed']
@@ -756,7 +755,7 @@ def get_static_bert_model(model_name=None, dataset_name=None, vocab=None,
         'Cannot override predefined model settings.'
     predefined_args.update(kwargs)
     # encoder
-    encoder = StaticBERTEncoder(attention_cell=predefined_args['attention_cell'],
+    encoder = HybridBERTEncoder(attention_cell=predefined_args['attention_cell'],
                                 num_layers=predefined_args['num_layers'],
                                 units=predefined_args['units'],
                                 hidden_size=predefined_args['hidden_size'],
@@ -774,7 +773,7 @@ def get_static_bert_model(model_name=None, dataset_name=None, vocab=None,
                       ' Please use wiki_cn_cased/wiki_multilingual_uncased instead.')
     bert_vocab = _load_vocab(dataset_name, vocab, root, cls=BERTVocab)
     # BERT
-    net = StaticBERTModel(encoder, len(bert_vocab),
+    net = HybridBERTModel(encoder, len(bert_vocab),
                           token_type_vocab_size=predefined_args['token_type_vocab_size'],
                           units=predefined_args['units'],
                           embed_size=predefined_args['embed_size'],
@@ -788,8 +787,7 @@ def get_static_bert_model(model_name=None, dataset_name=None, vocab=None,
                                 ignore_extra=ignore_extra)
     return net, bert_vocab
 
-#create a hybridizable task guided model using BERT
-class StaticBertForQA(HybridBlock):
+class HybridBERTForQA(HybridBlock):
     """Hybridizable Model for SQuAD task with BERT.
 
     The model feeds token ids and token type ids into BERT to get the
@@ -806,14 +804,13 @@ class StaticBertForQA(HybridBlock):
     """
 
     def __init__(self, bert, prefix=None, params=None):
-        super(StaticBertForQA, self).__init__(prefix=prefix, params=params)
+        super(HybridBERTForQA, self).__init__(prefix=prefix, params=params)
         self.bert = bert
         with self.name_scope():
             self.span_classifier = nn.Dense(units=2, flatten=False)
 
     def hybrid_forward(self, F, inputs, token_types, valid_length=None):
         # pylint: disable=arguments-differ
-        # pylint: disable=unused-argument
         """Generate the unnormalized score for the given the input sequences.
 
         Parameters
@@ -835,7 +832,7 @@ def hybrid_forward(self, F, inputs, token_types, valid_length=None):
         output = self.span_classifier(bert_output)
         return output
 
-class StaticBERTClassifier(HybridBlock):
+class HybridBERTClassifier(HybridBlock):
     """Model for sentence (pair) classification task with BERT.
 
     The model feeds token ids and token type ids into BERT to get the
@@ -862,7 +859,7 @@ def __init__(self,
                  dropout=0.0,
                  prefix=None,
                  params=None):
-        super(BERTClassifier, self).__init__(prefix=prefix, params=params)
+        super(HybridBERTClassifier, self).__init__(prefix=prefix, params=params)
         self.bert = bert
         with self.name_scope():
             self.classifier = nn.HybridSequential(prefix=prefix)
@@ -890,3 +887,53 @@ def hybrid_forward(self, F, inputs, token_types, valid_length=None):  # pylint:
         """
         _, pooler_out = self.bert(inputs, token_types, valid_length)
         return self.classifier(pooler_out)
+
+class HybridBERTRegression(HybridBlock):
+    """Model for sentence (pair) regression task with BERT.
+
+    The model feeds token ids and token type ids into BERT to get the
+    pooled BERT sequence representation, then apply a Dense layer for
+    regression.
+
+    Parameters
+    ----------
+    bert: BERTModel
+        Bidirectional encoder with transformer.
+    dropout : float or None, default 0.0.
+        Dropout probability for the bert output.
+    prefix : str or None
+        See document of `mx.gluon.Block`.
+    params : ParameterDict or None
+        See document of `mx.gluon.Block`.
+    """
+
+    def __init__(self, bert, dropout=0.0, prefix=None, params=None):
+        super(HybridBERTRegression, self).__init__(prefix=prefix, params=params)
+        self.bert = bert
+        with self.name_scope():
+            self.regression = nn.HybridSequential(prefix=prefix)
+            if dropout:
+                self.regression.add(nn.Dropout(rate=dropout))
+            self.regression.add(nn.Dense(1))
+
+    def hybrid_forward(self, _, inputs, token_types, valid_length=None):  # pylint: disable=arguments-differ
+        """Generate the unnormalized score for the given the input sequences.
+
+        Parameters
+        ----------
+        inputs : NDArray, shape (batch_size, seq_length)
+            Input words for the sequences.
+        token_types : NDArray, shape (batch_size, seq_length)
+            Token types for the sequences, used to indicate whether the word belongs to the
+            first sentence or the second one.
+        valid_length : NDArray or None, shape (batch_size)
+            Valid length of the sequence. This is used to mask the padded tokens.
+
+        Returns
+        -------
+        outputs : NDArray
+            Shape (batch_size, num_classes)
+        """
+        _, pooler_out = self.bert(inputs, token_types, valid_length)
+        return self.regression(pooler_out)
+

From 705b970405d06c9a4c424d0836ffc6b5693c4b9d Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-172-31-20-62.us-west-2.compute.internal>
Date: Tue, 14 May 2019 22:35:57 +0000
Subject: [PATCH 05/11] add readme and tasks

---
 scripts/bert/export/export.py              |   9 +-
 scripts/bert/export/hybrid_bert.py         |   2 +-
 scripts/bert/export/static_export_squad.py | 239 ---------------------
 scripts/bert/index.rst                     |  54 +----
 scripts/tests/test_scripts.py              |   7 +
 5 files changed, 18 insertions(+), 293 deletions(-)
 delete mode 100644 scripts/bert/export/static_export_squad.py

diff --git a/scripts/bert/export/export.py b/scripts/bert/export/export.py
index 79f77e27a7..8903924ab9 100644
--- a/scripts/bert/export/export.py
+++ b/scripts/bert/export/export.py
@@ -3,7 +3,8 @@
 
 ====================================
 
-This script exports the BERT model to a hybrid model suitable for use with MXNet Module API.
+This script exports the BERT model to a hybrid model serialized as a symbol.json file,
+which is suitable for deployment, or use with MXNet Module API.
 
 @article{devlin2018bert,
   title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
@@ -191,12 +192,14 @@ def infer(batch, prefix):
     """Evaluate the model on a mini-batch."""
     log.info('Start inference ... ')
     tic = time.time()
+
     # import with SymbolBlock. Alternatively, you can use Module.load APIs.
-    inputs, token_types, valid_length = batch
-    num_trials = 10
     imported_net = mx.gluon.nn.SymbolBlock.imports(prefix + '-symbol.json',
                                                    ['data0','data1','data2'],
                                                    prefix + '-0000.params')
+    # run forward inference
+    inputs, token_types, valid_length = batch
+    num_trials = 10
     for _ in range(num_trials):
         net(inputs, token_types, valid_length)
     mx.nd.waitall()
diff --git a/scripts/bert/export/hybrid_bert.py b/scripts/bert/export/hybrid_bert.py
index c971eda4de..d055946f4c 100644
--- a/scripts/bert/export/hybrid_bert.py
+++ b/scripts/bert/export/hybrid_bert.py
@@ -19,7 +19,7 @@
 """Hybrid BERT models."""
 
 __all__ = ['HybridBERTModel', 'HybridBERTEncoder',
-           'get_hybrid_model', 'hybrid_bert_12_768_12', 'hybrid_bert_24_1024_16',
+           'get_hybrid_model', 'bert_12_768_12', 'bert_24_1024_16',
            'get_hybrid_bert_model',
            'HybridBERTForQA', 'HybridBERTClassifier', 'HybridBERTRegression']
 
diff --git a/scripts/bert/export/static_export_squad.py b/scripts/bert/export/static_export_squad.py
deleted file mode 100644
index da87914179..0000000000
--- a/scripts/bert/export/static_export_squad.py
+++ /dev/null
@@ -1,239 +0,0 @@
-"""
-Export SQuAD with Static Bidirectional Encoder Representations from Transformers (BERT)
-
-=========================================================================================
-
-This example shows how to export a Block based BERT model with pre-trained BERT parameters
-with static shape, we are using SQuAD as an example.
-
-@article{devlin2018bert,
-  title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
-  author={Devlin, Jacob and Chang, Ming- \
-      Wei and Lee, Kenton and Toutanova, Kristina},
-  journal={arXiv preprint arXiv:1810.04805},
-  year={2018}
-}
-"""
-
-# coding=utf-8
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-
-import argparse
-import logging
-import os
-import time
-
-import mxnet as mx
-
-from static_bert_qa_model import StaticBertForQA
-from static_bert import get_model
-
-log = logging.getLogger('gluonnlp')
-log.setLevel(logging.DEBUG)
-formatter = logging.Formatter(
-    fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s', datefmt='%H:%M:%S')
-
-parser = argparse.ArgumentParser(description='export static BERT QA example.')
-
-parser.add_argument('--model_parameters',
-                    type=str,
-                    default=None,
-                    help='Model parameter file')
-
-parser.add_argument('--bert_model',
-                    type=str,
-                    default='bert_12_768_12',
-                    help='BERT model name. options are bert_12_768_12 and bert_24_1024_16.')
-
-parser.add_argument('--bert_dataset',
-                    type=str,
-                    default='book_corpus_wiki_en_uncased',
-                    help='BERT dataset name.'
-                         'options are book_corpus_wiki_en_uncased and book_corpus_wiki_en_cased.')
-
-parser.add_argument('--pretrained_bert_parameters',
-                    type=str,
-                    default=None,
-                    help='Pre-trained bert model parameter file. default is None')
-
-parser.add_argument('--uncased',
-                    action='store_false',
-                    help='if not set, inputs are converted to lower case.')
-
-parser.add_argument('--output_dir',
-                    type=str,
-                    default='./output_dir',
-                    help='The output directory where the model params will be written.'
-                         ' default is ./output_dir')
-
-parser.add_argument('--test_batch_size',
-                    type=int,
-                    default=24,
-                    help='Test batch size. default is 24')
-
-parser.add_argument('--max_seq_length',
-                    type=int,
-                    default=384,
-                    help='The maximum total input sequence length after WordPiece tokenization.'
-                         'Sequences longer than this will be truncated, and sequences shorter '
-                         'than this will be padded. default is 384')
-
-parser.add_argument('--doc_stride',
-                    type=int,
-                    default=128,
-                    help='When splitting up a long document into chunks, how much stride to '
-                         'take between chunks. default is 128')
-
-parser.add_argument('--max_query_length',
-                    type=int,
-                    default=64,
-                    help='The maximum number of tokens for the question. Questions longer than '
-                         'this will be truncated to this length. default is 64')
-
-parser.add_argument('--gpu', type=str, help='single gpu id')
-
-parser.add_argument('--seq_length',
-                    type=int,
-                    default=384,
-                    help='The sequence length of the input')
-
-parser.add_argument('--input_size',
-                    type=int,
-                    default=768,
-                    help='The embedding size of the input')
-
-parser.add_argument('--export',
-                    action='store_true',
-                    help='Whether to export the model.')
-
-parser.add_argument('--evaluate',
-                    action='store_true',
-                    help='Whether to evaluate the model.')
-
-args = parser.parse_args()
-
-
-output_dir = args.output_dir
-if not os.path.exists(output_dir):
-    os.mkdir(output_dir)
-
-fh = logging.FileHandler(os.path.join(
-    args.output_dir, 'static_export_squad.log'), mode='w')
-fh.setLevel(logging.INFO)
-fh.setFormatter(formatter)
-console = logging.StreamHandler()
-console.setLevel(logging.INFO)
-console.setFormatter(formatter)
-log.addHandler(console)
-log.addHandler(fh)
-
-log.info(args)
-
-model_name = args.bert_model
-dataset_name = args.bert_dataset
-model_parameters = args.model_parameters
-pretrained_bert_parameters = args.pretrained_bert_parameters
-lower = args.uncased
-
-seq_length = args.seq_length
-input_size = args.input_size
-test_batch_size = args.test_batch_size
-ctx = mx.cpu() if not args.gpu else mx.gpu(int(args.gpu))
-
-max_seq_length = args.max_seq_length
-doc_stride = args.doc_stride
-max_query_length = args.max_query_length
-
-if max_seq_length <= max_query_length + 3:
-    raise ValueError('The max_seq_length (%d) must be greater than max_query_length '
-                     '(%d) + 3' % (max_seq_length, max_query_length))
-
-
-###############################################################################
-#                              Prepare dummy input data                       #
-###############################################################################
-if args.evaluate:
-    inputs = mx.nd.arange(test_batch_size * seq_length).reshape(shape=(test_batch_size, seq_length))
-    token_types = mx.nd.zeros_like(inputs)
-    valid_length = mx.nd.arange(seq_length)[:test_batch_size]
-    batch = inputs, token_types, valid_length
-    num_batch = 10
-    sample_dataset = []
-    for _ in range(num_batch):
-        sample_dataset.append(batch)
-
-
-bert, vocab = get_model(
-    name=model_name,
-    dataset_name=dataset_name,
-    pretrained=not model_parameters and not pretrained_bert_parameters,
-    ctx=ctx,
-    use_pooler=False,
-    use_decoder=False,
-    use_classifier=False,
-    input_size=args.input_size,
-    seq_length=args.seq_length)
-
-
-###############################################################################
-#                              Hybridize the model                            #
-###############################################################################
-net = StaticBertForQA(bert=bert)
-if pretrained_bert_parameters and not model_parameters:
-    bert.load_parameters(pretrained_bert_parameters, ctx=ctx,
-                         ignore_extra=True)
-if not model_parameters:
-    net.span_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
-else:
-    net.load_parameters(model_parameters, ctx=ctx)
-
-net.hybridize(static_alloc=True, static_shape=True)
-
-
-def evaluate(data_source):
-    """Evaluate the model on a mini-batch.
-    """
-    log.info('Start predict')
-    tic = time.time()
-    for batch in data_source:
-        inputs, token_types, valid_length = batch
-        net(inputs.astype('float32').as_in_context(ctx),
-            token_types.astype('float32').as_in_context(ctx),
-            valid_length.astype('float32').as_in_context(ctx))
-    mx.nd.waitall()
-    toc = time.time()
-    log.info('Inference time cost={:.2f} s, Thoughput={:.2f} samples/s'
-             .format(toc - tic,
-                     len(data_source) / (toc - tic)))
-
-
-
-###############################################################################
-#                              Export the model                               #
-###############################################################################
-if __name__ == '__main__':
-    if args.export:
-        net.export(os.path.join(args.output_dir, 'static_net'), epoch=0)
-        if args.evaluate:
-            net.load_parameters(os.path.join(args.output_dir, 'static_net-0000.params'))
-            evaluate(sample_dataset)
-    else:
-        if args.evaluate:
-            evaluate(sample_dataset)
diff --git a/scripts/bert/index.rst b/scripts/bert/index.rst
index 6a5def3eeb..95d85ddcb5 100644
--- a/scripts/bert/index.rst
+++ b/scripts/bert/index.rst
@@ -322,57 +322,11 @@ Command line interface
 Export BERT for Deployment
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The BERTModel class is a subclass of Block, rather than HybridBlock.
-To support exporting BERT model to json format for deployment, we introduce the StaticBERT class.
-Specifically, by exporting hybridizable BERT, we mean the BERT with fixed input embedding size and sequence length can be exported through
-a static shape based implementation of hybridblock based BERT. By using this, we can export a block based BERT model.
-
-Please follow the steps below for exporting the model.
-
-
-Step 1: create a hybridizable task guided model using BERT:
-
-.. code-block:: python
-
-    class StaticBertForQA(HybridBlock)
-
-An example can be found in 'staticbert/static_bert_for_qa_model.py'.
-
-
-Step 2: hybridize the model in the script:
-
-.. code-block:: python
-
-    net = StaticBertForQA(bert=bert)
-    net.hybridize(static_alloc=True, static_shape=True)
-
-An example can be found in 'staticbert/static_export_squad.py'.
-
-
-Step 3: export trained model:
-
-.. code-block:: python
-
-    net.export(os.path.join(args.output_dir, 'static_net'), epoch=args.epochs)
-
-To export the model, in 'staticbert/static_export_squad.py', set export=True.
-
-To run the example, if you would like to export the Block parameters
-and test the HybridBlock on your datasets with the specified input size and sequence length,
+Current export/export.py support exporting BERT models. Supported values for --task argument include classification, regression and question_answering.
 
 .. code-block:: console
 
-    $ cd staticbert
-    $ python static_export_squad.py --model_parameters output_dir/net.params --export --evaluate --seq_length 384 --input_size 768 --gpu 0
-
-This will load the the StaticBERTQA HybridBlock with parameter (requirement: output_dir/net.params should exist)
-trained by a normal BERTQA Block, and export the HybridBlock to json format.
-
-Besides, Where seq_length stands for the sequence length of the input, input_size represents the embedding size of the input.
-
-To load and export the BERT base pretrained model that that is suitable for fine tuning, use the following:
-
-.. code-block:: console
+    $ python export/export.py --task classification --model_parameters /path/to/saved/ckpt.params --output_dir /path/to/output/dir/ --seq_length 256
 
-    $ cd staticbert
-    $ python static_export_base.py --model_parameters --seq_length 128
+This will export the BERT model for classification to a symbol.json file, saved to the directory specified by --output_dir.
+The --model_parameters argument is optional. If not set, the .params file saved in the output directory will be randomly intialized parameters.
diff --git a/scripts/tests/test_scripts.py b/scripts/tests/test_scripts.py
index 45f7034891..40935f3ee8 100644
--- a/scripts/tests/test_scripts.py
+++ b/scripts/tests/test_scripts.py
@@ -343,3 +343,10 @@ def test_finetune_train(dataset):
         process = subprocess.check_call([sys.executable, './scripts/bert/finetune_classifier.py',
                                          '--task_name', dataset,
                                          '--optimizer', 'adam'] + arguments)
+
+@pytest.mark.serial
+@pytest.mark.integration
+@pytest.mark.parametrize('task', ['classification', 'regression', 'question_answering'])
+def test_export(task):
+    process = subprocess.check_call([sys.executable, './scripts/bert/export/export.py',
+                                     '--task', task])

From 1d9c780e8856e3de05eea57f8ce00ae12d2df573 Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-172-31-20-62.us-west-2.compute.internal>
Date: Tue, 14 May 2019 22:46:04 +0000
Subject: [PATCH 06/11] fix lint

---
 scripts/bert/export/export.py      | 4 ++--
 scripts/bert/export/hybrid_bert.py | 7 ++-----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/scripts/bert/export/export.py b/scripts/bert/export/export.py
index 8903924ab9..f1ae1a01a7 100644
--- a/scripts/bert/export/export.py
+++ b/scripts/bert/export/export.py
@@ -195,13 +195,13 @@ def infer(batch, prefix):
 
     # import with SymbolBlock. Alternatively, you can use Module.load APIs.
     imported_net = mx.gluon.nn.SymbolBlock.imports(prefix + '-symbol.json',
-                                                   ['data0','data1','data2'],
+                                                   ['data0', 'data1', 'data2'],
                                                    prefix + '-0000.params')
     # run forward inference
     inputs, token_types, valid_length = batch
     num_trials = 10
     for _ in range(num_trials):
-        net(inputs, token_types, valid_length)
+        imported_net(inputs, token_types, valid_length)
     mx.nd.waitall()
     toc = time.time()
     log.info('Inference time cost={:.2f} s, Thoughput={:.2f} samples/s'
diff --git a/scripts/bert/export/hybrid_bert.py b/scripts/bert/export/hybrid_bert.py
index d055946f4c..63313380d8 100644
--- a/scripts/bert/export/hybrid_bert.py
+++ b/scripts/bert/export/hybrid_bert.py
@@ -23,16 +23,14 @@
            'get_hybrid_bert_model',
            'HybridBERTForQA', 'HybridBERTClassifier', 'HybridBERTRegression']
 
-from mxnet.gluon import HybridBlock, loss, nn
-from mxnet.gluon.loss import Loss
-
 import os
 import math
 import warnings
 
+import mxnet as mx
 from mxnet.gluon import Block, HybridBlock
 from mxnet.gluon import nn
-import mxnet as mx
+
 from gluonnlp.model.block import GELU
 from gluonnlp.model.bert import BERTLayerNorm, BERTEncoderCell, _load_vocab
 from gluonnlp.model.bert import _load_pretrained_params, bert_hparams
@@ -936,4 +934,3 @@ def hybrid_forward(self, _, inputs, token_types, valid_length=None):  # pylint:
         """
         _, pooler_out = self.bert(inputs, token_types, valid_length)
         return self.regression(pooler_out)
-

From 3b593fd99edc2fabdb029bea65dd80a377c85eaf Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-172-31-20-62.us-west-2.compute.internal>
Date: Wed, 15 May 2019 01:30:12 +0000
Subject: [PATCH 07/11] fix unit test

---
 scripts/bert/export/hybrid_bert.py                         | 2 +-
 scripts/tests/{test_static_bert.py => test_hybrid_bert.py} | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)
 rename scripts/tests/{test_static_bert.py => test_hybrid_bert.py} (97%)

diff --git a/scripts/bert/export/hybrid_bert.py b/scripts/bert/export/hybrid_bert.py
index 63313380d8..a875cb2c18 100644
--- a/scripts/bert/export/hybrid_bert.py
+++ b/scripts/bert/export/hybrid_bert.py
@@ -531,7 +531,7 @@ def _apply_pooling(self, sequence):
 
         This is used for pre-training or fine-tuning a BERT model.
         """
-        outputs = sequence.slice(begin=(None, 0, None), end=(None, 1, None))
+        outputs = sequence.slice(begin=(None, 0, None), end=(None, 1, None)).squeeze(axis=1)
         return self.pooler(outputs)
 
     def _decode(self, sequence, masked_positions):
diff --git a/scripts/tests/test_static_bert.py b/scripts/tests/test_hybrid_bert.py
similarity index 97%
rename from scripts/tests/test_static_bert.py
rename to scripts/tests/test_hybrid_bert.py
index 868cfe5531..430774da1a 100644
--- a/scripts/tests/test_static_bert.py
+++ b/scripts/tests/test_hybrid_bert.py
@@ -17,7 +17,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""Test static bert models."""
+"""Test hybrid bert models."""
 
 from __future__ import print_function
 
@@ -30,12 +30,12 @@
 import pytest
 
 
-from ..bert.staticbert.static_bert import get_model
+from ..bert.export.hybrid_bert import get_model
 
 
 @pytest.mark.serial
 @pytest.mark.remote_required
-def test_static_bert_models():
+def test_hybrid_bert_models():
     models = ['bert_12_768_12', 'bert_24_1024_16']
     layers = [12, 24]
     attention_heads = [12, 16]

From 25c4d0ed1d1ae4103df6614db3947c129e233665 Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-172-31-20-62.us-west-2.compute.internal>
Date: Wed, 15 May 2019 03:19:19 +0000
Subject: [PATCH 08/11] fix test

---
 scripts/tests/test_hybrid_bert.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/tests/test_hybrid_bert.py b/scripts/tests/test_hybrid_bert.py
index 430774da1a..2d44d6ec48 100644
--- a/scripts/tests/test_hybrid_bert.py
+++ b/scripts/tests/test_hybrid_bert.py
@@ -30,7 +30,7 @@
 import pytest
 
 
-from ..bert.export.hybrid_bert import get_model
+from ..bert.export.hybrid_bert import get_hybrid_model
 
 
 @pytest.mark.serial
@@ -109,10 +109,10 @@ def collect_shapes(item, shapes):
 
         for kwarg, expected_shape in zip(kwargs, expected_shapes):
             expected_shape = infer_shape(expected_shape, unit)
-            model, _ = get_model(model_name, dataset_name=dataset,
-                                 pretrained=False, root='tests/data/model/',
-                                 seq_length=seq_len, input_size=unit,
-                                 **kwarg)
+            model, _ = get_hybrid_model(model_name, dataset_name=dataset,
+                                        pretrained=False, root='tests/data/model/',
+                                        seq_length=seq_len, input_size=unit,
+                                        **kwarg)
             model.initialize()
             if kwarg['use_decoder']:
                 # position tensor is required for decoding

From 18ed5b3e60c82bfa492071d8ae4369c91a164b62 Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-172-31-20-62.us-west-2.compute.internal>
Date: Wed, 15 May 2019 03:59:36 +0000
Subject: [PATCH 09/11] remove test_bert_static_base_export

---
 scripts/tests/test_scripts.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/scripts/tests/test_scripts.py b/scripts/tests/test_scripts.py
index 40935f3ee8..e3de469aa0 100644
--- a/scripts/tests/test_scripts.py
+++ b/scripts/tests/test_scripts.py
@@ -176,16 +176,6 @@ def test_bert_embedding(use_pretrained):
     time.sleep(5)
 
 
-@pytest.mark.serial
-@pytest.mark.remote_required
-@pytest.mark.gpu
-@pytest.mark.integration
-def test_bert_static_base_export():
-    args = ['--gpu', '0', '--seq_length', '128']
-    process = subprocess.check_call([sys.executable, './scripts/bert/staticbert/static_export_base.py'] + args)
-    time.sleep(5)
-
-
 @pytest.mark.serial
 @pytest.mark.gpu
 @pytest.mark.remote_required

From 1ac214029d7b99e9bdbb5755ccb421cd74667e6e Mon Sep 17 00:00:00 2001
From: Haibin Lin <linhaibin.eric@gmail.com>
Date: Wed, 15 May 2019 15:28:13 -0700
Subject: [PATCH 10/11] address comments

---
 scripts/bert/export/export.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/bert/export/export.py b/scripts/bert/export/export.py
index f1ae1a01a7..0e7c20ab8e 100644
--- a/scripts/bert/export/export.py
+++ b/scripts/bert/export/export.py
@@ -191,12 +191,12 @@ def export(batch, prefix):
 def infer(batch, prefix):
     """Evaluate the model on a mini-batch."""
     log.info('Start inference ... ')
-    tic = time.time()
 
     # import with SymbolBlock. Alternatively, you can use Module.load APIs.
     imported_net = mx.gluon.nn.SymbolBlock.imports(prefix + '-symbol.json',
                                                    ['data0', 'data1', 'data2'],
                                                    prefix + '-0000.params')
+    tic = time.time()
     # run forward inference
     inputs, token_types, valid_length = batch
     num_trials = 10

From 33cabc12970218e73bfa3895efbfbbe919a42473 Mon Sep 17 00:00:00 2001
From: Haibin Lin <linhaibin.eric@gmail.com>
Date: Fri, 17 May 2019 15:20:16 -0700
Subject: [PATCH 11/11] update default seq len

---
 scripts/bert/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/bert/index.rst b/scripts/bert/index.rst
index 95d85ddcb5..7c129add49 100644
--- a/scripts/bert/index.rst
+++ b/scripts/bert/index.rst
@@ -326,7 +326,7 @@ Current export/export.py support exporting BERT models. Supported values for --t
 
 .. code-block:: console
 
-    $ python export/export.py --task classification --model_parameters /path/to/saved/ckpt.params --output_dir /path/to/output/dir/ --seq_length 256
+    $ python export/export.py --task classification --model_parameters /path/to/saved/ckpt.params --output_dir /path/to/output/dir/ --seq_length 128
 
 This will export the BERT model for classification to a symbol.json file, saved to the directory specified by --output_dir.
 The --model_parameters argument is optional. If not set, the .params file saved in the output directory will be randomly intialized parameters.