diff --git a/README.md b/README.md
index b1c73cfa12..675147a2c5 100644
--- a/README.md
+++ b/README.md
@@ -34,16 +34,16 @@ First of all, install the latest MXNet. You may use the following commands:
 
 ```bash
 # Install the version with CUDA 10.0
-python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20200802" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20200926" -f https://dist.mxnet.io/python
 
 # Install the version with CUDA 10.1
-python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20200802" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20200926" -f https://dist.mxnet.io/python
 
 # Install the version with CUDA 10.2
-python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python
 
 # Install the cpu-only version
-python3 -m pip install -U --pre "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet>=2.0.0b20200926" -f https://dist.mxnet.io/python
 ```
 
 
@@ -92,8 +92,13 @@ You may go to [tests](tests) to see how to run the unittests.
 You can use Docker to launch a JupyterLab development environment with GluonNLP installed.
 
 ```
+# GPU Instance
 docker pull gluonai/gluon-nlp:gpu-latest
-docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=4g gluonai/gluon-nlp:gpu-latest
+docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=2g gluonai/gluon-nlp:gpu-latest
+
+# CPU Instance
+docker pull gluonai/gluon-nlp:cpu-latest
+docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=2g gluonai/gluon-nlp:cpu-latest
 ``` 
 
 For more details, you can refer to the guidance in [tools/docker](tools/docker).
diff --git a/docs/conf.py b/docs/conf.py
index 916edd0a99..9d4c32eefa 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -234,10 +234,10 @@ def setup(app):
         'auto_doc_ref': True
             }, True)
     app.add_transform(AutoStructify)
-    app.add_javascript('google_analytics.js')
-    app.add_javascript('hidebib.js')
-    app.add_javascript('install-options.js')
-    app.add_stylesheet('custom.css')
+    app.add_js_file('google_analytics.js')
+    app.add_js_file('hidebib.js')
+    app.add_js_file('install-options.js')
+    app.add_css_file('custom.css')
 
 
 sphinx_gallery_conf = {
diff --git a/scripts/benchmarks/benchmark_utils.py b/scripts/benchmarks/benchmark_utils.py
index c022caff87..7475d4b5d5 100644
--- a/scripts/benchmarks/benchmark_utils.py
+++ b/scripts/benchmarks/benchmark_utils.py
@@ -792,12 +792,9 @@ def train_step():
             raise NotImplementedError
         timeit.repeat(train_step, repeat=1, number=3)
         mxnet.npx.waitall()
-        for ctx in mx_all_contexts:
-            ctx.empty_cache()
         runtimes = timeit.repeat(train_step, repeat=self._repeat, number=3)
         mxnet.npx.waitall()
-        for ctx in mx_all_contexts:
-            ctx.empty_cache()
+        ctx.empty_cache()
         mxnet.npx.waitall()
         # Profile memory
         if self._use_gpu:
@@ -844,8 +841,6 @@ def run(self):
                         infer_time = np.nan
                         infer_memory = np.nan
                     inference_result[model_name][workload] = (infer_time, infer_memory)
-                    for ctx in mx_all_contexts:
-                        ctx.empty_cache()
                     mxnet.npx.waitall()
                     self.save_to_csv(inference_result, self._inference_out_csv_file)
                 if self._profile_train:
@@ -858,8 +853,6 @@ def run(self):
                         train_time = np.nan
                         train_memory = np.nan
                     train_result[model_name][workload] = (train_time, train_memory)
-                    for ctx in mx_all_contexts:
-                        ctx.empty_cache()
                     mxnet.npx.waitall()
                     self.save_to_csv(train_result, self._train_out_csv_file)
 
diff --git a/scripts/datasets/general_nlp_benchmark/README.md b/scripts/datasets/general_nlp_benchmark/README.md
index 84dc9f5524..0f49b258dc 100644
--- a/scripts/datasets/general_nlp_benchmark/README.md
+++ b/scripts/datasets/general_nlp_benchmark/README.md
@@ -112,13 +112,13 @@ benchmarking. We select the classical datasets that are also used in
 
 | Dataset       | #Train  | #Test   | Columns         | Metrics         |
 |---------------|---------|---------|-----------------|-----------------|
-| AG            | 120000  | 7600    | content, label  | acc             |
-| IMDB          | 25000   | 25000   | content, label  | acc             |
-| DBpedia       | 560000  | 70000   | content, label  | acc             |
-| Yelp2         | 560000  | 38000   | content, label  | acc             |
-| Yelp5         | 650000  | 50000   | content, label  | acc             |
-| Amazon2       | 3600000 | 400000  | content, label  | acc             |
-| Amazon5       | 3000000 | 650000  | content, label  | acc             |
+| AG            | 120,000  | 7,600    | content, label  | acc             |
+| IMDB          | 25,000   | 25,000   | content, label  | acc             |
+| DBpedia       | 560,000  | 70,000   | content, label  | acc             |
+| Yelp2         | 560,000  | 38,000   | content, label  | acc             |
+| Yelp5         | 650,000  | 50,000   | content, label  | acc             |
+| Amazon2       | 3,600,000 | 400,000  | content, label  | acc             |
+| Amazon5       | 3,000,000 | 65,0000  | content, label  | acc             |
 
 To obtain the datasets, run:
 
diff --git a/scripts/datasets/pretrain_corpus/README.md b/scripts/datasets/pretrain_corpus/README.md
index 54c4d5c1e2..3f56dc3eeb 100644
--- a/scripts/datasets/pretrain_corpus/README.md
+++ b/scripts/datasets/pretrain_corpus/README.md
@@ -2,9 +2,11 @@
 
 We provide a series of shared scripts for downloading/preparing the text corpus for pretraining NLP models.
 This helps create a unified text corpus for studying the performance of different pretraining algorithms.
-When releasing the datasets, we follow the [FAIR principle](https://www.go-fair.org/fair-principles/),
+When picking the datasets to support, we follow the [FAIR principle](https://www.go-fair.org/fair-principles/),
 i.e., the dataset needs to be findable, accessible, interoperable, and reusable.
 
+For all scripts, we can either use `nlp_data SCRIPT_NAME`, or directly call the script.
+
 ## Gutenberg BookCorpus
 Unfortunately, we are unable to provide the [Toronto BookCorpus dataset](https://yknzhu.wixsite.com/mbweb) due to licensing issues.
 
@@ -16,14 +18,14 @@ Thus, we utilize the [Project Gutenberg](https://www.gutenberg.org/) as an alter
 You can use the following command to download and prepare the Gutenberg corpus.
 
 ```bash
-python3 prepare_bookcorpus.py --dataset gutenberg
+python3 prepare_gutenberg.py --save_dir gutenberg
 ```
 
 Also, you should follow the [license](https://www.gutenberg.org/wiki/Gutenberg:The_Project_Gutenberg_License) for using the data.
 
 ## Wikipedia
 
-Please install [attardi/wikiextractor](https://github.com/attardi/wikiextractor) for preparing the data.
+We used the [attardi/wikiextractor](https://github.com/attardi/wikiextractor) package for preparing the data.
 
 ```bash
 # Download
@@ -33,7 +35,9 @@ python3 prepare_wikipedia.py --mode download --lang en --date latest -o ./
 python3 prepare_wikipedia.py --mode format -i [path-to-wiki.xml.bz2] -o ./
 
 ```
-The process of downloading and formatting is time consuming, and we offer an alternative solution to download the prepared raw text file from S3 bucket. This raw text file is in English and was dumped at 2020-06-20 being formated by the above very process (` --lang en --date 20200620`).
+The process of downloading and formatting is time consuming, and we offer an alternative 
+solution to download the prepared raw text file from S3 bucket. This raw text file is in English and 
+was dumped at 2020-06-20 being formatted by the above process (` --lang en --date 20200620`).
 
 ```bash
 python3 prepare_wikipedia.py --mode download_prepared -o ./
diff --git a/scripts/datasets/pretrain_corpus/prepare_gutenberg.py b/scripts/datasets/pretrain_corpus/prepare_gutenberg.py
index b755a5801e..590310b62e 100644
--- a/scripts/datasets/pretrain_corpus/prepare_gutenberg.py
+++ b/scripts/datasets/pretrain_corpus/prepare_gutenberg.py
@@ -3,7 +3,7 @@
 import zipfile
 from gluonnlp.base import get_data_home_dir
 from gluonnlp.utils.misc import download, load_checksum_stats
-
+import shutil
 
 _CITATIONS = r"""
 @InProceedings{lahiri:2014:SRW,
@@ -59,11 +59,14 @@ def main(args):
     save_dir = args.dataset if args.save_dir is None else args.save_dir
     if not os.path.exists(save_dir):
         os.makedirs(save_dir, exist_ok=True)
+    print(f'Save to {save_dir}')
     with zipfile.ZipFile(target_download_location) as f:
         for name in f.namelist():
             if name.endswith('.txt'):
                 filename = os.path.basename(name)
-            f.extract(name, os.path.join(save_dir, filename))
+                with f.open(name) as in_file:
+                    with open(os.path.join(save_dir, filename.replace(' ', '_')), 'wb') as out_file:
+                        shutil.copyfileobj(in_file, out_file)
 
 
 def cli_main():
diff --git a/scripts/datasets/question_answering/README.md b/scripts/datasets/question_answering/README.md
index ac4fb68dbf..1336f69bb9 100644
--- a/scripts/datasets/question_answering/README.md
+++ b/scripts/datasets/question_answering/README.md
@@ -1,5 +1,6 @@
 # Question Answering
 
+
 ## SQuAD
 SQuAD datasets is distributed under the [CC BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/legalcode) license.
 
@@ -39,7 +40,7 @@ python3 prepare_searchqa.py
 nlp_data prepare_searchqa
 ```
 
-Directory structure of the searchqa dataset will be as follows
+Directory structure of the SearchQA dataset will be as follows
 ```
 searchqa
 ├── train.txt
@@ -48,9 +49,10 @@ searchqa
 ```
 
 ## TriviaQA
-[TriviaQA](https://nlp.cs.washington.edu/triviaqa/) is an open domain QA dataset. See more useful scripts in [Offical Github](https://github.com/mandarjoshi90/triviaqa)
+[TriviaQA](https://nlp.cs.washington.edu/triviaqa/) is an open domain QA dataset. 
+See more useful scripts in [Offical Github](https://github.com/mandarjoshi90/triviaqa).
 
-Run the following command to download triviaqa
+Run the following command to download TriviaQA
 
 ```bash
 python3 prepare_triviaqa.py --version rc         # Download TriviaQA version 1.0 for RC (2.5G)
diff --git a/scripts/datasets/question_answering/prepare_searchqa.py b/scripts/datasets/question_answering/prepare_searchqa.py
index 51552834ba..eb5b9fe0a6 100644
--- a/scripts/datasets/question_answering/prepare_searchqa.py
+++ b/scripts/datasets/question_answering/prepare_searchqa.py
@@ -1,7 +1,7 @@
 import os
 import argparse
 from gluonnlp.utils.misc import download, load_checksum_stats
-from gluonnlp.base import get_data_home_dir
+from gluonnlp.base import get_data_home_dir, get_repo_url
 
 _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
 _BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'searchqa')
@@ -20,9 +20,9 @@
 """
 
 _URLS = {
-    'train': 's3://gluonnlp-numpy-data/datasets/question_answering/searchqa/train.txt',
-    'val': 's3://gluonnlp-numpy-data/datasets/question_answering/searchqa/val.txt',
-    'test': 's3://gluonnlp-numpy-data/datasets/question_answering/searchqa/test.txt'
+    'train': get_repo_url() + 'datasets/question_answering/searchqa/train.txt',
+    'val': get_repo_url() + 'datasets/question_answering/searchqa/val.txt',
+    'test': get_repo_url() + 'datasets/question_answering/searchqa/test.txt'
 }
 
 
diff --git a/scripts/datasets/question_answering/prepare_squad.py b/scripts/datasets/question_answering/prepare_squad.py
index fb9381fc46..b9bc49c696 100644
--- a/scripts/datasets/question_answering/prepare_squad.py
+++ b/scripts/datasets/question_answering/prepare_squad.py
@@ -1,5 +1,6 @@
 import os
 import argparse
+import shutil
 from gluonnlp.utils.misc import download, load_checksum_stats
 from gluonnlp.base import get_data_home_dir
 
@@ -58,14 +59,18 @@ def main(args):
     download(dev_url, path=os.path.join(args.cache_path, dev_file_name))
     if not os.path.exists(args.save_path):
         os.makedirs(args.save_path)
-    if not os.path.exists(os.path.join(args.save_path, train_file_name))\
+    if not os.path.exists(os.path.join(args.save_path, train_file_name)) \
             or (args.overwrite and args.save_path != args.cache_path):
-        os.symlink(os.path.join(args.cache_path, train_file_name),
-                   os.path.join(args.save_path, train_file_name))
-    if not os.path.exists(os.path.join(args.save_path, dev_file_name))\
+        os.link(os.path.join(args.cache_path, train_file_name),
+                os.path.join(args.save_path, train_file_name))
+    else:
+        print(f'Found {os.path.join(args.save_path, train_file_name)}...skip')
+    if not os.path.exists(os.path.join(args.save_path, dev_file_name)) \
             or (args.overwrite and args.save_path != args.cache_path):
-        os.symlink(os.path.join(args.cache_path, dev_file_name),
-                   os.path.join(args.save_path, dev_file_name))
+        os.link(os.path.join(args.cache_path, dev_file_name),
+                os.path.join(args.save_path, dev_file_name))
+    else:
+        print(f'Found {os.path.join(args.save_path, dev_file_name)}...skip')
 
 
 def cli_main():
diff --git a/scripts/datasets/url_checksums/searchqa.txt b/scripts/datasets/url_checksums/searchqa.txt
index 12ba03a7d5..11f518c92f 100644
--- a/scripts/datasets/url_checksums/searchqa.txt
+++ b/scripts/datasets/url_checksums/searchqa.txt
@@ -1,3 +1,3 @@
-s3://gluonnlp-numpy-data/datasets/question_answering/searchqa/train.txt c7e1eb8c34d0525547b91e18b3f8f4d855e35c16 1226681217
-s3://gluonnlp-numpy-data/datasets/question_answering/searchqa/test.txt 08a928e0f8c129d5b3ca43bf46df117e38be0c27 332064988
-s3://gluonnlp-numpy-data/datasets/question_answering/searchqa/val.txt c2f65d6b83c26188d5998ab96bc6a38c1a127fcc 170835902
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/question_answering/searchqa/train.txt c7e1eb8c34d0525547b91e18b3f8f4d855e35c16 1226681217
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/question_answering/searchqa/test.txt 08a928e0f8c129d5b3ca43bf46df117e38be0c27 332064988
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/question_answering/searchqa/val.txt c2f65d6b83c26188d5998ab96bc6a38c1a127fcc 170835902
diff --git a/scripts/machine_translation/README.md b/scripts/machine_translation/README.md
index 4bafcb920c..4b729cc117 100644
--- a/scripts/machine_translation/README.md
+++ b/scripts/machine_translation/README.md
@@ -7,8 +7,8 @@ to generate the dataset. Then, run `train_transformer.py` to train the model.
 In the following, we give the training script for WMT2014 EN-DE task with yttm tokenizer. 
 You may first run the following command in [datasets/machine_translation](../datasets/machine_translation).
 ```bash
-bash ../datasets/machine_translation/wmt2014_ende_base.sh yttm (For transformer_base config)
-bash ../datasets/machine_translation/wmt2014_ende.sh yttm (For transformer_wmt_en_de_big config)
+bash ../datasets/machine_translation/wmt2014_ende_base.sh yttm # (For transformer_base config)
+bash ../datasets/machine_translation/wmt2014_ende.sh yttm # (For transformer_wmt_en_de_big config)
 ```
 
 Then, you can run the experiment.
diff --git a/scripts/question_answering/commands/README.md b/scripts/question_answering/commands/README.md
new file mode 100644
index 0000000000..6a8a835d19
--- /dev/null
+++ b/scripts/question_answering/commands/README.md
@@ -0,0 +1,8 @@
+# Commands For Training on SQuAD
+
+All commands are generated by parsing the template in [run_squad.template](run_squad.template). 
+To generate all commands, use the following code.
+
+```bash
+python3 generate_commands.py
+```
diff --git a/scripts/question_answering/commands/generate_commands.py b/scripts/question_answering/commands/generate_commands.py
new file mode 100644
index 0000000000..192d12c06b
--- /dev/null
+++ b/scripts/question_answering/commands/generate_commands.py
@@ -0,0 +1,139 @@
+from gluonnlp.utils.config import CfgNode
+import re
+
+
+def base_cfg():
+    cfg = CfgNode()
+    cfg.model_name = 'google_albert_base_v2'
+    cfg.version = 2.0
+    cfg.batch_size = 4
+    cfg.num_accumulated = 3
+    cfg.epochs = 3
+    cfg.lr = 2e-5
+    cfg.warmup_ratio = 0.1
+    cfg.wd = 0.01
+    cfg.max_grad_norm = 0.1
+    cfg.max_seq_length = 512
+    cfg.layerwise_decay = -1
+    return cfg
+
+
+def albert_base_cfg():
+    return base_cfg()
+
+
+def albert_large_cfg():
+    cfg = base_cfg()
+    cfg.model_name = 'google_albert_large_v2'
+    cfg.batch_size = 3
+    cfg.num_accumulated = 4
+    return cfg
+
+
+def albert_xlarge_cfg():
+    cfg = base_cfg()
+    cfg.model_name = 'google_albert_xlarge_v2'
+    cfg.batch_size = 1
+    cfg.num_accumulated = 12
+    return cfg
+
+
+def albert_xxlarge_cfg():
+    cfg = albert_xlarge_cfg()
+    cfg.model_name = 'google_albert_xxlarge_v2'
+    return cfg
+
+
+def electra_base_cfg():
+    cfg = base_cfg()
+    cfg.model_name = 'google_electra_base'
+    cfg.batch_size = 8
+    cfg.num_accumulated = 1
+    cfg.lr = 1e-4
+    cfg.epochs = 2
+    cfg.layerwise_decay = 0.8
+    cfg.wd = 0
+    return cfg
+
+
+def electra_large_cfg():
+    cfg = electra_base_cfg()
+    cfg.model_name = 'google_electra_large'
+    cfg.batch_size = 2
+    cfg.num_accumulated = 4
+    cfg.lr = 1e-5
+    cfg.layerwise_decay = 0.9
+    return cfg
+
+
+def electra_small_cfg():
+    cfg = electra_base_cfg()
+    cfg.model_name = 'google_electra_small'
+    cfg.batch_size = 8
+    cfg.num_accumulated = 1
+    cfg.lr = 3e-4
+    cfg.epochs = 2
+    cfg.layerwise_decay = 0.8
+    return cfg
+
+
+def mobilebert_cfg():
+    cfg = base_cfg()
+    cfg.model_name = 'google_uncased_mobilebert'
+    cfg.batch_size = 8
+    cfg.num_accumulated = 1
+    cfg.lr = 4e-5
+    cfg.epochs = 5
+    cfg.max_seq_length = 384
+    return cfg
+
+
+def roberta_large_cfg():
+    cfg = base_cfg()
+    cfg.model_name = 'fairseq_roberta_large'
+    cfg.batch_size = 2
+    cfg.num_accumulated = 6
+    cfg.epochs = 3
+    cfg.lr = 3e-5
+    cfg.warmup_ratio = 0.2
+    cfg.wd = 0.01
+    return cfg
+
+
+def uncased_bert_base_cfg():
+    cfg = base_cfg()
+    cfg.model_name = 'google_en_uncased_bert_base'
+    cfg.batch_size = 6
+    cfg.num_accumulated = 2
+    cfg.lr = 3e-5
+    return cfg
+
+
+def uncased_bert_large_cfg():
+    cfg = uncased_bert_base_cfg()
+    cfg.model_name = 'google_en_uncased_bert_large'
+    cfg.batch_size = 2
+    cfg.num_accumulated = 6
+    return cfg
+
+
+def gen_command(config, template_path, out_path):
+    print(f'Generating from "{template_path}" to "{out_path}"')
+
+    def replace_fn(match):
+        return str(getattr(config, match.groups()[0]))
+
+    with open(template_path, 'r') as in_f:
+        with open(out_path, 'w') as out_f:
+            dat = in_f.read()
+            updated_dat = re.sub(r'{{ (.+) }}', replace_fn, dat)
+            out_f.write(updated_dat)
+
+
+if __name__ == '__main__':
+    for cfg_func in [albert_base_cfg, albert_large_cfg, albert_xlarge_cfg, albert_xxlarge_cfg,
+                     electra_base_cfg, electra_large_cfg, electra_small_cfg, mobilebert_cfg,
+                     roberta_large_cfg, uncased_bert_base_cfg, uncased_bert_large_cfg]:
+        prefix = cfg_func.__name__[:-len('_cfg')]
+        gen_command(cfg_func(), 'run_squad.template',
+                    f'run_squad2_{prefix}.sh')
diff --git a/scripts/question_answering/commands/run_squad.template b/scripts/question_answering/commands/run_squad.template
new file mode 100644
index 0000000000..a67b23bce3
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad.template
@@ -0,0 +1,42 @@
+# Generated by "generate_commands.py"
+
+USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
+VERSION=${2:-2.0}   # SQuAD Version
+MODEL_NAME={{ model_name }}
+BATCH_SIZE={{ batch_size }}
+NUM_ACCUMULATED={{ num_accumulated }}
+EPOCHS={{ epochs }}
+LR={{ lr }}
+WARMUP_RATIO={{ warmup_ratio }}
+WD={{ wd }}
+MAX_SEQ_LENGTH={{ max_seq_length }}
+MAX_GRAD_NORM={{ max_grad_norm }}
+LAYERWISE_DECAY={{ layerwise_decay }}
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+if [ ${USE_HOROVOD} -eq 0 ];
+then
+  RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3"
+else
+  RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod"
+fi
+${RUN_COMMAND} \
+    --model_name ${MODEL_NAME} \
+    --data_dir squad \
+    --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+    --version ${VERSION} \
+    --do_eval \
+    --do_train \
+    --batch_size ${BATCH_SIZE} \
+    --num_accumulated ${NUM_ACCUMULATED} \
+    --layerwise_decay ${LAYERWISE_DECAY} \
+    --epochs ${EPOCHS} \
+    --lr ${LR} \
+    --warmup_ratio ${WARMUP_RATIO} \
+    --wd ${WD} \
+    --max_seq_length ${MAX_SEQ_LENGTH} \
+    --max_grad_norm ${MAX_GRAD_NORM} \
+    --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_albert_base.sh b/scripts/question_answering/commands/run_squad2_albert_base.sh
index 69bee438f8..732b3abef8 100644
--- a/scripts/question_answering/commands/run_squad2_albert_base.sh
+++ b/scripts/question_answering/commands/run_squad2_albert_base.sh
@@ -1,25 +1,44 @@
-VERSION=2.0  # Either 2.0 or 1.1
+# Generated by "generate_commands.py"
+
+USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
+VERSION=${2:-2.0}   # SQuAD Version
 MODEL_NAME=google_albert_base_v2
+BATCH_SIZE=4
+NUM_ACCUMULATED=3
+EPOCHS=3
+LR=2e-05
+WARMUP_RATIO=0.1
+WD=0.01
+MAX_SEQ_LENGTH=512
+MAX_GRAD_NORM=0.1
+LAYERWISE_DECAY=-1
 
 # Prepare the Data
 nlp_data prepare_squad --version ${VERSION}
 
-# Run the script
+RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
 
-python3 run_squad.py \
+# Run the script
+if [ ${USE_HOROVOD} -eq 0 ];
+then
+  RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
+else
+  RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
+fi
+${RUN_COMMAND} \
     --model_name ${MODEL_NAME} \
     --data_dir squad \
     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
     --version ${VERSION} \
     --do_eval \
     --do_train \
-    --batch_size 4 \
-    --num_accumulated 3 \
-    --gpus 0,1,2,3 \
-    --epochs 3 \
-    --lr 2e-5 \
-    --warmup_ratio 0.1 \
-    --wd 0.01 \
-    --max_seq_length 512 \
-    --max_grad_norm 0.1 \
-    --overwrite_cache \
+    --batch_size ${BATCH_SIZE} \
+    --num_accumulated ${NUM_ACCUMULATED} \
+    --layerwise_decay ${LAYERWISE_DECAY} \
+    --epochs ${EPOCHS} \
+    --lr ${LR} \
+    --warmup_ratio ${WARMUP_RATIO} \
+    --wd ${WD} \
+    --max_seq_length ${MAX_SEQ_LENGTH} \
+    --max_grad_norm ${MAX_GRAD_NORM} \
+    --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_albert_large.sh b/scripts/question_answering/commands/run_squad2_albert_large.sh
index f4c9d069c5..fb92b7cda9 100644
--- a/scripts/question_answering/commands/run_squad2_albert_large.sh
+++ b/scripts/question_answering/commands/run_squad2_albert_large.sh
@@ -1,25 +1,44 @@
-VERSION=2.0  # Either 2.0 or 1.1
+# Generated by "generate_commands.py"
+
+USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
+VERSION=${2:-2.0}   # SQuAD Version
 MODEL_NAME=google_albert_large_v2
+BATCH_SIZE=3
+NUM_ACCUMULATED=4
+EPOCHS=3
+LR=2e-05
+WARMUP_RATIO=0.1
+WD=0.01
+MAX_SEQ_LENGTH=512
+MAX_GRAD_NORM=0.1
+LAYERWISE_DECAY=-1
 
 # Prepare the Data
 nlp_data prepare_squad --version ${VERSION}
 
-# Run the script
+RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
 
-python3 run_squad.py \
+# Run the script
+if [ ${USE_HOROVOD} -eq 0 ];
+then
+  RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
+else
+  RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
+fi
+${RUN_COMMAND} \
     --model_name ${MODEL_NAME} \
     --data_dir squad \
     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
     --version ${VERSION} \
     --do_eval \
     --do_train \
-    --batch_size 3 \
-    --num_accumulated 4 \
-    --gpus 0,1,2,3 \
-    --epochs 3 \
-    --lr 2e-5 \
-    --warmup_ratio 0.1 \
-    --wd 0.01 \
-    --max_seq_length 512 \
-    --max_grad_norm 0.1 \
-    --overwrite_cache \
+    --batch_size ${BATCH_SIZE} \
+    --num_accumulated ${NUM_ACCUMULATED} \
+    --layerwise_decay ${LAYERWISE_DECAY} \
+    --epochs ${EPOCHS} \
+    --lr ${LR} \
+    --warmup_ratio ${WARMUP_RATIO} \
+    --wd ${WD} \
+    --max_seq_length ${MAX_SEQ_LENGTH} \
+    --max_grad_norm ${MAX_GRAD_NORM} \
+    --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_albert_xlarge.sh b/scripts/question_answering/commands/run_squad2_albert_xlarge.sh
index d14994422d..0bd28952d5 100644
--- a/scripts/question_answering/commands/run_squad2_albert_xlarge.sh
+++ b/scripts/question_answering/commands/run_squad2_albert_xlarge.sh
@@ -1,25 +1,44 @@
-VERSION=2.0  # Either 2.0 or 1.1
+# Generated by "generate_commands.py"
+
+USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
+VERSION=${2:-2.0}   # SQuAD Version
 MODEL_NAME=google_albert_xlarge_v2
+BATCH_SIZE=1
+NUM_ACCUMULATED=12
+EPOCHS=3
+LR=2e-05
+WARMUP_RATIO=0.1
+WD=0.01
+MAX_SEQ_LENGTH=512
+MAX_GRAD_NORM=0.1
+LAYERWISE_DECAY=-1
 
 # Prepare the Data
 nlp_data prepare_squad --version ${VERSION}
 
-# Run the script
+RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
 
-python3 run_squad.py \
+# Run the script
+if [ ${USE_HOROVOD} -eq 0 ];
+then
+  RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
+else
+  RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
+fi
+${RUN_COMMAND} \
     --model_name ${MODEL_NAME} \
     --data_dir squad \
     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
     --version ${VERSION} \
     --do_eval \
     --do_train \
-    --batch_size 1 \
-    --num_accumulated 12 \
-    --gpus 0,1,2,3 \
-    --epochs 3 \
-    --lr 2e-5 \
-    --warmup_ratio 0.1 \
-    --wd 0.01 \
-    --max_seq_length 512 \
-    --max_grad_norm 0.1 \
-    --overwrite_cache \
+    --batch_size ${BATCH_SIZE} \
+    --num_accumulated ${NUM_ACCUMULATED} \
+    --layerwise_decay ${LAYERWISE_DECAY} \
+    --epochs ${EPOCHS} \
+    --lr ${LR} \
+    --warmup_ratio ${WARMUP_RATIO} \
+    --wd ${WD} \
+    --max_seq_length ${MAX_SEQ_LENGTH} \
+    --max_grad_norm ${MAX_GRAD_NORM} \
+    --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh b/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh
index fdb6e89658..9383cbc873 100644
--- a/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh
+++ b/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh
@@ -1,25 +1,44 @@
-VERSION=2.0  # Either 2.0 or 1.1
+# Generated by "generate_commands.py"
+
+USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
+VERSION=${2:-2.0}   # SQuAD Version
 MODEL_NAME=google_albert_xxlarge_v2
+BATCH_SIZE=1
+NUM_ACCUMULATED=12
+EPOCHS=3
+LR=2e-05
+WARMUP_RATIO=0.1
+WD=0.01
+MAX_SEQ_LENGTH=512
+MAX_GRAD_NORM=0.1
+LAYERWISE_DECAY=-1
 
 # Prepare the Data
 nlp_data prepare_squad --version ${VERSION}
 
-# Run the script
+RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
 
-python3 run_squad.py \
+# Run the script
+if [ ${USE_HOROVOD} -eq 0 ];
+then
+  RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
+else
+  RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
+fi
+${RUN_COMMAND} \
     --model_name ${MODEL_NAME} \
     --data_dir squad \
     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
     --version ${VERSION} \
     --do_eval \
     --do_train \
-    --batch_size 1 \
-    --num_accumulated 12 \
-    --gpus 0,1,2,3 \
-    --epochs 3 \
-    --lr 2e-5 \
-    --warmup_ratio 0.1 \
-    --wd 0.01 \
-    --max_seq_length 512 \
-    --max_grad_norm 0.1 \
-    --overwrite_cache \
+    --batch_size ${BATCH_SIZE} \
+    --num_accumulated ${NUM_ACCUMULATED} \
+    --layerwise_decay ${LAYERWISE_DECAY} \
+    --epochs ${EPOCHS} \
+    --lr ${LR} \
+    --warmup_ratio ${WARMUP_RATIO} \
+    --wd ${WD} \
+    --max_seq_length ${MAX_SEQ_LENGTH} \
+    --max_grad_norm ${MAX_GRAD_NORM} \
+    --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_electra_base.sh b/scripts/question_answering/commands/run_squad2_electra_base.sh
index a500a3ae50..16ee8cdb98 100644
--- a/scripts/question_answering/commands/run_squad2_electra_base.sh
+++ b/scripts/question_answering/commands/run_squad2_electra_base.sh
@@ -1,25 +1,44 @@
-VERSION=2.0  # Either 2.0 or 1.1
+# Generated by "generate_commands.py"
+
+USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
+VERSION=${2:-2.0}   # SQuAD Version
 MODEL_NAME=google_electra_base
+BATCH_SIZE=8
+NUM_ACCUMULATED=1
+EPOCHS=2
+LR=0.0001
+WARMUP_RATIO=0.1
+WD=0
+MAX_SEQ_LENGTH=512
+MAX_GRAD_NORM=0.1
+LAYERWISE_DECAY=0.8
 
 # Prepare the Data
 nlp_data prepare_squad --version ${VERSION}
 
-# Run the script
+RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
 
-python3 run_squad.py \
+# Run the script
+if [ ${USE_HOROVOD} -eq 0 ];
+then
+  RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
+else
+  RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
+fi
+${RUN_COMMAND} \
     --model_name ${MODEL_NAME} \
     --data_dir squad \
     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
     --version ${VERSION} \
     --do_eval \
     --do_train \
-    --batch_size 8 \
-    --num_accumulated 1 \
-    --gpus 0,1,2,3 \
-    --epochs 2 \
-    --lr 1e-4 \
-    --layerwise_decay 0.8 \
-    --warmup_ratio 0.1 \
-    --wd 0 \
-    --max_seq_length 512 \
-    --max_grad_norm 0.1 \
+    --batch_size ${BATCH_SIZE} \
+    --num_accumulated ${NUM_ACCUMULATED} \
+    --layerwise_decay ${LAYERWISE_DECAY} \
+    --epochs ${EPOCHS} \
+    --lr ${LR} \
+    --warmup_ratio ${WARMUP_RATIO} \
+    --wd ${WD} \
+    --max_seq_length ${MAX_SEQ_LENGTH} \
+    --max_grad_norm ${MAX_GRAD_NORM} \
+    --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_electra_large.sh b/scripts/question_answering/commands/run_squad2_electra_large.sh
index 61872f110b..815ec304e6 100644
--- a/scripts/question_answering/commands/run_squad2_electra_large.sh
+++ b/scripts/question_answering/commands/run_squad2_electra_large.sh
@@ -1,25 +1,44 @@
-VERSION=2.0  # Either 2.0 or 1.1
+# Generated by "generate_commands.py"
+
+USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
+VERSION=${2:-2.0}   # SQuAD Version
 MODEL_NAME=google_electra_large
+BATCH_SIZE=2
+NUM_ACCUMULATED=4
+EPOCHS=2
+LR=1e-05
+WARMUP_RATIO=0.1
+WD=0
+MAX_SEQ_LENGTH=512
+MAX_GRAD_NORM=0.1
+LAYERWISE_DECAY=0.9
 
 # Prepare the Data
 nlp_data prepare_squad --version ${VERSION}
 
-# Run the script
+RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
 
-python3 run_squad.py \
+# Run the script
+if [ ${USE_HOROVOD} -eq 0 ];
+then
+  RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
+else
+  RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
+fi
+${RUN_COMMAND} \
     --model_name ${MODEL_NAME} \
     --data_dir squad \
     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
     --version ${VERSION} \
     --do_eval \
     --do_train \
-    --batch_size 2 \
-    --num_accumulated 4 \
-    --gpus 0,1,2,3 \
-    --epochs 2 \
-    --lr 5e-5 \
-    --layerwise_decay 0.9 \
-    --warmup_ratio 0.1 \
-    --wd 0 \
-    --max_seq_length 512 \
-    --max_grad_norm 0.1 \
+    --batch_size ${BATCH_SIZE} \
+    --num_accumulated ${NUM_ACCUMULATED} \
+    --layerwise_decay ${LAYERWISE_DECAY} \
+    --epochs ${EPOCHS} \
+    --lr ${LR} \
+    --warmup_ratio ${WARMUP_RATIO} \
+    --wd ${WD} \
+    --max_seq_length ${MAX_SEQ_LENGTH} \
+    --max_grad_norm ${MAX_GRAD_NORM} \
+    --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_electra_small.sh b/scripts/question_answering/commands/run_squad2_electra_small.sh
index e174258c17..d6228ef0bc 100644
--- a/scripts/question_answering/commands/run_squad2_electra_small.sh
+++ b/scripts/question_answering/commands/run_squad2_electra_small.sh
@@ -1,24 +1,44 @@
-VERSION=2.0  # Either 2.0 or 1.1
+# Generated by "generate_commands.py"
+
+USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
+VERSION=${2:-2.0}   # SQuAD Version
 MODEL_NAME=google_electra_small
+BATCH_SIZE=8
+NUM_ACCUMULATED=1
+EPOCHS=2
+LR=0.0003
+WARMUP_RATIO=0.1
+WD=0
+MAX_SEQ_LENGTH=512
+MAX_GRAD_NORM=0.1
+LAYERWISE_DECAY=0.8
+
 # Prepare the Data
 nlp_data prepare_squad --version ${VERSION}
 
-# Run the script
+RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
 
-python3 run_squad.py \
+# Run the script
+if [ ${USE_HOROVOD} -eq 0 ];
+then
+  RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
+else
+  RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
+fi
+${RUN_COMMAND} \
     --model_name ${MODEL_NAME} \
     --data_dir squad \
     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
     --version ${VERSION} \
     --do_eval \
     --do_train \
-    --batch_size 32 \
-    --num_accumulated 1 \
-    --gpus 0 \
-    --epochs 2 \
-    --lr 3e-4 \
-    --layerwise_decay 0.8 \
-    --warmup_ratio 0.1 \
-    --wd 0 \
-    --max_seq_length 512 \
-    --max_grad_norm 0.1 \
+    --batch_size ${BATCH_SIZE} \
+    --num_accumulated ${NUM_ACCUMULATED} \
+    --layerwise_decay ${LAYERWISE_DECAY} \
+    --epochs ${EPOCHS} \
+    --lr ${LR} \
+    --warmup_ratio ${WARMUP_RATIO} \
+    --wd ${WD} \
+    --max_seq_length ${MAX_SEQ_LENGTH} \
+    --max_grad_norm ${MAX_GRAD_NORM} \
+    --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_mobilebert.sh b/scripts/question_answering/commands/run_squad2_mobilebert.sh
index cfeee56356..24fece841d 100644
--- a/scripts/question_answering/commands/run_squad2_mobilebert.sh
+++ b/scripts/question_answering/commands/run_squad2_mobilebert.sh
@@ -1,25 +1,44 @@
-VERSION=2.0  # Either 2.0 or 1.1
+# Generated by "generate_commands.py"
+
+USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
+VERSION=${2:-2.0}   # SQuAD Version
 MODEL_NAME=google_uncased_mobilebert
+BATCH_SIZE=8
+NUM_ACCUMULATED=1
+EPOCHS=5
+LR=4e-05
+WARMUP_RATIO=0.1
+WD=0.01
+MAX_SEQ_LENGTH=384
+MAX_GRAD_NORM=0.1
+LAYERWISE_DECAY=-1
 
 # Prepare the Data
 nlp_data prepare_squad --version ${VERSION}
 
-# Run the script
+RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
 
-python3 run_squad.py \
+# Run the script
+if [ ${USE_HOROVOD} -eq 0 ];
+then
+  RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
+else
+  RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
+fi
+${RUN_COMMAND} \
     --model_name ${MODEL_NAME} \
     --data_dir squad \
     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
     --version ${VERSION} \
     --do_eval \
     --do_train \
-    --batch_size 8 \
-    --num_accumulated 1 \
-    --gpus 0,1,2,3 \
-    --epochs  5 \
-    --lr 4e-5 \
-    --warmup_steps 1400 \
-    --wd 0.0 \
-    --max_seq_length 384 \
-    --max_grad_norm 0.1 \
-    --overwrite_cache \
+    --batch_size ${BATCH_SIZE} \
+    --num_accumulated ${NUM_ACCUMULATED} \
+    --layerwise_decay ${LAYERWISE_DECAY} \
+    --epochs ${EPOCHS} \
+    --lr ${LR} \
+    --warmup_ratio ${WARMUP_RATIO} \
+    --wd ${WD} \
+    --max_seq_length ${MAX_SEQ_LENGTH} \
+    --max_grad_norm ${MAX_GRAD_NORM} \
+    --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_roberta_large.sh b/scripts/question_answering/commands/run_squad2_roberta_large.sh
index 3cdf2cb6ea..2bf51e6b6c 100644
--- a/scripts/question_answering/commands/run_squad2_roberta_large.sh
+++ b/scripts/question_answering/commands/run_squad2_roberta_large.sh
@@ -1,23 +1,44 @@
-VERSION=2.0  # Either 2.0 or 1.1
+# Generated by "generate_commands.py"
+
+USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
+VERSION=${2:-2.0}   # SQuAD Version
 MODEL_NAME=fairseq_roberta_large
+BATCH_SIZE=2
+NUM_ACCUMULATED=6
+EPOCHS=3
+LR=3e-05
+WARMUP_RATIO=0.2
+WD=0.01
+MAX_SEQ_LENGTH=512
+MAX_GRAD_NORM=0.1
+LAYERWISE_DECAY=-1
 
 # Prepare the Data
 nlp_data prepare_squad --version ${VERSION}
 
+RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
+
 # Run the script
-python3 run_squad.py \
+if [ ${USE_HOROVOD} -eq 0 ];
+then
+  RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
+else
+  RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
+fi
+${RUN_COMMAND} \
     --model_name ${MODEL_NAME} \
     --data_dir squad \
     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
     --version ${VERSION} \
     --do_eval \
     --do_train \
-    --batch_size 2 \
-    --num_accumulated 6 \
-    --gpus 0,1,2,3 \
-    --epochs 3 \
-    --lr 3e-5 \
-    --warmup_ratio 0.2 \
-    --wd 0.01 \
-    --max_seq_length 512 \
-    --max_grad_norm 0.1 \
+    --batch_size ${BATCH_SIZE} \
+    --num_accumulated ${NUM_ACCUMULATED} \
+    --layerwise_decay ${LAYERWISE_DECAY} \
+    --epochs ${EPOCHS} \
+    --lr ${LR} \
+    --warmup_ratio ${WARMUP_RATIO} \
+    --wd ${WD} \
+    --max_seq_length ${MAX_SEQ_LENGTH} \
+    --max_grad_norm ${MAX_GRAD_NORM} \
+    --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh b/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh
index f087860014..f2a0738282 100644
--- a/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh
+++ b/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh
@@ -1,25 +1,44 @@
-VERSION=2.0  # Either 2.0 or 1.1
+# Generated by "generate_commands.py"
+
+USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
+VERSION=${2:-2.0}   # SQuAD Version
 MODEL_NAME=google_en_uncased_bert_base
+BATCH_SIZE=6
+NUM_ACCUMULATED=2
+EPOCHS=3
+LR=3e-05
+WARMUP_RATIO=0.1
+WD=0.01
+MAX_SEQ_LENGTH=512
+MAX_GRAD_NORM=0.1
+LAYERWISE_DECAY=-1
 
 # Prepare the Data
 nlp_data prepare_squad --version ${VERSION}
 
-# Run the script
+RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
 
-python3 run_squad.py \
+# Run the script
+if [ ${USE_HOROVOD} -eq 0 ];
+then
+  RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
+else
+  RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
+fi
+${RUN_COMMAND} \
     --model_name ${MODEL_NAME} \
     --data_dir squad \
     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
     --version ${VERSION} \
     --do_eval \
     --do_train \
-    --batch_size 6 \
-    --num_accumulated 2 \
-    --gpus 0,1,2,3 \
-    --epochs 3 \
-    --lr 3e-5 \
-    --warmup_ratio 0.1 \
-    --wd 0.01 \
-    --max_seq_length 512 \
-    --max_grad_norm 0.1 \
-    --overwrite_cache \
+    --batch_size ${BATCH_SIZE} \
+    --num_accumulated ${NUM_ACCUMULATED} \
+    --layerwise_decay ${LAYERWISE_DECAY} \
+    --epochs ${EPOCHS} \
+    --lr ${LR} \
+    --warmup_ratio ${WARMUP_RATIO} \
+    --wd ${WD} \
+    --max_seq_length ${MAX_SEQ_LENGTH} \
+    --max_grad_norm ${MAX_GRAD_NORM} \
+    --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh b/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh
index 0e80da7688..2f19c4c5e7 100644
--- a/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh
+++ b/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh
@@ -1,25 +1,44 @@
-VERSION=2.0  # Either 2.0 or 1.1
+# Generated by "generate_commands.py"
+
+USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
+VERSION=${2:-2.0}   # SQuAD Version
 MODEL_NAME=google_en_uncased_bert_large
+BATCH_SIZE=2
+NUM_ACCUMULATED=6
+EPOCHS=3
+LR=3e-05
+WARMUP_RATIO=0.1
+WD=0.01
+MAX_SEQ_LENGTH=512
+MAX_GRAD_NORM=0.1
+LAYERWISE_DECAY=-1
 
 # Prepare the Data
 nlp_data prepare_squad --version ${VERSION}
 
-# Run the script
+RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
 
-python3 run_squad.py \
+# Run the script
+if [ ${USE_HOROVOD} -eq 0 ];
+then
+  RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
+else
+  RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
+fi
+${RUN_COMMAND} \
     --model_name ${MODEL_NAME} \
     --data_dir squad \
     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
     --version ${VERSION} \
     --do_eval \
     --do_train \
-    --batch_size 2 \
-    --num_accumulated 6 \
-    --gpus 0,1,2,3 \
-    --epochs 3 \
-    --lr 3e-5 \
-    --warmup_ratio 0.1 \
-    --wd 0.01 \
-    --max_seq_length 512 \
-    --max_grad_norm 0.1 \
-    --overwrite_cache \
+    --batch_size ${BATCH_SIZE} \
+    --num_accumulated ${NUM_ACCUMULATED} \
+    --layerwise_decay ${LAYERWISE_DECAY} \
+    --epochs ${EPOCHS} \
+    --lr ${LR} \
+    --warmup_ratio ${WARMUP_RATIO} \
+    --wd ${WD} \
+    --max_seq_length ${MAX_SEQ_LENGTH} \
+    --max_grad_norm ${MAX_GRAD_NORM} \
+    --overwrite_cache
diff --git a/src/gluonnlp/models/__init__.py b/src/gluonnlp/models/__init__.py
index 490667c20c..93bbd1f3bf 100644
--- a/src/gluonnlp/models/__init__.py
+++ b/src/gluonnlp/models/__init__.py
@@ -54,7 +54,7 @@ def get_backbone(model_name: str,
     --------
 
     >>> from gluonnlp.models import get_backbone
-    >>> model_cls, tokenizer, cfg, backbone_param_path, _ = get_backbone('google_en_cased_bert_base')
+    >>> model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone('google_en_cased_bert_base')
     >>> model = model_cls.from_cfg(cfg)
     >>> model.load_parameters(backbone_param_path)
     """
diff --git a/src/gluonnlp/utils/lazy_imports.py b/src/gluonnlp/utils/lazy_imports.py
index 8b26275b0e..82e2a2fd5e 100644
--- a/src/gluonnlp/utils/lazy_imports.py
+++ b/src/gluonnlp/utils/lazy_imports.py
@@ -25,7 +25,8 @@
            'try_import_fasttext',
            'try_import_langid',
            'try_import_boto3',
-           'try_import_jieba']
+           'try_import_jieba',
+           'try_import_tvm']
 
 
 def try_import_sentencepiece():
@@ -155,3 +156,12 @@ def try_import_jieba():
         raise ImportError('"jieba" is not installed. You must install jieba tokenizer. '
                           'You may try to use `pip install jieba`')
     return jieba
+
+
+def try_import_tvm():
+    try:
+        import tvm
+    except ImportError:
+        raise ImportError('"tvm" is not installed. You must install TVM to use the functionality. '
+                          'To install TVM, you may see the documentation in '
+                          'https://tvm.apache.org/ or try to use the docker of GluonNLP.')
diff --git a/tests/test_models.py b/tests/test_models.py
index 3a41dcf656..c46bc252a1 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -16,9 +16,6 @@ def test_list_backbone_names():
 def test_get_backbone(name, ctx):
     with tempfile.TemporaryDirectory() as root, ctx:
         model_cls, cfg, tokenizer, local_params_path, _ = get_backbone(name, root=root)
-        if name == 'gpt2_1558M':
-            # skip gpt2 1558M due to the space
-            return
         net = model_cls.from_cfg(cfg)
         net.load_parameters(local_params_path)
         net.hybridize()
@@ -38,8 +35,9 @@ def test_get_backbone(name, ctx):
         elif 'bart' in name:
             out = net(inputs, valid_length, inputs, valid_length)
         elif 'gpt2' in name:
-            # Temporarily skip GPT-2 test
-            return
+            states = net.init_states(batch_size=batch_size, ctx=ctx)
+            out, new_states = net(inputs, states)
+            out_np = out.asnumpy()
         else:
             out = net(inputs, token_types, valid_length)
         mx.npx.waitall()
diff --git a/tools/batch/README.md b/tools/batch/README.md
index 1ba8cfc5e1..e95d2e4c6f 100644
--- a/tools/batch/README.md
+++ b/tools/batch/README.md
@@ -13,21 +13,33 @@ python3 submit-job.py \
 --wait
 ```
 
+# Updating the Docker for AWS Batch.
+
+You may refer to the instruction in [GluonNLP Docker Support](../docker/README.md#ci-maintainer) for more information.
+
 ## Conversion Toolkits
-Following the instruction of [converting scripts](../../scripts/conversion_toolkits), several pre-trained models could be converted through the corresponding conversion tool as below command where `${MODEL_TYPE}` could be selected from `[albert, bert, electra, mobilebert, bart, robert, xmlr]`.
+Following the instruction of [converting scripts](../../scripts/conversion_toolkits), 
+several pre-trained models could be converted through the corresponding conversion tool as below command where `${MODEL_TYPE}` could be selected from `[albert, bert, electra, mobilebert, bart, robert, xmlr]`.
 ```bash
 bash run_batch_conversion ${MODEL_TYPE}
 ```
+
 ## Fine-tuning Downstream Tasks
 
 ### Question Answering
-We can quickly deploy an experiment via [squad fine-tuning scripts](../../scripts/question_answering#squad) as
+We can quickly run the squad finetuning via [squad fine-tuning scripts](../../scripts/question_answering#squad) and the AWS Batch job.
+
+The code is given in [run_batch_squad.sh](run_batch_squad.sh)
 
 ```bash
-bash run_batch_squad.sh ${MODEL_NAME}
+# AWS Batch training without horovod on SQuAD 2.0
+bash run_batch_squad.sh
+
+# AWS Batch training with horovod on SQuAD 2.0
+bash run_batch_squad.sh 1 2.0 submit_squad_v2_horovod.log
 ```
 
-in which `${MODEL_NAME}` is the name of available pre-trained models listing as following:
+Internally, it will train the following models on SQuAD 2.0 dataset:
 |    MODEL_NAME      |
 |:------------------:|
 | uncased_bert_base  |
@@ -39,12 +51,5 @@ in which `${MODEL_NAME}` is the name of available pre-trained models listing as
 | electra_small      |
 | electra_base       |
 | electra_large      |
-| roberta_base       |
 | roberta_large      |
 | mobilebert         |
-
-### Machine Translation
-
-### Text Translation
-
-## Pre-trained Model Training
diff --git a/tools/batch/batch_states/compile_notebooks.sh b/tools/batch/batch_states/compile_notebooks.sh
index d993e2dcea..b82a4b14e0 100755
--- a/tools/batch/batch_states/compile_notebooks.sh
+++ b/tools/batch/batch_states/compile_notebooks.sh
@@ -1,11 +1,13 @@
 #!/bin/bash
 # Shell script for submitting AWS Batch jobs to compile notebooks
+set -ex
 
 prnumber=$1
 runnumber=$2
 remote=$3
 refs=$4
 
+
 compile_notebook () {
     local MDFILE=$1
     DIR=$(dirname $MDFILE)
@@ -19,7 +21,7 @@ compile_notebook () {
     python3 tools/batch/submit-job.py --region us-east-1 \
             --wait \
             --timeout 3600 \
-            --saved-output /gluon-nlp/docs/examples \
+            --saved-output docs/examples \
             --name GluonNLP-Docs-${refs}-${prnumber}-${runnumber} \
             --save-path ${runnumber}/gluon-nlp/docs/examples \
             --work-dir . \
@@ -27,7 +29,7 @@ compile_notebook () {
             --remote https://github.com/${remote} \
             --command "python3 -m pip install --quiet nbformat notedown jupyter_client ipykernel && \
                        python3 -m nltk.downloader perluniprops nonbreaking_prefixes punkt && \
-                       python3 /gluon-nlp/docs/md2ipynb.py ${MDFILE}" 2>&1 | tee $LOGNAME >/dev/null
+                       python3 docs/md2ipynb.py ${MDFILE}" 2>&1 | tee $LOGNAME >/dev/null
 
     BATCH_EXIT_CODE=$?
 
diff --git a/tools/batch/batch_states/test.sh b/tools/batch/batch_states/test.sh
index c14c16bc67..86ac4f467a 100755
--- a/tools/batch/batch_states/test.sh
+++ b/tools/batch/batch_states/test.sh
@@ -1,9 +1,10 @@
 #!/bin/bash
 # Shell script for installing dependencies and running test on AWS Batch
+set -ex
 
 echo $PWD
+SCRIPTPATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
+REPODIR="$( readlink -f ${SCRIPTPATH}/../../../../gluon-nlp)"
 
-python3 -m pip install --user --quiet -upgrade pip
-python3 -m pip install --user --quiet setuptools pytest pytest-cov contextvars
-python3 -m pip install --upgrade --quiet cython
-python3 -m pytest --cov=/gluon-nlp --cov-config=/gluon-nlp/.coveragerc --cov-report=xml --durations=50 --device="gpu" --runslow /gluon-nlp/tests/
+python3 -m pip install --upgrade --user pytest pytest-cov contextvars
+python3 -m pytest --cov=$REPODIR --cov-config=$REPODIR/.coveragerc --cov-report=xml --durations=50 --device="gpu" --runslow $REPODIR/tests/
diff --git a/tools/batch/batch_states/test_data_pipeline.sh b/tools/batch/batch_states/test_data_pipeline.sh
index 86fafeaf9b..69a478a582 100644
--- a/tools/batch/batch_states/test_data_pipeline.sh
+++ b/tools/batch/batch_states/test_data_pipeline.sh
@@ -1,8 +1,9 @@
+#!/bin/bash
+# Shell script for testing the data preprocessing on AWS Batch
+
 set -ex
 echo $PWD
 
-python3 -m pip install --pre "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python
-
 for MODEL in spm yttm
 do
   bash ../../../scripts/datasets/machine_translation/wmt2014_ende.sh ${MODEL}
diff --git a/tools/batch/docker/Dockerfile.cpu b/tools/batch/docker/Dockerfile.cpu
deleted file mode 100644
index ca5cb6029e..0000000000
--- a/tools/batch/docker/Dockerfile.cpu
+++ /dev/null
@@ -1,33 +0,0 @@
-FROM ubuntu:18.04
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      build-essential \
-      locales \
-      cmake \
-      wget \
-      subversion \
-      git \
-      curl \
-      vim \
-      unzip \
-      sudo \
-      ca-certificates \
-      libjpeg-dev \
-      libpng-dev \
-      libfreetype6-dev \
-      libopenblas-dev \
-      python3-dev \
-      python3-pip \
-      python3-setuptools \
-      libxft-dev &&\
-  rm -rf /var/lib/apt/lists/*
-
-RUN pip3 install --upgrade pip
-RUN pip3 install  --no-cache --upgrade \
-    wheel \
-    cmake \
-    awscli
-RUN git clone https://github.com/dmlc/gluon-nlp
-WORKDIR gluon-nlp
-ADD gluon_nlp_cpu_job.sh .
-RUN chmod +x gluon_nlp_cpu_job.sh
diff --git a/tools/batch/docker/Dockerfile.gpu b/tools/batch/docker/Dockerfile.gpu
deleted file mode 100644
index 88ad1c86aa..0000000000
--- a/tools/batch/docker/Dockerfile.gpu
+++ /dev/null
@@ -1,33 +0,0 @@
-FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      build-essential \
-      locales \
-      cmake \
-      wget \
-      subversion \
-      git \
-      curl \
-      vim \
-      unzip \
-      sudo \
-      ca-certificates \
-      libjpeg-dev \
-      libpng-dev \
-      libfreetype6-dev \
-      libopenblas-dev \
-      python3-dev \
-      python3-pip \
-      python3-setuptools \
-      libxft-dev &&\
-  rm -rf /var/lib/apt/lists/*
-
-RUN pip3 install --upgrade pip
-RUN pip3 install  --no-cache --upgrade \
-    wheel \
-    cmake \
-    awscli
-RUN git clone https://github.com/dmlc/gluon-nlp
-WORKDIR gluon-nlp
-ADD gluon_nlp_job.sh .
-RUN chmod +x gluon_nlp_job.sh
diff --git a/tools/batch/docker/README.md b/tools/batch/docker/README.md
deleted file mode 100644
index 3675f82980..0000000000
--- a/tools/batch/docker/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# Updating the Docker for AWS Batch.
-
-Our current batch job dockers are in 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1. To
-update the docker:
-- update the Dockerfile
-- Make sure docker and docker-compose, as well as the docker python package are installed.
-- Export the AWS account credentials as environment variables
-- CD to the same folder as the Dockerfile and execute the following:
-
-```
-# this executes a command that logs into ECR.
-$(aws ecr get-login --no-include-email --region us-east-1)
-
-# builds the Dockerfile as gluon-nlp-1 docker.
-docker build -f Dockerfile.gpu -t gluon-nlp-1:gpu .
-docker build -f Dockerfile.cpu -t gluon-nlp-1:cpu .
-
-# tags the recent build as gluon-nlp-1:latest, which AWS batch pulls from.
-docker tag gluon-nlp-1:gpu 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:latest
-docker tag gluon-nlp-1:cpu 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:cpu-latest
-
-# pushes the change
-docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:latest
-docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:cpu-latest
-```
diff --git a/tools/batch/docker/gluon_nlp_cpu_job.sh b/tools/batch/docker/gluon_nlp_cpu_job.sh
deleted file mode 100644
index 3045209c4f..0000000000
--- a/tools/batch/docker/gluon_nlp_cpu_job.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-date
-echo "Args: $@"
-env
-echo "jobId: $AWS_BATCH_JOB_ID"
-echo "jobQueue: $AWS_BATCH_JQ_NAME"
-echo "computeEnvironment: $AWS_BATCH_CE_NAME"
-
-SOURCE_REF=$1
-WORK_DIR=$2
-COMMAND=$3
-SAVED_OUTPUT=$4
-SAVE_PATH=$5
-REMOTE=$6
-
-if [ ! -z $REMOTE ]; then
-    git remote set-url origin $REMOTE
-fi;
-
-git fetch origin $SOURCE_REF:working
-git checkout working
-python3 -m pip install -U --quiet --pre "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python
-python3 -m pip install --quiet -e .[extras]
-
-cd $WORK_DIR
-/bin/bash -o pipefail -c "$COMMAND"
-COMMAND_EXIT_CODE=$?
-if [[ -f $SAVED_OUTPUT ]]; then
-  aws s3 cp $SAVED_OUTPUT s3://gluon-nlp-dev/batch/$AWS_BATCH_JOB_ID/$SAVE_PATH;
-elif [[ -d $SAVED_OUTPUT ]]; then
-  aws s3 cp --recursive $SAVED_OUTPUT s3://gluon-nlp-dev/batch/$AWS_BATCH_JOB_ID/$SAVE_PATH;
-fi;
-exit $COMMAND_EXIT_CODE
diff --git a/tools/batch/run_batch_squad.sh b/tools/batch/run_batch_squad.sh
index c3f9ba1dff..8349716c29 100644
--- a/tools/batch/run_batch_squad.sh
+++ b/tools/batch/run_batch_squad.sh
@@ -1,3 +1,9 @@
+set -ex
+
+USE_HOROVOD=${1:-0}
+VERSION=${2:-2.0}
+LOG_PATH=${3:-submit_squad_v2.log}
+
 for MODEL_NAME in albert_base \
                   albert_large \
                   albert_xlarge \
@@ -18,5 +24,5 @@ do
       --name test_squad2_${MODEL_NAME} \
       --work-dir scripts/question_answering \
       --remote https://github.com/dmlc/gluon-nlp/ \
-      --command 'bash commands/run_squad2_'${MODEL_NAME}'.sh | tee stdout.log' >> submit_squad_v2.log
+      --command "bash commands/run_squad2_${MODEL_NAME}.sh ${USE_HOROVOD} ${VERSION} | tee stdout.log" >> ${LOG_PATH}
 done
diff --git a/tools/batch/submit-job.py b/tools/batch/submit-job.py
index 76f7368d2b..9bbbc6fe81 100644
--- a/tools/batch/submit-job.py
+++ b/tools/batch/submit-job.py
@@ -8,6 +8,45 @@
 import boto3
 from botocore.compat import total_seconds
 
+instance_type_info = {
+    'g4dn.4x': {
+        'job_definition': 'gluon-nlp-g4dn_4xlarge:5',
+        'job_queue': 'g4dn'
+    },
+    'g4dn.8x': {
+        'job_definition': 'gluon-nlp-g4dn_8xlarge:5',
+        'job_queue': 'g4dn'
+    },
+    'g4dn.12x': {
+        'job_definition': 'gluon-nlp-g4dn_12xlarge:5',
+        'job_queue': 'g4dn-multi-gpu'
+    },
+    'p3.2x': {
+        'job_definition': 'gluon-nlp-p3_2xlarge:5',
+        'job_queue': 'p3'
+    },
+    'p3.8x': {
+        'job_definition': 'gluon-nlp-p3_8xlarge:5',
+        'job_queue': 'p3-4gpu'
+    },
+    'p3.16x': {
+        'job_definition': 'gluon-nlp-p3_16xlarge:5',
+        'job_queue': 'p3-8gpu'
+    },
+    'p3dn.24x': {
+        'job_definition': 'gluon-nlp-p3_24xlarge:5',
+        'job_queue': 'p3dn-8gpu'
+    },
+    'c5n.4x': {
+        'job_definition': 'gluon-nlp-c5_4xlarge:3',
+        'job_queue': 'c5n'
+    },
+    'c5n.18x': {
+        'job_definition': 'gluon-nlp-c5_18xlarge:3',
+        'job_queue': 'c5n'
+    }
+}
+
 parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
 parser.add_argument('--profile', help='profile name of aws account.', type=str,
@@ -16,9 +55,7 @@
                     default=None)
 parser.add_argument('--name', help='name of the job', type=str, default='dummy')
 parser.add_argument('--job-type', help='type of job to submit.', type=str,
-                    choices=['g4dn.4x', 'g4dn.8x', 'g4dn.12x', 'g4dn.16x',
-                             'p3.2x', 'p3.8x', 'p3.16x', 'p3dn.24x',
-                             'c5n.18x', 'c5n.4x'], default='g4dn.4x')
+                    choices=instance_type_info.keys(), default='g4dn.4x')
 parser.add_argument('--source-ref',
                     help='ref in GluonNLP main github. e.g. master, refs/pull/500/head',
                     type=str, default='master')
@@ -76,41 +113,14 @@ def nowInMillis():
     return endTime
 
 
-job_definitions = {
-    'g4dn.4x': 'gluon-nlp-1-jobs:5',
-    'g4dn.8x': 'gluon-nlp-1-jobs:4',
-    'g4dn.12x': 'gluon-nlp-1-4gpu-jobs:1',
-    'g4dn.16x': 'gluon-nlp-1-jobs:3',
-    'p3.2x': 'gluon-nlp-1-jobs:11',
-    'p3.8x': 'gluon-nlp-1-4gpu-jobs:2',
-    'p3.16x': 'gluon-nlp-1-8gpu-jobs:1',
-    'p3dn.24x': 'gluon-nlp-1-8gpu-jobs:2',
-    'c5n.4x': 'gluon-nlp-1-cpu-jobs:3',
-    'c5n.18x': 'gluon-nlp-1-cpu-jobs:2',
-}
-
-job_queues = {
-    'g4dn.4x': 'g4dn',
-    'g4dn.8x': 'g4dn',
-    'g4dn.12x': 'g4dn-multi-gpu',
-    'g4dn.16x': 'g4dn',
-    'p3.2x': 'p3',
-    'p3.8x': 'p3-4gpu',
-    'p3.16x': 'p3-8gpu',
-    'p3dn.24x': 'p3dn-8gpu',
-    'c5n.4x': 'c5n',
-    'c5n.18x': 'c5n',
-}
-
-
 def main():
     spin = ['-', '/', '|', '\\', '-', '/', '|', '\\']
     logGroupName = '/aws/batch/job'
 
     jobName = re.sub('[^A-Za-z0-9_\-]', '', args.name)[:128]  # Enforce AWS Batch jobName rules
     jobType = args.job_type
-    jobQueue = job_queues[jobType]
-    jobDefinition = job_definitions[jobType]
+    jobQueue = instance_type_info[jobType]['job_queue']
+    jobDefinition = instance_type_info[jobType]['job_definition']
     command = args.command.split()
     wait = args.wait
 
diff --git a/tools/docker/README.md b/tools/docker/README.md
index 6b90b0121d..e9e200288d 100644
--- a/tools/docker/README.md
+++ b/tools/docker/README.md
@@ -4,12 +4,23 @@ With the prebuilt docker image, there is no need to worry about the operating sy
 You can launch a [JupyterLab](https://jupyterlab.readthedocs.io/en/stable/) development environment 
 and try out to use GluonNLP to solve your problem.
 
+| Name | Description | Target User |
+|------|-------------|-------------|
+| `cpu-ci-latest` or `gpu-ci-latest`   | Extends the CUDA image to include the basic functionalities, e.g., GluonNLP package, MXNet, PyTorch, Horovod. This is the image used in GluonNLP CI | GluonNLP Developers |  
+| `cpu-latest` or `gpu-latest` | It has more functionality than the CI image, including the a development platform powered by Jupyter Lab. Some useful functionalities like Tensorboard are pre-installed. | Users that are willing to solve NLP problems and also do distributed training with Horovod + GluonNLP. |
+
+
 ## Run Docker
 You can run the docker with the following command.
 
 ```
+# On GPU machine
 docker pull gluonai/gluon-nlp:gpu-latest
 docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=2g gluonai/gluon-nlp:gpu-latest
+
+# On CPU machine
+docker pull gluonai/gluon-nlp:cpu-latest
+docker run --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=2g gluonai/gluon-nlp:cpu-latest
 ```
 
 Here, we open the ports 8888, 8787, 8786, which are used for connecting to JupyterLab. 
@@ -21,26 +32,86 @@ The folder structure of the docker image will be
 ```
 /workspace/
 ├── gluonnlp
-├── horovod
-├── mxnet
-├── notebooks
+├── tvm
 ├── data
 ```
 
 If you have a multi-GPU instance, e.g., [g4dn.12xlarge](https://aws.amazon.com/ec2/instance-types/g4/),
 [p2.8xlarge](https://aws.amazon.com/ec2/instance-types/p2/),
-[p3.8xlarge](https://aws.amazon.com/ec2/instance-types/p3/), you can try to run the following 
-command to verify the installation of horovod + MXNet
+[p3.8xlarge](https://aws.amazon.com/ec2/instance-types/p3/), you can try to verify the installation 
+of horovod + MXNet by running the question answering script
+
+```
+# Assume that you are currently in GluonNLP
+
+cd gluon-nlp/scripts/question_answering
 
+docker run --gpus all --rm -it --shm-size=2g -v `pwd`:/workspace/data gluonai/gluon-nlp:gpu-latest \
+    bash -c 'cd /workspace/data && bash commands/run_squad2_albert_base.sh 1 2.0'
 ```
-docker run --gpus all --rm -it --shm-size=4g gluonai/gluon-nlp:gpu-latest \
-    horovodrun -np 2 python3 -m pytest /workspace/horovod/horovod/test/test_mxnet.py
+
+
+## Build by yourself
+To build a docker image from the dockerfile, you may use the following command:
+
 ```
+# Build CPU Dockers
+docker build -f ubuntu18.04-cpu.Dockerfile --target ci -t gluonai/gluon-nlp:cpu-ci-latest .
+docker build -f ubuntu18.04-cpu.Dockerfile --target devel -t gluonai/gluon-nlp:cpu-latest .
 
+# Build GPU Dockers
+docker build -f ubuntu18.04-gpu.Dockerfile --target ci -t gluonai/gluon-nlp:gpu-ci-latest .
+docker build -f ubuntu18.04-gpu.Dockerfile --target devel -t gluonai/gluon-nlp:gpu-latest .
+```
 
-## Build your own Docker Image
-To build a docker image fom the dockerfile, you may use the following command:
+In addition, to build the GPU docker, you will need to install the nvidia-docker2 and edit `/etc/docker/daemon.json` like the following:
 
 ```
-docker build -f ubuntu18.04-devel-gpu.Dockerfile -t gluonai/gluon-nlp:gpu-latest .
+{
+    "runtimes": {
+        "nvidia": {
+            "path": "nvidia-container-runtime",
+            "runtimeArgs": []
+        }
+    },
+    "default-runtime": "nvidia"
+}
+```
+
+After that, restart docker via `sudo systemctl restart docker.service`.
+
+For more details, you may refer to https://github.com/NVIDIA/nvidia-docker/issues/595. We need this additional setup
+because the horovod+mxnet integration identifies the library and include 
+path of MXNet by querying the MXNet runtime.
+
+### Developers of GluonNLP
+You may try to login to your dockerhub account and push the image to dockerhub.
+```
+docker push gluonai/gluon-nlp:cpu-ci-latest
+docker push gluonai/gluon-nlp:cpu-latest
+
+docker push gluonai/gluon-nlp:gpu-ci-latest
+docker push gluonai/gluon-nlp:gpu-latest
+```
+
+### CI maintainer
+
+Our current batch job dockers are in 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1. To
+update the docker:
+- Update the Dockerfile as described above
+- Make sure docker and docker-compose, as well as the docker python package are installed.
+- Export the AWS account credentials as environment variables
+- CD to the same folder as the Dockerfile and execute the following:
+
+```
+# this executes a command that logs into ECR.
+$(aws ecr get-login --no-include-email --region us-east-1)
+
+# tags the recent build as gluon-nlp-1:latest, which AWS batch pulls from.
+docker tag gluonai/gluon-nlp:gpu-ci-latest 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:gpu-ci-latest
+docker tag gluonai/gluon-nlp:cpu-ci-latest 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:cpu-ci-latest
+
+# pushes the change
+docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:gpu-ci-latest
+docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:cpu-ci-latest
 ```
diff --git a/tools/batch/docker/gluon_nlp_job.sh b/tools/docker/gluon_nlp_job.sh
similarity index 54%
rename from tools/batch/docker/gluon_nlp_job.sh
rename to tools/docker/gluon_nlp_job.sh
index 65bad7ccce..c2f54e371f 100755
--- a/tools/batch/docker/gluon_nlp_job.sh
+++ b/tools/docker/gluon_nlp_job.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+
 date
 echo "Args: $@"
 env
@@ -12,6 +13,7 @@ COMMAND=$3
 SAVED_OUTPUT=$4
 SAVE_PATH=$5
 REMOTE=$6
+DEVICE=${7:-gpu}
 
 if [ ! -z $REMOTE ]; then
     git remote set-url origin $REMOTE
@@ -19,7 +21,18 @@ fi;
 
 git fetch origin $SOURCE_REF:working
 git checkout working
-python3 -m pip install -U --quiet --pre "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
+
+if [ $DEVICE == "cpu" ]; then
+  python3 -m pip install -U --quiet --pre "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python
+else
+  # Due to the issue in https://forums.aws.amazon.com/thread.jspa?messageID=953912
+  # We need to manually configure the shm to ensure that Horovod is runnable.
+  # The reason that we need a larger shm is described in https://github.com/NVIDIA/nccl/issues/290
+  umount shm
+  mount -t tmpfs -o rw,nosuid,nodev,noexec,relatime,size=2G shm /dev/shm
+  python3 -m pip install -U --quiet --pre "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
+fi
+
 python3 -m pip install --quiet -e .[extras]
 
 cd $WORK_DIR
diff --git a/tools/docker/install/install_horovod.sh b/tools/docker/install/install_horovod.sh
new file mode 100644
index 0000000000..a311ce2d95
--- /dev/null
+++ b/tools/docker/install/install_horovod.sh
@@ -0,0 +1,8 @@
+set -euo pipefail
+
+# Install Horovod
+HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITHOUT_GLOO=1 \
+HOROVOD_WITH_MPI=1 HOROVOD_WITH_MXNET=1 HOROVOD_WITH_PYTORCH=1 \
+HOROVOD_WITHOUT_TENSORFLOW=1 python3 -m pip install --no-cache-dir horovod==0.20.3 --user
+# Debug horovod by default
+echo NCCL_DEBUG=INFO >> /etc/nccl.conf
diff --git a/tools/docker/install/install_jupyter_lab.sh b/tools/docker/install/install_jupyter_lab.sh
new file mode 100644
index 0000000000..f6a67826cd
--- /dev/null
+++ b/tools/docker/install/install_jupyter_lab.sh
@@ -0,0 +1,23 @@
+set -euo pipefail
+
+# Install NodeJS + Tensorboard + TensorboardX
+
+curl -sL https://deb.nodesource.com/setup_14.x | bash - \
+    && apt-get install -y nodejs
+
+apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev
+
+python3 -m pip install --no-cache --upgrade \
+    soundfile==0.10.2 \
+    ipywidgets==7.5.1 \
+    jupyter_tensorboard==0.2.0 \
+    widgetsnbextension==3.5.1 \
+    tensorboard==2.1.1 \
+    tensorboardX==2.1 --user
+jupyter labextension install jupyterlab_tensorboard \
+   && jupyter nbextension enable --py widgetsnbextension \
+   && jupyter labextension install @jupyter-widgets/jupyterlab-manager
+
+# Revise default shell to /bin/bash
+jupyter notebook --generate-config \
+  && echo "c.NotebookApp.terminado_settings = { 'shell_command': ['/bin/bash'] }" >> /root/.jupyter/jupyter_notebook_config.py
diff --git a/tools/docker/install/install_llvm.sh b/tools/docker/install/install_llvm.sh
new file mode 100644
index 0000000000..56f793b201
--- /dev/null
+++ b/tools/docker/install/install_llvm.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-9 main\
+     >> /etc/apt/sources.list.d/llvm.list
+echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-9 main\
+     >> /etc/apt/sources.list.d/llvm.list
+
+
+echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main\
+     >> /etc/apt/sources.list.d/llvm.list
+echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main\
+     >> /etc/apt/sources.list.d/llvm.list
+
+echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main\
+     >> /etc/apt/sources.list.d/llvm.list
+echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main\
+     >> /etc/apt/sources.list.d/llvm.list
+
+echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic main\
+     >> /etc/apt/sources.list.d/llvm.list
+echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic main\
+     >> /etc/apt/sources.list.d/llvm.list
+
+wget -q -O - http://apt.llvm.org/llvm-snapshot.gpg.key|apt-key add -
+apt-get update && apt-get install -y llvm-9 llvm-10 llvm-11 clang-9 clang-10 clang-11
diff --git a/tools/docker/install/install_openmpi.sh b/tools/docker/install/install_openmpi.sh
new file mode 100644
index 0000000000..42a764a740
--- /dev/null
+++ b/tools/docker/install/install_openmpi.sh
@@ -0,0 +1,21 @@
+set -euo pipefail
+
+mkdir /tmp/openmpi \
+ && cd /tmp/openmpi \
+ && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
+ && tar zxf openmpi-4.0.1.tar.gz \
+ && cd openmpi-4.0.1 \
+ && ./configure --enable-orterun-prefix-by-default \
+ && make -j $(nproc) all \
+ && make install \
+ && ldconfig \
+ && rm -rf /tmp/openmpi
+
+# Create a wrapper for OpenMPI to allow running as root by default
+mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
+ && echo '#!/bin/bash' > /usr/local/bin/mpirun \
+ && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
+ && chmod a+x /usr/local/bin/mpirun
+
+echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
+ && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
diff --git a/tools/docker/install/install_python_packages.sh b/tools/docker/install/install_python_packages.sh
new file mode 100644
index 0000000000..879089acc4
--- /dev/null
+++ b/tools/docker/install/install_python_packages.sh
@@ -0,0 +1,28 @@
+set -euo pipefail
+
+
+python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools \
+    wheel
+
+# python-dateutil==2.8.0 to satisfy botocore associated with latest awscli
+python3 -m pip install --no-cache --upgrade \
+    numpy==1.19.1 \
+    pandas==0.25.1 \
+    cython \
+    pytest \
+    pytest-cov \
+    Pillow \
+    requests==2.22.0 \
+    scikit-learn==0.20.4 \
+    scipy==1.2.2 \
+    urllib3==1.25.8 \
+    python-dateutil==2.8.0 \
+    sagemaker-experiments==0.* \
+    PyYAML==5.3.1 \
+    mpi4py==3.0.2 \
+    jupyterlab==2.2.4 \
+    contextvars \
+    cmake \
+    awscli --user
diff --git a/tools/docker/install/install_tvm_cpu.sh b/tools/docker/install/install_tvm_cpu.sh
new file mode 100644
index 0000000000..3dc27fd0f0
--- /dev/null
+++ b/tools/docker/install/install_tvm_cpu.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+cd ${WORKDIR}
+git clone https://github.com/apache/incubator-tvm tvm --recursive
+cd ${WORKDIR}/tvm
+# checkout a hash-tag
+git checkout 6d0351a7f0e23eb5428c59a976edd2bfb8207c0d
+
+echo set\(USE_LLVM llvm-config-10\) >> config.cmake
+echo set\(USE_GRAPH_RUNTIME ON\) >> config.cmake
+echo set\(USE_BLAS openblas\) >> config.cmake
+mkdir -p build
+cd build
+cmake ..
+make -j10
+
+# install python binding
+cd ..
+cd python
+python3 -m pip install -U -e . --user
diff --git a/tools/docker/install/install_tvm_gpu.sh b/tools/docker/install/install_tvm_gpu.sh
new file mode 100644
index 0000000000..f7f8cdfb8a
--- /dev/null
+++ b/tools/docker/install/install_tvm_gpu.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+cd ${WORKDIR}
+git clone https://github.com/apache/incubator-tvm tvm --recursive
+cd ${WORKDIR}/tvm
+# checkout a hash-tag
+git checkout 6d0351a7f0e23eb5428c59a976edd2bfb8207c0d
+
+echo set\(USE_LLVM llvm-config-10\) >> config.cmake
+echo set\(USE_CUDA ON\) >> config.cmake
+echo set\(USE_CUDNN ON\) >> config.cmake
+echo set\(USE_BLAS openblas\) >> config.cmake
+mkdir -p build
+cd build
+cmake ..
+make -j10
+
+# install python binding
+cd ..
+cd python
+python3 -m pip install -U -e . --user
diff --git a/tools/docker/install/install_ubuntu18.04_core.sh b/tools/docker/install/install_ubuntu18.04_core.sh
new file mode 100644
index 0000000000..404e00fb0e
--- /dev/null
+++ b/tools/docker/install/install_ubuntu18.04_core.sh
@@ -0,0 +1,37 @@
+set -e
+set -u
+set -o pipefail
+
+export DEBIAN_FRONTEND=noninteractive
+
+apt-get update \
+ && apt-get install -y --no-install-recommends \
+    software-properties-common \
+    build-essential \
+    ca-certificates \
+    curl \
+    emacs \
+    subversion \
+    locales \
+    cmake \
+    git \
+    libopencv-dev \
+    htop \
+    vim \
+    wget \
+    unzip \
+    less \
+    libopenblas-dev \
+    gpg-agent \
+    ninja-build \
+    openssh-client \
+    openssh-server \
+    python3-dev \
+    python3-pip \
+    python3-setuptools \
+    libxft-dev \
+    zlib1g-dev \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+
+ln -s $(which python3) /usr/local/bin/python
diff --git a/tools/docker/ubuntu18.04-cpu.Dockerfile b/tools/docker/ubuntu18.04-cpu.Dockerfile
new file mode 100644
index 0000000000..a5a2114652
--- /dev/null
+++ b/tools/docker/ubuntu18.04-cpu.Dockerfile
@@ -0,0 +1,77 @@
+FROM ubuntu:18.04 as base
+
+LABEL maintainer="GluonNLP Team"
+COPY install /install
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" \
+    PYTHONIOENCODING=UTF-8 \
+    LANG=C.UTF-8 \
+    LC_ALL=C.UTF-8
+
+ENV WORKDIR=/workspace
+ENV SHELL=/bin/bash
+
+RUN mkdir -p ${WORKDIR}
+
+
+RUN bash /install/install_ubuntu18.04_core.sh
+
+# Install Open MPI
+RUN bash /install/install_openmpi.sh
+ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
+ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH
+
+# Install LLVM
+RUN bash /install/install_llvm.sh
+
+# Install Python Packages
+RUN bash /install/install_python_packages.sh
+
+# Install TVM
+RUN bash /install/install_tvm_cpu.sh
+
+# Install MXNet
+RUN python3 -m pip install -U --pre "mxnet>=2.0.0b20200926" -f https://dist.mxnet.io/python --user
+
+# Install PyTorch
+RUN python3 -m pip install -U torch torchvision --user
+
+# Install Jupyter Lab
+RUN bash /install/install_jupyter_lab.sh
+
+RUN mkdir -p ${WORKDIR}/data
+RUN mkdir -p /.init
+RUN cd ${WORKDIR} \
+   && git clone https://github.com/dmlc/gluon-nlp \
+   && cd gluon-nlp \
+   && git checkout master \
+   && python3 -m pip install -U -e ."[extras]"
+
+
+# Stage-CI
+FROM base as ci
+WORKDIR ${WORKDIR}/gluon-nlp
+ADD gluon_nlp_job.sh .
+RUN chmod +x gluon_nlp_job.sh
+
+
+# Stage-Devel
+FROM base as devel
+COPY start_jupyter.sh /start_jupyter.sh
+COPY devel_entrypoint.sh /devel_entrypoint.sh
+RUN chmod +x /devel_entrypoint.sh
+
+EXPOSE 8888
+EXPOSE 8787
+EXPOSE 8786
+
+WORKDIR ${WORKDIR}
+
+# Add Tini
+ARG TINI_VERSION=v0.19.0
+ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
+RUN chmod +x /tini
+ENTRYPOINT [ "/tini", "--", "/devel_entrypoint.sh" ]
+CMD ["/bin/bash"]
diff --git a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile
deleted file mode 100644
index 43d1a740f9..0000000000
--- a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile
+++ /dev/null
@@ -1,175 +0,0 @@
-FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
-
-LABEL maintainer="GluonNLP Team"
-
-ARG DEBIAN_FRONTEND=noninteractive
-
-ENV PYTHONDONTWRITEBYTECODE=1 \
-    PYTHONUNBUFFERED=1 \
-    LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" \
-    PYTHONIOENCODING=UTF-8 \
-    LANG=C.UTF-8 \
-    LC_ALL=C.UTF-8
-
-ENV WORKDIR=/workspace
-ENV SHELL=/bin/bash
-
-RUN apt-get update \
- && apt-get install -y --no-install-recommends \
-    software-properties-common \
-    build-essential \
-    ca-certificates \
-    curl \
-    emacs \
-    subversion \
-    locales \
-    cmake \
-    git \
-    libopencv-dev \
-    htop \
-    vim \
-    wget \
-    unzip \
-    libopenblas-dev \
-    ninja-build \
-    openssh-client \
-    openssh-server \
-    python3-dev \
-    python3-pip \
-    python3-setuptools \
-    libxft-dev \
-    zlib1g-dev \
- && apt-get clean \
- && rm -rf /var/lib/apt/lists/*
-
-RUN python3 -m pip --no-cache-dir install --upgrade \
-    pip \
-    setuptools
-
-###########################################################################
-# Horovod dependencies
-###########################################################################
-
-# Install Open MPI
-RUN mkdir /tmp/openmpi \
- && cd /tmp/openmpi \
- && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
- && tar zxf openmpi-4.0.1.tar.gz \
- && cd openmpi-4.0.1 \
- && ./configure --enable-orterun-prefix-by-default \
- && make -j $(nproc) all \
- && make install \
- && ldconfig \
- && rm -rf /tmp/openmpi
-
-# Create a wrapper for OpenMPI to allow running as root by default
-RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
- && echo '#!/bin/bash' > /usr/local/bin/mpirun \
- && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
- && chmod a+x /usr/local/bin/mpirun
-
-RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
- && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
-
-ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
-ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH
-
-RUN ln -s $(which python3) /usr/local/bin/python
-
-RUN mkdir -p ${WORKDIR}
-
-# install PyYAML==5.1.2 to avoid conflict with latest awscli
-# python-dateutil==2.8.0 to satisfy botocore associated with latest awscli
-RUN pip3 install --no-cache --upgrade \
-    wheel \
-    numpy==1.19.1 \
-    pandas==0.25.1 \
-    pytest \
-    Pillow \
-    requests==2.22.0 \
-    scikit-learn==0.20.4 \
-    scipy==1.2.2 \
-    urllib3==1.25.8 \
-    python-dateutil==2.8.0 \
-    sagemaker-experiments==0.* \
-    PyYAML==5.3.1 \
-    mpi4py==3.0.2 \
-    jupyterlab==2.2.4 \
-    cmake \
-    awscli
-
-# Install MXNet
-RUN mkdir -p ${WORKDIR}/mxnet \
- && cd ${WORKDIR}/mxnet \
- && git clone --single-branch --branch master --recursive https://github.com/apache/incubator-mxnet \
- && cd incubator-mxnet \
- && mkdir build \
- && cd build \
- && cmake -DMXNET_CUDA_ARCH="3.0;5.0;6.0;7.0" -GNinja -C ../config/linux_gpu.cmake .. \
- && cmake --build . \
- && cd ../python \
- && python3 -m pip install -U -e . --user
-
-# Install Horovod
-# TODO Fix once https://github.com/horovod/horovod/pull/2155 gets merged
-RUN mkdir ${WORKDIR}/horovod \
- && cd ${WORKDIR}/horovod \
- && git clone --single-branch --branch mx2-pr --recursive https://github.com/eric-haibin-lin/horovod \
- && cd horovod \
- && ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \
- && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITHOUT_GLOO=1 \
-    HOROVOD_WITH_MPI=1 HOROVOD_WITH_MXNET=1 HOROVOD_WITHOUT_PYTORCH=1 \
-    HOROVOD_WITHOUT_TENSORFLOW=1 python3 setup.py install --user \
- && ldconfig
-
-RUN mkdir -p ${WORKDIR}/notebook
-RUN mkdir -p ${WORKDIR}/data
-RUN mkdir -p /.init
-RUN cd ${WORKDIR} \
-   && git clone https://github.com/dmlc/gluon-nlp \
-   && cd gluon-nlp \
-   && git checkout master \
-   && python3 -m pip install -U -e ."[extras]" --user
-
-COPY start_jupyter.sh /start_jupyter.sh
-COPY devel_entrypoint.sh /devel_entrypoint.sh
-RUN chmod +x /devel_entrypoint.sh
-
-EXPOSE 8888
-EXPOSE 8787
-EXPOSE 8786
-
-WORKDIR ${WORKDIR}
-
-# Debug horovod by default
-RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
-
-# Install NodeJS + Tensorboard + TensorboardX
-RUN curl -sL https://deb.nodesource.com/setup_14.x | bash - \
-    && apt-get install -y nodejs
-
-RUN apt-get update \
- && apt-get install -y --no-install-recommends \
-    libsndfile1-dev
-
-RUN pip3 install --no-cache --upgrade \
-    soundfile==0.10.2 \
-    ipywidgets==7.5.1 \
-    jupyter_tensorboard==0.2.0 \
-    widgetsnbextension==3.5.1 \
-    tensorboard==2.1.1 \
-    tensorboardX==2.1
-RUN jupyter labextension install jupyterlab_tensorboard \
-   && jupyter nbextension enable --py widgetsnbextension \
-   && jupyter labextension install @jupyter-widgets/jupyterlab-manager
-
-# Revise default shell to /bin/bash
-RUN jupyter notebook --generate-config \
-  && echo "c.NotebookApp.terminado_settings = { 'shell_command': ['/bin/bash'] }" >> /root/.jupyter/jupyter_notebook_config.py
-
-# Add Tini
-ARG TINI_VERSION=v0.19.0
-ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
-RUN chmod +x /tini
-ENTRYPOINT [ "/tini", "--", "/devel_entrypoint.sh" ]
-CMD ["/bin/bash"]
diff --git a/tools/docker/ubuntu18.04-gpu.Dockerfile b/tools/docker/ubuntu18.04-gpu.Dockerfile
new file mode 100644
index 0000000000..4ac5880e63
--- /dev/null
+++ b/tools/docker/ubuntu18.04-gpu.Dockerfile
@@ -0,0 +1,77 @@
+FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 as base
+
+LABEL maintainer="GluonNLP Team"
+COPY install /install
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" \
+    PYTHONIOENCODING=UTF-8 \
+    LANG=C.UTF-8 \
+    LC_ALL=C.UTF-8
+
+ENV WORKDIR=/workspace
+ENV SHELL=/bin/bash
+
+RUN mkdir -p ${WORKDIR}
+
+RUN bash /install/install_ubuntu18.04_core.sh
+
+# Install Open MPI
+RUN bash /install/install_openmpi.sh
+ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
+ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH
+
+# Install LLVM
+RUN bash /install/install_llvm.sh
+
+# Install Python Packages
+RUN bash /install/install_python_packages.sh
+
+# Install TVM
+RUN bash /install/install_tvm_gpu.sh
+
+# Install MXNet
+RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python --user
+
+# Install PyTorch
+RUN python3 -m pip install -U torch torchvision --user
+
+# Install Horovod
+RUN bash /install/install_horovod.sh
+
+# Install Jupyter Lab
+RUN bash /install/install_jupyter_lab.sh
+
+RUN mkdir -p ${WORKDIR}/data
+RUN mkdir -p /.init
+RUN cd ${WORKDIR} \
+   && git clone https://github.com/dmlc/gluon-nlp \
+   && cd gluon-nlp \
+   && git checkout master \
+   && python3 -m pip install -U -e ."[extras]"
+
+# Stage-CI
+FROM base as ci
+WORKDIR ${WORKDIR}/gluon-nlp
+ADD gluon_nlp_job.sh .
+RUN chmod +x gluon_nlp_job.sh
+
+# Stage-Devel
+FROM base as devel
+COPY start_jupyter.sh /start_jupyter.sh
+COPY devel_entrypoint.sh /devel_entrypoint.sh
+RUN chmod +x /devel_entrypoint.sh
+
+EXPOSE 8888
+EXPOSE 8787
+EXPOSE 8786
+
+WORKDIR ${WORKDIR}
+
+# Add Tini
+ARG TINI_VERSION=v0.19.0
+ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
+RUN chmod +x /tini
+ENTRYPOINT [ "/tini", "--", "/devel_entrypoint.sh" ]
+CMD ["/bin/bash"]