dmlc · sxjscience · Oct 15, 2020 · Sep 29, 2020 · Sep 29, 2020 · Sep 30, 2020
@@ -34,16 +34,16 @@ First of all, install the latest MXNet. You may use the following commands:
 
 ```bash
 # Install the version with CUDA 10.0
-python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20200802" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20200926" -f https://dist.mxnet.io/python
 
 # Install the version with CUDA 10.1
-python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20200802" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20200926" -f https://dist.mxnet.io/python
 
 # Install the version with CUDA 10.2
-python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python
 
 # Install the cpu-only version
-python3 -m pip install -U --pre "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet>=2.0.0b20200926" -f https://dist.mxnet.io/python
 ```
 
 
@@ -92,8 +92,13 @@ You may go to [tests](tests) to see how to run the unittests.
 You can use Docker to launch a JupyterLab development environment with GluonNLP installed.
 
 ```
+# GPU Instance
 docker pull gluonai/gluon-nlp:gpu-latest
 docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=4g gluonai/gluon-nlp:gpu-latest
+
+# CPU Instance
+docker pull gluonai/gluon-nlp:cpu-latest
+docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=4g gluonai/gluon-nlp:cpu-latest
 ``` 
 
 For more details, you can refer to the guidance in [tools/docker](tools/docker).
@@ -234,10 +234,10 @@ def setup(app):
         'auto_doc_ref': True
             }, True)
     app.add_transform(AutoStructify)
-    app.add_javascript('google_analytics.js')
-    app.add_javascript('hidebib.js')
-    app.add_javascript('install-options.js')
-    app.add_stylesheet('custom.css')
+    app.add_js_file('google_analytics.js')
+    app.add_js_file('hidebib.js')
+    app.add_js_file('install-options.js')
+    app.add_css_file('custom.css')
 
 
 sphinx_gallery_conf = {

@@ -792,12 +792,9 @@ def train_step():
             raise NotImplementedError
         timeit.repeat(train_step, repeat=1, number=3)
         mxnet.npx.waitall()
-        for ctx in mx_all_contexts:
-            ctx.empty_cache()
         runtimes = timeit.repeat(train_step, repeat=self._repeat, number=3)
         mxnet.npx.waitall()
-        for ctx in mx_all_contexts:
-            ctx.empty_cache()
+        ctx.empty_cache()
         mxnet.npx.waitall()
         # Profile memory
         if self._use_gpu:
@@ -844,8 +841,6 @@ def run(self):
                         infer_time = np.nan
                         infer_memory = np.nan
                     inference_result[model_name][workload] = (infer_time, infer_memory)
-                    for ctx in mx_all_contexts:
-                        ctx.empty_cache()
                     mxnet.npx.waitall()
                     self.save_to_csv(inference_result, self._inference_out_csv_file)
                 if self._profile_train:
@@ -858,8 +853,6 @@ def run(self):
                         train_time = np.nan
                         train_memory = np.nan
                     train_result[model_name][workload] = (train_time, train_memory)
-                    for ctx in mx_all_contexts:
-                        ctx.empty_cache()
                     mxnet.npx.waitall()
                     self.save_to_csv(train_result, self._train_out_csv_file)
 

@@ -112,13 +112,13 @@ benchmarking. We select the classical datasets that are also used in
 
 | Dataset       | #Train  | #Test   | Columns         | Metrics         |
 |---------------|---------|---------|-----------------|-----------------|
-| AG            | 120000  | 7600    | content, label  | acc             |
-| IMDB          | 25000   | 25000   | content, label  | acc             |
-| DBpedia       | 560000  | 70000   | content, label  | acc             |
-| Yelp2         | 560000  | 38000   | content, label  | acc             |
-| Yelp5         | 650000  | 50000   | content, label  | acc             |
-| Amazon2       | 3600000 | 400000  | content, label  | acc             |
-| Amazon5       | 3000000 | 650000  | content, label  | acc             |
+| AG            | 120,000  | 7,600    | content, label  | acc             |
+| IMDB          | 25,000   | 25,000   | content, label  | acc             |
+| DBpedia       | 560,000  | 70,000   | content, label  | acc             |
+| Yelp2         | 560,000  | 38,000   | content, label  | acc             |
+| Yelp5         | 650,000  | 50,000   | content, label  | acc             |
+| Amazon2       | 3,600,000 | 400,000  | content, label  | acc             |
+| Amazon5       | 3,000,000 | 65,0000  | content, label  | acc             |
 
 To obtain the datasets, run:
 

@@ -2,9 +2,11 @@
 
 We provide a series of shared scripts for downloading/preparing the text corpus for pretraining NLP models.
 This helps create a unified text corpus for studying the performance of different pretraining algorithms.
-When releasing the datasets, we follow the [FAIR principle](https://www.go-fair.org/fair-principles/),
+When picking the datasets to support, we follow the [FAIR principle](https://www.go-fair.org/fair-principles/),
 i.e., the dataset needs to be findable, accessible, interoperable, and reusable.
 
+For all scripts, we can either use `nlp_data SCRIPT_NAME`, or directly call the script.
+
 ## Gutenberg BookCorpus
 Unfortunately, we are unable to provide the [Toronto BookCorpus dataset](https://yknzhu.wixsite.com/mbweb) due to licensing issues.
 
@@ -16,14 +18,14 @@ Thus, we utilize the [Project Gutenberg](https://www.gutenberg.org/) as an alter
 You can use the following command to download and prepare the Gutenberg corpus.
 
 ```bash
-python3 prepare_bookcorpus.py --dataset gutenberg
+python3 prepare_gutenberg.py --save_dir gutenberg
 ```
 
 Also, you should follow the [license](https://www.gutenberg.org/wiki/Gutenberg:The_Project_Gutenberg_License) for using the data.
 
 ## Wikipedia
 
-Please install [attardi/wikiextractor](https://github.com/attardi/wikiextractor) for preparing the data.
+We used the [attardi/wikiextractor](https://github.com/attardi/wikiextractor) package for preparing the data.
 
 ```bash
 # Download
@@ -33,7 +35,9 @@ python3 prepare_wikipedia.py --mode download --lang en --date latest -o ./
 python3 prepare_wikipedia.py --mode format -i [path-to-wiki.xml.bz2] -o ./
 
 ```
-The process of downloading and formatting is time consuming, and we offer an alternative solution to download the prepared raw text file from S3 bucket. This raw text file is in English and was dumped at 2020-06-20 being formated by the above very process (` --lang en --date 20200620`).
+The process of downloading and formatting is time consuming, and we offer an alternative 
+solution to download the prepared raw text file from S3 bucket. This raw text file is in English and 
+was dumped at 2020-06-20 being formatted by the above process (` --lang en --date 20200620`).
 
 ```bash
 python3 prepare_wikipedia.py --mode download_prepared -o ./

@@ -3,7 +3,7 @@
 import zipfile
 from gluonnlp.base import get_data_home_dir
 from gluonnlp.utils.misc import download, load_checksum_stats
-
+import shutil
 
 _CITATIONS = r"""
 @InProceedings{lahiri:2014:SRW,
@@ -59,11 +59,14 @@ def main(args):
     save_dir = args.dataset if args.save_dir is None else args.save_dir
     if not os.path.exists(save_dir):
         os.makedirs(save_dir, exist_ok=True)
+    print(f'Save to {save_dir}')
     with zipfile.ZipFile(target_download_location) as f:
         for name in f.namelist():
             if name.endswith('.txt'):
                 filename = os.path.basename(name)
-            f.extract(name, os.path.join(save_dir, filename))
+                with f.open(name) as in_file:
+                    with open(os.path.join(save_dir, filename.replace(' ', '_')), 'wb') as out_file:
+                        shutil.copyfileobj(in_file, out_file)
 
 
 def cli_main():

@@ -1,5 +1,6 @@
 # Question Answering
 
+
 ## SQuAD
 SQuAD datasets is distributed under the [CC BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/legalcode) license.
 
@@ -39,7 +40,7 @@ python3 prepare_searchqa.py
 nlp_data prepare_searchqa
 ```
 
-Directory structure of the searchqa dataset will be as follows
+Directory structure of the SearchQA dataset will be as follows
 ```
 searchqa
 ├── train.txt
@@ -48,9 +49,10 @@ searchqa
 ```
 
 ## TriviaQA
-[TriviaQA](https://nlp.cs.washington.edu/triviaqa/) is an open domain QA dataset. See more useful scripts in [Offical Github](https://github.com/mandarjoshi90/triviaqa)
+[TriviaQA](https://nlp.cs.washington.edu/triviaqa/) is an open domain QA dataset. 
+See more useful scripts in [Offical Github](https://github.com/mandarjoshi90/triviaqa).
 
-Run the following command to download triviaqa
+Run the following command to download TriviaQA
 
 ```bash
 python3 prepare_triviaqa.py --version rc         # Download TriviaQA version 1.0 for RC (2.5G)

@@ -1,7 +1,7 @@
 import os
 import argparse
 from gluonnlp.utils.misc import download, load_checksum_stats
-from gluonnlp.base import get_data_home_dir
+from gluonnlp.base import get_data_home_dir, get_repo_url
 
 _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
 _BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'searchqa')
@@ -20,9 +20,9 @@
 """
 
 _URLS = {
-    'train': 's3://gluonnlp-numpy-data/datasets/question_answering/searchqa/train.txt',
-    'val': 's3://gluonnlp-numpy-data/datasets/question_answering/searchqa/val.txt',
-    'test': 's3://gluonnlp-numpy-data/datasets/question_answering/searchqa/test.txt'
+    'train': get_repo_url() + 'datasets/question_answering/searchqa/train.txt',
+    'val': get_repo_url() + 'datasets/question_answering/searchqa/val.txt',
+    'test': get_repo_url() + 'datasets/question_answering/searchqa/test.txt'
 }
 
 

@@ -1,5 +1,6 @@
 import os
 import argparse
+import shutil
 from gluonnlp.utils.misc import download, load_checksum_stats
 from gluonnlp.base import get_data_home_dir
 
@@ -58,14 +59,18 @@ def main(args):
     download(dev_url, path=os.path.join(args.cache_path, dev_file_name))
     if not os.path.exists(args.save_path):
         os.makedirs(args.save_path)
-    if not os.path.exists(os.path.join(args.save_path, train_file_name))\
+    if not os.path.exists(os.path.join(args.save_path, train_file_name)) \
             or (args.overwrite and args.save_path != args.cache_path):
-        os.symlink(os.path.join(args.cache_path, train_file_name),
-                   os.path.join(args.save_path, train_file_name))
-    if not os.path.exists(os.path.join(args.save_path, dev_file_name))\
+        os.link(os.path.join(args.cache_path, train_file_name),
+                os.path.join(args.save_path, train_file_name))
+    else:
+        print(f'Found {os.path.join(args.save_path, train_file_name)}...skip')
+    if not os.path.exists(os.path.join(args.save_path, dev_file_name)) \
             or (args.overwrite and args.save_path != args.cache_path):
-        os.symlink(os.path.join(args.cache_path, dev_file_name),
-                   os.path.join(args.save_path, dev_file_name))
+        os.link(os.path.join(args.cache_path, dev_file_name),
+                os.path.join(args.save_path, dev_file_name))
+    else:
+        print(f'Found {os.path.join(args.save_path, dev_file_name)}...skip')
 
 
 def cli_main():

@@ -1,3 +1,3 @@
-s3://gluonnlp-numpy-data/datasets/question_answering/searchqa/train.txt c7e1eb8c34d0525547b91e18b3f8f4d855e35c16 1226681217
-s3://gluonnlp-numpy-data/datasets/question_answering/searchqa/test.txt 08a928e0f8c129d5b3ca43bf46df117e38be0c27 332064988
-s3://gluonnlp-numpy-data/datasets/question_answering/searchqa/val.txt c2f65d6b83c26188d5998ab96bc6a38c1a127fcc 170835902
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/question_answering/searchqa/train.txt c7e1eb8c34d0525547b91e18b3f8f4d855e35c16 1226681217
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/question_answering/searchqa/test.txt 08a928e0f8c129d5b3ca43bf46df117e38be0c27 332064988
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/question_answering/searchqa/val.txt c2f65d6b83c26188d5998ab96bc6a38c1a127fcc 170835902
@@ -0,0 +1,8 @@
+# Commands For Training on SQuAD
+
+All commands are generated by parsing the template in [run_squad.template](run_squad.template). 
+To generate all commands, use the following code.
+
+```bash
+python3 generate_commands.py
+```