From df1480f85b22592160dcd2ad74ba079aa968c8bd Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 29 Sep 2020 01:37:22 -0700 Subject: [PATCH 001/115] update --- README.md | 8 +++---- tools/docker/ubuntu18.04-devel-gpu.Dockerfile | 22 ++----------------- 2 files changed, 6 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index b1c73cfa12..d2cb3a8f5e 100644 --- a/README.md +++ b/README.md @@ -34,16 +34,16 @@ First of all, install the latest MXNet. You may use the following commands: ```bash # Install the version with CUDA 10.0 -python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20200802" -f https://dist.mxnet.io/python +python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20200928" -f https://dist.mxnet.io/python # Install the version with CUDA 10.1 -python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20200802" -f https://dist.mxnet.io/python +python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20200928" -f https://dist.mxnet.io/python # Install the version with CUDA 10.2 -python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python +python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200928" -f https://dist.mxnet.io/python # Install the cpu-only version -python3 -m pip install -U --pre "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python +python3 -m pip install -U --pre "mxnet>=2.0.0b20200928" -f https://dist.mxnet.io/python ``` diff --git a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile index 43d1a740f9..175c9d83fd 100644 --- a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile @@ -99,28 +99,10 @@ RUN pip3 install --no-cache --upgrade \ awscli # Install MXNet -RUN mkdir -p ${WORKDIR}/mxnet \ - && cd ${WORKDIR}/mxnet \ - && git clone --single-branch --branch master --recursive https://github.com/apache/incubator-mxnet \ - && cd incubator-mxnet \ - && mkdir build \ - && cd build \ - && cmake -DMXNET_CUDA_ARCH="3.0;5.0;6.0;7.0" -GNinja -C ../config/linux_gpu.cmake .. \ - && cmake --build . \ - && cd ../python \ - && python3 -m pip install -U -e . --user +RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200928" -f https://dist.mxnet.io/python --user # Install Horovod -# TODO Fix once https://github.com/horovod/horovod/pull/2155 gets merged -RUN mkdir ${WORKDIR}/horovod \ - && cd ${WORKDIR}/horovod \ - && git clone --single-branch --branch mx2-pr --recursive https://github.com/eric-haibin-lin/horovod \ - && cd horovod \ - && ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \ - && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITHOUT_GLOO=1 \ - HOROVOD_WITH_MPI=1 HOROVOD_WITH_MXNET=1 HOROVOD_WITHOUT_PYTORCH=1 \ - HOROVOD_WITHOUT_TENSORFLOW=1 python3 setup.py install --user \ - && ldconfig +RUN HOROVOD_GPU_OPERATIONS=NCCL python3 -m pip install --no-cache-dir horovod --user RUN mkdir -p ${WORKDIR}/notebook RUN mkdir -p ${WORKDIR}/data From 120c4f40dcce2e9bedbb79b0c174f84547261ec4 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 29 Sep 2020 11:53:20 -0700 Subject: [PATCH 002/115] Update ubuntu18.04-devel-gpu.Dockerfile --- tools/docker/ubuntu18.04-devel-gpu.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile index 175c9d83fd..ab3a951ebc 100644 --- a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile @@ -99,7 +99,7 @@ RUN pip3 install --no-cache --upgrade \ awscli # Install MXNet -RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200928" -f https://dist.mxnet.io/python --user +RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python --user # Install Horovod RUN HOROVOD_GPU_OPERATIONS=NCCL python3 -m pip install --no-cache-dir horovod --user From 9f0b129d557294e047d62bed511061715371a05a Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 29 Sep 2020 21:16:05 -0700 Subject: [PATCH 003/115] fix the docker image --- README.md | 8 ++++---- tools/docker/ubuntu18.04-devel-gpu.Dockerfile | 4 +++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index d2cb3a8f5e..feb20cba2a 100644 --- a/README.md +++ b/README.md @@ -34,16 +34,16 @@ First of all, install the latest MXNet. You may use the following commands: ```bash # Install the version with CUDA 10.0 -python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20200928" -f https://dist.mxnet.io/python +python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20200926" -f https://dist.mxnet.io/python # Install the version with CUDA 10.1 -python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20200928" -f https://dist.mxnet.io/python +python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20200926" -f https://dist.mxnet.io/python # Install the version with CUDA 10.2 -python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200928" -f https://dist.mxnet.io/python +python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python # Install the cpu-only version -python3 -m pip install -U --pre "mxnet>=2.0.0b20200928" -f https://dist.mxnet.io/python +python3 -m pip install -U --pre "mxnet>=2.0.0b20200926" -f https://dist.mxnet.io/python ``` diff --git a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile index ab3a951ebc..b1552d0fa3 100644 --- a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile @@ -102,7 +102,9 @@ RUN pip3 install --no-cache --upgrade \ RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python --user # Install Horovod -RUN HOROVOD_GPU_OPERATIONS=NCCL python3 -m pip install --no-cache-dir horovod --user +RUN HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITHOUT_GLOO=1 \ + HOROVOD_WITH_MPI=1 HOROVOD_WITH_MXNET=1 HOROVOD_WITHOUT_PYTORCH=1 \ + HOROVOD_WITHOUT_TENSORFLOW=1 python3 -m pip install --no-cache-dir horovod --user RUN mkdir -p ${WORKDIR}/notebook RUN mkdir -p ${WORKDIR}/data From 47c16769cc7997ae15d86afc921016cf83b0db66 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 30 Sep 2020 10:35:33 -0700 Subject: [PATCH 004/115] Update README.md --- tools/docker/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/docker/README.md b/tools/docker/README.md index 6b90b0121d..29d9c65c80 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -44,3 +44,6 @@ To build a docker image fom the dockerfile, you may use the following command: ``` docker build -f ubuntu18.04-devel-gpu.Dockerfile -t gluonai/gluon-nlp:gpu-latest . ``` + +### Developers of GluonNLP +You may try to login to your dockerhub account and push the image to dockerhub. From be6aa35e2f7676d9865acad0bd20aa51202c93ac Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 30 Sep 2020 10:52:58 -0700 Subject: [PATCH 005/115] Update ubuntu18.04-devel-gpu.Dockerfile --- tools/docker/ubuntu18.04-devel-gpu.Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile index b1552d0fa3..590052dd5b 100644 --- a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile @@ -101,9 +101,12 @@ RUN pip3 install --no-cache --upgrade \ # Install MXNet RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python --user +# Install PyTorch +RUN python3 -m pip install -U torch torchvision --user + # Install Horovod RUN HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITHOUT_GLOO=1 \ - HOROVOD_WITH_MPI=1 HOROVOD_WITH_MXNET=1 HOROVOD_WITHOUT_PYTORCH=1 \ + HOROVOD_WITH_MPI=1 HOROVOD_WITH_MXNET=1 HOROVOD_WITH_PYTORCH=1 \ HOROVOD_WITHOUT_TENSORFLOW=1 python3 -m pip install --no-cache-dir horovod --user RUN mkdir -p ${WORKDIR}/notebook From 07d9e0feb1cd509654c37c1e44021c425fa25af1 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 30 Sep 2020 12:03:56 -0700 Subject: [PATCH 006/115] Update README.md --- tools/docker/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/docker/README.md b/tools/docker/README.md index 29d9c65c80..e54fd9d299 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -29,8 +29,8 @@ The folder structure of the docker image will be If you have a multi-GPU instance, e.g., [g4dn.12xlarge](https://aws.amazon.com/ec2/instance-types/g4/), [p2.8xlarge](https://aws.amazon.com/ec2/instance-types/p2/), -[p3.8xlarge](https://aws.amazon.com/ec2/instance-types/p3/), you can try to run the following -command to verify the installation of horovod + MXNet +[p3.8xlarge](https://aws.amazon.com/ec2/instance-types/p3/), you can try to verify the installation +of horovod + MXNet by running the question answering script ``` docker run --gpus all --rm -it --shm-size=4g gluonai/gluon-nlp:gpu-latest \ From 3d18977dafab746fe011dcc9baae076d308c2b77 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 30 Sep 2020 19:18:26 -0700 Subject: [PATCH 007/115] fix readme --- scripts/datasets/pretrain_corpus/README.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/datasets/pretrain_corpus/README.md b/scripts/datasets/pretrain_corpus/README.md index 54c4d5c1e2..a0a0c8493a 100644 --- a/scripts/datasets/pretrain_corpus/README.md +++ b/scripts/datasets/pretrain_corpus/README.md @@ -2,9 +2,11 @@ We provide a series of shared scripts for downloading/preparing the text corpus for pretraining NLP models. This helps create a unified text corpus for studying the performance of different pretraining algorithms. -When releasing the datasets, we follow the [FAIR principle](https://www.go-fair.org/fair-principles/), +When picking the datasets to support, we follow the [FAIR principle](https://www.go-fair.org/fair-principles/), i.e., the dataset needs to be findable, accessible, interoperable, and reusable. +For all scripts, we can either use `nlp_data SCRIPT_NAME`, or directly call the script. + ## Gutenberg BookCorpus Unfortunately, we are unable to provide the [Toronto BookCorpus dataset](https://yknzhu.wixsite.com/mbweb) due to licensing issues. @@ -16,14 +18,14 @@ Thus, we utilize the [Project Gutenberg](https://www.gutenberg.org/) as an alter You can use the following command to download and prepare the Gutenberg corpus. ```bash -python3 prepare_bookcorpus.py --dataset gutenberg +python3 prepare_gutenberg.py --save_dir . ``` Also, you should follow the [license](https://www.gutenberg.org/wiki/Gutenberg:The_Project_Gutenberg_License) for using the data. ## Wikipedia -Please install [attardi/wikiextractor](https://github.com/attardi/wikiextractor) for preparing the data. +We used the [attardi/wikiextractor](https://github.com/attardi/wikiextractor) package for preparing the data. ```bash # Download @@ -33,7 +35,9 @@ python3 prepare_wikipedia.py --mode download --lang en --date latest -o ./ python3 prepare_wikipedia.py --mode format -i [path-to-wiki.xml.bz2] -o ./ ``` -The process of downloading and formatting is time consuming, and we offer an alternative solution to download the prepared raw text file from S3 bucket. This raw text file is in English and was dumped at 2020-06-20 being formated by the above very process (` --lang en --date 20200620`). +The process of downloading and formatting is time consuming, and we offer an alternative +solution to download the prepared raw text file from S3 bucket. This raw text file is in English and +was dumped at 2020-06-20 being formatted by the above process (` --lang en --date 20200620`). ```bash python3 prepare_wikipedia.py --mode download_prepared -o ./ From 146b8268acb5ccfd6c639d40e0adcb5da5940aee Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 7 Oct 2020 22:04:29 -0700 Subject: [PATCH 008/115] Add CPU DockerFile --- tools/docker/README.md | 1 + tools/docker/ubuntu18.04-devel-cpu.Dockerfile | 115 ++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 tools/docker/ubuntu18.04-devel-cpu.Dockerfile diff --git a/tools/docker/README.md b/tools/docker/README.md index e54fd9d299..b9ed4abfe5 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -42,6 +42,7 @@ docker run --gpus all --rm -it --shm-size=4g gluonai/gluon-nlp:gpu-latest \ To build a docker image fom the dockerfile, you may use the following command: ``` +docker build -f ubuntu18.04-devel-cpu.Dockerfile -t gluonai/gluon-nlp:cpu-latest . docker build -f ubuntu18.04-devel-gpu.Dockerfile -t gluonai/gluon-nlp:gpu-latest . ``` diff --git a/tools/docker/ubuntu18.04-devel-cpu.Dockerfile b/tools/docker/ubuntu18.04-devel-cpu.Dockerfile new file mode 100644 index 0000000000..a338a32dea --- /dev/null +++ b/tools/docker/ubuntu18.04-devel-cpu.Dockerfile @@ -0,0 +1,115 @@ +FROM ubuntu:18.04 + +LABEL maintainer="GluonNLP Team" + +ENV WORKDIR=/workspace +ENV SHELL=/bin/bash + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + software-properties-common \ + build-essential \ + ca-certificates \ + curl \ + emacs \ + subversion \ + locales \ + cmake \ + git \ + libopencv-dev \ + htop \ + vim \ + wget \ + unzip \ + libopenblas-dev \ + ninja-build \ + openssh-client \ + openssh-server \ + python3-dev \ + python3-pip \ + python3-setuptools \ + libxft-dev \ + zlib1g-dev \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN python3 -m pip --no-cache-dir install --upgrade \ + pip \ + setuptools + +RUN ln -s $(which python3) /usr/local/bin/python + +# Install MXNet +RUN python3 -m pip install -U --pre "mxnet>=2.0.0b20200926" -f https://dist.mxnet.io/python --user + +# Install PyTorch +RUN python3 -m pip install -U torch torchvision --user + +RUN pip3 install --no-cache --upgrade \ + wheel \ + numpy==1.19.1 \ + pandas==0.25.1 \ + pytest \ + Pillow \ + requests==2.22.0 \ + scikit-learn==0.20.4 \ + scipy==1.2.2 \ + urllib3==1.25.8 \ + python-dateutil==2.8.0 \ + sagemaker-experiments==0.* \ + PyYAML==5.3.1 \ + mpi4py==3.0.2 \ + jupyterlab==2.2.4 \ + cmake \ + awscli + +RUN mkdir -p ${WORKDIR}/notebook +RUN mkdir -p ${WORKDIR}/data +RUN cd ${WORKDIR} \ + && git clone https://github.com/dmlc/gluon-nlp \ + && cd gluon-nlp \ + && git checkout master \ + && python3 -m pip install -U -e ."[extras]" --user + +COPY start_jupyter.sh /start_jupyter.sh +COPY devel_entrypoint.sh /devel_entrypoint.sh +RUN chmod +x /devel_entrypoint.sh + +EXPOSE 8888 +EXPOSE 8787 +EXPOSE 8786 + +WORKDIR ${WORKDIR} + +# Debug horovod by default +RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf + +# Install NodeJS + Tensorboard + TensorboardX +RUN curl -sL https://deb.nodesource.com/setup_14.x | bash - \ + && apt-get install -y nodejs + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + libsndfile1-dev + +RUN pip3 install --no-cache --upgrade \ + soundfile==0.10.2 \ + ipywidgets==7.5.1 \ + jupyter_tensorboard==0.2.0 \ + widgetsnbextension==3.5.1 \ + tensorboard==2.1.1 \ + tensorboardX==2.1 +RUN jupyter labextension install jupyterlab_tensorboard \ + && jupyter nbextension enable --py widgetsnbextension \ + && jupyter labextension install @jupyter-widgets/jupyterlab-manager + +# Revise default shell to /bin/bash +RUN jupyter notebook --generate-config \ + && echo "c.NotebookApp.terminado_settings = { 'shell_command': ['/bin/bash'] }" >> /root/.jupyter/jupyter_notebook_config.py + +# Add Tini +ARG TINI_VERSION=v0.19.0 +ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini +RUN chmod +x /tini +ENTRYPOINT [ "/tini", "--", "/devel_entrypoint.sh" ] +CMD ["/bin/bash"] From 487e88e05534a7827d90a1c3ec6c9653402802a0 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Thu, 8 Oct 2020 14:50:01 -0700 Subject: [PATCH 009/115] update --- tools/docker/install/jupyter_lab_dev.sh | 21 ++++++++++++++ tools/docker/ubuntu18.04-devel-gpu.Dockerfile | 28 +++---------------- 2 files changed, 25 insertions(+), 24 deletions(-) create mode 100644 tools/docker/install/jupyter_lab_dev.sh diff --git a/tools/docker/install/jupyter_lab_dev.sh b/tools/docker/install/jupyter_lab_dev.sh new file mode 100644 index 0000000000..b7664637b5 --- /dev/null +++ b/tools/docker/install/jupyter_lab_dev.sh @@ -0,0 +1,21 @@ +# Install NodeJS + Tensorboard + TensorboardX + +curl -sL https://deb.nodesource.com/setup_14.x | bash - \ + && apt-get install -y nodejs + +apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev + +pip3 install --no-cache --upgrade \ + soundfile==0.10.2 \ + ipywidgets==7.5.1 \ + jupyter_tensorboard==0.2.0 \ + widgetsnbextension==3.5.1 \ + tensorboard==2.1.1 \ + tensorboardX==2.1 +jupyter labextension install jupyterlab_tensorboard \ + && jupyter nbextension enable --py widgetsnbextension \ + && jupyter labextension install @jupyter-widgets/jupyterlab-manager + +# Revise default shell to /bin/bash +jupyter notebook --generate-config \ + && echo "c.NotebookApp.terminado_settings = { 'shell_command': ['/bin/bash'] }" >> /root/.jupyter/jupyter_notebook_config.py diff --git a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile index 590052dd5b..66a112c9bb 100644 --- a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile @@ -108,6 +108,8 @@ RUN python3 -m pip install -U torch torchvision --user RUN HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITHOUT_GLOO=1 \ HOROVOD_WITH_MPI=1 HOROVOD_WITH_MXNET=1 HOROVOD_WITH_PYTORCH=1 \ HOROVOD_WITHOUT_TENSORFLOW=1 python3 -m pip install --no-cache-dir horovod --user +# Debug horovod by default +RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf RUN mkdir -p ${WORKDIR}/notebook RUN mkdir -p ${WORKDIR}/data @@ -120,6 +122,7 @@ RUN cd ${WORKDIR} \ COPY start_jupyter.sh /start_jupyter.sh COPY devel_entrypoint.sh /devel_entrypoint.sh +COPY install /install RUN chmod +x /devel_entrypoint.sh EXPOSE 8888 @@ -128,31 +131,8 @@ EXPOSE 8786 WORKDIR ${WORKDIR} -# Debug horovod by default -RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf - # Install NodeJS + Tensorboard + TensorboardX -RUN curl -sL https://deb.nodesource.com/setup_14.x | bash - \ - && apt-get install -y nodejs - -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - libsndfile1-dev - -RUN pip3 install --no-cache --upgrade \ - soundfile==0.10.2 \ - ipywidgets==7.5.1 \ - jupyter_tensorboard==0.2.0 \ - widgetsnbextension==3.5.1 \ - tensorboard==2.1.1 \ - tensorboardX==2.1 -RUN jupyter labextension install jupyterlab_tensorboard \ - && jupyter nbextension enable --py widgetsnbextension \ - && jupyter labextension install @jupyter-widgets/jupyterlab-manager - -# Revise default shell to /bin/bash -RUN jupyter notebook --generate-config \ - && echo "c.NotebookApp.terminado_settings = { 'shell_command': ['/bin/bash'] }" >> /root/.jupyter/jupyter_notebook_config.py +RUN source /install/jupyter_lab_dev.sh # Add Tini ARG TINI_VERSION=v0.19.0 From 0fbecd42f63eed2f7093559b0a46243c780db09d Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Thu, 8 Oct 2020 16:08:23 -0700 Subject: [PATCH 010/115] update --- tools/docker/README.md | 6 + tools/docker/ubuntu18.04-base-gpu.Dockerfile | 112 ++++++++++++++++++ tools/docker/ubuntu18.04-devel-gpu.Dockerfile | 2 +- 3 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 tools/docker/ubuntu18.04-base-gpu.Dockerfile diff --git a/tools/docker/README.md b/tools/docker/README.md index b9ed4abfe5..f9167c5149 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -4,6 +4,12 @@ With the prebuilt docker image, there is no need to worry about the operating sy You can launch a [JupyterLab](https://jupyterlab.readthedocs.io/en/stable/) development environment and try out to use GluonNLP to solve your problem. +| Name | Description | Target User | +|------|-------------|-------------| +| `base` | Extends the CUDA image to include the basic functionalities, e.g., GluonNLP package, MXNet, PyTorch, Horovod. You can directly use the docker to run distributed training jobs. | Users that are willing to use GluonNLP to train models. | +| `devel` | Extends the base image to include a development platform powered by Jupyter Lab. Some useful functionalities like Tensorboard are pre-installed. | Users that are willing to analyze NLP data and build models with GluonNLP. | + + ## Run Docker You can run the docker with the following command. diff --git a/tools/docker/ubuntu18.04-base-gpu.Dockerfile b/tools/docker/ubuntu18.04-base-gpu.Dockerfile new file mode 100644 index 0000000000..6a551f1145 --- /dev/null +++ b/tools/docker/ubuntu18.04-base-gpu.Dockerfile @@ -0,0 +1,112 @@ +FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 + +LABEL maintainer="GluonNLP Team" + +ARG DEBIAN_FRONTEND=noninteractive + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" \ + PYTHONIOENCODING=UTF-8 \ + LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 + +ENV WORKDIR=/workspace +ENV SHELL=/bin/bash + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + software-properties-common \ + build-essential \ + ca-certificates \ + curl \ + emacs \ + subversion \ + locales \ + cmake \ + git \ + libopencv-dev \ + htop \ + vim \ + wget \ + unzip \ + libopenblas-dev \ + ninja-build \ + openssh-client \ + openssh-server \ + python3-dev \ + python3-pip \ + python3-setuptools \ + libxft-dev \ + zlib1g-dev \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN python3 -m pip --no-cache-dir install --upgrade \ + pip \ + setuptools + +########################################################################### +# Horovod dependencies +########################################################################### + +# Install Open MPI +RUN mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ + && tar zxf openmpi-4.0.1.tar.gz \ + && cd openmpi-4.0.1 \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ + && echo '#!/bin/bash' > /usr/local/bin/mpirun \ + && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ + && chmod a+x /usr/local/bin/mpirun + +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf + +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH +ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH + +RUN ln -s $(which python3) /usr/local/bin/python + +RUN mkdir -p ${WORKDIR} + +# install PyYAML==5.1.2 to avoid conflict with latest awscli +# python-dateutil==2.8.0 to satisfy botocore associated with latest awscli +RUN pip3 install --no-cache --upgrade \ + wheel \ + numpy==1.19.1 \ + pandas==0.25.1 \ + pytest \ + Pillow \ + requests==2.22.0 \ + scikit-learn==0.20.4 \ + scipy==1.2.2 \ + urllib3==1.25.8 \ + python-dateutil==2.8.0 \ + sagemaker-experiments==0.* \ + PyYAML==5.3.1 \ + mpi4py==3.0.2 \ + jupyterlab==2.2.4 \ + cmake \ + awscli + +# Install MXNet +RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python --user + +# Install PyTorch +RUN python3 -m pip install -U torch torchvision --user + +# Install Horovod +RUN HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITHOUT_GLOO=1 \ + HOROVOD_WITH_MPI=1 HOROVOD_WITH_MXNET=1 HOROVOD_WITH_PYTORCH=1 \ + HOROVOD_WITHOUT_TENSORFLOW=1 python3 -m pip install --no-cache-dir horovod --user +# Debug horovod by default +RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf diff --git a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile index 66a112c9bb..2051ee8090 100644 --- a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile @@ -107,7 +107,7 @@ RUN python3 -m pip install -U torch torchvision --user # Install Horovod RUN HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITHOUT_GLOO=1 \ HOROVOD_WITH_MPI=1 HOROVOD_WITH_MXNET=1 HOROVOD_WITH_PYTORCH=1 \ - HOROVOD_WITHOUT_TENSORFLOW=1 python3 -m pip install --no-cache-dir horovod --user + HOROVOD_WITHOUT_TENSORFLOW=1 python3 -m pip install --no-cache-dir horovod==0.20.3 --user # Debug horovod by default RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf From 9b454bd4aed2d3f7d5fb86488be3c19efacae13c Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Thu, 8 Oct 2020 16:15:17 -0700 Subject: [PATCH 011/115] Update ubuntu18.04-devel-gpu.Dockerfile --- tools/docker/ubuntu18.04-devel-gpu.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile index 2051ee8090..f881730a29 100644 --- a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile @@ -105,7 +105,7 @@ RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dis RUN python3 -m pip install -U torch torchvision --user # Install Horovod -RUN HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITHOUT_GLOO=1 \ +RUN PYTHON_EXECUTABLE=python3 HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITHOUT_GLOO=1 \ HOROVOD_WITH_MPI=1 HOROVOD_WITH_MXNET=1 HOROVOD_WITH_PYTORCH=1 \ HOROVOD_WITHOUT_TENSORFLOW=1 python3 -m pip install --no-cache-dir horovod==0.20.3 --user # Debug horovod by default From 0e6d40a9e2239ba134b61da1353291a9acd58d95 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Fri, 9 Oct 2020 16:33:24 -0700 Subject: [PATCH 012/115] update --- tests/test_models.py | 8 +++----- tools/docker/README.md | 20 +++++++++++++++++++ tools/docker/ubuntu18.04-devel-gpu.Dockerfile | 2 +- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index 3a41dcf656..c46bc252a1 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -16,9 +16,6 @@ def test_list_backbone_names(): def test_get_backbone(name, ctx): with tempfile.TemporaryDirectory() as root, ctx: model_cls, cfg, tokenizer, local_params_path, _ = get_backbone(name, root=root) - if name == 'gpt2_1558M': - # skip gpt2 1558M due to the space - return net = model_cls.from_cfg(cfg) net.load_parameters(local_params_path) net.hybridize() @@ -38,8 +35,9 @@ def test_get_backbone(name, ctx): elif 'bart' in name: out = net(inputs, valid_length, inputs, valid_length) elif 'gpt2' in name: - # Temporarily skip GPT-2 test - return + states = net.init_states(batch_size=batch_size, ctx=ctx) + out, new_states = net(inputs, states) + out_np = out.asnumpy() else: out = net(inputs, token_types, valid_length) mx.npx.waitall() diff --git a/tools/docker/README.md b/tools/docker/README.md index f9167c5149..986e63d592 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -52,5 +52,25 @@ docker build -f ubuntu18.04-devel-cpu.Dockerfile -t gluonai/gluon-nlp:cpu-latest docker build -f ubuntu18.04-devel-gpu.Dockerfile -t gluonai/gluon-nlp:gpu-latest . ``` +In addition, to build the GPU docker, you will need to install the nvidia-docker2 and edit `/etc/docker/daemon.json` like the following: + +``` +{ + "runtimes": { + "nvidia": { + "path": "nvidia-container-runtime", + "runtimeArgs": [] + } + }, + "default-runtime": "nvidia" +} +``` + +After that, restart docker via `sudo systemctl restart docker.service`. + +For more details, you may refer to https://github.com/NVIDIA/nvidia-docker/issues/595. We need this additional setup +because the horovod+mxnet integration identifies the library and include +path of MXNet by querying th MXNet runtime. + ### Developers of GluonNLP You may try to login to your dockerhub account and push the image to dockerhub. diff --git a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile index f881730a29..2051ee8090 100644 --- a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile @@ -105,7 +105,7 @@ RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dis RUN python3 -m pip install -U torch torchvision --user # Install Horovod -RUN PYTHON_EXECUTABLE=python3 HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITHOUT_GLOO=1 \ +RUN HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITHOUT_GLOO=1 \ HOROVOD_WITH_MPI=1 HOROVOD_WITH_MXNET=1 HOROVOD_WITH_PYTORCH=1 \ HOROVOD_WITHOUT_TENSORFLOW=1 python3 -m pip install --no-cache-dir horovod==0.20.3 --user # Debug horovod by default From 4d221cfc2396058cec3ad31b8972adc4510baf98 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Fri, 9 Oct 2020 16:34:43 -0700 Subject: [PATCH 013/115] prepare to add TVM to docker --- tools/docker/install/install_tvm_cpu.sh | 35 ++++++++++++++++++++++++ tools/docker/install/install_tvm_gpu.sh | 36 +++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 tools/docker/install/install_tvm_cpu.sh create mode 100644 tools/docker/install/install_tvm_gpu.sh diff --git a/tools/docker/install/install_tvm_cpu.sh b/tools/docker/install/install_tvm_cpu.sh new file mode 100644 index 0000000000..b11c9791fb --- /dev/null +++ b/tools/docker/install/install_tvm_cpu.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e +set -u +set -o pipefail + +cd /usr +git clone https://github.com/apache/incubator-tvm tvm --recursive +cd /usr/tvm +# checkout a hash-tag +git checkout 4b13bf668edc7099b38d463e5db94ebc96c80470 + +echo set\(USE_LLVM llvm-config-8\) >> config.cmake +echo set\(USE_GRAPH_RUNTIME ON\) >> config.cmake +echo set\(USE_BLAS openblas\) >> config.cmake +mkdir -p build +cd build +cmake .. +make -j10 diff --git a/tools/docker/install/install_tvm_gpu.sh b/tools/docker/install/install_tvm_gpu.sh new file mode 100644 index 0000000000..2dbf8e1739 --- /dev/null +++ b/tools/docker/install/install_tvm_gpu.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e +set -u +set -o pipefail + +cd /usr +git clone https://github.com/apache/incubator-tvm tvm --recursive +cd /usr/tvm +# checkout a hash-tag +git checkout 4b13bf668edc7099b38d463e5db94ebc96c80470 + +echo set\(USE_LLVM llvm-config-8\) >> config.cmake +echo set\(USE_CUDA ON\) >> config.cmake +echo set\(USE_CUDNN ON\) >> config.cmake +echo set\(USE_BLAS openblas\) >> config.cmake +mkdir -p build +cd build +cmake .. +make -j10 From 029cb05a7ee036e4a8855546cb9834730b632e9c Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Fri, 9 Oct 2020 17:29:42 -0700 Subject: [PATCH 014/115] try to update --- tools/docker/install/install_horovod.sh | 8 ++ ...yter_lab_dev.sh => install_jupyter_lab.sh} | 2 + tools/docker/install/install_openmpi.sh | 21 ++++ .../docker/install/install_python_packages.sh | 22 +++++ .../install/install_ubuntu18.04_core.sh | 37 +++++++ tools/docker/ubuntu18.04-devel-gpu.Dockerfile | 96 ++----------------- 6 files changed, 99 insertions(+), 87 deletions(-) create mode 100644 tools/docker/install/install_horovod.sh rename tools/docker/install/{jupyter_lab_dev.sh => install_jupyter_lab.sh} (97%) create mode 100644 tools/docker/install/install_openmpi.sh create mode 100644 tools/docker/install/install_python_packages.sh create mode 100644 tools/docker/install/install_ubuntu18.04_core.sh diff --git a/tools/docker/install/install_horovod.sh b/tools/docker/install/install_horovod.sh new file mode 100644 index 0000000000..a311ce2d95 --- /dev/null +++ b/tools/docker/install/install_horovod.sh @@ -0,0 +1,8 @@ +set -euo pipefail + +# Install Horovod +HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITHOUT_GLOO=1 \ +HOROVOD_WITH_MPI=1 HOROVOD_WITH_MXNET=1 HOROVOD_WITH_PYTORCH=1 \ +HOROVOD_WITHOUT_TENSORFLOW=1 python3 -m pip install --no-cache-dir horovod==0.20.3 --user +# Debug horovod by default +echo NCCL_DEBUG=INFO >> /etc/nccl.conf diff --git a/tools/docker/install/jupyter_lab_dev.sh b/tools/docker/install/install_jupyter_lab.sh similarity index 97% rename from tools/docker/install/jupyter_lab_dev.sh rename to tools/docker/install/install_jupyter_lab.sh index b7664637b5..79633218fb 100644 --- a/tools/docker/install/jupyter_lab_dev.sh +++ b/tools/docker/install/install_jupyter_lab.sh @@ -1,3 +1,5 @@ +set -euo pipefail + # Install NodeJS + Tensorboard + TensorboardX curl -sL https://deb.nodesource.com/setup_14.x | bash - \ diff --git a/tools/docker/install/install_openmpi.sh b/tools/docker/install/install_openmpi.sh new file mode 100644 index 0000000000..72ee6b3c59 --- /dev/null +++ b/tools/docker/install/install_openmpi.sh @@ -0,0 +1,21 @@ +set -euo pipefail + +RUN mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ + && tar zxf openmpi-4.0.1.tar.gz \ + && cd openmpi-4.0.1 \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ + && echo '#!/bin/bash' > /usr/local/bin/mpirun \ + && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ + && chmod a+x /usr/local/bin/mpirun + +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf diff --git a/tools/docker/install/install_python_packages.sh b/tools/docker/install/install_python_packages.sh new file mode 100644 index 0000000000..96dededd35 --- /dev/null +++ b/tools/docker/install/install_python_packages.sh @@ -0,0 +1,22 @@ +set -euo pipefail + + +# install PyYAML==5.1.2 to avoid conflict with latest awscli +# python-dateutil==2.8.0 to satisfy botocore associated with latest awscli +pip3 install --no-cache --upgrade \ + wheel \ + numpy==1.19.1 \ + pandas==0.25.1 \ + pytest \ + Pillow \ + requests==2.22.0 \ + scikit-learn==0.20.4 \ + scipy==1.2.2 \ + urllib3==1.25.8 \ + python-dateutil==2.8.0 \ + sagemaker-experiments==0.* \ + PyYAML==5.3.1 \ + mpi4py==3.0.2 \ + jupyterlab==2.2.4 \ + cmake \ + awscli diff --git a/tools/docker/install/install_ubuntu18.04_core.sh b/tools/docker/install/install_ubuntu18.04_core.sh new file mode 100644 index 0000000000..7278b702f9 --- /dev/null +++ b/tools/docker/install/install_ubuntu18.04_core.sh @@ -0,0 +1,37 @@ +set -e +set -u +set -o pipefail + +apt-get update \ + && apt-get install -y --no-install-recommends \ + software-properties-common \ + build-essential \ + ca-certificates \ + curl \ + emacs \ + subversion \ + locales \ + cmake \ + git \ + libopencv-dev \ + htop \ + vim \ + wget \ + unzip \ + libopenblas-dev \ + ninja-build \ + openssh-client \ + openssh-server \ + python3-dev \ + python3-pip \ + python3-setuptools \ + libxft-dev \ + zlib1g-dev \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +python3 -m pip --no-cache-dir install --upgrade \ + pip \ + setuptools + +ln -s $(which python3) /usr/local/bin/python diff --git a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile index 2051ee8090..05fba5c6a0 100644 --- a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile @@ -14,89 +14,16 @@ ENV PYTHONDONTWRITEBYTECODE=1 \ ENV WORKDIR=/workspace ENV SHELL=/bin/bash -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - software-properties-common \ - build-essential \ - ca-certificates \ - curl \ - emacs \ - subversion \ - locales \ - cmake \ - git \ - libopencv-dev \ - htop \ - vim \ - wget \ - unzip \ - libopenblas-dev \ - ninja-build \ - openssh-client \ - openssh-server \ - python3-dev \ - python3-pip \ - python3-setuptools \ - libxft-dev \ - zlib1g-dev \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -RUN python3 -m pip --no-cache-dir install --upgrade \ - pip \ - setuptools - -########################################################################### -# Horovod dependencies -########################################################################### +RUN mkdir -p ${WORKDIR} -# Install Open MPI -RUN mkdir /tmp/openmpi \ - && cd /tmp/openmpi \ - && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ - && tar zxf openmpi-4.0.1.tar.gz \ - && cd openmpi-4.0.1 \ - && ./configure --enable-orterun-prefix-by-default \ - && make -j $(nproc) all \ - && make install \ - && ldconfig \ - && rm -rf /tmp/openmpi - -# Create a wrapper for OpenMPI to allow running as root by default -RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ - && echo '#!/bin/bash' > /usr/local/bin/mpirun \ - && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ - && chmod a+x /usr/local/bin/mpirun - -RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ - && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf +RUN /install/install_ubuntu18.04_core.sh +# Install Open MPI +RUN /install/install_openmpi.sh ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH -RUN ln -s $(which python3) /usr/local/bin/python - -RUN mkdir -p ${WORKDIR} - -# install PyYAML==5.1.2 to avoid conflict with latest awscli -# python-dateutil==2.8.0 to satisfy botocore associated with latest awscli -RUN pip3 install --no-cache --upgrade \ - wheel \ - numpy==1.19.1 \ - pandas==0.25.1 \ - pytest \ - Pillow \ - requests==2.22.0 \ - scikit-learn==0.20.4 \ - scipy==1.2.2 \ - urllib3==1.25.8 \ - python-dateutil==2.8.0 \ - sagemaker-experiments==0.* \ - PyYAML==5.3.1 \ - mpi4py==3.0.2 \ - jupyterlab==2.2.4 \ - cmake \ - awscli +RUN /install/install_python_packages.sh # Install MXNet RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python --user @@ -105,11 +32,10 @@ RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dis RUN python3 -m pip install -U torch torchvision --user # Install Horovod -RUN HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITHOUT_GLOO=1 \ - HOROVOD_WITH_MPI=1 HOROVOD_WITH_MXNET=1 HOROVOD_WITH_PYTORCH=1 \ - HOROVOD_WITHOUT_TENSORFLOW=1 python3 -m pip install --no-cache-dir horovod==0.20.3 --user -# Debug horovod by default -RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf +RUN /install/install_horovod.sh + +# Install Jupyter Lab +RUN /install/install_jupyter_lab.sh RUN mkdir -p ${WORKDIR}/notebook RUN mkdir -p ${WORKDIR}/data @@ -122,7 +48,6 @@ RUN cd ${WORKDIR} \ COPY start_jupyter.sh /start_jupyter.sh COPY devel_entrypoint.sh /devel_entrypoint.sh -COPY install /install RUN chmod +x /devel_entrypoint.sh EXPOSE 8888 @@ -131,9 +56,6 @@ EXPOSE 8786 WORKDIR ${WORKDIR} -# Install NodeJS + Tensorboard + TensorboardX -RUN source /install/jupyter_lab_dev.sh - # Add Tini ARG TINI_VERSION=v0.19.0 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini From 5a69ff848c749b7965a0af76fa0673098fc901a1 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Fri, 9 Oct 2020 18:32:57 -0700 Subject: [PATCH 015/115] Update ubuntu18.04-devel-gpu.Dockerfile --- tools/docker/ubuntu18.04-devel-gpu.Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile index 05fba5c6a0..d660cea3f0 100644 --- a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile @@ -1,6 +1,7 @@ FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 LABEL maintainer="GluonNLP Team" +COPY install /install ARG DEBIAN_FRONTEND=noninteractive From 35c3e1c581493e3eee3acec971547d88587a6d50 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Fri, 9 Oct 2020 18:34:14 -0700 Subject: [PATCH 016/115] Update ubuntu18.04-devel-gpu.Dockerfile --- tools/docker/ubuntu18.04-devel-gpu.Dockerfile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile index d660cea3f0..9ad3be01b5 100644 --- a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile @@ -17,14 +17,14 @@ ENV SHELL=/bin/bash RUN mkdir -p ${WORKDIR} -RUN /install/install_ubuntu18.04_core.sh +RUN bash /install/install_ubuntu18.04_core.sh # Install Open MPI -RUN /install/install_openmpi.sh +RUN bash /install/install_openmpi.sh ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH -RUN /install/install_python_packages.sh +RUN bash /install/install_python_packages.sh # Install MXNet RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python --user @@ -33,10 +33,10 @@ RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dis RUN python3 -m pip install -U torch torchvision --user # Install Horovod -RUN /install/install_horovod.sh +RUN bash /install/install_horovod.sh # Install Jupyter Lab -RUN /install/install_jupyter_lab.sh +RUN bash /install/install_jupyter_lab.sh RUN mkdir -p ${WORKDIR}/notebook RUN mkdir -p ${WORKDIR}/data From fc665513b780816a195fdf9276935bdf7ca61fdb Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Fri, 9 Oct 2020 18:42:55 -0700 Subject: [PATCH 017/115] Update install_openmpi.sh --- tools/docker/install/install_openmpi.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/docker/install/install_openmpi.sh b/tools/docker/install/install_openmpi.sh index 72ee6b3c59..42a764a740 100644 --- a/tools/docker/install/install_openmpi.sh +++ b/tools/docker/install/install_openmpi.sh @@ -1,6 +1,6 @@ set -euo pipefail -RUN mkdir /tmp/openmpi \ +mkdir /tmp/openmpi \ && cd /tmp/openmpi \ && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ && tar zxf openmpi-4.0.1.tar.gz \ @@ -12,10 +12,10 @@ RUN mkdir /tmp/openmpi \ && rm -rf /tmp/openmpi # Create a wrapper for OpenMPI to allow running as root by default -RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ +mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ && echo '#!/bin/bash' > /usr/local/bin/mpirun \ && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ && chmod a+x /usr/local/bin/mpirun -RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ +echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf From 2006d0bcc23ceefa0e7fde67b9db0b2b26aaaf9e Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Fri, 9 Oct 2020 19:40:57 -0700 Subject: [PATCH 018/115] update --- tools/docker/README.md | 4 ++-- tools/docker/install/install_tvm_cpu.sh | 7 ++++++- tools/docker/install/install_tvm_gpu.sh | 7 ++++++- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/tools/docker/README.md b/tools/docker/README.md index 986e63d592..2013a6e285 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -48,8 +48,8 @@ docker run --gpus all --rm -it --shm-size=4g gluonai/gluon-nlp:gpu-latest \ To build a docker image fom the dockerfile, you may use the following command: ``` -docker build -f ubuntu18.04-devel-cpu.Dockerfile -t gluonai/gluon-nlp:cpu-latest . -docker build -f ubuntu18.04-devel-gpu.Dockerfile -t gluonai/gluon-nlp:gpu-latest . +docker build -f ubuntu18.04-devel-cpu.Dockerfile -t gluonai/gluon-nlp:devel-cpu-latest . +docker build -f ubuntu18.04-devel-gpu.Dockerfile -t gluonai/gluon-nlp:devel-gpu-latest . ``` In addition, to build the GPU docker, you will need to install the nvidia-docker2 and edit `/etc/docker/daemon.json` like the following: diff --git a/tools/docker/install/install_tvm_cpu.sh b/tools/docker/install/install_tvm_cpu.sh index b11c9791fb..5598095090 100644 --- a/tools/docker/install/install_tvm_cpu.sh +++ b/tools/docker/install/install_tvm_cpu.sh @@ -24,7 +24,7 @@ cd /usr git clone https://github.com/apache/incubator-tvm tvm --recursive cd /usr/tvm # checkout a hash-tag -git checkout 4b13bf668edc7099b38d463e5db94ebc96c80470 +git checkout 6d0351a7f0e23eb5428c59a976edd2bfb8207c0d echo set\(USE_LLVM llvm-config-8\) >> config.cmake echo set\(USE_GRAPH_RUNTIME ON\) >> config.cmake @@ -33,3 +33,8 @@ mkdir -p build cd build cmake .. make -j10 + +# install python binding +cd .. +cd python +python3 -m pip install -U -e . --user diff --git a/tools/docker/install/install_tvm_gpu.sh b/tools/docker/install/install_tvm_gpu.sh index 2dbf8e1739..f00ed64039 100644 --- a/tools/docker/install/install_tvm_gpu.sh +++ b/tools/docker/install/install_tvm_gpu.sh @@ -24,7 +24,7 @@ cd /usr git clone https://github.com/apache/incubator-tvm tvm --recursive cd /usr/tvm # checkout a hash-tag -git checkout 4b13bf668edc7099b38d463e5db94ebc96c80470 +git checkout 6d0351a7f0e23eb5428c59a976edd2bfb8207c0d echo set\(USE_LLVM llvm-config-8\) >> config.cmake echo set\(USE_CUDA ON\) >> config.cmake @@ -34,3 +34,8 @@ mkdir -p build cd build cmake .. make -j10 + +# install python binding +cd .. +cd python +python3 -m pip install -U -e . --user From 8f0fa41fa9618ab1cd52874efe4adaa9209e6d6d Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Fri, 9 Oct 2020 19:52:06 -0700 Subject: [PATCH 019/115] Create install_llvm.sh --- tools/docker/install/install_llvm.sh | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 tools/docker/install/install_llvm.sh diff --git a/tools/docker/install/install_llvm.sh b/tools/docker/install/install_llvm.sh new file mode 100644 index 0000000000..7ca627b5ca --- /dev/null +++ b/tools/docker/install/install_llvm.sh @@ -0,0 +1,5 @@ +set -euo pipefail + +wget https://apt.llvm.org/llvm.sh +chmod +x llvm.sh +./llvm.sh 8 # Fix version From 80bc0719346e18102a0c72593fecc0e4e62e62b0 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Fri, 9 Oct 2020 20:25:17 -0700 Subject: [PATCH 020/115] Update ubuntu18.04-base-gpu.Dockerfile --- tools/docker/ubuntu18.04-base-gpu.Dockerfile | 88 ++------------------ 1 file changed, 6 insertions(+), 82 deletions(-) diff --git a/tools/docker/ubuntu18.04-base-gpu.Dockerfile b/tools/docker/ubuntu18.04-base-gpu.Dockerfile index 6a551f1145..c7c30f7d7e 100644 --- a/tools/docker/ubuntu18.04-base-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-base-gpu.Dockerfile @@ -1,6 +1,7 @@ FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 LABEL maintainer="GluonNLP Team" +COPY install /install ARG DEBIAN_FRONTEND=noninteractive @@ -14,89 +15,16 @@ ENV PYTHONDONTWRITEBYTECODE=1 \ ENV WORKDIR=/workspace ENV SHELL=/bin/bash -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - software-properties-common \ - build-essential \ - ca-certificates \ - curl \ - emacs \ - subversion \ - locales \ - cmake \ - git \ - libopencv-dev \ - htop \ - vim \ - wget \ - unzip \ - libopenblas-dev \ - ninja-build \ - openssh-client \ - openssh-server \ - python3-dev \ - python3-pip \ - python3-setuptools \ - libxft-dev \ - zlib1g-dev \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -RUN python3 -m pip --no-cache-dir install --upgrade \ - pip \ - setuptools +RUN mkdir -p ${WORKDIR} -########################################################################### -# Horovod dependencies -########################################################################### +RUN bash /install/install_ubuntu18.04_core.sh # Install Open MPI -RUN mkdir /tmp/openmpi \ - && cd /tmp/openmpi \ - && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ - && tar zxf openmpi-4.0.1.tar.gz \ - && cd openmpi-4.0.1 \ - && ./configure --enable-orterun-prefix-by-default \ - && make -j $(nproc) all \ - && make install \ - && ldconfig \ - && rm -rf /tmp/openmpi - -# Create a wrapper for OpenMPI to allow running as root by default -RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ - && echo '#!/bin/bash' > /usr/local/bin/mpirun \ - && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ - && chmod a+x /usr/local/bin/mpirun - -RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ - && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf - +RUN bash /install/install_openmpi.sh ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH -RUN ln -s $(which python3) /usr/local/bin/python - -RUN mkdir -p ${WORKDIR} - -# install PyYAML==5.1.2 to avoid conflict with latest awscli -# python-dateutil==2.8.0 to satisfy botocore associated with latest awscli -RUN pip3 install --no-cache --upgrade \ - wheel \ - numpy==1.19.1 \ - pandas==0.25.1 \ - pytest \ - Pillow \ - requests==2.22.0 \ - scikit-learn==0.20.4 \ - scipy==1.2.2 \ - urllib3==1.25.8 \ - python-dateutil==2.8.0 \ - sagemaker-experiments==0.* \ - PyYAML==5.3.1 \ - mpi4py==3.0.2 \ - jupyterlab==2.2.4 \ - cmake \ - awscli +RUN bash /install/install_python_packages.sh # Install MXNet RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python --user @@ -105,8 +33,4 @@ RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dis RUN python3 -m pip install -U torch torchvision --user # Install Horovod -RUN HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITHOUT_GLOO=1 \ - HOROVOD_WITH_MPI=1 HOROVOD_WITH_MXNET=1 HOROVOD_WITH_PYTORCH=1 \ - HOROVOD_WITHOUT_TENSORFLOW=1 python3 -m pip install --no-cache-dir horovod --user -# Debug horovod by default -RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf +RUN bash /install/install_horovod.sh From ee3d27b068ab4ee03ceddfdc3d681d6163aaa933 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Fri, 9 Oct 2020 20:26:44 -0700 Subject: [PATCH 021/115] Update ubuntu18.04-base-gpu.Dockerfile --- tools/docker/ubuntu18.04-base-gpu.Dockerfile | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/docker/ubuntu18.04-base-gpu.Dockerfile b/tools/docker/ubuntu18.04-base-gpu.Dockerfile index c7c30f7d7e..6a5712e09c 100644 --- a/tools/docker/ubuntu18.04-base-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-base-gpu.Dockerfile @@ -34,3 +34,15 @@ RUN python3 -m pip install -U torch torchvision --user # Install Horovod RUN bash /install/install_horovod.sh + +RUN mkdir -p ${WORKDIR}/notebook +RUN mkdir -p ${WORKDIR}/data +RUN mkdir -p /.init +RUN cd ${WORKDIR} \ + && git clone https://github.com/dmlc/gluon-nlp \ + && cd gluon-nlp \ + && git checkout master \ + && python3 -m pip install -U -e ."[extras]" --user + +WORKDIR ${WORKDIR} +s \ No newline at end of file From 5790d6b628c0e3208f3ca127903bff5bf817c92c Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Fri, 9 Oct 2020 20:54:35 -0700 Subject: [PATCH 022/115] Update run_squad2_albert_base.sh --- .../commands/run_squad2_albert_base.sh | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/scripts/question_answering/commands/run_squad2_albert_base.sh b/scripts/question_answering/commands/run_squad2_albert_base.sh index 69bee438f8..86bccc007b 100644 --- a/scripts/question_answering/commands/run_squad2_albert_base.sh +++ b/scripts/question_answering/commands/run_squad2_albert_base.sh @@ -1,3 +1,5 @@ +USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default + VERSION=2.0 # Either 2.0 or 1.1 MODEL_NAME=google_albert_base_v2 @@ -6,8 +8,14 @@ nlp_data prepare_squad --version ${VERSION} # Run the script -python3 run_squad.py \ - --model_name ${MODEL_NAME} \ +if [ ${USE_HOROVOD} -eq 0 ]; +then + RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" +else + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" +fi + +${RUN_COMMAND} --model_name ${MODEL_NAME} \ --data_dir squad \ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ --version ${VERSION} \ @@ -15,11 +23,10 @@ python3 run_squad.py \ --do_train \ --batch_size 4 \ --num_accumulated 3 \ - --gpus 0,1,2,3 \ --epochs 3 \ --lr 2e-5 \ --warmup_ratio 0.1 \ --wd 0.01 \ --max_seq_length 512 \ --max_grad_norm 0.1 \ - --overwrite_cache \ + --overwrite_cache From ae8b2cc4ac68f8efd8ab1007de86dec5b1fb19ef Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Fri, 9 Oct 2020 21:08:41 -0700 Subject: [PATCH 023/115] Update prepare_squad.py --- scripts/datasets/question_answering/prepare_squad.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/datasets/question_answering/prepare_squad.py b/scripts/datasets/question_answering/prepare_squad.py index fb9381fc46..d4cd77de96 100644 --- a/scripts/datasets/question_answering/prepare_squad.py +++ b/scripts/datasets/question_answering/prepare_squad.py @@ -45,7 +45,6 @@ def get_parser(): parser.add_argument('--save-path', type=str, default='squad') parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH, help='The path to download the dataset.') - parser.add_argument('--overwrite', action='store_true') return parser @@ -58,14 +57,16 @@ def main(args): download(dev_url, path=os.path.join(args.cache_path, dev_file_name)) if not os.path.exists(args.save_path): os.makedirs(args.save_path) - if not os.path.exists(os.path.join(args.save_path, train_file_name))\ - or (args.overwrite and args.save_path != args.cache_path): + if not os.path.exists(os.path.join(args.save_path, train_file_name)): os.symlink(os.path.join(args.cache_path, train_file_name), os.path.join(args.save_path, train_file_name)) - if not os.path.exists(os.path.join(args.save_path, dev_file_name))\ - or (args.overwrite and args.save_path != args.cache_path): + else: + print(f'Found {os.path.join(args.save_path, train_file_name)}') + if not os.path.exists(os.path.join(args.save_path, dev_file_name)): os.symlink(os.path.join(args.cache_path, dev_file_name), os.path.join(args.save_path, dev_file_name)) + else: + print(f'Found {os.path.join(args.save_path, dev_file_name)}') def cli_main(): From 0555216027f1a0f8105c06474c97d5326b2390f5 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Fri, 9 Oct 2020 21:13:47 -0700 Subject: [PATCH 024/115] Update prepare_squad.py --- .../question_answering/prepare_squad.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/scripts/datasets/question_answering/prepare_squad.py b/scripts/datasets/question_answering/prepare_squad.py index d4cd77de96..5b891bb059 100644 --- a/scripts/datasets/question_answering/prepare_squad.py +++ b/scripts/datasets/question_answering/prepare_squad.py @@ -1,5 +1,6 @@ import os import argparse +import shutil from gluonnlp.utils.misc import download, load_checksum_stats from gluonnlp.base import get_data_home_dir @@ -45,6 +46,7 @@ def get_parser(): parser.add_argument('--save-path', type=str, default='squad') parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH, help='The path to download the dataset.') + parser.add_argument('--overwrite', action='store_true') return parser @@ -61,12 +63,25 @@ def main(args): os.symlink(os.path.join(args.cache_path, train_file_name), os.path.join(args.save_path, train_file_name)) else: - print(f'Found {os.path.join(args.save_path, train_file_name)}') - if not os.path.exists(os.path.join(args.save_path, dev_file_name)): + print(f'Found {os.path.join(args.save_path, train_file_name)}...') + if args.overwrite and args.save_path != args.cache_path: + print('Overwrite!') + shutil.copyfile(os.path.join(args.cache_path, train_file_name), + os.path.join(args.save_path, train_file_name)) + else: + print('Skip!') + if not os.path.exists(os.path.join(args.save_path, dev_file_name))\ + or (args.overwrite and args.save_path != args.cache_path): os.symlink(os.path.join(args.cache_path, dev_file_name), os.path.join(args.save_path, dev_file_name)) else: print(f'Found {os.path.join(args.save_path, dev_file_name)}') + if args.overwrite and args.save_path != args.cache_path: + print('Overwrite!') + shutil.copyfile(os.path.join(args.cache_path, dev_file_name), + os.path.join(args.save_path, dev_file_name)) + else: + print('Skip!') def cli_main(): From 43d4198d9064cc56f559501216be97e271c7d104 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Fri, 9 Oct 2020 21:15:49 -0700 Subject: [PATCH 025/115] Update prepare_squad.py --- .../question_answering/prepare_squad.py | 29 ++++++------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/scripts/datasets/question_answering/prepare_squad.py b/scripts/datasets/question_answering/prepare_squad.py index 5b891bb059..292fe1fae2 100644 --- a/scripts/datasets/question_answering/prepare_squad.py +++ b/scripts/datasets/question_answering/prepare_squad.py @@ -59,29 +59,18 @@ def main(args): download(dev_url, path=os.path.join(args.cache_path, dev_file_name)) if not os.path.exists(args.save_path): os.makedirs(args.save_path) - if not os.path.exists(os.path.join(args.save_path, train_file_name)): - os.symlink(os.path.join(args.cache_path, train_file_name), - os.path.join(args.save_path, train_file_name)) + if not os.path.exists(os.path.join(args.save_path, train_file_name)) \ + or (args.overwrite and args.save_path != args.cache_path): + shutil.copyfile(os.path.join(args.cache_path, train_file_name), + os.path.join(args.save_path, train_file_name)) else: - print(f'Found {os.path.join(args.save_path, train_file_name)}...') - if args.overwrite and args.save_path != args.cache_path: - print('Overwrite!') - shutil.copyfile(os.path.join(args.cache_path, train_file_name), - os.path.join(args.save_path, train_file_name)) - else: - print('Skip!') - if not os.path.exists(os.path.join(args.save_path, dev_file_name))\ + print(f'Found {os.path.join(args.save_path, train_file_name)}...skip') + if not os.path.exists(os.path.join(args.save_path, dev_file_name)) \ or (args.overwrite and args.save_path != args.cache_path): - os.symlink(os.path.join(args.cache_path, dev_file_name), - os.path.join(args.save_path, dev_file_name)) + shutil.copyfile(os.path.join(args.cache_path, dev_file_name), + os.path.join(args.save_path, dev_file_name)) else: - print(f'Found {os.path.join(args.save_path, dev_file_name)}') - if args.overwrite and args.save_path != args.cache_path: - print('Overwrite!') - shutil.copyfile(os.path.join(args.cache_path, dev_file_name), - os.path.join(args.save_path, dev_file_name)) - else: - print('Skip!') + print(f'Found {os.path.join(args.save_path, dev_file_name)}...skip') def cli_main(): From 4dc00242a0c80d4021664bab0b84c88a9f62b935 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 16:38:47 -0700 Subject: [PATCH 026/115] fix --- scripts/datasets/question_answering/prepare_searchqa.py | 8 ++++---- scripts/datasets/url_checksums/searchqa.txt | 6 +++--- tools/docker/README.md | 5 ++++- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/scripts/datasets/question_answering/prepare_searchqa.py b/scripts/datasets/question_answering/prepare_searchqa.py index 51552834ba..eb5b9fe0a6 100644 --- a/scripts/datasets/question_answering/prepare_searchqa.py +++ b/scripts/datasets/question_answering/prepare_searchqa.py @@ -1,7 +1,7 @@ import os import argparse from gluonnlp.utils.misc import download, load_checksum_stats -from gluonnlp.base import get_data_home_dir +from gluonnlp.base import get_data_home_dir, get_repo_url _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__))) _BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'searchqa') @@ -20,9 +20,9 @@ """ _URLS = { - 'train': 's3://gluonnlp-numpy-data/datasets/question_answering/searchqa/train.txt', - 'val': 's3://gluonnlp-numpy-data/datasets/question_answering/searchqa/val.txt', - 'test': 's3://gluonnlp-numpy-data/datasets/question_answering/searchqa/test.txt' + 'train': get_repo_url() + 'datasets/question_answering/searchqa/train.txt', + 'val': get_repo_url() + 'datasets/question_answering/searchqa/val.txt', + 'test': get_repo_url() + 'datasets/question_answering/searchqa/test.txt' } diff --git a/scripts/datasets/url_checksums/searchqa.txt b/scripts/datasets/url_checksums/searchqa.txt index 12ba03a7d5..11f518c92f 100644 --- a/scripts/datasets/url_checksums/searchqa.txt +++ b/scripts/datasets/url_checksums/searchqa.txt @@ -1,3 +1,3 @@ -s3://gluonnlp-numpy-data/datasets/question_answering/searchqa/train.txt c7e1eb8c34d0525547b91e18b3f8f4d855e35c16 1226681217 -s3://gluonnlp-numpy-data/datasets/question_answering/searchqa/test.txt 08a928e0f8c129d5b3ca43bf46df117e38be0c27 332064988 -s3://gluonnlp-numpy-data/datasets/question_answering/searchqa/val.txt c2f65d6b83c26188d5998ab96bc6a38c1a127fcc 170835902 +https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/question_answering/searchqa/train.txt c7e1eb8c34d0525547b91e18b3f8f4d855e35c16 1226681217 +https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/question_answering/searchqa/test.txt 08a928e0f8c129d5b3ca43bf46df117e38be0c27 332064988 +https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/question_answering/searchqa/val.txt c2f65d6b83c26188d5998ab96bc6a38c1a127fcc 170835902 diff --git a/tools/docker/README.md b/tools/docker/README.md index 2013a6e285..f0b79f1c62 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -39,7 +39,7 @@ If you have a multi-GPU instance, e.g., [g4dn.12xlarge](https://aws.amazon.com/e of horovod + MXNet by running the question answering script ``` -docker run --gpus all --rm -it --shm-size=4g gluonai/gluon-nlp:gpu-latest \ +docker run --gpus all --rm -it --shm-size=4g gluonai/gluon-nlp:devel-gpu-latest \ horovodrun -np 2 python3 -m pytest /workspace/horovod/horovod/test/test_mxnet.py ``` @@ -74,3 +74,6 @@ path of MXNet by querying th MXNet runtime. ### Developers of GluonNLP You may try to login to your dockerhub account and push the image to dockerhub. +``` +docker push gluonai/gluon-nlp:devel-gpu-latest +``` From 8d8fbb76e4bfe7a2a5db427df7903250c9156a8a Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 17:47:00 -0700 Subject: [PATCH 027/115] Update README.md --- scripts/datasets/general_nlp_benchmark/README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/datasets/general_nlp_benchmark/README.md b/scripts/datasets/general_nlp_benchmark/README.md index 84dc9f5524..0f49b258dc 100644 --- a/scripts/datasets/general_nlp_benchmark/README.md +++ b/scripts/datasets/general_nlp_benchmark/README.md @@ -112,13 +112,13 @@ benchmarking. We select the classical datasets that are also used in | Dataset | #Train | #Test | Columns | Metrics | |---------------|---------|---------|-----------------|-----------------| -| AG | 120000 | 7600 | content, label | acc | -| IMDB | 25000 | 25000 | content, label | acc | -| DBpedia | 560000 | 70000 | content, label | acc | -| Yelp2 | 560000 | 38000 | content, label | acc | -| Yelp5 | 650000 | 50000 | content, label | acc | -| Amazon2 | 3600000 | 400000 | content, label | acc | -| Amazon5 | 3000000 | 650000 | content, label | acc | +| AG | 120,000 | 7,600 | content, label | acc | +| IMDB | 25,000 | 25,000 | content, label | acc | +| DBpedia | 560,000 | 70,000 | content, label | acc | +| Yelp2 | 560,000 | 38,000 | content, label | acc | +| Yelp5 | 650,000 | 50,000 | content, label | acc | +| Amazon2 | 3,600,000 | 400,000 | content, label | acc | +| Amazon5 | 3,000,000 | 65,0000 | content, label | acc | To obtain the datasets, run: From 5aa0fcbea901aa2dddede7c079c93142b24940ad Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 19:10:28 -0700 Subject: [PATCH 028/115] update --- tools/docker/README.md | 8 ++++---- tools/docker/ubuntu18.04-base-gpu.Dockerfile | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/docker/README.md b/tools/docker/README.md index f0b79f1c62..7da1492604 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -39,7 +39,7 @@ If you have a multi-GPU instance, e.g., [g4dn.12xlarge](https://aws.amazon.com/e of horovod + MXNet by running the question answering script ``` -docker run --gpus all --rm -it --shm-size=4g gluonai/gluon-nlp:devel-gpu-latest \ +docker run --gpus all --rm -it --shm-size=4g gluonai/gluon-nlp:gpu-latest \ horovodrun -np 2 python3 -m pytest /workspace/horovod/horovod/test/test_mxnet.py ``` @@ -48,8 +48,8 @@ docker run --gpus all --rm -it --shm-size=4g gluonai/gluon-nlp:devel-gpu-latest To build a docker image fom the dockerfile, you may use the following command: ``` -docker build -f ubuntu18.04-devel-cpu.Dockerfile -t gluonai/gluon-nlp:devel-cpu-latest . -docker build -f ubuntu18.04-devel-gpu.Dockerfile -t gluonai/gluon-nlp:devel-gpu-latest . +docker build -f ubuntu18.04-devel-cpu.Dockerfile -t gluonai/gluon-nlp:cpu-latest . +docker build -f ubuntu18.04-devel-gpu.Dockerfile -t gluonai/gluon-nlp:gpu-latest . ``` In addition, to build the GPU docker, you will need to install the nvidia-docker2 and edit `/etc/docker/daemon.json` like the following: @@ -75,5 +75,5 @@ path of MXNet by querying th MXNet runtime. ### Developers of GluonNLP You may try to login to your dockerhub account and push the image to dockerhub. ``` -docker push gluonai/gluon-nlp:devel-gpu-latest +docker push gluonai/gluon-nlp:gpu-latest ``` diff --git a/tools/docker/ubuntu18.04-base-gpu.Dockerfile b/tools/docker/ubuntu18.04-base-gpu.Dockerfile index 6a5712e09c..ef31e10aaf 100644 --- a/tools/docker/ubuntu18.04-base-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-base-gpu.Dockerfile @@ -35,6 +35,9 @@ RUN python3 -m pip install -U torch torchvision --user # Install Horovod RUN bash /install/install_horovod.sh +# Install Jupyter Lab +RUN bash /install/install_jupyter_lab.sh + RUN mkdir -p ${WORKDIR}/notebook RUN mkdir -p ${WORKDIR}/data RUN mkdir -p /.init @@ -43,6 +46,3 @@ RUN cd ${WORKDIR} \ && cd gluon-nlp \ && git checkout master \ && python3 -m pip install -U -e ."[extras]" --user - -WORKDIR ${WORKDIR} -s \ No newline at end of file From 704117d685e1f9255e78018e0b0f125f2898ee7b Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 19:14:41 -0700 Subject: [PATCH 029/115] update --- tools/docker/README.md | 3 --- tools/docker/ubuntu18.04-base-gpu.Dockerfile | 1 - tools/docker/ubuntu18.04-devel-gpu.Dockerfile | 1 - 3 files changed, 5 deletions(-) diff --git a/tools/docker/README.md b/tools/docker/README.md index 7da1492604..e22d11608d 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -27,9 +27,6 @@ The folder structure of the docker image will be ``` /workspace/ ├── gluonnlp -├── horovod -├── mxnet -├── notebooks ├── data ``` diff --git a/tools/docker/ubuntu18.04-base-gpu.Dockerfile b/tools/docker/ubuntu18.04-base-gpu.Dockerfile index ef31e10aaf..076eee0f7a 100644 --- a/tools/docker/ubuntu18.04-base-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-base-gpu.Dockerfile @@ -38,7 +38,6 @@ RUN bash /install/install_horovod.sh # Install Jupyter Lab RUN bash /install/install_jupyter_lab.sh -RUN mkdir -p ${WORKDIR}/notebook RUN mkdir -p ${WORKDIR}/data RUN mkdir -p /.init RUN cd ${WORKDIR} \ diff --git a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile index 9ad3be01b5..4568eee75d 100644 --- a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile @@ -38,7 +38,6 @@ RUN bash /install/install_horovod.sh # Install Jupyter Lab RUN bash /install/install_jupyter_lab.sh -RUN mkdir -p ${WORKDIR}/notebook RUN mkdir -p ${WORKDIR}/data RUN mkdir -p /.init RUN cd ${WORKDIR} \ From be03a49e753c1263f6cb0b17c5c0be6af0411547 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 19:19:06 -0700 Subject: [PATCH 030/115] Update README.md --- tools/docker/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/docker/README.md b/tools/docker/README.md index e22d11608d..e64026bd66 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -45,7 +45,10 @@ docker run --gpus all --rm -it --shm-size=4g gluonai/gluon-nlp:gpu-latest \ To build a docker image fom the dockerfile, you may use the following command: ``` +docker build -f ubuntu18.04-base-cpu.Dockerfile -t gluonai/gluon-nlp:base-cpu-latest . docker build -f ubuntu18.04-devel-cpu.Dockerfile -t gluonai/gluon-nlp:cpu-latest . + +docker build -f ubuntu18.04-base-gpu.Dockerfile -t gluonai/gluon-nlp:base-gpu-latest . docker build -f ubuntu18.04-devel-gpu.Dockerfile -t gluonai/gluon-nlp:gpu-latest . ``` From eb7d78277a09262f498c37ad5dbce86497698dcc Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 19:19:32 -0700 Subject: [PATCH 031/115] Update README.md --- tools/docker/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/docker/README.md b/tools/docker/README.md index e64026bd66..c6b0551647 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -45,10 +45,10 @@ docker run --gpus all --rm -it --shm-size=4g gluonai/gluon-nlp:gpu-latest \ To build a docker image fom the dockerfile, you may use the following command: ``` -docker build -f ubuntu18.04-base-cpu.Dockerfile -t gluonai/gluon-nlp:base-cpu-latest . +docker build -f ubuntu18.04-base-cpu.Dockerfile -t gluonai/gluon-nlp:cpu-base-latest . docker build -f ubuntu18.04-devel-cpu.Dockerfile -t gluonai/gluon-nlp:cpu-latest . -docker build -f ubuntu18.04-base-gpu.Dockerfile -t gluonai/gluon-nlp:base-gpu-latest . +docker build -f ubuntu18.04-base-gpu.Dockerfile -t gluonai/gluon-nlp:gpu-base-latest . docker build -f ubuntu18.04-devel-gpu.Dockerfile -t gluonai/gluon-nlp:gpu-latest . ``` From 515dd10382aed7f08c9e423e2a3c731173d5d2ad Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 19:28:15 -0700 Subject: [PATCH 032/115] Update ubuntu18.04-devel-gpu.Dockerfile --- tools/docker/ubuntu18.04-devel-gpu.Dockerfile | 46 +------------------ 1 file changed, 1 insertion(+), 45 deletions(-) diff --git a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile index 4568eee75d..ef7b70de11 100644 --- a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile @@ -1,50 +1,6 @@ -FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 +FROM gluonai/gluon-nlp:gpu-base-latest LABEL maintainer="GluonNLP Team" -COPY install /install - -ARG DEBIAN_FRONTEND=noninteractive - -ENV PYTHONDONTWRITEBYTECODE=1 \ - PYTHONUNBUFFERED=1 \ - LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" \ - PYTHONIOENCODING=UTF-8 \ - LANG=C.UTF-8 \ - LC_ALL=C.UTF-8 - -ENV WORKDIR=/workspace -ENV SHELL=/bin/bash - -RUN mkdir -p ${WORKDIR} - -RUN bash /install/install_ubuntu18.04_core.sh - -# Install Open MPI -RUN bash /install/install_openmpi.sh -ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH -ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH - -RUN bash /install/install_python_packages.sh - -# Install MXNet -RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python --user - -# Install PyTorch -RUN python3 -m pip install -U torch torchvision --user - -# Install Horovod -RUN bash /install/install_horovod.sh - -# Install Jupyter Lab -RUN bash /install/install_jupyter_lab.sh - -RUN mkdir -p ${WORKDIR}/data -RUN mkdir -p /.init -RUN cd ${WORKDIR} \ - && git clone https://github.com/dmlc/gluon-nlp \ - && cd gluon-nlp \ - && git checkout master \ - && python3 -m pip install -U -e ."[extras]" --user COPY start_jupyter.sh /start_jupyter.sh COPY devel_entrypoint.sh /devel_entrypoint.sh From 202d89f659bea6855a49b1f2bf21cab266fa3b14 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 20:30:46 -0700 Subject: [PATCH 033/115] update --- tools/docker/README.md | 2 +- tools/docker/ubuntu18.04-base-cpu.Dockerfile | 39 ++++++++++++++ tools/docker/ubuntu18.04-base-gpu.Dockerfile | 55 +++++--------------- 3 files changed, 54 insertions(+), 42 deletions(-) create mode 100644 tools/docker/ubuntu18.04-base-cpu.Dockerfile diff --git a/tools/docker/README.md b/tools/docker/README.md index c6b0551647..d86fd79216 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -6,7 +6,7 @@ and try out to use GluonNLP to solve your problem. | Name | Description | Target User | |------|-------------|-------------| -| `base` | Extends the CUDA image to include the basic functionalities, e.g., GluonNLP package, MXNet, PyTorch, Horovod. You can directly use the docker to run distributed training jobs. | Users that are willing to use GluonNLP to train models. | +| `base` | Extends the CUDA image to include the basic functionalities, e.g., GluonNLP package, MXNet, PyTorch, Horovod. You can directly use the docker to run distributed training jobs. | Users that are willing to use GluonNLP to train models. For example, you can use the docker image for distributed training. | | `devel` | Extends the base image to include a development platform powered by Jupyter Lab. Some useful functionalities like Tensorboard are pre-installed. | Users that are willing to analyze NLP data and build models with GluonNLP. | diff --git a/tools/docker/ubuntu18.04-base-cpu.Dockerfile b/tools/docker/ubuntu18.04-base-cpu.Dockerfile new file mode 100644 index 0000000000..1bb632b4af --- /dev/null +++ b/tools/docker/ubuntu18.04-base-cpu.Dockerfile @@ -0,0 +1,39 @@ +FROM ubuntu:18.04 + +LABEL maintainer="GluonNLP Team" +COPY install /install + +ARG DEBIAN_FRONTEND=noninteractive + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" \ + PYTHONIOENCODING=UTF-8 \ + LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 + +ENV WORKDIR=/workspace +ENV SHELL=/bin/bash + +RUN mkdir -p ${WORKDIR} + +RUN bash /install/install_ubuntu18.04_core.sh + +RUN bash /install/install_python_packages.sh + +# Install MXNet +RUN python3 -m pip install -U --pre "mxnet>=2.0.0b20200926" -f https://dist.mxnet.io/python --user + +# Install PyTorch +RUN python3 -m pip install -U torch torchvision --user + +# Install Jupyter Lab +RUN bash /install/install_jupyter_lab.sh + +RUN mkdir -p ${WORKDIR}/data +RUN mkdir -p /.init +RUN cd ${WORKDIR} \ + && git clone https://github.com/dmlc/gluon-nlp \ + && cd gluon-nlp \ + && git checkout master \ + && python3 -m pip install -U -e ."[extras]" --user diff --git a/tools/docker/ubuntu18.04-base-gpu.Dockerfile b/tools/docker/ubuntu18.04-base-gpu.Dockerfile index 076eee0f7a..961b89b7fd 100644 --- a/tools/docker/ubuntu18.04-base-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-base-gpu.Dockerfile @@ -1,47 +1,20 @@ -FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 +FROM gluonai/gluon-nlp:cspu-base-latest LABEL maintainer="GluonNLP Team" -COPY install /install -ARG DEBIAN_FRONTEND=noninteractive +COPY start_jupyter.sh /start_jupyter.sh +COPY devel_entrypoint.sh /devel_entrypoint.sh +RUN chmod +x /devel_entrypoint.sh -ENV PYTHONDONTWRITEBYTECODE=1 \ - PYTHONUNBUFFERED=1 \ - LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" \ - PYTHONIOENCODING=UTF-8 \ - LANG=C.UTF-8 \ - LC_ALL=C.UTF-8 +EXPOSE 8888 +EXPOSE 8787 +EXPOSE 8786 -ENV WORKDIR=/workspace -ENV SHELL=/bin/bash +WORKDIR ${WORKDIR} -RUN mkdir -p ${WORKDIR} - -RUN bash /install/install_ubuntu18.04_core.sh - -# Install Open MPI -RUN bash /install/install_openmpi.sh -ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH -ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH - -RUN bash /install/install_python_packages.sh - -# Install MXNet -RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python --user - -# Install PyTorch -RUN python3 -m pip install -U torch torchvision --user - -# Install Horovod -RUN bash /install/install_horovod.sh - -# Install Jupyter Lab -RUN bash /install/install_jupyter_lab.sh - -RUN mkdir -p ${WORKDIR}/data -RUN mkdir -p /.init -RUN cd ${WORKDIR} \ - && git clone https://github.com/dmlc/gluon-nlp \ - && cd gluon-nlp \ - && git checkout master \ - && python3 -m pip install -U -e ."[extras]" --user +# Add Tini +ARG TINI_VERSION=v0.19.0 +ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini +RUN chmod +x /tini +ENTRYPOINT [ "/tini", "--", "/devel_entrypoint.sh" ] +CMD ["/bin/bash"] From 633005e95a7b806cf3c1a84756204d0fb066b857 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 20:33:16 -0700 Subject: [PATCH 034/115] Update README.md --- tools/docker/README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/docker/README.md b/tools/docker/README.md index d86fd79216..0826981b3d 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -45,9 +45,11 @@ docker run --gpus all --rm -it --shm-size=4g gluonai/gluon-nlp:gpu-latest \ To build a docker image fom the dockerfile, you may use the following command: ``` +# Build CPU Dockers docker build -f ubuntu18.04-base-cpu.Dockerfile -t gluonai/gluon-nlp:cpu-base-latest . docker build -f ubuntu18.04-devel-cpu.Dockerfile -t gluonai/gluon-nlp:cpu-latest . +# Build GPU Dockers docker build -f ubuntu18.04-base-gpu.Dockerfile -t gluonai/gluon-nlp:gpu-base-latest . docker build -f ubuntu18.04-devel-gpu.Dockerfile -t gluonai/gluon-nlp:gpu-latest . ``` @@ -75,5 +77,8 @@ path of MXNet by querying th MXNet runtime. ### Developers of GluonNLP You may try to login to your dockerhub account and push the image to dockerhub. ``` +docker push gluonai/gluon-nlp:cpu-base-latest +docker push gluonai/gluon-nlp:cpu-latest +docker push gluonai/gluon-nlp:gpu-base-latest docker push gluonai/gluon-nlp:gpu-latest ``` From 8fd9db78a59729a1a0a64d3a5da198c437979811 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 21:33:21 -0700 Subject: [PATCH 035/115] fix --- tools/docker/README.md | 2 +- tools/docker/ubuntu18.04-base-gpu.Dockerfile | 55 +++++++++++++++----- 2 files changed, 42 insertions(+), 15 deletions(-) diff --git a/tools/docker/README.md b/tools/docker/README.md index 0826981b3d..79e40ae225 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -6,7 +6,7 @@ and try out to use GluonNLP to solve your problem. | Name | Description | Target User | |------|-------------|-------------| -| `base` | Extends the CUDA image to include the basic functionalities, e.g., GluonNLP package, MXNet, PyTorch, Horovod. You can directly use the docker to run distributed training jobs. | Users that are willing to use GluonNLP to train models. For example, you can use the docker image for distributed training. | +| `base` | Extends the CUDA image to include the basic functionalities, e.g., GluonNLP package, MXNet, PyTorch, Horovod. You can directly configure other docker objects based on this basic docker | For developers who are willing to | | `devel` | Extends the base image to include a development platform powered by Jupyter Lab. Some useful functionalities like Tensorboard are pre-installed. | Users that are willing to analyze NLP data and build models with GluonNLP. | diff --git a/tools/docker/ubuntu18.04-base-gpu.Dockerfile b/tools/docker/ubuntu18.04-base-gpu.Dockerfile index 961b89b7fd..076eee0f7a 100644 --- a/tools/docker/ubuntu18.04-base-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-base-gpu.Dockerfile @@ -1,20 +1,47 @@ -FROM gluonai/gluon-nlp:cspu-base-latest +FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 LABEL maintainer="GluonNLP Team" +COPY install /install -COPY start_jupyter.sh /start_jupyter.sh -COPY devel_entrypoint.sh /devel_entrypoint.sh -RUN chmod +x /devel_entrypoint.sh +ARG DEBIAN_FRONTEND=noninteractive -EXPOSE 8888 -EXPOSE 8787 -EXPOSE 8786 +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" \ + PYTHONIOENCODING=UTF-8 \ + LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 -WORKDIR ${WORKDIR} +ENV WORKDIR=/workspace +ENV SHELL=/bin/bash -# Add Tini -ARG TINI_VERSION=v0.19.0 -ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini -RUN chmod +x /tini -ENTRYPOINT [ "/tini", "--", "/devel_entrypoint.sh" ] -CMD ["/bin/bash"] +RUN mkdir -p ${WORKDIR} + +RUN bash /install/install_ubuntu18.04_core.sh + +# Install Open MPI +RUN bash /install/install_openmpi.sh +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH +ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH + +RUN bash /install/install_python_packages.sh + +# Install MXNet +RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python --user + +# Install PyTorch +RUN python3 -m pip install -U torch torchvision --user + +# Install Horovod +RUN bash /install/install_horovod.sh + +# Install Jupyter Lab +RUN bash /install/install_jupyter_lab.sh + +RUN mkdir -p ${WORKDIR}/data +RUN mkdir -p /.init +RUN cd ${WORKDIR} \ + && git clone https://github.com/dmlc/gluon-nlp \ + && cd gluon-nlp \ + && git checkout master \ + && python3 -m pip install -U -e ."[extras]" --user From bc72cbe001201ca4b387d558f4745c4e6e8e8253 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 21:38:06 -0700 Subject: [PATCH 036/115] Update ubuntu18.04-base-cpu.Dockerfile --- tools/docker/ubuntu18.04-base-cpu.Dockerfile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/docker/ubuntu18.04-base-cpu.Dockerfile b/tools/docker/ubuntu18.04-base-cpu.Dockerfile index 1bb632b4af..9e04237026 100644 --- a/tools/docker/ubuntu18.04-base-cpu.Dockerfile +++ b/tools/docker/ubuntu18.04-base-cpu.Dockerfile @@ -17,8 +17,14 @@ ENV SHELL=/bin/bash RUN mkdir -p ${WORKDIR} + RUN bash /install/install_ubuntu18.04_core.sh +# Install Open MPI +RUN bash /install/install_openmpi.sh +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH +ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH + RUN bash /install/install_python_packages.sh # Install MXNet From 0f6067bfbffe5fcd4c5c8f769826ca15b9d9bd40 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 21:43:48 -0700 Subject: [PATCH 037/115] update --- tools/docker/install/install_ubuntu18.04_core.sh | 2 ++ tools/docker/ubuntu18.04-base-cpu.Dockerfile | 2 -- tools/docker/ubuntu18.04-base-gpu.Dockerfile | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/docker/install/install_ubuntu18.04_core.sh b/tools/docker/install/install_ubuntu18.04_core.sh index 7278b702f9..145061b115 100644 --- a/tools/docker/install/install_ubuntu18.04_core.sh +++ b/tools/docker/install/install_ubuntu18.04_core.sh @@ -2,6 +2,8 @@ set -e set -u set -o pipefail +export DEBIAN_FRONTEND=noninteractive + apt-get update \ && apt-get install -y --no-install-recommends \ software-properties-common \ diff --git a/tools/docker/ubuntu18.04-base-cpu.Dockerfile b/tools/docker/ubuntu18.04-base-cpu.Dockerfile index 9e04237026..66a1c6dae9 100644 --- a/tools/docker/ubuntu18.04-base-cpu.Dockerfile +++ b/tools/docker/ubuntu18.04-base-cpu.Dockerfile @@ -3,8 +3,6 @@ FROM ubuntu:18.04 LABEL maintainer="GluonNLP Team" COPY install /install -ARG DEBIAN_FRONTEND=noninteractive - ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" \ diff --git a/tools/docker/ubuntu18.04-base-gpu.Dockerfile b/tools/docker/ubuntu18.04-base-gpu.Dockerfile index 076eee0f7a..1ebd8c954e 100644 --- a/tools/docker/ubuntu18.04-base-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-base-gpu.Dockerfile @@ -3,8 +3,6 @@ FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 LABEL maintainer="GluonNLP Team" COPY install /install -ARG DEBIAN_FRONTEND=noninteractive - ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" \ From 2620dfda6b3f0046173581c3814983a8e3fd3625 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 21:58:30 -0700 Subject: [PATCH 038/115] add tvm to lazy import --- src/gluonnlp/utils/lazy_imports.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/gluonnlp/utils/lazy_imports.py b/src/gluonnlp/utils/lazy_imports.py index 8b26275b0e..82e2a2fd5e 100644 --- a/src/gluonnlp/utils/lazy_imports.py +++ b/src/gluonnlp/utils/lazy_imports.py @@ -25,7 +25,8 @@ 'try_import_fasttext', 'try_import_langid', 'try_import_boto3', - 'try_import_jieba'] + 'try_import_jieba', + 'try_import_tvm'] def try_import_sentencepiece(): @@ -155,3 +156,12 @@ def try_import_jieba(): raise ImportError('"jieba" is not installed. You must install jieba tokenizer. ' 'You may try to use `pip install jieba`') return jieba + + +def try_import_tvm(): + try: + import tvm + except ImportError: + raise ImportError('"tvm" is not installed. You must install TVM to use the functionality. ' + 'To install TVM, you may see the documentation in ' + 'https://tvm.apache.org/ or try to use the docker of GluonNLP.') From 2d58e0c16be9c6c8e77a0b924fbdbd2f7a298c2e Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 22:09:12 -0700 Subject: [PATCH 039/115] update --- tools/{batch => }/docker/gluon_nlp_cpu_job.sh | 0 tools/{batch => }/docker/gluon_nlp_job.sh | 0 tools/docker/ubuntu18.04-base-cpu.Dockerfile | 6 +- tools/docker/ubuntu18.04-base-gpu.Dockerfile | 6 +- tools/docker/ubuntu18.04-devel-cpu.Dockerfile | 97 +------------------ 5 files changed, 7 insertions(+), 102 deletions(-) rename tools/{batch => }/docker/gluon_nlp_cpu_job.sh (100%) rename tools/{batch => }/docker/gluon_nlp_job.sh (100%) diff --git a/tools/batch/docker/gluon_nlp_cpu_job.sh b/tools/docker/gluon_nlp_cpu_job.sh similarity index 100% rename from tools/batch/docker/gluon_nlp_cpu_job.sh rename to tools/docker/gluon_nlp_cpu_job.sh diff --git a/tools/batch/docker/gluon_nlp_job.sh b/tools/docker/gluon_nlp_job.sh similarity index 100% rename from tools/batch/docker/gluon_nlp_job.sh rename to tools/docker/gluon_nlp_job.sh diff --git a/tools/docker/ubuntu18.04-base-cpu.Dockerfile b/tools/docker/ubuntu18.04-base-cpu.Dockerfile index 66a1c6dae9..912a3076d1 100644 --- a/tools/docker/ubuntu18.04-base-cpu.Dockerfile +++ b/tools/docker/ubuntu18.04-base-cpu.Dockerfile @@ -26,10 +26,10 @@ ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH RUN bash /install/install_python_packages.sh # Install MXNet -RUN python3 -m pip install -U --pre "mxnet>=2.0.0b20200926" -f https://dist.mxnet.io/python --user +RUN python3 -m pip install -U --pre "mxnet>=2.0.0b20200926" -f https://dist.mxnet.io/python # Install PyTorch -RUN python3 -m pip install -U torch torchvision --user +RUN python3 -m pip install -U torch torchvision # Install Jupyter Lab RUN bash /install/install_jupyter_lab.sh @@ -40,4 +40,4 @@ RUN cd ${WORKDIR} \ && git clone https://github.com/dmlc/gluon-nlp \ && cd gluon-nlp \ && git checkout master \ - && python3 -m pip install -U -e ."[extras]" --user + && python3 -m pip install -U -e ."[extras]" diff --git a/tools/docker/ubuntu18.04-base-gpu.Dockerfile b/tools/docker/ubuntu18.04-base-gpu.Dockerfile index 1ebd8c954e..06e1780e35 100644 --- a/tools/docker/ubuntu18.04-base-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-base-gpu.Dockerfile @@ -25,10 +25,10 @@ ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH RUN bash /install/install_python_packages.sh # Install MXNet -RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python --user +RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python # Install PyTorch -RUN python3 -m pip install -U torch torchvision --user +RUN python3 -m pip install -U torch torchvision # Install Horovod RUN bash /install/install_horovod.sh @@ -42,4 +42,4 @@ RUN cd ${WORKDIR} \ && git clone https://github.com/dmlc/gluon-nlp \ && cd gluon-nlp \ && git checkout master \ - && python3 -m pip install -U -e ."[extras]" --user + && python3 -m pip install -U -e ."[extras]" diff --git a/tools/docker/ubuntu18.04-devel-cpu.Dockerfile b/tools/docker/ubuntu18.04-devel-cpu.Dockerfile index a338a32dea..47a587ebf0 100644 --- a/tools/docker/ubuntu18.04-devel-cpu.Dockerfile +++ b/tools/docker/ubuntu18.04-devel-cpu.Dockerfile @@ -1,76 +1,7 @@ -FROM ubuntu:18.04 +FROM gluonai/gluon-nlp:cpu-base-latest LABEL maintainer="GluonNLP Team" -ENV WORKDIR=/workspace -ENV SHELL=/bin/bash - -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - software-properties-common \ - build-essential \ - ca-certificates \ - curl \ - emacs \ - subversion \ - locales \ - cmake \ - git \ - libopencv-dev \ - htop \ - vim \ - wget \ - unzip \ - libopenblas-dev \ - ninja-build \ - openssh-client \ - openssh-server \ - python3-dev \ - python3-pip \ - python3-setuptools \ - libxft-dev \ - zlib1g-dev \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -RUN python3 -m pip --no-cache-dir install --upgrade \ - pip \ - setuptools - -RUN ln -s $(which python3) /usr/local/bin/python - -# Install MXNet -RUN python3 -m pip install -U --pre "mxnet>=2.0.0b20200926" -f https://dist.mxnet.io/python --user - -# Install PyTorch -RUN python3 -m pip install -U torch torchvision --user - -RUN pip3 install --no-cache --upgrade \ - wheel \ - numpy==1.19.1 \ - pandas==0.25.1 \ - pytest \ - Pillow \ - requests==2.22.0 \ - scikit-learn==0.20.4 \ - scipy==1.2.2 \ - urllib3==1.25.8 \ - python-dateutil==2.8.0 \ - sagemaker-experiments==0.* \ - PyYAML==5.3.1 \ - mpi4py==3.0.2 \ - jupyterlab==2.2.4 \ - cmake \ - awscli - -RUN mkdir -p ${WORKDIR}/notebook -RUN mkdir -p ${WORKDIR}/data -RUN cd ${WORKDIR} \ - && git clone https://github.com/dmlc/gluon-nlp \ - && cd gluon-nlp \ - && git checkout master \ - && python3 -m pip install -U -e ."[extras]" --user - COPY start_jupyter.sh /start_jupyter.sh COPY devel_entrypoint.sh /devel_entrypoint.sh RUN chmod +x /devel_entrypoint.sh @@ -81,32 +12,6 @@ EXPOSE 8786 WORKDIR ${WORKDIR} -# Debug horovod by default -RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf - -# Install NodeJS + Tensorboard + TensorboardX -RUN curl -sL https://deb.nodesource.com/setup_14.x | bash - \ - && apt-get install -y nodejs - -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - libsndfile1-dev - -RUN pip3 install --no-cache --upgrade \ - soundfile==0.10.2 \ - ipywidgets==7.5.1 \ - jupyter_tensorboard==0.2.0 \ - widgetsnbextension==3.5.1 \ - tensorboard==2.1.1 \ - tensorboardX==2.1 -RUN jupyter labextension install jupyterlab_tensorboard \ - && jupyter nbextension enable --py widgetsnbextension \ - && jupyter labextension install @jupyter-widgets/jupyterlab-manager - -# Revise default shell to /bin/bash -RUN jupyter notebook --generate-config \ - && echo "c.NotebookApp.terminado_settings = { 'shell_command': ['/bin/bash'] }" >> /root/.jupyter/jupyter_notebook_config.py - # Add Tini ARG TINI_VERSION=v0.19.0 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini From 8234215d170053b4294d6ca67c1fab86876cf47e Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 22:10:46 -0700 Subject: [PATCH 040/115] Update README.md --- tools/docker/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/docker/README.md b/tools/docker/README.md index 79e40ae225..a490941f23 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -6,8 +6,9 @@ and try out to use GluonNLP to solve your problem. | Name | Description | Target User | |------|-------------|-------------| -| `base` | Extends the CUDA image to include the basic functionalities, e.g., GluonNLP package, MXNet, PyTorch, Horovod. You can directly configure other docker objects based on this basic docker | For developers who are willing to | -| `devel` | Extends the base image to include a development platform powered by Jupyter Lab. Some useful functionalities like Tensorboard are pre-installed. | Users that are willing to analyze NLP data and build models with GluonNLP. | +| `base` | Extends the CUDA image to include the basic functionalities, e.g., GluonNLP package, MXNet, PyTorch, Horovod. You can directly configure other docker images based on this basic docker | The basic docker | +| `ci` | Image used in GluonNLP CI | GluonNLP Developers | +| `devel` | Extends the base image to include a development platform powered by Jupyter Lab. Some useful functionalities like Tensorboard are pre-installed. | Users that are willing to solve NLP problems and also do distributed training with Horovod + GluonNLP. | ## Run Docker From 7dada1d3170786306c0894afb0ad72dbaf391f40 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 22:19:28 -0700 Subject: [PATCH 041/115] update --- tools/docker/ubuntu18.04-ci-cpu.Dockerfile | 7 +++++++ tools/docker/ubuntu18.04-ci-gpu.Dockerfile | 7 +++++++ 2 files changed, 14 insertions(+) create mode 100644 tools/docker/ubuntu18.04-ci-cpu.Dockerfile create mode 100644 tools/docker/ubuntu18.04-ci-gpu.Dockerfile diff --git a/tools/docker/ubuntu18.04-ci-cpu.Dockerfile b/tools/docker/ubuntu18.04-ci-cpu.Dockerfile new file mode 100644 index 0000000000..f8cd878422 --- /dev/null +++ b/tools/docker/ubuntu18.04-ci-cpu.Dockerfile @@ -0,0 +1,7 @@ +FROM gluonai/gluon-nlp:cpu-base-latest + +LABEL maintainer="GluonNLP Team" + +WORKDIR ${WORKSPACE}/gluon-nlp +ADD gluon_nlp_cpu_job.sh . +RUN chmod +x gluon_nlp_cpu_job.sh diff --git a/tools/docker/ubuntu18.04-ci-gpu.Dockerfile b/tools/docker/ubuntu18.04-ci-gpu.Dockerfile new file mode 100644 index 0000000000..7519fdaec4 --- /dev/null +++ b/tools/docker/ubuntu18.04-ci-gpu.Dockerfile @@ -0,0 +1,7 @@ +FROM gluonai/gluon-nlp:gpu-base-latest + +LABEL maintainer="GluonNLP Team" + +WORKDIR ${WORKSPACE}/gluon-nlp +ADD gluon_nlp_job.sh . +RUN chmod +x gluon_nlp_job.sh From 9fbaf77abd9fcfa6a33f444873018b282204c1a6 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 22:20:53 -0700 Subject: [PATCH 042/115] Update README.md --- tools/docker/README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/docker/README.md b/tools/docker/README.md index a490941f23..cbdbee8cae 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -46,12 +46,14 @@ docker run --gpus all --rm -it --shm-size=4g gluonai/gluon-nlp:gpu-latest \ To build a docker image fom the dockerfile, you may use the following command: ``` -# Build CPU Dockers +# Build Base Dockers docker build -f ubuntu18.04-base-cpu.Dockerfile -t gluonai/gluon-nlp:cpu-base-latest . +docker build -f ubuntu18.04-ci-cpu.Dockerfile -t gluonai/gluon-nlp:cpu-ci-latest . docker build -f ubuntu18.04-devel-cpu.Dockerfile -t gluonai/gluon-nlp:cpu-latest . # Build GPU Dockers docker build -f ubuntu18.04-base-gpu.Dockerfile -t gluonai/gluon-nlp:gpu-base-latest . +docker build -f ubuntu18.04-ci-gpu.Dockerfile -t gluonai/gluon-nlp:gpu-ci-latest . docker build -f ubuntu18.04-devel-gpu.Dockerfile -t gluonai/gluon-nlp:gpu-latest . ``` @@ -79,7 +81,10 @@ path of MXNet by querying th MXNet runtime. You may try to login to your dockerhub account and push the image to dockerhub. ``` docker push gluonai/gluon-nlp:cpu-base-latest +docker push gluonai/gluon-nlp:cpu-ci-latest docker push gluonai/gluon-nlp:cpu-latest + docker push gluonai/gluon-nlp:gpu-base-latest +docker push gluonai/gluon-nlp:gpu-ci-latest docker push gluonai/gluon-nlp:gpu-latest ``` From c62639da9aeaed213e9652822d57fa587c0c8b00 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 22:37:20 -0700 Subject: [PATCH 043/115] Update run_squad2_albert_base.sh --- .../commands/run_squad2_albert_base.sh | 32 ------------------- 1 file changed, 32 deletions(-) diff --git a/scripts/question_answering/commands/run_squad2_albert_base.sh b/scripts/question_answering/commands/run_squad2_albert_base.sh index 86bccc007b..e69de29bb2 100644 --- a/scripts/question_answering/commands/run_squad2_albert_base.sh +++ b/scripts/question_answering/commands/run_squad2_albert_base.sh @@ -1,32 +0,0 @@ -USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default - -VERSION=2.0 # Either 2.0 or 1.1 -MODEL_NAME=google_albert_base_v2 - -# Prepare the Data -nlp_data prepare_squad --version ${VERSION} - -# Run the script - -if [ ${USE_HOROVOD} -eq 0 ]; -then - RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" -else - RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" -fi - -${RUN_COMMAND} --model_name ${MODEL_NAME} \ - --data_dir squad \ - --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ - --version ${VERSION} \ - --do_eval \ - --do_train \ - --batch_size 4 \ - --num_accumulated 3 \ - --epochs 3 \ - --lr 2e-5 \ - --warmup_ratio 0.1 \ - --wd 0.01 \ - --max_seq_length 512 \ - --max_grad_norm 0.1 \ - --overwrite_cache From 7e810ad9c714720922d66c69f12d5ad5a284bd2f Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 22:43:01 -0700 Subject: [PATCH 044/115] update --- scripts/question_answering/commands/README.md | 8 + .../commands/generate_commands.py | 140 ++++++++++++++++++ .../commands/run_squad.template.sh | 42 ++++++ .../commands/run_squad2_albert_base.sh | 40 +++++ .../commands/run_squad2_albert_large.sh | 39 +++-- .../commands/run_squad2_albert_xlarge.sh | 39 +++-- .../commands/run_squad2_albert_xxlarge.sh | 39 +++-- .../commands/run_squad2_electra_base.sh | 39 +++-- .../commands/run_squad2_electra_large.sh | 39 +++-- .../commands/run_squad2_electra_small.sh | 40 +++-- .../commands/run_squad2_mobilebert.sh | 39 +++-- .../commands/run_squad2_roberta_large.sh | 37 +++-- .../commands/run_squad2_uncased_bert_base.sh | 39 +++-- .../commands/run_squad2_uncased_bert_large.sh | 39 +++-- 14 files changed, 501 insertions(+), 118 deletions(-) create mode 100644 scripts/question_answering/commands/README.md create mode 100644 scripts/question_answering/commands/generate_commands.py create mode 100644 scripts/question_answering/commands/run_squad.template.sh diff --git a/scripts/question_answering/commands/README.md b/scripts/question_answering/commands/README.md new file mode 100644 index 0000000000..626fc0a786 --- /dev/null +++ b/scripts/question_answering/commands/README.md @@ -0,0 +1,8 @@ +# Commands For Training on SQuAD + +All commands are generated by parsing the template in [run_squad.template.sh](run_squad.template.sh). +To generate all commands, use the following code. + +```bash +python3 generate_commands.py +``` diff --git a/scripts/question_answering/commands/generate_commands.py b/scripts/question_answering/commands/generate_commands.py new file mode 100644 index 0000000000..6ed26bd1e9 --- /dev/null +++ b/scripts/question_answering/commands/generate_commands.py @@ -0,0 +1,140 @@ +from gluonnlp.utils.config import CfgNode +import re + + +def base_cfg(): + cfg = CfgNode() + cfg.model_name = 'google_albert_base_v2' + cfg.version = 2.0 + cfg.batch_size = 4 + cfg.num_accumulated = 3 + cfg.epochs = 3 + cfg.lr = 2e-5 + cfg.warmup_ratio = 0.1 + cfg.wd = 0.01 + cfg.max_grad_norm = 0.1 + cfg.max_seq_length = 512 + cfg.layerwise_decay = -1 + return cfg + + +def albert_base_cfg(): + return base_cfg() + + +def albert_large_cfg(): + cfg = base_cfg() + cfg.model_name = 'google_albert_large_v2' + cfg.batch_size = 3 + cfg.num_accumulated = 4 + return cfg + + +def albert_xlarge_cfg(): + cfg = base_cfg() + cfg.model_name = 'google_albert_xlarge_v2' + cfg.batch_size = 1 + cfg.num_accumulated = 12 + return cfg + + +def albert_xxlarge_cfg(): + cfg = albert_xlarge_cfg() + cfg.model_name = 'google_albert_xxlarge_v2' + return cfg + + +def electra_base_cfg(): + cfg = base_cfg() + cfg.model_name = 'google_electra_base' + cfg.batch_size = 8 + cfg.num_accumulated = 1 + cfg.lr = 1e-4 + cfg.epochs = 2 + cfg.layerwise_decay = 0.8 + cfg.wd = 0 + return cfg + + +def electra_large_cfg(): + cfg = electra_base_cfg() + cfg.model_name = 'google_electra_large' + cfg.batch_size = 2 + cfg.num_accumulated = 4 + cfg.lr = 1e-5 + cfg.layerwise_decay = 0.9 + return cfg + + +def electra_small_cfg(): + cfg = electra_base_cfg() + cfg.model_name = 'google_electra_small' + cfg.batch_size = 8 + cfg.num_accumulated = 1 + cfg.lr = 3e-4 + cfg.epochs = 2 + cfg.layerwise_decay = 0.8 + return cfg + + +def mobilebert_cfg(): + cfg = base_cfg() + cfg.model_name = 'google_uncased_mobilebert' + cfg.batch_size = 8 + cfg.num_accumulated = 1 + cfg.lr = 4e-5 + cfg.epochs = 5 + cfg.max_seq_length = 384 + return cfg + + +def roberta_large_cfg(): + cfg = base_cfg() + cfg.model_name = 'fairseq_roberta_large' + cfg.batch_size = 2 + cfg.num_accumulated = 6 + cfg.epochs = 3 + cfg.lr = 3e-5 + cfg.warmup_ratio = 0.2 + cfg.wd = 0.01 + return cfg + + +def uncased_bert_base_cfg(): + cfg = base_cfg() + cfg.model_name = 'google_en_uncased_bert_base' + cfg.batch_size = 6 + cfg.num_accumulated = 2 + cfg.lr = 3e-5 + return cfg + + +def uncased_bert_large_cfg(): + cfg = uncased_bert_base_cfg() + cfg.model_name = 'google_en_uncased_bert_large' + cfg.batch_size = 2 + cfg.num_accumulated = 6 + return cfg + + +def gen_command(config, template_path, out_path): + print(f'Generating from "{template_path}" to "{out_path}"') + + def replace_fn(match): + return str(getattr(config, match.groups()[0])) + + with open(template_path, 'r') as in_f: + with open(out_path, 'w') as out_f: + dat = in_f.read() + updated_dat = re.sub(r'{{ (.+) }}', replace_fn, dat) + out_f.write(updated_dat) + + +if __name__ == '__main__': + for squad_version in [1.1, 2.0]: + for cfg_func in [albert_base_cfg, albert_large_cfg, albert_xlarge_cfg, albert_xxlarge_cfg, + electra_base_cfg, electra_large_cfg, electra_small_cfg, mobilebert_cfg, + roberta_large_cfg, uncased_bert_base_cfg, uncased_bert_large_cfg]: + prefix = cfg_func.__name__[:-len('_cfg')] + gen_command(cfg_func(), 'run_squad.template.sh', + f'run_squad2_{prefix}.sh') diff --git a/scripts/question_answering/commands/run_squad.template.sh b/scripts/question_answering/commands/run_squad.template.sh new file mode 100644 index 0000000000..f8e1d3eb7f --- /dev/null +++ b/scripts/question_answering/commands/run_squad.template.sh @@ -0,0 +1,42 @@ +# Generated by "generate_commands.py" + +USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod +VERSION=${2:-2.0} # SQuAD Version +MODEL_NAME={{ model_name }} +BATCH_SIZE={{ batch_size }} +NUM_ACCUMULATED={{ num_accumulated }} +EPOCHS={{ epochs }} +LR={{ lr }} +WARMUP_RATIO={{ warmup_ratio }} +WD={{ wd }} +MAX_SEQ_LENGTH={{ max_seq_length }} +MAX_GRAD_NORM={{ max_grad_norm }} +LAYERWISE_DECAY={{ layerwise_decay }} + +# Prepare the Data +nlp_data prepare_squad --version ${VERSION} + +# Run the script +if [ ${USE_HOROVOD} -eq 0 ]; +then + RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" +else + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" +fi +python3 run_squad.py \ + --model_name ${MODEL_NAME} \ + --data_dir squad \ + --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ + --version ${VERSION} \ + --do_eval \ + --do_train \ + --batch_size ${BATCH_SIZE} \ + --num_accumulated ${NUM_ACCUMULATED} \ + --layerwise_decay ${LAYERWISE_DECAY} \ + --epochs ${EPOCHS} \ + --lr ${LR} \ + --warmup_ratio ${WARMUP_RATIO} \ + --wd ${WD} \ + --max_seq_length ${MAX_SEQ_LENGTH} \ + --max_grad_norm ${MAX_GRAD_NORM} \ + --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_albert_base.sh b/scripts/question_answering/commands/run_squad2_albert_base.sh index e69de29bb2..f0090fd25b 100644 --- a/scripts/question_answering/commands/run_squad2_albert_base.sh +++ b/scripts/question_answering/commands/run_squad2_albert_base.sh @@ -0,0 +1,40 @@ +USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default +VERSION=${2:-2.0} # Version +MODEL_NAME=google_albert_base_v2 +BATCH_SIZE=4 +NUM_ACCUMULATED=3 +EPOCHS=3 +LR=2e-05 +WARMUP_RATIO=0.1 +WD=0.01 +MAX_SEQ_LENGTH=512 +MAX_GRAD_NORM=0.1 +LAYERWISE_DECAY=-1 + +# Prepare the Data +nlp_data prepare_squad --version ${VERSION} + +# Run the script +if [ ${USE_HOROVOD} -eq 0 ]; +then + RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" +else + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" +fi +python3 run_squad.py \ + --model_name ${MODEL_NAME} \ + --data_dir squad \ + --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ + --version ${VERSION} \ + --do_eval \ + --do_train \ + --batch_size ${BATCH_SIZE} \ + --num_accumulated ${NUM_ACCUMULATED} \ + --layerwise_decay ${LAYERWISE_DECAY} \ + --epochs ${EPOCHS} \ + --lr ${LR} \ + --warmup_ratio ${WARMUP_RATIO} \ + --wd ${WD} \ + --max_seq_length ${MAX_SEQ_LENGTH} \ + --max_grad_norm ${MAX_GRAD_NORM} \ + --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_albert_large.sh b/scripts/question_answering/commands/run_squad2_albert_large.sh index f4c9d069c5..c3e157ef56 100644 --- a/scripts/question_answering/commands/run_squad2_albert_large.sh +++ b/scripts/question_answering/commands/run_squad2_albert_large.sh @@ -1,11 +1,26 @@ -VERSION=2.0 # Either 2.0 or 1.1 +USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default +VERSION=${2:-2.0} # Version MODEL_NAME=google_albert_large_v2 +BATCH_SIZE=3 +NUM_ACCUMULATED=4 +EPOCHS=3 +LR=2e-05 +WARMUP_RATIO=0.1 +WD=0.01 +MAX_SEQ_LENGTH=512 +MAX_GRAD_NORM=0.1 +LAYERWISE_DECAY=-1 # Prepare the Data nlp_data prepare_squad --version ${VERSION} # Run the script - +if [ ${USE_HOROVOD} -eq 0 ]; +then + RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" +else + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" +fi python3 run_squad.py \ --model_name ${MODEL_NAME} \ --data_dir squad \ @@ -13,13 +28,13 @@ python3 run_squad.py \ --version ${VERSION} \ --do_eval \ --do_train \ - --batch_size 3 \ - --num_accumulated 4 \ - --gpus 0,1,2,3 \ - --epochs 3 \ - --lr 2e-5 \ - --warmup_ratio 0.1 \ - --wd 0.01 \ - --max_seq_length 512 \ - --max_grad_norm 0.1 \ - --overwrite_cache \ + --batch_size ${BATCH_SIZE} \ + --num_accumulated ${NUM_ACCUMULATED} \ + --layerwise_decay ${LAYERWISE_DECAY} \ + --epochs ${EPOCHS} \ + --lr ${LR} \ + --warmup_ratio ${WARMUP_RATIO} \ + --wd ${WD} \ + --max_seq_length ${MAX_SEQ_LENGTH} \ + --max_grad_norm ${MAX_GRAD_NORM} \ + --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_albert_xlarge.sh b/scripts/question_answering/commands/run_squad2_albert_xlarge.sh index d14994422d..e7810248b3 100644 --- a/scripts/question_answering/commands/run_squad2_albert_xlarge.sh +++ b/scripts/question_answering/commands/run_squad2_albert_xlarge.sh @@ -1,11 +1,26 @@ -VERSION=2.0 # Either 2.0 or 1.1 +USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default +VERSION=${2:-2.0} # Version MODEL_NAME=google_albert_xlarge_v2 +BATCH_SIZE=1 +NUM_ACCUMULATED=12 +EPOCHS=3 +LR=2e-05 +WARMUP_RATIO=0.1 +WD=0.01 +MAX_SEQ_LENGTH=512 +MAX_GRAD_NORM=0.1 +LAYERWISE_DECAY=-1 # Prepare the Data nlp_data prepare_squad --version ${VERSION} # Run the script - +if [ ${USE_HOROVOD} -eq 0 ]; +then + RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" +else + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" +fi python3 run_squad.py \ --model_name ${MODEL_NAME} \ --data_dir squad \ @@ -13,13 +28,13 @@ python3 run_squad.py \ --version ${VERSION} \ --do_eval \ --do_train \ - --batch_size 1 \ - --num_accumulated 12 \ - --gpus 0,1,2,3 \ - --epochs 3 \ - --lr 2e-5 \ - --warmup_ratio 0.1 \ - --wd 0.01 \ - --max_seq_length 512 \ - --max_grad_norm 0.1 \ - --overwrite_cache \ + --batch_size ${BATCH_SIZE} \ + --num_accumulated ${NUM_ACCUMULATED} \ + --layerwise_decay ${LAYERWISE_DECAY} \ + --epochs ${EPOCHS} \ + --lr ${LR} \ + --warmup_ratio ${WARMUP_RATIO} \ + --wd ${WD} \ + --max_seq_length ${MAX_SEQ_LENGTH} \ + --max_grad_norm ${MAX_GRAD_NORM} \ + --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh b/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh index fdb6e89658..9154028a38 100644 --- a/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh +++ b/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh @@ -1,11 +1,26 @@ -VERSION=2.0 # Either 2.0 or 1.1 +USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default +VERSION=${2:-2.0} # Version MODEL_NAME=google_albert_xxlarge_v2 +BATCH_SIZE=1 +NUM_ACCUMULATED=12 +EPOCHS=3 +LR=2e-05 +WARMUP_RATIO=0.1 +WD=0.01 +MAX_SEQ_LENGTH=512 +MAX_GRAD_NORM=0.1 +LAYERWISE_DECAY=-1 # Prepare the Data nlp_data prepare_squad --version ${VERSION} # Run the script - +if [ ${USE_HOROVOD} -eq 0 ]; +then + RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" +else + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" +fi python3 run_squad.py \ --model_name ${MODEL_NAME} \ --data_dir squad \ @@ -13,13 +28,13 @@ python3 run_squad.py \ --version ${VERSION} \ --do_eval \ --do_train \ - --batch_size 1 \ - --num_accumulated 12 \ - --gpus 0,1,2,3 \ - --epochs 3 \ - --lr 2e-5 \ - --warmup_ratio 0.1 \ - --wd 0.01 \ - --max_seq_length 512 \ - --max_grad_norm 0.1 \ - --overwrite_cache \ + --batch_size ${BATCH_SIZE} \ + --num_accumulated ${NUM_ACCUMULATED} \ + --layerwise_decay ${LAYERWISE_DECAY} \ + --epochs ${EPOCHS} \ + --lr ${LR} \ + --warmup_ratio ${WARMUP_RATIO} \ + --wd ${WD} \ + --max_seq_length ${MAX_SEQ_LENGTH} \ + --max_grad_norm ${MAX_GRAD_NORM} \ + --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_electra_base.sh b/scripts/question_answering/commands/run_squad2_electra_base.sh index a500a3ae50..53f94456d4 100644 --- a/scripts/question_answering/commands/run_squad2_electra_base.sh +++ b/scripts/question_answering/commands/run_squad2_electra_base.sh @@ -1,11 +1,26 @@ -VERSION=2.0 # Either 2.0 or 1.1 +USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default +VERSION=${2:-2.0} # Version MODEL_NAME=google_electra_base +BATCH_SIZE=8 +NUM_ACCUMULATED=1 +EPOCHS=2 +LR=0.0001 +WARMUP_RATIO=0.1 +WD=0 +MAX_SEQ_LENGTH=512 +MAX_GRAD_NORM=0.1 +LAYERWISE_DECAY=0.8 # Prepare the Data nlp_data prepare_squad --version ${VERSION} # Run the script - +if [ ${USE_HOROVOD} -eq 0 ]; +then + RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" +else + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" +fi python3 run_squad.py \ --model_name ${MODEL_NAME} \ --data_dir squad \ @@ -13,13 +28,13 @@ python3 run_squad.py \ --version ${VERSION} \ --do_eval \ --do_train \ - --batch_size 8 \ - --num_accumulated 1 \ - --gpus 0,1,2,3 \ - --epochs 2 \ - --lr 1e-4 \ - --layerwise_decay 0.8 \ - --warmup_ratio 0.1 \ - --wd 0 \ - --max_seq_length 512 \ - --max_grad_norm 0.1 \ + --batch_size ${BATCH_SIZE} \ + --num_accumulated ${NUM_ACCUMULATED} \ + --layerwise_decay ${LAYERWISE_DECAY} \ + --epochs ${EPOCHS} \ + --lr ${LR} \ + --warmup_ratio ${WARMUP_RATIO} \ + --wd ${WD} \ + --max_seq_length ${MAX_SEQ_LENGTH} \ + --max_grad_norm ${MAX_GRAD_NORM} \ + --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_electra_large.sh b/scripts/question_answering/commands/run_squad2_electra_large.sh index 61872f110b..79f4c002db 100644 --- a/scripts/question_answering/commands/run_squad2_electra_large.sh +++ b/scripts/question_answering/commands/run_squad2_electra_large.sh @@ -1,11 +1,26 @@ -VERSION=2.0 # Either 2.0 or 1.1 +USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default +VERSION=${2:-2.0} # Version MODEL_NAME=google_electra_large +BATCH_SIZE=2 +NUM_ACCUMULATED=4 +EPOCHS=2 +LR=1e-05 +WARMUP_RATIO=0.1 +WD=0 +MAX_SEQ_LENGTH=512 +MAX_GRAD_NORM=0.1 +LAYERWISE_DECAY=0.9 # Prepare the Data nlp_data prepare_squad --version ${VERSION} # Run the script - +if [ ${USE_HOROVOD} -eq 0 ]; +then + RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" +else + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" +fi python3 run_squad.py \ --model_name ${MODEL_NAME} \ --data_dir squad \ @@ -13,13 +28,13 @@ python3 run_squad.py \ --version ${VERSION} \ --do_eval \ --do_train \ - --batch_size 2 \ - --num_accumulated 4 \ - --gpus 0,1,2,3 \ - --epochs 2 \ - --lr 5e-5 \ - --layerwise_decay 0.9 \ - --warmup_ratio 0.1 \ - --wd 0 \ - --max_seq_length 512 \ - --max_grad_norm 0.1 \ + --batch_size ${BATCH_SIZE} \ + --num_accumulated ${NUM_ACCUMULATED} \ + --layerwise_decay ${LAYERWISE_DECAY} \ + --epochs ${EPOCHS} \ + --lr ${LR} \ + --warmup_ratio ${WARMUP_RATIO} \ + --wd ${WD} \ + --max_seq_length ${MAX_SEQ_LENGTH} \ + --max_grad_norm ${MAX_GRAD_NORM} \ + --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_electra_small.sh b/scripts/question_answering/commands/run_squad2_electra_small.sh index e174258c17..ac120ec278 100644 --- a/scripts/question_answering/commands/run_squad2_electra_small.sh +++ b/scripts/question_answering/commands/run_squad2_electra_small.sh @@ -1,10 +1,26 @@ -VERSION=2.0 # Either 2.0 or 1.1 +USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default +VERSION=${2:-2.0} # Version MODEL_NAME=google_electra_small +BATCH_SIZE=8 +NUM_ACCUMULATED=1 +EPOCHS=2 +LR=0.0003 +WARMUP_RATIO=0.1 +WD=0 +MAX_SEQ_LENGTH=512 +MAX_GRAD_NORM=0.1 +LAYERWISE_DECAY=0.8 + # Prepare the Data nlp_data prepare_squad --version ${VERSION} # Run the script - +if [ ${USE_HOROVOD} -eq 0 ]; +then + RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" +else + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" +fi python3 run_squad.py \ --model_name ${MODEL_NAME} \ --data_dir squad \ @@ -12,13 +28,13 @@ python3 run_squad.py \ --version ${VERSION} \ --do_eval \ --do_train \ - --batch_size 32 \ - --num_accumulated 1 \ - --gpus 0 \ - --epochs 2 \ - --lr 3e-4 \ - --layerwise_decay 0.8 \ - --warmup_ratio 0.1 \ - --wd 0 \ - --max_seq_length 512 \ - --max_grad_norm 0.1 \ + --batch_size ${BATCH_SIZE} \ + --num_accumulated ${NUM_ACCUMULATED} \ + --layerwise_decay ${LAYERWISE_DECAY} \ + --epochs ${EPOCHS} \ + --lr ${LR} \ + --warmup_ratio ${WARMUP_RATIO} \ + --wd ${WD} \ + --max_seq_length ${MAX_SEQ_LENGTH} \ + --max_grad_norm ${MAX_GRAD_NORM} \ + --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_mobilebert.sh b/scripts/question_answering/commands/run_squad2_mobilebert.sh index cfeee56356..13ffeb0246 100644 --- a/scripts/question_answering/commands/run_squad2_mobilebert.sh +++ b/scripts/question_answering/commands/run_squad2_mobilebert.sh @@ -1,11 +1,26 @@ -VERSION=2.0 # Either 2.0 or 1.1 +USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default +VERSION=${2:-2.0} # Version MODEL_NAME=google_uncased_mobilebert +BATCH_SIZE=8 +NUM_ACCUMULATED=1 +EPOCHS=5 +LR=4e-05 +WARMUP_RATIO=0.1 +WD=0.01 +MAX_SEQ_LENGTH=384 +MAX_GRAD_NORM=0.1 +LAYERWISE_DECAY=-1 # Prepare the Data nlp_data prepare_squad --version ${VERSION} # Run the script - +if [ ${USE_HOROVOD} -eq 0 ]; +then + RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" +else + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" +fi python3 run_squad.py \ --model_name ${MODEL_NAME} \ --data_dir squad \ @@ -13,13 +28,13 @@ python3 run_squad.py \ --version ${VERSION} \ --do_eval \ --do_train \ - --batch_size 8 \ - --num_accumulated 1 \ - --gpus 0,1,2,3 \ - --epochs 5 \ - --lr 4e-5 \ - --warmup_steps 1400 \ - --wd 0.0 \ - --max_seq_length 384 \ - --max_grad_norm 0.1 \ - --overwrite_cache \ + --batch_size ${BATCH_SIZE} \ + --num_accumulated ${NUM_ACCUMULATED} \ + --layerwise_decay ${LAYERWISE_DECAY} \ + --epochs ${EPOCHS} \ + --lr ${LR} \ + --warmup_ratio ${WARMUP_RATIO} \ + --wd ${WD} \ + --max_seq_length ${MAX_SEQ_LENGTH} \ + --max_grad_norm ${MAX_GRAD_NORM} \ + --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_roberta_large.sh b/scripts/question_answering/commands/run_squad2_roberta_large.sh index 3cdf2cb6ea..cdea52c0d6 100644 --- a/scripts/question_answering/commands/run_squad2_roberta_large.sh +++ b/scripts/question_answering/commands/run_squad2_roberta_large.sh @@ -1,10 +1,26 @@ -VERSION=2.0 # Either 2.0 or 1.1 +USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default +VERSION=${2:-2.0} # Version MODEL_NAME=fairseq_roberta_large +BATCH_SIZE=2 +NUM_ACCUMULATED=6 +EPOCHS=3 +LR=3e-05 +WARMUP_RATIO=0.2 +WD=0.01 +MAX_SEQ_LENGTH=512 +MAX_GRAD_NORM=0.1 +LAYERWISE_DECAY=-1 # Prepare the Data nlp_data prepare_squad --version ${VERSION} # Run the script +if [ ${USE_HOROVOD} -eq 0 ]; +then + RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" +else + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" +fi python3 run_squad.py \ --model_name ${MODEL_NAME} \ --data_dir squad \ @@ -12,12 +28,13 @@ python3 run_squad.py \ --version ${VERSION} \ --do_eval \ --do_train \ - --batch_size 2 \ - --num_accumulated 6 \ - --gpus 0,1,2,3 \ - --epochs 3 \ - --lr 3e-5 \ - --warmup_ratio 0.2 \ - --wd 0.01 \ - --max_seq_length 512 \ - --max_grad_norm 0.1 \ + --batch_size ${BATCH_SIZE} \ + --num_accumulated ${NUM_ACCUMULATED} \ + --layerwise_decay ${LAYERWISE_DECAY} \ + --epochs ${EPOCHS} \ + --lr ${LR} \ + --warmup_ratio ${WARMUP_RATIO} \ + --wd ${WD} \ + --max_seq_length ${MAX_SEQ_LENGTH} \ + --max_grad_norm ${MAX_GRAD_NORM} \ + --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh b/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh index f087860014..7755755e5d 100644 --- a/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh +++ b/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh @@ -1,11 +1,26 @@ -VERSION=2.0 # Either 2.0 or 1.1 +USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default +VERSION=${2:-2.0} # Version MODEL_NAME=google_en_uncased_bert_base +BATCH_SIZE=6 +NUM_ACCUMULATED=2 +EPOCHS=3 +LR=3e-05 +WARMUP_RATIO=0.1 +WD=0.01 +MAX_SEQ_LENGTH=512 +MAX_GRAD_NORM=0.1 +LAYERWISE_DECAY=-1 # Prepare the Data nlp_data prepare_squad --version ${VERSION} # Run the script - +if [ ${USE_HOROVOD} -eq 0 ]; +then + RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" +else + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" +fi python3 run_squad.py \ --model_name ${MODEL_NAME} \ --data_dir squad \ @@ -13,13 +28,13 @@ python3 run_squad.py \ --version ${VERSION} \ --do_eval \ --do_train \ - --batch_size 6 \ - --num_accumulated 2 \ - --gpus 0,1,2,3 \ - --epochs 3 \ - --lr 3e-5 \ - --warmup_ratio 0.1 \ - --wd 0.01 \ - --max_seq_length 512 \ - --max_grad_norm 0.1 \ - --overwrite_cache \ + --batch_size ${BATCH_SIZE} \ + --num_accumulated ${NUM_ACCUMULATED} \ + --layerwise_decay ${LAYERWISE_DECAY} \ + --epochs ${EPOCHS} \ + --lr ${LR} \ + --warmup_ratio ${WARMUP_RATIO} \ + --wd ${WD} \ + --max_seq_length ${MAX_SEQ_LENGTH} \ + --max_grad_norm ${MAX_GRAD_NORM} \ + --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh b/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh index 0e80da7688..54754c3522 100644 --- a/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh +++ b/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh @@ -1,11 +1,26 @@ -VERSION=2.0 # Either 2.0 or 1.1 +USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default +VERSION=${2:-2.0} # Version MODEL_NAME=google_en_uncased_bert_large +BATCH_SIZE=2 +NUM_ACCUMULATED=6 +EPOCHS=3 +LR=3e-05 +WARMUP_RATIO=0.1 +WD=0.01 +MAX_SEQ_LENGTH=512 +MAX_GRAD_NORM=0.1 +LAYERWISE_DECAY=-1 # Prepare the Data nlp_data prepare_squad --version ${VERSION} # Run the script - +if [ ${USE_HOROVOD} -eq 0 ]; +then + RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" +else + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" +fi python3 run_squad.py \ --model_name ${MODEL_NAME} \ --data_dir squad \ @@ -13,13 +28,13 @@ python3 run_squad.py \ --version ${VERSION} \ --do_eval \ --do_train \ - --batch_size 2 \ - --num_accumulated 6 \ - --gpus 0,1,2,3 \ - --epochs 3 \ - --lr 3e-5 \ - --warmup_ratio 0.1 \ - --wd 0.01 \ - --max_seq_length 512 \ - --max_grad_norm 0.1 \ - --overwrite_cache \ + --batch_size ${BATCH_SIZE} \ + --num_accumulated ${NUM_ACCUMULATED} \ + --layerwise_decay ${LAYERWISE_DECAY} \ + --epochs ${EPOCHS} \ + --lr ${LR} \ + --warmup_ratio ${WARMUP_RATIO} \ + --wd ${WD} \ + --max_seq_length ${MAX_SEQ_LENGTH} \ + --max_grad_norm ${MAX_GRAD_NORM} \ + --overwrite_cache From 2cb007d187a47a916350effe3c9c16045cb1993a Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 22:49:46 -0700 Subject: [PATCH 045/115] update --- tools/batch/README.md | 36 +++++++++++++++++++++++++++++------- tools/docker/README.md | 8 ++++++-- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/tools/batch/README.md b/tools/batch/README.md index 1ba8cfc5e1..786c2da09c 100644 --- a/tools/batch/README.md +++ b/tools/batch/README.md @@ -13,11 +13,39 @@ python3 submit-job.py \ --wait ``` +# Updating the Docker for AWS Batch. + +Our current batch job dockers are in 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1. To +update the docker: +- Update the Dockerfile +- Make sure docker and docker-compose, as well as the docker python package are installed. +- Export the AWS account credentials as environment variables +- CD to the same folder as the Dockerfile and execute the following: + +``` +# this executes a command that logs into ECR. +$(aws ecr get-login --no-include-email --region us-east-1) + +# builds the Dockerfile as gluon-nlp-1 docker. +docker build -f ../docker/ubuntu18.04-ci-gpu.Dockerfile -t gluon-nlp-1:gpu . +docker build -f ../docker/ubuntu18.04-ci-cpu.Dockerfile -t gluon-nlp-1:cpu . + +# tags the recent build as gluon-nlp-1:latest, which AWS batch pulls from. +docker tag gluon-nlp-1:gpu 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:latest +docker tag gluon-nlp-1:cpu 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:cpu-latest + +# pushes the change +docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:latest +docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:cpu-latest +``` + ## Conversion Toolkits -Following the instruction of [converting scripts](../../scripts/conversion_toolkits), several pre-trained models could be converted through the corresponding conversion tool as below command where `${MODEL_TYPE}` could be selected from `[albert, bert, electra, mobilebert, bart, robert, xmlr]`. +Following the instruction of [converting scripts](../../scripts/conversion_toolkits), +several pre-trained models could be converted through the corresponding conversion tool as below command where `${MODEL_TYPE}` could be selected from `[albert, bert, electra, mobilebert, bart, robert, xmlr]`. ```bash bash run_batch_conversion ${MODEL_TYPE} ``` + ## Fine-tuning Downstream Tasks ### Question Answering @@ -42,9 +70,3 @@ in which `${MODEL_NAME}` is the name of available pre-trained models listing as | roberta_base | | roberta_large | | mobilebert | - -### Machine Translation - -### Text Translation - -## Pre-trained Model Training diff --git a/tools/docker/README.md b/tools/docker/README.md index cbdbee8cae..affab52d00 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -37,8 +37,12 @@ If you have a multi-GPU instance, e.g., [g4dn.12xlarge](https://aws.amazon.com/e of horovod + MXNet by running the question answering script ``` -docker run --gpus all --rm -it --shm-size=4g gluonai/gluon-nlp:gpu-latest \ - horovodrun -np 2 python3 -m pytest /workspace/horovod/horovod/test/test_mxnet.py +# Assume that you are currently in GluonNLP + +cd gluon-nlp/scripts/question_answering + +docker run --gpus all --rm -it --shm-size=4g -v `pwd`:/workspace/data gluonai/gluon-nlp:gpu-latest \ + bash commands/run_squad2_albert_base.sh ``` From d52075d75d1435399f975b3f3efb37268035d49a Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 22:51:51 -0700 Subject: [PATCH 046/115] update --- tools/docker/ubuntu18.04-ci-cpu.Dockerfile | 2 +- tools/docker/ubuntu18.04-ci-gpu.Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/docker/ubuntu18.04-ci-cpu.Dockerfile b/tools/docker/ubuntu18.04-ci-cpu.Dockerfile index f8cd878422..6c1284047b 100644 --- a/tools/docker/ubuntu18.04-ci-cpu.Dockerfile +++ b/tools/docker/ubuntu18.04-ci-cpu.Dockerfile @@ -2,6 +2,6 @@ FROM gluonai/gluon-nlp:cpu-base-latest LABEL maintainer="GluonNLP Team" -WORKDIR ${WORKSPACE}/gluon-nlp +WORKDIR ${WORKDIR}/gluon-nlp ADD gluon_nlp_cpu_job.sh . RUN chmod +x gluon_nlp_cpu_job.sh diff --git a/tools/docker/ubuntu18.04-ci-gpu.Dockerfile b/tools/docker/ubuntu18.04-ci-gpu.Dockerfile index 7519fdaec4..9c99d4b4a9 100644 --- a/tools/docker/ubuntu18.04-ci-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-ci-gpu.Dockerfile @@ -2,6 +2,6 @@ FROM gluonai/gluon-nlp:gpu-base-latest LABEL maintainer="GluonNLP Team" -WORKDIR ${WORKSPACE}/gluon-nlp +WORKDIR ${WORKDIR}/gluon-nlp ADD gluon_nlp_job.sh . RUN chmod +x gluon_nlp_job.sh From 028a0e5ef85641c55910cf5c7b01b06b9aab703a Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 22:57:39 -0700 Subject: [PATCH 047/115] update --- scripts/question_answering/commands/run_squad.template.sh | 8 +++++--- tools/docker/README.md | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/question_answering/commands/run_squad.template.sh b/scripts/question_answering/commands/run_squad.template.sh index f8e1d3eb7f..eb6621aaf5 100644 --- a/scripts/question_answering/commands/run_squad.template.sh +++ b/scripts/question_answering/commands/run_squad.template.sh @@ -16,14 +16,16 @@ LAYERWISE_DECAY={{ layerwise_decay }} # Prepare the Data nlp_data prepare_squad --version ${VERSION} +RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py + # Run the script if [ ${USE_HOROVOD} -eq 0 ]; then - RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" + RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" else - RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" fi -python3 run_squad.py \ +${RUN_COMMAND} \ --model_name ${MODEL_NAME} \ --data_dir squad \ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ diff --git a/tools/docker/README.md b/tools/docker/README.md index affab52d00..7030aee006 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -42,7 +42,7 @@ of horovod + MXNet by running the question answering script cd gluon-nlp/scripts/question_answering docker run --gpus all --rm -it --shm-size=4g -v `pwd`:/workspace/data gluonai/gluon-nlp:gpu-latest \ - bash commands/run_squad2_albert_base.sh + bash /workspace/data/commands/run_squad2_albert_base.sh ``` From f448df52d5d5d5d48689825bfa899343fa107d54 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 22:58:57 -0700 Subject: [PATCH 048/115] update --- scripts/question_answering/commands/README.md | 2 +- .../commands/generate_commands.py | 2 +- .../{run_squad.template.sh => run_squad.template} | 0 .../commands/run_squad2_albert_base.sh | 14 +++++++++----- .../commands/run_squad2_albert_large.sh | 14 +++++++++----- .../commands/run_squad2_albert_xlarge.sh | 14 +++++++++----- .../commands/run_squad2_albert_xxlarge.sh | 14 +++++++++----- .../commands/run_squad2_electra_base.sh | 14 +++++++++----- .../commands/run_squad2_electra_large.sh | 14 +++++++++----- .../commands/run_squad2_electra_small.sh | 14 +++++++++----- .../commands/run_squad2_mobilebert.sh | 14 +++++++++----- .../commands/run_squad2_roberta_large.sh | 14 +++++++++----- .../commands/run_squad2_uncased_bert_base.sh | 14 +++++++++----- .../commands/run_squad2_uncased_bert_large.sh | 14 +++++++++----- 14 files changed, 101 insertions(+), 57 deletions(-) rename scripts/question_answering/commands/{run_squad.template.sh => run_squad.template} (100%) diff --git a/scripts/question_answering/commands/README.md b/scripts/question_answering/commands/README.md index 626fc0a786..6a8a835d19 100644 --- a/scripts/question_answering/commands/README.md +++ b/scripts/question_answering/commands/README.md @@ -1,6 +1,6 @@ # Commands For Training on SQuAD -All commands are generated by parsing the template in [run_squad.template.sh](run_squad.template.sh). +All commands are generated by parsing the template in [run_squad.template](run_squad.template). To generate all commands, use the following code. ```bash diff --git a/scripts/question_answering/commands/generate_commands.py b/scripts/question_answering/commands/generate_commands.py index 6ed26bd1e9..d8d3f0c6ec 100644 --- a/scripts/question_answering/commands/generate_commands.py +++ b/scripts/question_answering/commands/generate_commands.py @@ -136,5 +136,5 @@ def replace_fn(match): electra_base_cfg, electra_large_cfg, electra_small_cfg, mobilebert_cfg, roberta_large_cfg, uncased_bert_base_cfg, uncased_bert_large_cfg]: prefix = cfg_func.__name__[:-len('_cfg')] - gen_command(cfg_func(), 'run_squad.template.sh', + gen_command(cfg_func(), 'run_squad.template', f'run_squad2_{prefix}.sh') diff --git a/scripts/question_answering/commands/run_squad.template.sh b/scripts/question_answering/commands/run_squad.template similarity index 100% rename from scripts/question_answering/commands/run_squad.template.sh rename to scripts/question_answering/commands/run_squad.template diff --git a/scripts/question_answering/commands/run_squad2_albert_base.sh b/scripts/question_answering/commands/run_squad2_albert_base.sh index f0090fd25b..732b3abef8 100644 --- a/scripts/question_answering/commands/run_squad2_albert_base.sh +++ b/scripts/question_answering/commands/run_squad2_albert_base.sh @@ -1,5 +1,7 @@ -USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default -VERSION=${2:-2.0} # Version +# Generated by "generate_commands.py" + +USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod +VERSION=${2:-2.0} # SQuAD Version MODEL_NAME=google_albert_base_v2 BATCH_SIZE=4 NUM_ACCUMULATED=3 @@ -14,14 +16,16 @@ LAYERWISE_DECAY=-1 # Prepare the Data nlp_data prepare_squad --version ${VERSION} +RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py + # Run the script if [ ${USE_HOROVOD} -eq 0 ]; then - RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" + RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" else - RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" fi -python3 run_squad.py \ +${RUN_COMMAND} \ --model_name ${MODEL_NAME} \ --data_dir squad \ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ diff --git a/scripts/question_answering/commands/run_squad2_albert_large.sh b/scripts/question_answering/commands/run_squad2_albert_large.sh index c3e157ef56..fb92b7cda9 100644 --- a/scripts/question_answering/commands/run_squad2_albert_large.sh +++ b/scripts/question_answering/commands/run_squad2_albert_large.sh @@ -1,5 +1,7 @@ -USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default -VERSION=${2:-2.0} # Version +# Generated by "generate_commands.py" + +USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod +VERSION=${2:-2.0} # SQuAD Version MODEL_NAME=google_albert_large_v2 BATCH_SIZE=3 NUM_ACCUMULATED=4 @@ -14,14 +16,16 @@ LAYERWISE_DECAY=-1 # Prepare the Data nlp_data prepare_squad --version ${VERSION} +RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py + # Run the script if [ ${USE_HOROVOD} -eq 0 ]; then - RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" + RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" else - RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" fi -python3 run_squad.py \ +${RUN_COMMAND} \ --model_name ${MODEL_NAME} \ --data_dir squad \ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ diff --git a/scripts/question_answering/commands/run_squad2_albert_xlarge.sh b/scripts/question_answering/commands/run_squad2_albert_xlarge.sh index e7810248b3..0bd28952d5 100644 --- a/scripts/question_answering/commands/run_squad2_albert_xlarge.sh +++ b/scripts/question_answering/commands/run_squad2_albert_xlarge.sh @@ -1,5 +1,7 @@ -USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default -VERSION=${2:-2.0} # Version +# Generated by "generate_commands.py" + +USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod +VERSION=${2:-2.0} # SQuAD Version MODEL_NAME=google_albert_xlarge_v2 BATCH_SIZE=1 NUM_ACCUMULATED=12 @@ -14,14 +16,16 @@ LAYERWISE_DECAY=-1 # Prepare the Data nlp_data prepare_squad --version ${VERSION} +RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py + # Run the script if [ ${USE_HOROVOD} -eq 0 ]; then - RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" + RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" else - RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" fi -python3 run_squad.py \ +${RUN_COMMAND} \ --model_name ${MODEL_NAME} \ --data_dir squad \ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ diff --git a/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh b/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh index 9154028a38..9383cbc873 100644 --- a/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh +++ b/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh @@ -1,5 +1,7 @@ -USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default -VERSION=${2:-2.0} # Version +# Generated by "generate_commands.py" + +USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod +VERSION=${2:-2.0} # SQuAD Version MODEL_NAME=google_albert_xxlarge_v2 BATCH_SIZE=1 NUM_ACCUMULATED=12 @@ -14,14 +16,16 @@ LAYERWISE_DECAY=-1 # Prepare the Data nlp_data prepare_squad --version ${VERSION} +RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py + # Run the script if [ ${USE_HOROVOD} -eq 0 ]; then - RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" + RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" else - RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" fi -python3 run_squad.py \ +${RUN_COMMAND} \ --model_name ${MODEL_NAME} \ --data_dir squad \ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ diff --git a/scripts/question_answering/commands/run_squad2_electra_base.sh b/scripts/question_answering/commands/run_squad2_electra_base.sh index 53f94456d4..16ee8cdb98 100644 --- a/scripts/question_answering/commands/run_squad2_electra_base.sh +++ b/scripts/question_answering/commands/run_squad2_electra_base.sh @@ -1,5 +1,7 @@ -USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default -VERSION=${2:-2.0} # Version +# Generated by "generate_commands.py" + +USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod +VERSION=${2:-2.0} # SQuAD Version MODEL_NAME=google_electra_base BATCH_SIZE=8 NUM_ACCUMULATED=1 @@ -14,14 +16,16 @@ LAYERWISE_DECAY=0.8 # Prepare the Data nlp_data prepare_squad --version ${VERSION} +RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py + # Run the script if [ ${USE_HOROVOD} -eq 0 ]; then - RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" + RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" else - RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" fi -python3 run_squad.py \ +${RUN_COMMAND} \ --model_name ${MODEL_NAME} \ --data_dir squad \ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ diff --git a/scripts/question_answering/commands/run_squad2_electra_large.sh b/scripts/question_answering/commands/run_squad2_electra_large.sh index 79f4c002db..815ec304e6 100644 --- a/scripts/question_answering/commands/run_squad2_electra_large.sh +++ b/scripts/question_answering/commands/run_squad2_electra_large.sh @@ -1,5 +1,7 @@ -USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default -VERSION=${2:-2.0} # Version +# Generated by "generate_commands.py" + +USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod +VERSION=${2:-2.0} # SQuAD Version MODEL_NAME=google_electra_large BATCH_SIZE=2 NUM_ACCUMULATED=4 @@ -14,14 +16,16 @@ LAYERWISE_DECAY=0.9 # Prepare the Data nlp_data prepare_squad --version ${VERSION} +RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py + # Run the script if [ ${USE_HOROVOD} -eq 0 ]; then - RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" + RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" else - RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" fi -python3 run_squad.py \ +${RUN_COMMAND} \ --model_name ${MODEL_NAME} \ --data_dir squad \ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ diff --git a/scripts/question_answering/commands/run_squad2_electra_small.sh b/scripts/question_answering/commands/run_squad2_electra_small.sh index ac120ec278..d6228ef0bc 100644 --- a/scripts/question_answering/commands/run_squad2_electra_small.sh +++ b/scripts/question_answering/commands/run_squad2_electra_small.sh @@ -1,5 +1,7 @@ -USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default -VERSION=${2:-2.0} # Version +# Generated by "generate_commands.py" + +USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod +VERSION=${2:-2.0} # SQuAD Version MODEL_NAME=google_electra_small BATCH_SIZE=8 NUM_ACCUMULATED=1 @@ -14,14 +16,16 @@ LAYERWISE_DECAY=0.8 # Prepare the Data nlp_data prepare_squad --version ${VERSION} +RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py + # Run the script if [ ${USE_HOROVOD} -eq 0 ]; then - RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" + RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" else - RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" fi -python3 run_squad.py \ +${RUN_COMMAND} \ --model_name ${MODEL_NAME} \ --data_dir squad \ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ diff --git a/scripts/question_answering/commands/run_squad2_mobilebert.sh b/scripts/question_answering/commands/run_squad2_mobilebert.sh index 13ffeb0246..24fece841d 100644 --- a/scripts/question_answering/commands/run_squad2_mobilebert.sh +++ b/scripts/question_answering/commands/run_squad2_mobilebert.sh @@ -1,5 +1,7 @@ -USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default -VERSION=${2:-2.0} # Version +# Generated by "generate_commands.py" + +USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod +VERSION=${2:-2.0} # SQuAD Version MODEL_NAME=google_uncased_mobilebert BATCH_SIZE=8 NUM_ACCUMULATED=1 @@ -14,14 +16,16 @@ LAYERWISE_DECAY=-1 # Prepare the Data nlp_data prepare_squad --version ${VERSION} +RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py + # Run the script if [ ${USE_HOROVOD} -eq 0 ]; then - RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" + RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" else - RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" fi -python3 run_squad.py \ +${RUN_COMMAND} \ --model_name ${MODEL_NAME} \ --data_dir squad \ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ diff --git a/scripts/question_answering/commands/run_squad2_roberta_large.sh b/scripts/question_answering/commands/run_squad2_roberta_large.sh index cdea52c0d6..2bf51e6b6c 100644 --- a/scripts/question_answering/commands/run_squad2_roberta_large.sh +++ b/scripts/question_answering/commands/run_squad2_roberta_large.sh @@ -1,5 +1,7 @@ -USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default -VERSION=${2:-2.0} # Version +# Generated by "generate_commands.py" + +USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod +VERSION=${2:-2.0} # SQuAD Version MODEL_NAME=fairseq_roberta_large BATCH_SIZE=2 NUM_ACCUMULATED=6 @@ -14,14 +16,16 @@ LAYERWISE_DECAY=-1 # Prepare the Data nlp_data prepare_squad --version ${VERSION} +RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py + # Run the script if [ ${USE_HOROVOD} -eq 0 ]; then - RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" + RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" else - RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" fi -python3 run_squad.py \ +${RUN_COMMAND} \ --model_name ${MODEL_NAME} \ --data_dir squad \ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ diff --git a/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh b/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh index 7755755e5d..f2a0738282 100644 --- a/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh +++ b/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh @@ -1,5 +1,7 @@ -USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default -VERSION=${2:-2.0} # Version +# Generated by "generate_commands.py" + +USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod +VERSION=${2:-2.0} # SQuAD Version MODEL_NAME=google_en_uncased_bert_base BATCH_SIZE=6 NUM_ACCUMULATED=2 @@ -14,14 +16,16 @@ LAYERWISE_DECAY=-1 # Prepare the Data nlp_data prepare_squad --version ${VERSION} +RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py + # Run the script if [ ${USE_HOROVOD} -eq 0 ]; then - RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" + RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" else - RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" fi -python3 run_squad.py \ +${RUN_COMMAND} \ --model_name ${MODEL_NAME} \ --data_dir squad \ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ diff --git a/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh b/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh index 54754c3522..2f19c4c5e7 100644 --- a/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh +++ b/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh @@ -1,5 +1,7 @@ -USE_HOROVOD=${1:-0} # Horovod flag. Do not use horovod by default -VERSION=${2:-2.0} # Version +# Generated by "generate_commands.py" + +USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod +VERSION=${2:-2.0} # SQuAD Version MODEL_NAME=google_en_uncased_bert_large BATCH_SIZE=2 NUM_ACCUMULATED=6 @@ -14,14 +16,16 @@ LAYERWISE_DECAY=-1 # Prepare the Data nlp_data prepare_squad --version ${VERSION} +RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py + # Run the script if [ ${USE_HOROVOD} -eq 0 ]; then - RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" + RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" else - RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" fi -python3 run_squad.py \ +${RUN_COMMAND} \ --model_name ${MODEL_NAME} \ --data_dir squad \ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ From 83e96c06cee766062bab319ed87b2df6bfade642 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 23:00:03 -0700 Subject: [PATCH 049/115] Update README.md --- tools/docker/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/docker/README.md b/tools/docker/README.md index 7030aee006..982812ac29 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -42,7 +42,7 @@ of horovod + MXNet by running the question answering script cd gluon-nlp/scripts/question_answering docker run --gpus all --rm -it --shm-size=4g -v `pwd`:/workspace/data gluonai/gluon-nlp:gpu-latest \ - bash /workspace/data/commands/run_squad2_albert_base.sh + cd /workspace/data && bash commands/run_squad2_albert_base.sh ``` From ed80b9fa7e783e8cbc298add747c62107ca6c362 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 23:01:24 -0700 Subject: [PATCH 050/115] Update install_ubuntu18.04_core.sh --- tools/docker/install/install_ubuntu18.04_core.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/docker/install/install_ubuntu18.04_core.sh b/tools/docker/install/install_ubuntu18.04_core.sh index 145061b115..18cce66c7f 100644 --- a/tools/docker/install/install_ubuntu18.04_core.sh +++ b/tools/docker/install/install_ubuntu18.04_core.sh @@ -20,6 +20,7 @@ apt-get update \ vim \ wget \ unzip \ + less \ libopenblas-dev \ ninja-build \ openssh-client \ From f8d09a0e52ac8fbae9f22ebbd901b62d753ba2a2 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 23:05:21 -0700 Subject: [PATCH 051/115] update --- tools/docker/README.md | 2 +- tools/docker/ubuntu18.04-base-gpu.Dockerfile | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/docker/README.md b/tools/docker/README.md index 982812ac29..d9fb27e539 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -41,7 +41,7 @@ of horovod + MXNet by running the question answering script cd gluon-nlp/scripts/question_answering -docker run --gpus all --rm -it --shm-size=4g -v `pwd`:/workspace/data gluonai/gluon-nlp:gpu-latest \ +docker run --gpus all --rm -it --shm-size=4g -v `pwd`:/workspace/data gluonai/gluon-nlp:gpu-base-latest \ cd /workspace/data && bash commands/run_squad2_albert_base.sh ``` diff --git a/tools/docker/ubuntu18.04-base-gpu.Dockerfile b/tools/docker/ubuntu18.04-base-gpu.Dockerfile index 06e1780e35..a2a91276e0 100644 --- a/tools/docker/ubuntu18.04-base-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-base-gpu.Dockerfile @@ -43,3 +43,4 @@ RUN cd ${WORKDIR} \ && cd gluon-nlp \ && git checkout master \ && python3 -m pip install -U -e ."[extras]" +CMD ["/bin/bash"] From 26ef33cdaa2199d9cc56465eb995ef275f982c46 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 23:30:43 -0700 Subject: [PATCH 052/115] update --- scripts/question_answering/commands/run_squad.template | 6 ++---- tools/docker/README.md | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/question_answering/commands/run_squad.template b/scripts/question_answering/commands/run_squad.template index eb6621aaf5..a67b23bce3 100644 --- a/scripts/question_answering/commands/run_squad.template +++ b/scripts/question_answering/commands/run_squad.template @@ -16,14 +16,12 @@ LAYERWISE_DECAY={{ layerwise_decay }} # Prepare the Data nlp_data prepare_squad --version ${VERSION} -RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py - # Run the script if [ ${USE_HOROVOD} -eq 0 ]; then - RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" + RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" else - RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" fi ${RUN_COMMAND} \ --model_name ${MODEL_NAME} \ diff --git a/tools/docker/README.md b/tools/docker/README.md index d9fb27e539..982812ac29 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -41,7 +41,7 @@ of horovod + MXNet by running the question answering script cd gluon-nlp/scripts/question_answering -docker run --gpus all --rm -it --shm-size=4g -v `pwd`:/workspace/data gluonai/gluon-nlp:gpu-base-latest \ +docker run --gpus all --rm -it --shm-size=4g -v `pwd`:/workspace/data gluonai/gluon-nlp:gpu-latest \ cd /workspace/data && bash commands/run_squad2_albert_base.sh ``` From d33834be074e956ab43d5660aff8b8992e0d26ca Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 23:49:09 -0700 Subject: [PATCH 053/115] update --- tools/batch/submit-job.py | 2 +- tools/docker/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/batch/submit-job.py b/tools/batch/submit-job.py index 76f7368d2b..7c6345a4c4 100644 --- a/tools/batch/submit-job.py +++ b/tools/batch/submit-job.py @@ -79,7 +79,7 @@ def nowInMillis(): job_definitions = { 'g4dn.4x': 'gluon-nlp-1-jobs:5', 'g4dn.8x': 'gluon-nlp-1-jobs:4', - 'g4dn.12x': 'gluon-nlp-1-4gpu-jobs:1', + 'g4dn.12x': 'gluon-nlp-1-4gpu-jobs:3', 'g4dn.16x': 'gluon-nlp-1-jobs:3', 'p3.2x': 'gluon-nlp-1-jobs:11', 'p3.8x': 'gluon-nlp-1-4gpu-jobs:2', diff --git a/tools/docker/README.md b/tools/docker/README.md index 982812ac29..b89aadd99d 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -42,7 +42,7 @@ of horovod + MXNet by running the question answering script cd gluon-nlp/scripts/question_answering docker run --gpus all --rm -it --shm-size=4g -v `pwd`:/workspace/data gluonai/gluon-nlp:gpu-latest \ - cd /workspace/data && bash commands/run_squad2_albert_base.sh + bash -c 'cd /workspace/data && bash commands/run_squad2_albert_base.sh 1 2.0' ``` From a689265d0be6fcc27740fcfbaad8b0ce04151088 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 23:49:44 -0700 Subject: [PATCH 054/115] fix --- scripts/datasets/question_answering/README.md | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/scripts/datasets/question_answering/README.md b/scripts/datasets/question_answering/README.md index ac4fb68dbf..10328c1407 100644 --- a/scripts/datasets/question_answering/README.md +++ b/scripts/datasets/question_answering/README.md @@ -1,5 +1,14 @@ # Question Answering +| Datasets | #Train | #Valid | #Test | Leaderboard | Answerable | Long Context | +|-----------|--------|--------|-------|-------------|------------|--------------| +| SQuAD 2.0 | +| SQuAD 1.1 | +| SearchQA | +| TriviaQA | +| HotpotQA | + + ## SQuAD SQuAD datasets is distributed under the [CC BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/legalcode) license. @@ -39,7 +48,7 @@ python3 prepare_searchqa.py nlp_data prepare_searchqa ``` -Directory structure of the searchqa dataset will be as follows +Directory structure of the SearchQA dataset will be as follows ``` searchqa ├── train.txt @@ -48,9 +57,10 @@ searchqa ``` ## TriviaQA -[TriviaQA](https://nlp.cs.washington.edu/triviaqa/) is an open domain QA dataset. See more useful scripts in [Offical Github](https://github.com/mandarjoshi90/triviaqa) +[TriviaQA](https://nlp.cs.washington.edu/triviaqa/) is an open domain QA dataset. +See more useful scripts in [Offical Github](https://github.com/mandarjoshi90/triviaqa). -Run the following command to download triviaqa +Run the following command to download TriviaQA ```bash python3 prepare_triviaqa.py --version rc # Download TriviaQA version 1.0 for RC (2.5G) From 9653d7a5a09ea94321ac780198041b78548528c2 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 23:56:06 -0700 Subject: [PATCH 055/115] Update README.md --- tools/batch/README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/batch/README.md b/tools/batch/README.md index 786c2da09c..f62e404923 100644 --- a/tools/batch/README.md +++ b/tools/batch/README.md @@ -49,13 +49,15 @@ bash run_batch_conversion ${MODEL_TYPE} ## Fine-tuning Downstream Tasks ### Question Answering -We can quickly deploy an experiment via [squad fine-tuning scripts](../../scripts/question_answering#squad) as +We can quickly run the squad finetuning via [squad fine-tuning scripts](../../scripts/question_answering#squad) and the AWS Batch job. + +The code is given in [run_batch_squad.sh](run_batch_squad.sh) ```bash -bash run_batch_squad.sh ${MODEL_NAME} +bash run_batch_squad.sh ``` -in which `${MODEL_NAME}` is the name of available pre-trained models listing as following: +Internally, it will train the following models on SQuAD 2.0 dataset: | MODEL_NAME | |:------------------:| | uncased_bert_base | From 0b8f37ddc697727700b6fa9475bca9f835037099 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sat, 10 Oct 2020 23:58:51 -0700 Subject: [PATCH 056/115] Update run_batch_squad.sh --- tools/batch/run_batch_squad.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/batch/run_batch_squad.sh b/tools/batch/run_batch_squad.sh index c3f9ba1dff..1f948db62f 100644 --- a/tools/batch/run_batch_squad.sh +++ b/tools/batch/run_batch_squad.sh @@ -1,3 +1,7 @@ +USE_HOROVOD=${1:-0} +VERSION=${2:-2.0} +LOG_PATH=${3:-submit_squad_v2.log} + for MODEL_NAME in albert_base \ albert_large \ albert_xlarge \ @@ -18,5 +22,5 @@ do --name test_squad2_${MODEL_NAME} \ --work-dir scripts/question_answering \ --remote https://github.com/dmlc/gluon-nlp/ \ - --command 'bash commands/run_squad2_'${MODEL_NAME}'.sh | tee stdout.log' >> submit_squad_v2.log + --command 'bash commands/run_squad2_'${MODEL_NAME}'.sh ${USE_HOROVOD} ${VERSION} | tee stdout.log' >> ${LOG_PATH} done From 8c38f9828a9936be6db5b37afac002ffabfd3347 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sun, 11 Oct 2020 00:01:36 -0700 Subject: [PATCH 057/115] update --- tools/batch/README.md | 6 ++++++ tools/batch/run_batch_squad.sh | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/batch/README.md b/tools/batch/README.md index f62e404923..9664e9fcf3 100644 --- a/tools/batch/README.md +++ b/tools/batch/README.md @@ -54,9 +54,15 @@ We can quickly run the squad finetuning via [squad fine-tuning scripts](../../sc The code is given in [run_batch_squad.sh](run_batch_squad.sh) ```bash +# AWS Batch training without horovod on SQuAD 2.0 bash run_batch_squad.sh + +# AWS Batch training with horovod on SQuAD 2.0 +bash run_batch_squad.sh 1 2.0 submit_squad_v2_horovod.log ``` + + Internally, it will train the following models on SQuAD 2.0 dataset: | MODEL_NAME | |:------------------:| diff --git a/tools/batch/run_batch_squad.sh b/tools/batch/run_batch_squad.sh index 1f948db62f..169c07b8bb 100644 --- a/tools/batch/run_batch_squad.sh +++ b/tools/batch/run_batch_squad.sh @@ -16,11 +16,11 @@ for MODEL_NAME in albert_base \ do python3 submit-job.py \ --region us-east-1 \ - --source-ref master \ + --source-ref fix_docker \ --job-type g4dn.12x \ --save-path temp \ --name test_squad2_${MODEL_NAME} \ --work-dir scripts/question_answering \ - --remote https://github.com/dmlc/gluon-nlp/ \ + --remote https://github.com/sxjscience/gluon-nlp/ \ --command 'bash commands/run_squad2_'${MODEL_NAME}'.sh ${USE_HOROVOD} ${VERSION} | tee stdout.log' >> ${LOG_PATH} done From a605e3a173499abf0e86c3ed9d6c8470a765c8d6 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sun, 11 Oct 2020 04:15:16 -0700 Subject: [PATCH 058/115] Update run_batch_squad.sh --- tools/batch/run_batch_squad.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/batch/run_batch_squad.sh b/tools/batch/run_batch_squad.sh index 169c07b8bb..a0ca2f329b 100644 --- a/tools/batch/run_batch_squad.sh +++ b/tools/batch/run_batch_squad.sh @@ -1,3 +1,5 @@ +set -e + USE_HOROVOD=${1:-0} VERSION=${2:-2.0} LOG_PATH=${3:-submit_squad_v2.log} @@ -22,5 +24,5 @@ do --name test_squad2_${MODEL_NAME} \ --work-dir scripts/question_answering \ --remote https://github.com/sxjscience/gluon-nlp/ \ - --command 'bash commands/run_squad2_'${MODEL_NAME}'.sh ${USE_HOROVOD} ${VERSION} | tee stdout.log' >> ${LOG_PATH} + --command "bash commands/run_squad2_${MODEL_NAME}.sh ${USE_HOROVOD} ${VERSION} | tee stdout.log" >> ${LOG_PATH} done From 36628acf296ef231a3469a0f5c372b8da6e0ab6a Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sun, 11 Oct 2020 04:17:20 -0700 Subject: [PATCH 059/115] Update run_batch_squad.sh --- tools/batch/run_batch_squad.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/batch/run_batch_squad.sh b/tools/batch/run_batch_squad.sh index a0ca2f329b..a35fa693f9 100644 --- a/tools/batch/run_batch_squad.sh +++ b/tools/batch/run_batch_squad.sh @@ -1,4 +1,4 @@ -set -e +set -exs USE_HOROVOD=${1:-0} VERSION=${2:-2.0} From d850924c29d03a15d63052e34364c81ea384026f Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sun, 11 Oct 2020 20:27:20 -0700 Subject: [PATCH 060/115] update --- tools/batch/README.md | 6 ++---- tools/batch/run_batch_squad.sh | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/tools/batch/README.md b/tools/batch/README.md index 9664e9fcf3..24b764b62b 100644 --- a/tools/batch/README.md +++ b/tools/batch/README.md @@ -26,12 +26,10 @@ update the docker: # this executes a command that logs into ECR. $(aws ecr get-login --no-include-email --region us-east-1) -# builds the Dockerfile as gluon-nlp-1 docker. -docker build -f ../docker/ubuntu18.04-ci-gpu.Dockerfile -t gluon-nlp-1:gpu . -docker build -f ../docker/ubuntu18.04-ci-cpu.Dockerfile -t gluon-nlp-1:cpu . +# builds the docker image use the command in docker # tags the recent build as gluon-nlp-1:latest, which AWS batch pulls from. -docker tag gluon-nlp-1:gpu 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:latest +docker tag gluon-nlp:gpu-ci-latest 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:latest docker tag gluon-nlp-1:cpu 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:cpu-latest # pushes the change diff --git a/tools/batch/run_batch_squad.sh b/tools/batch/run_batch_squad.sh index a35fa693f9..2d0d8bf0d9 100644 --- a/tools/batch/run_batch_squad.sh +++ b/tools/batch/run_batch_squad.sh @@ -1,4 +1,4 @@ -set -exs +set -ex USE_HOROVOD=${1:-0} VERSION=${2:-2.0} From d62923525a3efda9a6859f44c5b57f5b77009b8a Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sun, 11 Oct 2020 20:28:14 -0700 Subject: [PATCH 061/115] Update README.md --- tools/batch/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/batch/README.md b/tools/batch/README.md index 24b764b62b..8f9fc0be73 100644 --- a/tools/batch/README.md +++ b/tools/batch/README.md @@ -29,8 +29,8 @@ $(aws ecr get-login --no-include-email --region us-east-1) # builds the docker image use the command in docker # tags the recent build as gluon-nlp-1:latest, which AWS batch pulls from. -docker tag gluon-nlp:gpu-ci-latest 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:latest -docker tag gluon-nlp-1:cpu 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:cpu-latest +docker tag gluonai/gluon-nlp:gpu-ci-latest 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:latest +docker tag gluonai/gluon-nlp:cpu-ci-latest 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:cpu-latest # pushes the change docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:latest From ab0a18356c384a682d4080a4095aa91621e0591c Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sun, 11 Oct 2020 22:19:02 -0700 Subject: [PATCH 062/115] fix --- tools/batch/README.md | 1 - tools/batch/submit-job.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/batch/README.md b/tools/batch/README.md index 8f9fc0be73..7c22e2839c 100644 --- a/tools/batch/README.md +++ b/tools/batch/README.md @@ -73,6 +73,5 @@ Internally, it will train the following models on SQuAD 2.0 dataset: | electra_small | | electra_base | | electra_large | -| roberta_base | | roberta_large | | mobilebert | diff --git a/tools/batch/submit-job.py b/tools/batch/submit-job.py index 7c6345a4c4..2829cdd0c9 100644 --- a/tools/batch/submit-job.py +++ b/tools/batch/submit-job.py @@ -79,7 +79,7 @@ def nowInMillis(): job_definitions = { 'g4dn.4x': 'gluon-nlp-1-jobs:5', 'g4dn.8x': 'gluon-nlp-1-jobs:4', - 'g4dn.12x': 'gluon-nlp-1-4gpu-jobs:3', + 'g4dn.12x': 'gluon-nlp-g4-12dn:4', #'gluon-nlp-1-4gpu-jobs:3', 'g4dn.16x': 'gluon-nlp-1-jobs:3', 'p3.2x': 'gluon-nlp-1-jobs:11', 'p3.8x': 'gluon-nlp-1-4gpu-jobs:2', From 74e296607fe60dc1327de2f13667a05053f46d94 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sun, 11 Oct 2020 22:32:08 -0700 Subject: [PATCH 063/115] Update gluon_nlp_job.sh --- tools/docker/gluon_nlp_job.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/docker/gluon_nlp_job.sh b/tools/docker/gluon_nlp_job.sh index 65bad7ccce..76ed223aa4 100755 --- a/tools/docker/gluon_nlp_job.sh +++ b/tools/docker/gluon_nlp_job.sh @@ -1,4 +1,11 @@ #!/bin/bash +set -x + +# Due to the issue in https://forums.aws.amazon.com/thread.jspa?messageID=953912 +# We need to manually configure the shm to ensure that Horovod is runnable. +umount shm +mount -t tmpfs -o rw,nosuid,nodev,noexec,relatime,size=2G shm /dev/shm + date echo "Args: $@" env From ab24028629da730034f94256717bae9de00dc816 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sun, 11 Oct 2020 22:42:01 -0700 Subject: [PATCH 064/115] update --- tools/docker/gluon_nlp_job.sh | 18 ++++++++++++------ tools/docker/ubuntu18.04-ci-cpu.Dockerfile | 4 ++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/tools/docker/gluon_nlp_job.sh b/tools/docker/gluon_nlp_job.sh index 76ed223aa4..e2866a411a 100755 --- a/tools/docker/gluon_nlp_job.sh +++ b/tools/docker/gluon_nlp_job.sh @@ -1,11 +1,6 @@ #!/bin/bash set -x -# Due to the issue in https://forums.aws.amazon.com/thread.jspa?messageID=953912 -# We need to manually configure the shm to ensure that Horovod is runnable. -umount shm -mount -t tmpfs -o rw,nosuid,nodev,noexec,relatime,size=2G shm /dev/shm - date echo "Args: $@" env @@ -19,6 +14,7 @@ COMMAND=$3 SAVED_OUTPUT=$4 SAVE_PATH=$5 REMOTE=$6 +DEVICE=${7:-gpu} if [ ! -z $REMOTE ]; then git remote set-url origin $REMOTE @@ -26,7 +22,17 @@ fi; git fetch origin $SOURCE_REF:working git checkout working -python3 -m pip install -U --quiet --pre "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python + +if [ $DEVICE == "cpu" ]; then + python3 -m pip install -U --quiet --pre "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python +else + # Due to the issue in https://forums.aws.amazon.com/thread.jspa?messageID=953912 + # We need to manually configure the shm to ensure that Horovod is runnable. + umount shm + mount -t tmpfs -o rw,nosuid,nodev,noexec,relatime,size=2G shm /dev/shm + python3 -m pip install -U --quiet --pre "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python +fi + python3 -m pip install --quiet -e .[extras] cd $WORK_DIR diff --git a/tools/docker/ubuntu18.04-ci-cpu.Dockerfile b/tools/docker/ubuntu18.04-ci-cpu.Dockerfile index 6c1284047b..eef8f74d17 100644 --- a/tools/docker/ubuntu18.04-ci-cpu.Dockerfile +++ b/tools/docker/ubuntu18.04-ci-cpu.Dockerfile @@ -3,5 +3,5 @@ FROM gluonai/gluon-nlp:cpu-base-latest LABEL maintainer="GluonNLP Team" WORKDIR ${WORKDIR}/gluon-nlp -ADD gluon_nlp_cpu_job.sh . -RUN chmod +x gluon_nlp_cpu_job.sh +ADD gluon_nlp_job.sh . +RUN chmod +x gluon_nlp_job.sh From 2f0c048fd95b2cdb98c08cf3a2c89d2ff87aef78 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sun, 11 Oct 2020 23:13:16 -0700 Subject: [PATCH 065/115] Update README.md --- tools/batch/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/batch/README.md b/tools/batch/README.md index 7c22e2839c..d0b8cc9ef1 100644 --- a/tools/batch/README.md +++ b/tools/batch/README.md @@ -29,12 +29,12 @@ $(aws ecr get-login --no-include-email --region us-east-1) # builds the docker image use the command in docker # tags the recent build as gluon-nlp-1:latest, which AWS batch pulls from. -docker tag gluonai/gluon-nlp:gpu-ci-latest 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:latest -docker tag gluonai/gluon-nlp:cpu-ci-latest 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:cpu-latest +docker tag gluonai/gluon-nlp:gpu-ci-latest 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:gpu-ci-latest +docker tag gluonai/gluon-nlp:cpu-ci-latest 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:cpu-ci-latest # pushes the change -docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:latest -docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:cpu-latest +docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:gpu-ci-latest +docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:cpu-ci-latest ``` ## Conversion Toolkits From 296bc7ed22bfc820f862647fc28bb2790e0b62d9 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sun, 11 Oct 2020 23:15:10 -0700 Subject: [PATCH 066/115] Update README.md --- tools/docker/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/docker/README.md b/tools/docker/README.md index b89aadd99d..73b3482c37 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -6,9 +6,9 @@ and try out to use GluonNLP to solve your problem. | Name | Description | Target User | |------|-------------|-------------| -| `base` | Extends the CUDA image to include the basic functionalities, e.g., GluonNLP package, MXNet, PyTorch, Horovod. You can directly configure other docker images based on this basic docker | The basic docker | -| `ci` | Image used in GluonNLP CI | GluonNLP Developers | -| `devel` | Extends the base image to include a development platform powered by Jupyter Lab. Some useful functionalities like Tensorboard are pre-installed. | Users that are willing to solve NLP problems and also do distributed training with Horovod + GluonNLP. | +| `cpu-base-latest` or `gpu-base-latest` | Extends the CUDA image to include the basic functionalities, e.g., GluonNLP package, MXNet, PyTorch, Horovod. You can directly configure other docker images based on this basic docker | The basic docker | +| `cpu-ci-latest` or `gpu-ci-latest` | Image used in GluonNLP CI | GluonNLP Developers | +| `cpu-latest` or `gpu-latest` | Extends the base image to include a development platform powered by Jupyter Lab. Some useful functionalities like Tensorboard are pre-installed. | Users that are willing to solve NLP problems and also do distributed training with Horovod + GluonNLP. | ## Run Docker From cc62fde4282ce27561aa0fdc4d7f323ed991f96b Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sun, 11 Oct 2020 23:17:04 -0700 Subject: [PATCH 067/115] Update README.md --- tools/docker/README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/docker/README.md b/tools/docker/README.md index 73b3482c37..661e12898c 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -15,8 +15,13 @@ and try out to use GluonNLP to solve your problem. You can run the docker with the following command. ``` +# On GPU machine docker pull gluonai/gluon-nlp:gpu-latest docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=2g gluonai/gluon-nlp:gpu-latest + +# On CPU machine +docker pull gluonai/gluon-nlp:cpu-latest +docker run --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=2g gluonai/gluon-nlp:cpu-latest ``` Here, we open the ports 8888, 8787, 8786, which are used for connecting to JupyterLab. @@ -41,7 +46,7 @@ of horovod + MXNet by running the question answering script cd gluon-nlp/scripts/question_answering -docker run --gpus all --rm -it --shm-size=4g -v `pwd`:/workspace/data gluonai/gluon-nlp:gpu-latest \ +docker run --gpus all --rm -it --shm-size=2g -v `pwd`:/workspace/data gluonai/gluon-nlp:gpu-latest \ bash -c 'cd /workspace/data && bash commands/run_squad2_albert_base.sh 1 2.0' ``` From 0650674854925ea5bd61152a6ed4d8d08226e1e9 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 12 Oct 2020 00:36:34 -0700 Subject: [PATCH 068/115] update --- tools/docker/install/install_tvm_cpu.sh | 4 ++-- tools/docker/install/install_tvm_gpu.sh | 4 ++-- tools/docker/ubuntu18.04-base-cpu.Dockerfile | 7 +++++++ tools/docker/ubuntu18.04-base-gpu.Dockerfile | 7 +++++++ 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/tools/docker/install/install_tvm_cpu.sh b/tools/docker/install/install_tvm_cpu.sh index 5598095090..37b6187329 100644 --- a/tools/docker/install/install_tvm_cpu.sh +++ b/tools/docker/install/install_tvm_cpu.sh @@ -20,9 +20,9 @@ set -e set -u set -o pipefail -cd /usr +cd ${WORKDIR} git clone https://github.com/apache/incubator-tvm tvm --recursive -cd /usr/tvm +cd ${WORKDIR}/tvm # checkout a hash-tag git checkout 6d0351a7f0e23eb5428c59a976edd2bfb8207c0d diff --git a/tools/docker/install/install_tvm_gpu.sh b/tools/docker/install/install_tvm_gpu.sh index f00ed64039..6364bf4bf7 100644 --- a/tools/docker/install/install_tvm_gpu.sh +++ b/tools/docker/install/install_tvm_gpu.sh @@ -20,9 +20,9 @@ set -e set -u set -o pipefail -cd /usr +cd ${WORKDIR} git clone https://github.com/apache/incubator-tvm tvm --recursive -cd /usr/tvm +cd ${WORKDIR}/tvm # checkout a hash-tag git checkout 6d0351a7f0e23eb5428c59a976edd2bfb8207c0d diff --git a/tools/docker/ubuntu18.04-base-cpu.Dockerfile b/tools/docker/ubuntu18.04-base-cpu.Dockerfile index 912a3076d1..c072e06b55 100644 --- a/tools/docker/ubuntu18.04-base-cpu.Dockerfile +++ b/tools/docker/ubuntu18.04-base-cpu.Dockerfile @@ -23,8 +23,15 @@ RUN bash /install/install_openmpi.sh ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH +# Install LLVM +RUN bash /install/install_llvm.sh + +# Install Python Packages RUN bash /install/install_python_packages.sh +# Install TVM +RUN bash /install/install_tvm_gpu.sh + # Install MXNet RUN python3 -m pip install -U --pre "mxnet>=2.0.0b20200926" -f https://dist.mxnet.io/python diff --git a/tools/docker/ubuntu18.04-base-gpu.Dockerfile b/tools/docker/ubuntu18.04-base-gpu.Dockerfile index a2a91276e0..81f1be771e 100644 --- a/tools/docker/ubuntu18.04-base-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-base-gpu.Dockerfile @@ -22,8 +22,15 @@ RUN bash /install/install_openmpi.sh ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH +# Install LLVM +RUN bash /install/install_llvm.sh + +# Install Python Packages RUN bash /install/install_python_packages.sh +# Install TVM +RUN bash /install/install_tvm_gpu.sh + # Install MXNet RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python From 644618a0cd55c330650250ff21ffcb52ec610d3e Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 12 Oct 2020 00:39:59 -0700 Subject: [PATCH 069/115] Update README.md --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index feb20cba2a..c795778acc 100644 --- a/README.md +++ b/README.md @@ -92,8 +92,13 @@ You may go to [tests](tests) to see how to run the unittests. You can use Docker to launch a JupyterLab development environment with GluonNLP installed. ``` +# GPU Instance docker pull gluonai/gluon-nlp:gpu-latest docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=4g gluonai/gluon-nlp:gpu-latest + +# CPU Instance +docker pull gluonai/gluon-nlp:cpu-latest +docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=4g gluonai/gluon-nlp:cpu-latest ``` For more details, you can refer to the guidance in [tools/docker](tools/docker). From 7b7f42f21c776c723787b221339d4021ac34f2b3 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 12 Oct 2020 01:01:18 -0700 Subject: [PATCH 070/115] update --- tools/docker/install/install_llvm.sh | 2 +- tools/docker/install/install_tvm_cpu.sh | 2 +- tools/docker/install/install_tvm_gpu.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/docker/install/install_llvm.sh b/tools/docker/install/install_llvm.sh index 7ca627b5ca..c52c679920 100644 --- a/tools/docker/install/install_llvm.sh +++ b/tools/docker/install/install_llvm.sh @@ -2,4 +2,4 @@ set -euo pipefail wget https://apt.llvm.org/llvm.sh chmod +x llvm.sh -./llvm.sh 8 # Fix version +./llvm.sh 10 # Fix version diff --git a/tools/docker/install/install_tvm_cpu.sh b/tools/docker/install/install_tvm_cpu.sh index 37b6187329..3dc27fd0f0 100644 --- a/tools/docker/install/install_tvm_cpu.sh +++ b/tools/docker/install/install_tvm_cpu.sh @@ -26,7 +26,7 @@ cd ${WORKDIR}/tvm # checkout a hash-tag git checkout 6d0351a7f0e23eb5428c59a976edd2bfb8207c0d -echo set\(USE_LLVM llvm-config-8\) >> config.cmake +echo set\(USE_LLVM llvm-config-10\) >> config.cmake echo set\(USE_GRAPH_RUNTIME ON\) >> config.cmake echo set\(USE_BLAS openblas\) >> config.cmake mkdir -p build diff --git a/tools/docker/install/install_tvm_gpu.sh b/tools/docker/install/install_tvm_gpu.sh index 6364bf4bf7..f7f8cdfb8a 100644 --- a/tools/docker/install/install_tvm_gpu.sh +++ b/tools/docker/install/install_tvm_gpu.sh @@ -26,7 +26,7 @@ cd ${WORKDIR}/tvm # checkout a hash-tag git checkout 6d0351a7f0e23eb5428c59a976edd2bfb8207c0d -echo set\(USE_LLVM llvm-config-8\) >> config.cmake +echo set\(USE_LLVM llvm-config-10\) >> config.cmake echo set\(USE_CUDA ON\) >> config.cmake echo set\(USE_CUDNN ON\) >> config.cmake echo set\(USE_BLAS openblas\) >> config.cmake From 0e169c9feb099a22ccd170ee481375f2f2790e82 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 12 Oct 2020 01:19:48 -0700 Subject: [PATCH 071/115] Update install_python_packages.sh --- tools/docker/install/install_python_packages.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/docker/install/install_python_packages.sh b/tools/docker/install/install_python_packages.sh index 96dededd35..b18534647f 100644 --- a/tools/docker/install/install_python_packages.sh +++ b/tools/docker/install/install_python_packages.sh @@ -1,10 +1,9 @@ set -euo pipefail +pip3 install --no-cache --upgrade wheel -# install PyYAML==5.1.2 to avoid conflict with latest awscli # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli pip3 install --no-cache --upgrade \ - wheel \ numpy==1.19.1 \ pandas==0.25.1 \ pytest \ From 49d14533d3683be9f50b267be6c76817e73956e6 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 12 Oct 2020 01:23:35 -0700 Subject: [PATCH 072/115] Update install_llvm.sh --- tools/docker/install/install_llvm.sh | 48 +++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/tools/docker/install/install_llvm.sh b/tools/docker/install/install_llvm.sh index c52c679920..292fdde044 100644 --- a/tools/docker/install/install_llvm.sh +++ b/tools/docker/install/install_llvm.sh @@ -1,5 +1,45 @@ -set -euo pipefail +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. -wget https://apt.llvm.org/llvm.sh -chmod +x llvm.sh -./llvm.sh 10 # Fix version +set -e +set -u +set -o pipefail + +echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-9 main\ + >> /etc/apt/sources.list.d/llvm.list +echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-9 main\ + >> /etc/apt/sources.list.d/llvm.list + + +echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main\ + >> /etc/apt/sources.list.d/llvm.list +echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main\ + >> /etc/apt/sources.list.d/llvm.list + +echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main\ + >> /etc/apt/sources.list.d/llvm.list +echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main\ + >> /etc/apt/sources.list.d/llvm.list + +echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic main\ + >> /etc/apt/sources.list.d/llvm.list +echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic main\ + >> /etc/apt/sources.list.d/llvm.list + +wget -q -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add - +apt-get update && apt-get install -y llvm-9 llvm-10 llvm-11 clang-9 clang-10 clang-11 From c6c131d8a72f1a5fb385784cec70f9241dfae6e4 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 12 Oct 2020 01:59:35 -0700 Subject: [PATCH 073/115] Update install_python_packages.sh --- tools/docker/install/install_python_packages.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/docker/install/install_python_packages.sh b/tools/docker/install/install_python_packages.sh index b18534647f..eb9232426e 100644 --- a/tools/docker/install/install_python_packages.sh +++ b/tools/docker/install/install_python_packages.sh @@ -18,4 +18,4 @@ pip3 install --no-cache --upgrade \ mpi4py==3.0.2 \ jupyterlab==2.2.4 \ cmake \ - awscli + awscli --user From efbd7f598b5a072b77a367e5403bfbc9a288b1a1 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 12 Oct 2020 02:09:36 -0700 Subject: [PATCH 074/115] Update install_llvm.sh --- tools/docker/install/install_llvm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/docker/install/install_llvm.sh b/tools/docker/install/install_llvm.sh index 292fdde044..56f793b201 100644 --- a/tools/docker/install/install_llvm.sh +++ b/tools/docker/install/install_llvm.sh @@ -41,5 +41,5 @@ echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic main\ echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic main\ >> /etc/apt/sources.list.d/llvm.list -wget -q -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add - +wget -q -O - http://apt.llvm.org/llvm-snapshot.gpg.key|apt-key add - apt-get update && apt-get install -y llvm-9 llvm-10 llvm-11 clang-9 clang-10 clang-11 From 522fa851efe8f6aa211a2d90de339fbdb62e07aa Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 12 Oct 2020 02:18:51 -0700 Subject: [PATCH 075/115] update --- tools/docker/install/install_jupyter_lab.sh | 2 +- tools/docker/ubuntu18.04-base-cpu.Dockerfile | 4 ++-- tools/docker/ubuntu18.04-base-gpu.Dockerfile | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/docker/install/install_jupyter_lab.sh b/tools/docker/install/install_jupyter_lab.sh index 79633218fb..5b6f4b5a84 100644 --- a/tools/docker/install/install_jupyter_lab.sh +++ b/tools/docker/install/install_jupyter_lab.sh @@ -13,7 +13,7 @@ pip3 install --no-cache --upgrade \ jupyter_tensorboard==0.2.0 \ widgetsnbextension==3.5.1 \ tensorboard==2.1.1 \ - tensorboardX==2.1 + tensorboardX==2.1 --user jupyter labextension install jupyterlab_tensorboard \ && jupyter nbextension enable --py widgetsnbextension \ && jupyter labextension install @jupyter-widgets/jupyterlab-manager diff --git a/tools/docker/ubuntu18.04-base-cpu.Dockerfile b/tools/docker/ubuntu18.04-base-cpu.Dockerfile index c072e06b55..c4b7c35269 100644 --- a/tools/docker/ubuntu18.04-base-cpu.Dockerfile +++ b/tools/docker/ubuntu18.04-base-cpu.Dockerfile @@ -33,10 +33,10 @@ RUN bash /install/install_python_packages.sh RUN bash /install/install_tvm_gpu.sh # Install MXNet -RUN python3 -m pip install -U --pre "mxnet>=2.0.0b20200926" -f https://dist.mxnet.io/python +RUN python3 -m pip install -U --pre "mxnet>=2.0.0b20200926" -f https://dist.mxnet.io/python --user # Install PyTorch -RUN python3 -m pip install -U torch torchvision +RUN python3 -m pip install -U torch torchvision --user # Install Jupyter Lab RUN bash /install/install_jupyter_lab.sh diff --git a/tools/docker/ubuntu18.04-base-gpu.Dockerfile b/tools/docker/ubuntu18.04-base-gpu.Dockerfile index 81f1be771e..ae89fb27b9 100644 --- a/tools/docker/ubuntu18.04-base-gpu.Dockerfile +++ b/tools/docker/ubuntu18.04-base-gpu.Dockerfile @@ -32,10 +32,10 @@ RUN bash /install/install_python_packages.sh RUN bash /install/install_tvm_gpu.sh # Install MXNet -RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python +RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python --user # Install PyTorch -RUN python3 -m pip install -U torch torchvision +RUN python3 -m pip install -U torch torchvision --user # Install Horovod RUN bash /install/install_horovod.sh From 6d53466894ece82fca07659623cf02ca8de858fd Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 12 Oct 2020 02:31:13 -0700 Subject: [PATCH 076/115] Update install_ubuntu18.04_core.sh --- tools/docker/install/install_ubuntu18.04_core.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/docker/install/install_ubuntu18.04_core.sh b/tools/docker/install/install_ubuntu18.04_core.sh index 18cce66c7f..1a57e4af5a 100644 --- a/tools/docker/install/install_ubuntu18.04_core.sh +++ b/tools/docker/install/install_ubuntu18.04_core.sh @@ -22,6 +22,7 @@ apt-get update \ unzip \ less \ libopenblas-dev \ + gpg-agent \ ninja-build \ openssh-client \ openssh-server \ From 1fcf8a3009d5f260c6da5937d43ead9af7975421 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 12 Oct 2020 02:52:37 -0700 Subject: [PATCH 077/115] fix --- tools/docker/ubuntu18.04-base-cpu.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/docker/ubuntu18.04-base-cpu.Dockerfile b/tools/docker/ubuntu18.04-base-cpu.Dockerfile index c4b7c35269..8f46f1d81f 100644 --- a/tools/docker/ubuntu18.04-base-cpu.Dockerfile +++ b/tools/docker/ubuntu18.04-base-cpu.Dockerfile @@ -30,7 +30,7 @@ RUN bash /install/install_llvm.sh RUN bash /install/install_python_packages.sh # Install TVM -RUN bash /install/install_tvm_gpu.sh +RUN bash /install/install_tvm_cpu.sh # Install MXNet RUN python3 -m pip install -U --pre "mxnet>=2.0.0b20200926" -f https://dist.mxnet.io/python --user From 450d08e88cca6c5c9ef3c69fa382541723ed2711 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 12 Oct 2020 20:17:27 -0700 Subject: [PATCH 078/115] Update submit-job.py --- tools/batch/submit-job.py | 74 ++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 32 deletions(-) diff --git a/tools/batch/submit-job.py b/tools/batch/submit-job.py index 2829cdd0c9..a656672ba4 100644 --- a/tools/batch/submit-job.py +++ b/tools/batch/submit-job.py @@ -8,6 +8,45 @@ import boto3 from botocore.compat import total_seconds +instance_type_info = { + 'g4dn.4x': { + 'job_definition': 'gluon-nlp-g4dn_4xlarge:3', + 'job_queue': 'g4dn' + }, + 'g4dn.8x': { + 'job_definition': 'gluon-nlp-g4dn_8xlarge:3', + 'job_queue': 'g4dn' + }, + 'g4dn.12x': { + 'job_definition': 'gluon-nlp-g4dn_12xlarge:3', + 'job_queue': 'g4dn-multi-gpu' + }, + 'p3.2x': { + 'job_definition': 'gluon-nlp-p3_2xlarge:3', + 'job_queue': 'p3' + }, + 'p3.8x': { + 'job_definition': 'gluon-nlp-p3_8xlarge:3', + 'job_queue': 'p3-4gpu' + }, + 'p3.16x': { + 'job_definition': 'gluon-nlp-p3_16xlarge:3', + 'job_queue': 'p3-8gpu' + }, + 'p3dn.24x': { + 'job_definition': 'gluon-nlp-p3_24xlarge:3', + 'job_queue': 'p3dn-8gpu' + }, + 'c5n.4x': { + 'job_definition': 'gluon-nlp-c5_4xlarge:1', + 'job_queue': 'c5n' + }, + 'c5n.18x': { + 'job_definition': 'gluon-nlp-c5_18xlarge:1', + 'job_queue': 'c5n' + } +} + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--profile', help='profile name of aws account.', type=str, @@ -16,9 +55,7 @@ default=None) parser.add_argument('--name', help='name of the job', type=str, default='dummy') parser.add_argument('--job-type', help='type of job to submit.', type=str, - choices=['g4dn.4x', 'g4dn.8x', 'g4dn.12x', 'g4dn.16x', - 'p3.2x', 'p3.8x', 'p3.16x', 'p3dn.24x', - 'c5n.18x', 'c5n.4x'], default='g4dn.4x') + choices=instance_type_info.keys(), default='g4dn.4x') parser.add_argument('--source-ref', help='ref in GluonNLP main github. e.g. master, refs/pull/500/head', type=str, default='master') @@ -76,41 +113,14 @@ def nowInMillis(): return endTime -job_definitions = { - 'g4dn.4x': 'gluon-nlp-1-jobs:5', - 'g4dn.8x': 'gluon-nlp-1-jobs:4', - 'g4dn.12x': 'gluon-nlp-g4-12dn:4', #'gluon-nlp-1-4gpu-jobs:3', - 'g4dn.16x': 'gluon-nlp-1-jobs:3', - 'p3.2x': 'gluon-nlp-1-jobs:11', - 'p3.8x': 'gluon-nlp-1-4gpu-jobs:2', - 'p3.16x': 'gluon-nlp-1-8gpu-jobs:1', - 'p3dn.24x': 'gluon-nlp-1-8gpu-jobs:2', - 'c5n.4x': 'gluon-nlp-1-cpu-jobs:3', - 'c5n.18x': 'gluon-nlp-1-cpu-jobs:2', -} - -job_queues = { - 'g4dn.4x': 'g4dn', - 'g4dn.8x': 'g4dn', - 'g4dn.12x': 'g4dn-multi-gpu', - 'g4dn.16x': 'g4dn', - 'p3.2x': 'p3', - 'p3.8x': 'p3-4gpu', - 'p3.16x': 'p3-8gpu', - 'p3dn.24x': 'p3dn-8gpu', - 'c5n.4x': 'c5n', - 'c5n.18x': 'c5n', -} - - def main(): spin = ['-', '/', '|', '\\', '-', '/', '|', '\\'] logGroupName = '/aws/batch/job' jobName = re.sub('[^A-Za-z0-9_\-]', '', args.name)[:128] # Enforce AWS Batch jobName rules jobType = args.job_type - jobQueue = job_queues[jobType] - jobDefinition = job_definitions[jobType] + jobQueue = instance_type_info[jobType]['job_queue'] + jobDefinition = instance_type_info[jobType]['job_definition'] command = args.command.split() wait = args.wait From 207d0d05f2f4bd0341db3ea795debd7ef4b2b3a6 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 12 Oct 2020 22:15:28 -0700 Subject: [PATCH 079/115] Update submit-job.py --- tools/batch/submit-job.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/batch/submit-job.py b/tools/batch/submit-job.py index a656672ba4..2a5b3b802d 100644 --- a/tools/batch/submit-job.py +++ b/tools/batch/submit-job.py @@ -10,39 +10,39 @@ instance_type_info = { 'g4dn.4x': { - 'job_definition': 'gluon-nlp-g4dn_4xlarge:3', + 'job_definition': 'gluon-nlp-g4dn_4xlarge:4', 'job_queue': 'g4dn' }, 'g4dn.8x': { - 'job_definition': 'gluon-nlp-g4dn_8xlarge:3', + 'job_definition': 'gluon-nlp-g4dn_8xlarge:4', 'job_queue': 'g4dn' }, 'g4dn.12x': { - 'job_definition': 'gluon-nlp-g4dn_12xlarge:3', + 'job_definition': 'gluon-nlp-g4dn_12xlarge:4', 'job_queue': 'g4dn-multi-gpu' }, 'p3.2x': { - 'job_definition': 'gluon-nlp-p3_2xlarge:3', + 'job_definition': 'gluon-nlp-p3_2xlarge:4', 'job_queue': 'p3' }, 'p3.8x': { - 'job_definition': 'gluon-nlp-p3_8xlarge:3', + 'job_definition': 'gluon-nlp-p3_8xlarge:4', 'job_queue': 'p3-4gpu' }, 'p3.16x': { - 'job_definition': 'gluon-nlp-p3_16xlarge:3', + 'job_definition': 'gluon-nlp-p3_16xlarge:4', 'job_queue': 'p3-8gpu' }, 'p3dn.24x': { - 'job_definition': 'gluon-nlp-p3_24xlarge:3', + 'job_definition': 'gluon-nlp-p3_24xlarge:4', 'job_queue': 'p3dn-8gpu' }, 'c5n.4x': { - 'job_definition': 'gluon-nlp-c5_4xlarge:1', + 'job_definition': 'gluon-nlp-c5_4xlarge:2', 'job_queue': 'c5n' }, 'c5n.18x': { - 'job_definition': 'gluon-nlp-c5_18xlarge:1', + 'job_definition': 'gluon-nlp-c5_18xlarge:2', 'job_queue': 'c5n' } } From ad7dd82e4a55dce55dc7051767ea2bb7e83e9eff Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 12 Oct 2020 22:28:12 -0700 Subject: [PATCH 080/115] Update README.md --- scripts/datasets/question_answering/README.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/scripts/datasets/question_answering/README.md b/scripts/datasets/question_answering/README.md index 10328c1407..1336f69bb9 100644 --- a/scripts/datasets/question_answering/README.md +++ b/scripts/datasets/question_answering/README.md @@ -1,13 +1,5 @@ # Question Answering -| Datasets | #Train | #Valid | #Test | Leaderboard | Answerable | Long Context | -|-----------|--------|--------|-------|-------------|------------|--------------| -| SQuAD 2.0 | -| SQuAD 1.1 | -| SearchQA | -| TriviaQA | -| HotpotQA | - ## SQuAD SQuAD datasets is distributed under the [CC BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/legalcode) license. From d7513876993b020c41dbf42593eece676a6cfdcc Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 12 Oct 2020 22:30:55 -0700 Subject: [PATCH 081/115] Update README.md --- scripts/datasets/pretrain_corpus/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/datasets/pretrain_corpus/README.md b/scripts/datasets/pretrain_corpus/README.md index a0a0c8493a..3f56dc3eeb 100644 --- a/scripts/datasets/pretrain_corpus/README.md +++ b/scripts/datasets/pretrain_corpus/README.md @@ -18,7 +18,7 @@ Thus, we utilize the [Project Gutenberg](https://www.gutenberg.org/) as an alter You can use the following command to download and prepare the Gutenberg corpus. ```bash -python3 prepare_gutenberg.py --save_dir . +python3 prepare_gutenberg.py --save_dir gutenberg ``` Also, you should follow the [license](https://www.gutenberg.org/wiki/Gutenberg:The_Project_Gutenberg_License) for using the data. From 73437bca7396e621e7373c1552118c0ef71634ee Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 12 Oct 2020 22:32:57 -0700 Subject: [PATCH 082/115] Update prepare_gutenberg.py --- scripts/datasets/pretrain_corpus/prepare_gutenberg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/datasets/pretrain_corpus/prepare_gutenberg.py b/scripts/datasets/pretrain_corpus/prepare_gutenberg.py index b755a5801e..a807ed5989 100644 --- a/scripts/datasets/pretrain_corpus/prepare_gutenberg.py +++ b/scripts/datasets/pretrain_corpus/prepare_gutenberg.py @@ -63,6 +63,7 @@ def main(args): for name in f.namelist(): if name.endswith('.txt'): filename = os.path.basename(name) + print(name, filename) f.extract(name, os.path.join(save_dir, filename)) From ae137d2967eb11597278abd91baf13c76868def2 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 12 Oct 2020 23:09:09 -0700 Subject: [PATCH 083/115] Delete gluon_nlp_cpu_job.sh --- tools/docker/gluon_nlp_cpu_job.sh | 33 ------------------------------- 1 file changed, 33 deletions(-) delete mode 100644 tools/docker/gluon_nlp_cpu_job.sh diff --git a/tools/docker/gluon_nlp_cpu_job.sh b/tools/docker/gluon_nlp_cpu_job.sh deleted file mode 100644 index 3045209c4f..0000000000 --- a/tools/docker/gluon_nlp_cpu_job.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -date -echo "Args: $@" -env -echo "jobId: $AWS_BATCH_JOB_ID" -echo "jobQueue: $AWS_BATCH_JQ_NAME" -echo "computeEnvironment: $AWS_BATCH_CE_NAME" - -SOURCE_REF=$1 -WORK_DIR=$2 -COMMAND=$3 -SAVED_OUTPUT=$4 -SAVE_PATH=$5 -REMOTE=$6 - -if [ ! -z $REMOTE ]; then - git remote set-url origin $REMOTE -fi; - -git fetch origin $SOURCE_REF:working -git checkout working -python3 -m pip install -U --quiet --pre "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python -python3 -m pip install --quiet -e .[extras] - -cd $WORK_DIR -/bin/bash -o pipefail -c "$COMMAND" -COMMAND_EXIT_CODE=$? -if [[ -f $SAVED_OUTPUT ]]; then - aws s3 cp $SAVED_OUTPUT s3://gluon-nlp-dev/batch/$AWS_BATCH_JOB_ID/$SAVE_PATH; -elif [[ -d $SAVED_OUTPUT ]]; then - aws s3 cp --recursive $SAVED_OUTPUT s3://gluon-nlp-dev/batch/$AWS_BATCH_JOB_ID/$SAVE_PATH; -fi; -exit $COMMAND_EXIT_CODE From 7e8947a6dd38506ba61b6381203203951d5a6b42 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 12 Oct 2020 23:36:23 -0700 Subject: [PATCH 084/115] Update prepare_gutenberg.py --- scripts/datasets/pretrain_corpus/prepare_gutenberg.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/datasets/pretrain_corpus/prepare_gutenberg.py b/scripts/datasets/pretrain_corpus/prepare_gutenberg.py index a807ed5989..6bf4b16825 100644 --- a/scripts/datasets/pretrain_corpus/prepare_gutenberg.py +++ b/scripts/datasets/pretrain_corpus/prepare_gutenberg.py @@ -63,8 +63,7 @@ def main(args): for name in f.namelist(): if name.endswith('.txt'): filename = os.path.basename(name) - print(name, filename) - f.extract(name, os.path.join(save_dir, filename)) + f.extract(name, os.path.join(save_dir, filename.replace(' ', '_'))) def cli_main(): From c512fac23c8dd9f61dc96e95d531ed3c5cbd590e Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 12 Oct 2020 23:42:40 -0700 Subject: [PATCH 085/115] Update prepare_gutenberg.py --- scripts/datasets/pretrain_corpus/prepare_gutenberg.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/datasets/pretrain_corpus/prepare_gutenberg.py b/scripts/datasets/pretrain_corpus/prepare_gutenberg.py index 6bf4b16825..f3d8925c5e 100644 --- a/scripts/datasets/pretrain_corpus/prepare_gutenberg.py +++ b/scripts/datasets/pretrain_corpus/prepare_gutenberg.py @@ -3,7 +3,7 @@ import zipfile from gluonnlp.base import get_data_home_dir from gluonnlp.utils.misc import download, load_checksum_stats - +import shutil _CITATIONS = r""" @InProceedings{lahiri:2014:SRW, @@ -63,7 +63,9 @@ def main(args): for name in f.namelist(): if name.endswith('.txt'): filename = os.path.basename(name) - f.extract(name, os.path.join(save_dir, filename.replace(' ', '_'))) + with f.open(name) as in_file: + with open(os.path.join(save_dir, filename.replace(' ', '_')), 'wb') as out_file: + shutil.copyfileobj(in_file, out_file) def cli_main(): From 0ebfcd7c9a52930a77eb70ba12c447d386d9929d Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Mon, 12 Oct 2020 23:46:07 -0700 Subject: [PATCH 086/115] Update prepare_gutenberg.py --- scripts/datasets/pretrain_corpus/prepare_gutenberg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/datasets/pretrain_corpus/prepare_gutenberg.py b/scripts/datasets/pretrain_corpus/prepare_gutenberg.py index f3d8925c5e..590310b62e 100644 --- a/scripts/datasets/pretrain_corpus/prepare_gutenberg.py +++ b/scripts/datasets/pretrain_corpus/prepare_gutenberg.py @@ -59,6 +59,7 @@ def main(args): save_dir = args.dataset if args.save_dir is None else args.save_dir if not os.path.exists(save_dir): os.makedirs(save_dir, exist_ok=True) + print(f'Save to {save_dir}') with zipfile.ZipFile(target_download_location) as f: for name in f.namelist(): if name.endswith('.txt'): From cd4b24d113d53adda7fca2cfc90f51133b48f328 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 13 Oct 2020 00:14:07 -0700 Subject: [PATCH 087/115] Update conf.py --- docs/conf.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 916edd0a99..9d4c32eefa 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -234,10 +234,10 @@ def setup(app): 'auto_doc_ref': True }, True) app.add_transform(AutoStructify) - app.add_javascript('google_analytics.js') - app.add_javascript('hidebib.js') - app.add_javascript('install-options.js') - app.add_stylesheet('custom.css') + app.add_js_file('google_analytics.js') + app.add_js_file('hidebib.js') + app.add_js_file('install-options.js') + app.add_css_file('custom.css') sphinx_gallery_conf = { From 19324d9013e428e80eee806750f88d9f0c024b7a Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 13 Oct 2020 00:16:49 -0700 Subject: [PATCH 088/115] update --- tools/batch/docker/Dockerfile.cpu | 33 ------------------------------- tools/batch/docker/Dockerfile.gpu | 33 ------------------------------- tools/batch/docker/README.md | 25 ----------------------- tools/batch/run_batch_squad.sh | 4 ++-- 4 files changed, 2 insertions(+), 93 deletions(-) delete mode 100644 tools/batch/docker/Dockerfile.cpu delete mode 100644 tools/batch/docker/Dockerfile.gpu delete mode 100644 tools/batch/docker/README.md diff --git a/tools/batch/docker/Dockerfile.cpu b/tools/batch/docker/Dockerfile.cpu deleted file mode 100644 index ca5cb6029e..0000000000 --- a/tools/batch/docker/Dockerfile.cpu +++ /dev/null @@ -1,33 +0,0 @@ -FROM ubuntu:18.04 - -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - locales \ - cmake \ - wget \ - subversion \ - git \ - curl \ - vim \ - unzip \ - sudo \ - ca-certificates \ - libjpeg-dev \ - libpng-dev \ - libfreetype6-dev \ - libopenblas-dev \ - python3-dev \ - python3-pip \ - python3-setuptools \ - libxft-dev &&\ - rm -rf /var/lib/apt/lists/* - -RUN pip3 install --upgrade pip -RUN pip3 install --no-cache --upgrade \ - wheel \ - cmake \ - awscli -RUN git clone https://github.com/dmlc/gluon-nlp -WORKDIR gluon-nlp -ADD gluon_nlp_cpu_job.sh . -RUN chmod +x gluon_nlp_cpu_job.sh diff --git a/tools/batch/docker/Dockerfile.gpu b/tools/batch/docker/Dockerfile.gpu deleted file mode 100644 index 88ad1c86aa..0000000000 --- a/tools/batch/docker/Dockerfile.gpu +++ /dev/null @@ -1,33 +0,0 @@ -FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 - -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - locales \ - cmake \ - wget \ - subversion \ - git \ - curl \ - vim \ - unzip \ - sudo \ - ca-certificates \ - libjpeg-dev \ - libpng-dev \ - libfreetype6-dev \ - libopenblas-dev \ - python3-dev \ - python3-pip \ - python3-setuptools \ - libxft-dev &&\ - rm -rf /var/lib/apt/lists/* - -RUN pip3 install --upgrade pip -RUN pip3 install --no-cache --upgrade \ - wheel \ - cmake \ - awscli -RUN git clone https://github.com/dmlc/gluon-nlp -WORKDIR gluon-nlp -ADD gluon_nlp_job.sh . -RUN chmod +x gluon_nlp_job.sh diff --git a/tools/batch/docker/README.md b/tools/batch/docker/README.md deleted file mode 100644 index 3675f82980..0000000000 --- a/tools/batch/docker/README.md +++ /dev/null @@ -1,25 +0,0 @@ -# Updating the Docker for AWS Batch. - -Our current batch job dockers are in 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1. To -update the docker: -- update the Dockerfile -- Make sure docker and docker-compose, as well as the docker python package are installed. -- Export the AWS account credentials as environment variables -- CD to the same folder as the Dockerfile and execute the following: - -``` -# this executes a command that logs into ECR. -$(aws ecr get-login --no-include-email --region us-east-1) - -# builds the Dockerfile as gluon-nlp-1 docker. -docker build -f Dockerfile.gpu -t gluon-nlp-1:gpu . -docker build -f Dockerfile.cpu -t gluon-nlp-1:cpu . - -# tags the recent build as gluon-nlp-1:latest, which AWS batch pulls from. -docker tag gluon-nlp-1:gpu 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:latest -docker tag gluon-nlp-1:cpu 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:cpu-latest - -# pushes the change -docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:latest -docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:cpu-latest -``` diff --git a/tools/batch/run_batch_squad.sh b/tools/batch/run_batch_squad.sh index 2d0d8bf0d9..8349716c29 100644 --- a/tools/batch/run_batch_squad.sh +++ b/tools/batch/run_batch_squad.sh @@ -18,11 +18,11 @@ for MODEL_NAME in albert_base \ do python3 submit-job.py \ --region us-east-1 \ - --source-ref fix_docker \ + --source-ref master \ --job-type g4dn.12x \ --save-path temp \ --name test_squad2_${MODEL_NAME} \ --work-dir scripts/question_answering \ - --remote https://github.com/sxjscience/gluon-nlp/ \ + --remote https://github.com/dmlc/gluon-nlp/ \ --command "bash commands/run_squad2_${MODEL_NAME}.sh ${USE_HOROVOD} ${VERSION} | tee stdout.log" >> ${LOG_PATH} done From 6532042c548ba3c70826034b379c11de8b1f7fba Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 13 Oct 2020 10:48:53 -0700 Subject: [PATCH 089/115] Update generate_commands.py --- .../commands/generate_commands.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/scripts/question_answering/commands/generate_commands.py b/scripts/question_answering/commands/generate_commands.py index d8d3f0c6ec..192d12c06b 100644 --- a/scripts/question_answering/commands/generate_commands.py +++ b/scripts/question_answering/commands/generate_commands.py @@ -131,10 +131,9 @@ def replace_fn(match): if __name__ == '__main__': - for squad_version in [1.1, 2.0]: - for cfg_func in [albert_base_cfg, albert_large_cfg, albert_xlarge_cfg, albert_xxlarge_cfg, - electra_base_cfg, electra_large_cfg, electra_small_cfg, mobilebert_cfg, - roberta_large_cfg, uncased_bert_base_cfg, uncased_bert_large_cfg]: - prefix = cfg_func.__name__[:-len('_cfg')] - gen_command(cfg_func(), 'run_squad.template', - f'run_squad2_{prefix}.sh') + for cfg_func in [albert_base_cfg, albert_large_cfg, albert_xlarge_cfg, albert_xxlarge_cfg, + electra_base_cfg, electra_large_cfg, electra_small_cfg, mobilebert_cfg, + roberta_large_cfg, uncased_bert_base_cfg, uncased_bert_large_cfg]: + prefix = cfg_func.__name__[:-len('_cfg')] + gen_command(cfg_func(), 'run_squad.template', + f'run_squad2_{prefix}.sh') From 33e2575a3084c9a7664c4bcbebf421fba20fdc63 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 13 Oct 2020 10:52:13 -0700 Subject: [PATCH 090/115] fix readme --- tools/batch/README.md | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/tools/batch/README.md b/tools/batch/README.md index d0b8cc9ef1..2206177420 100644 --- a/tools/batch/README.md +++ b/tools/batch/README.md @@ -15,27 +15,7 @@ python3 submit-job.py \ # Updating the Docker for AWS Batch. -Our current batch job dockers are in 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1. To -update the docker: -- Update the Dockerfile -- Make sure docker and docker-compose, as well as the docker python package are installed. -- Export the AWS account credentials as environment variables -- CD to the same folder as the Dockerfile and execute the following: - -``` -# this executes a command that logs into ECR. -$(aws ecr get-login --no-include-email --region us-east-1) - -# builds the docker image use the command in docker - -# tags the recent build as gluon-nlp-1:latest, which AWS batch pulls from. -docker tag gluonai/gluon-nlp:gpu-ci-latest 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:gpu-ci-latest -docker tag gluonai/gluon-nlp:cpu-ci-latest 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:cpu-ci-latest - -# pushes the change -docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:gpu-ci-latest -docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:cpu-ci-latest -``` +You may refer to the instruction in [Docker](../docker) for more information. ## Conversion Toolkits Following the instruction of [converting scripts](../../scripts/conversion_toolkits), @@ -59,8 +39,6 @@ bash run_batch_squad.sh bash run_batch_squad.sh 1 2.0 submit_squad_v2_horovod.log ``` - - Internally, it will train the following models on SQuAD 2.0 dataset: | MODEL_NAME | |:------------------:| From 8e439c4968de3e257177b98060ad94d76e1828d9 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 13 Oct 2020 11:24:36 -0700 Subject: [PATCH 091/115] use os.link for hard link --- scripts/datasets/question_answering/prepare_squad.py | 8 ++++---- tools/docker/README.md | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/datasets/question_answering/prepare_squad.py b/scripts/datasets/question_answering/prepare_squad.py index 292fe1fae2..b9bc49c696 100644 --- a/scripts/datasets/question_answering/prepare_squad.py +++ b/scripts/datasets/question_answering/prepare_squad.py @@ -61,14 +61,14 @@ def main(args): os.makedirs(args.save_path) if not os.path.exists(os.path.join(args.save_path, train_file_name)) \ or (args.overwrite and args.save_path != args.cache_path): - shutil.copyfile(os.path.join(args.cache_path, train_file_name), - os.path.join(args.save_path, train_file_name)) + os.link(os.path.join(args.cache_path, train_file_name), + os.path.join(args.save_path, train_file_name)) else: print(f'Found {os.path.join(args.save_path, train_file_name)}...skip') if not os.path.exists(os.path.join(args.save_path, dev_file_name)) \ or (args.overwrite and args.save_path != args.cache_path): - shutil.copyfile(os.path.join(args.cache_path, dev_file_name), - os.path.join(args.save_path, dev_file_name)) + os.link(os.path.join(args.cache_path, dev_file_name), + os.path.join(args.save_path, dev_file_name)) else: print(f'Found {os.path.join(args.save_path, dev_file_name)}...skip') diff --git a/tools/docker/README.md b/tools/docker/README.md index 661e12898c..a8c68b0332 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -52,7 +52,7 @@ docker run --gpus all --rm -it --shm-size=2g -v `pwd`:/workspace/data gluonai/gl ## Build your own Docker Image -To build a docker image fom the dockerfile, you may use the following command: +To build a docker image from the dockerfile, you may use the following command: ``` # Build Base Dockers From 276d6d1d6b7856ef4e28ba3dc5f67630e23d6cf9 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 13 Oct 2020 11:25:01 -0700 Subject: [PATCH 092/115] Update README.md --- tools/docker/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/docker/README.md b/tools/docker/README.md index a8c68b0332..df9d6c29e9 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -84,7 +84,7 @@ After that, restart docker via `sudo systemctl restart docker.service`. For more details, you may refer to https://github.com/NVIDIA/nvidia-docker/issues/595. We need this additional setup because the horovod+mxnet integration identifies the library and include -path of MXNet by querying th MXNet runtime. +path of MXNet by querying the MXNet runtime. ### Developers of GluonNLP You may try to login to your dockerhub account and push the image to dockerhub. From 9127fde34e3e6c2f5ad451cd0b32aba19aa8fbf8 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 13 Oct 2020 12:08:35 -0700 Subject: [PATCH 093/115] Update README.md --- tools/docker/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/docker/README.md b/tools/docker/README.md index df9d6c29e9..7131db858a 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -33,6 +33,7 @@ The folder structure of the docker image will be ``` /workspace/ ├── gluonnlp +├── tvm ├── data ``` From 5ff701db2a3b8af159aa9f23d7a22e2475d3e642 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 13 Oct 2020 12:21:59 -0700 Subject: [PATCH 094/115] Update gluon_nlp_job.sh --- tools/docker/gluon_nlp_job.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/docker/gluon_nlp_job.sh b/tools/docker/gluon_nlp_job.sh index e2866a411a..f692c3fc16 100755 --- a/tools/docker/gluon_nlp_job.sh +++ b/tools/docker/gluon_nlp_job.sh @@ -28,6 +28,7 @@ if [ $DEVICE == "cpu" ]; then else # Due to the issue in https://forums.aws.amazon.com/thread.jspa?messageID=953912 # We need to manually configure the shm to ensure that Horovod is runnable. + # The reason that we need a larger shm is described in https://github.com/NVIDIA/nccl/issues/290 umount shm mount -t tmpfs -o rw,nosuid,nodev,noexec,relatime,size=2G shm /dev/shm python3 -m pip install -U --quiet --pre "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python From bc078860690d5eb8559a3ea5b36614bb374d6cd3 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 13 Oct 2020 13:23:09 -0700 Subject: [PATCH 095/115] Update __init__.py --- src/gluonnlp/models/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gluonnlp/models/__init__.py b/src/gluonnlp/models/__init__.py index 490667c20c..93bbd1f3bf 100644 --- a/src/gluonnlp/models/__init__.py +++ b/src/gluonnlp/models/__init__.py @@ -54,7 +54,7 @@ def get_backbone(model_name: str, -------- >>> from gluonnlp.models import get_backbone - >>> model_cls, tokenizer, cfg, backbone_param_path, _ = get_backbone('google_en_cased_bert_base') + >>> model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone('google_en_cased_bert_base') >>> model = model_cls.from_cfg(cfg) >>> model.load_parameters(backbone_param_path) """ From 92333268d2cc97d1f65596b0b8150c0717010a97 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 13 Oct 2020 14:28:44 -0700 Subject: [PATCH 096/115] Update benchmark_utils.py --- scripts/benchmarks/benchmark_utils.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/scripts/benchmarks/benchmark_utils.py b/scripts/benchmarks/benchmark_utils.py index c022caff87..1d40d056b7 100644 --- a/scripts/benchmarks/benchmark_utils.py +++ b/scripts/benchmarks/benchmark_utils.py @@ -792,12 +792,10 @@ def train_step(): raise NotImplementedError timeit.repeat(train_step, repeat=1, number=3) mxnet.npx.waitall() - for ctx in mx_all_contexts: - ctx.empty_cache() + ctx.empty_cache() runtimes = timeit.repeat(train_step, repeat=self._repeat, number=3) mxnet.npx.waitall() - for ctx in mx_all_contexts: - ctx.empty_cache() + ctx.empty_cache() mxnet.npx.waitall() # Profile memory if self._use_gpu: @@ -844,8 +842,6 @@ def run(self): infer_time = np.nan infer_memory = np.nan inference_result[model_name][workload] = (infer_time, infer_memory) - for ctx in mx_all_contexts: - ctx.empty_cache() mxnet.npx.waitall() self.save_to_csv(inference_result, self._inference_out_csv_file) if self._profile_train: @@ -858,8 +854,6 @@ def run(self): train_time = np.nan train_memory = np.nan train_result[model_name][workload] = (train_time, train_memory) - for ctx in mx_all_contexts: - ctx.empty_cache() mxnet.npx.waitall() self.save_to_csv(train_result, self._train_out_csv_file) From 6c604ea83c0e79e672173c65379f884de1aa45cb Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 13 Oct 2020 16:06:51 -0700 Subject: [PATCH 097/115] try to use multi-stage build --- tools/docker/ubuntu18.04-gpu.Dockerfile | 77 +++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 tools/docker/ubuntu18.04-gpu.Dockerfile diff --git a/tools/docker/ubuntu18.04-gpu.Dockerfile b/tools/docker/ubuntu18.04-gpu.Dockerfile new file mode 100644 index 0000000000..4ac5880e63 --- /dev/null +++ b/tools/docker/ubuntu18.04-gpu.Dockerfile @@ -0,0 +1,77 @@ +FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 as base + +LABEL maintainer="GluonNLP Team" +COPY install /install + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" \ + PYTHONIOENCODING=UTF-8 \ + LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 + +ENV WORKDIR=/workspace +ENV SHELL=/bin/bash + +RUN mkdir -p ${WORKDIR} + +RUN bash /install/install_ubuntu18.04_core.sh + +# Install Open MPI +RUN bash /install/install_openmpi.sh +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH +ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH + +# Install LLVM +RUN bash /install/install_llvm.sh + +# Install Python Packages +RUN bash /install/install_python_packages.sh + +# Install TVM +RUN bash /install/install_tvm_gpu.sh + +# Install MXNet +RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python --user + +# Install PyTorch +RUN python3 -m pip install -U torch torchvision --user + +# Install Horovod +RUN bash /install/install_horovod.sh + +# Install Jupyter Lab +RUN bash /install/install_jupyter_lab.sh + +RUN mkdir -p ${WORKDIR}/data +RUN mkdir -p /.init +RUN cd ${WORKDIR} \ + && git clone https://github.com/dmlc/gluon-nlp \ + && cd gluon-nlp \ + && git checkout master \ + && python3 -m pip install -U -e ."[extras]" + +# Stage-CI +FROM base as ci +WORKDIR ${WORKDIR}/gluon-nlp +ADD gluon_nlp_job.sh . +RUN chmod +x gluon_nlp_job.sh + +# Stage-Devel +FROM base as devel +COPY start_jupyter.sh /start_jupyter.sh +COPY devel_entrypoint.sh /devel_entrypoint.sh +RUN chmod +x /devel_entrypoint.sh + +EXPOSE 8888 +EXPOSE 8787 +EXPOSE 8786 + +WORKDIR ${WORKDIR} + +# Add Tini +ARG TINI_VERSION=v0.19.0 +ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini +RUN chmod +x /tini +ENTRYPOINT [ "/tini", "--", "/devel_entrypoint.sh" ] +CMD ["/bin/bash"] From fe4d089b49cb6786d1520bc39b94fc2e3ab6849a Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 13 Oct 2020 17:56:02 -0700 Subject: [PATCH 098/115] Update benchmark_utils.py --- scripts/benchmarks/benchmark_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/benchmarks/benchmark_utils.py b/scripts/benchmarks/benchmark_utils.py index 1d40d056b7..7475d4b5d5 100644 --- a/scripts/benchmarks/benchmark_utils.py +++ b/scripts/benchmarks/benchmark_utils.py @@ -792,7 +792,6 @@ def train_step(): raise NotImplementedError timeit.repeat(train_step, repeat=1, number=3) mxnet.npx.waitall() - ctx.empty_cache() runtimes = timeit.repeat(train_step, repeat=self._repeat, number=3) mxnet.npx.waitall() ctx.empty_cache() From c381eaeebafb5649470a0be0f91a2a51921b1d92 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 13 Oct 2020 18:03:18 -0700 Subject: [PATCH 099/115] multi-stage build --- tools/docker/README.md | 17 +++--- tools/docker/ubuntu18.04-base-gpu.Dockerfile | 53 ------------------- tools/docker/ubuntu18.04-ci-cpu.Dockerfile | 7 --- tools/docker/ubuntu18.04-ci-gpu.Dockerfile | 7 --- ....Dockerfile => ubuntu18.04-cpu.Dockerfile} | 29 +++++++++- tools/docker/ubuntu18.04-devel-cpu.Dockerfile | 20 ------- tools/docker/ubuntu18.04-devel-gpu.Dockerfile | 20 ------- 7 files changed, 34 insertions(+), 119 deletions(-) delete mode 100644 tools/docker/ubuntu18.04-base-gpu.Dockerfile delete mode 100644 tools/docker/ubuntu18.04-ci-cpu.Dockerfile delete mode 100644 tools/docker/ubuntu18.04-ci-gpu.Dockerfile rename tools/docker/{ubuntu18.04-base-cpu.Dockerfile => ubuntu18.04-cpu.Dockerfile} (68%) delete mode 100644 tools/docker/ubuntu18.04-devel-cpu.Dockerfile delete mode 100644 tools/docker/ubuntu18.04-devel-gpu.Dockerfile diff --git a/tools/docker/README.md b/tools/docker/README.md index 7131db858a..8e664b00df 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -6,9 +6,8 @@ and try out to use GluonNLP to solve your problem. | Name | Description | Target User | |------|-------------|-------------| -| `cpu-base-latest` or `gpu-base-latest` | Extends the CUDA image to include the basic functionalities, e.g., GluonNLP package, MXNet, PyTorch, Horovod. You can directly configure other docker images based on this basic docker | The basic docker | -| `cpu-ci-latest` or `gpu-ci-latest` | Image used in GluonNLP CI | GluonNLP Developers | -| `cpu-latest` or `gpu-latest` | Extends the base image to include a development platform powered by Jupyter Lab. Some useful functionalities like Tensorboard are pre-installed. | Users that are willing to solve NLP problems and also do distributed training with Horovod + GluonNLP. | +| `cpu-ci-latest` or `gpu-ci-latest` | Extends the CUDA image to include the basic functionalities, e.g., GluonNLP package, MXNet, PyTorch, Horovod. This is the image used in GluonNLP CI | GluonNLP Developers | +| `cpu-latest` or `gpu-latest` | It has more functionality than the CI image, including the a development platform powered by Jupyter Lab. Some useful functionalities like Tensorboard are pre-installed. | Users that are willing to solve NLP problems and also do distributed training with Horovod + GluonNLP. | ## Run Docker @@ -57,14 +56,12 @@ To build a docker image from the dockerfile, you may use the following command: ``` # Build Base Dockers -docker build -f ubuntu18.04-base-cpu.Dockerfile -t gluonai/gluon-nlp:cpu-base-latest . -docker build -f ubuntu18.04-ci-cpu.Dockerfile -t gluonai/gluon-nlp:cpu-ci-latest . -docker build -f ubuntu18.04-devel-cpu.Dockerfile -t gluonai/gluon-nlp:cpu-latest . +docker build -f ubuntu18.04-cpu.Dockerfile --target ci -t gluonai/gluon-nlp:cpu-ci-latest . +docker build -f ubuntu18.04-cpu.Dockerfile --target devel -t gluonai/gluon-nlp:cpu-latest . # Build GPU Dockers -docker build -f ubuntu18.04-base-gpu.Dockerfile -t gluonai/gluon-nlp:gpu-base-latest . -docker build -f ubuntu18.04-ci-gpu.Dockerfile -t gluonai/gluon-nlp:gpu-ci-latest . -docker build -f ubuntu18.04-devel-gpu.Dockerfile -t gluonai/gluon-nlp:gpu-latest . +docker build -f ubuntu18.04-gpu.Dockerfile --target ci -t gluonai/gluon-nlp:gpu-ci-latest . +docker build -f ubuntu18.04-gpu.Dockerfile --target devel -t gluonai/gluon-nlp:gpu-latest . ``` In addition, to build the GPU docker, you will need to install the nvidia-docker2 and edit `/etc/docker/daemon.json` like the following: @@ -90,11 +87,9 @@ path of MXNet by querying the MXNet runtime. ### Developers of GluonNLP You may try to login to your dockerhub account and push the image to dockerhub. ``` -docker push gluonai/gluon-nlp:cpu-base-latest docker push gluonai/gluon-nlp:cpu-ci-latest docker push gluonai/gluon-nlp:cpu-latest -docker push gluonai/gluon-nlp:gpu-base-latest docker push gluonai/gluon-nlp:gpu-ci-latest docker push gluonai/gluon-nlp:gpu-latest ``` diff --git a/tools/docker/ubuntu18.04-base-gpu.Dockerfile b/tools/docker/ubuntu18.04-base-gpu.Dockerfile deleted file mode 100644 index ae89fb27b9..0000000000 --- a/tools/docker/ubuntu18.04-base-gpu.Dockerfile +++ /dev/null @@ -1,53 +0,0 @@ -FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 - -LABEL maintainer="GluonNLP Team" -COPY install /install - -ENV PYTHONDONTWRITEBYTECODE=1 \ - PYTHONUNBUFFERED=1 \ - LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" \ - PYTHONIOENCODING=UTF-8 \ - LANG=C.UTF-8 \ - LC_ALL=C.UTF-8 - -ENV WORKDIR=/workspace -ENV SHELL=/bin/bash - -RUN mkdir -p ${WORKDIR} - -RUN bash /install/install_ubuntu18.04_core.sh - -# Install Open MPI -RUN bash /install/install_openmpi.sh -ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH -ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH - -# Install LLVM -RUN bash /install/install_llvm.sh - -# Install Python Packages -RUN bash /install/install_python_packages.sh - -# Install TVM -RUN bash /install/install_tvm_gpu.sh - -# Install MXNet -RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python --user - -# Install PyTorch -RUN python3 -m pip install -U torch torchvision --user - -# Install Horovod -RUN bash /install/install_horovod.sh - -# Install Jupyter Lab -RUN bash /install/install_jupyter_lab.sh - -RUN mkdir -p ${WORKDIR}/data -RUN mkdir -p /.init -RUN cd ${WORKDIR} \ - && git clone https://github.com/dmlc/gluon-nlp \ - && cd gluon-nlp \ - && git checkout master \ - && python3 -m pip install -U -e ."[extras]" -CMD ["/bin/bash"] diff --git a/tools/docker/ubuntu18.04-ci-cpu.Dockerfile b/tools/docker/ubuntu18.04-ci-cpu.Dockerfile deleted file mode 100644 index eef8f74d17..0000000000 --- a/tools/docker/ubuntu18.04-ci-cpu.Dockerfile +++ /dev/null @@ -1,7 +0,0 @@ -FROM gluonai/gluon-nlp:cpu-base-latest - -LABEL maintainer="GluonNLP Team" - -WORKDIR ${WORKDIR}/gluon-nlp -ADD gluon_nlp_job.sh . -RUN chmod +x gluon_nlp_job.sh diff --git a/tools/docker/ubuntu18.04-ci-gpu.Dockerfile b/tools/docker/ubuntu18.04-ci-gpu.Dockerfile deleted file mode 100644 index 9c99d4b4a9..0000000000 --- a/tools/docker/ubuntu18.04-ci-gpu.Dockerfile +++ /dev/null @@ -1,7 +0,0 @@ -FROM gluonai/gluon-nlp:gpu-base-latest - -LABEL maintainer="GluonNLP Team" - -WORKDIR ${WORKDIR}/gluon-nlp -ADD gluon_nlp_job.sh . -RUN chmod +x gluon_nlp_job.sh diff --git a/tools/docker/ubuntu18.04-base-cpu.Dockerfile b/tools/docker/ubuntu18.04-cpu.Dockerfile similarity index 68% rename from tools/docker/ubuntu18.04-base-cpu.Dockerfile rename to tools/docker/ubuntu18.04-cpu.Dockerfile index 8f46f1d81f..a5a2114652 100644 --- a/tools/docker/ubuntu18.04-base-cpu.Dockerfile +++ b/tools/docker/ubuntu18.04-cpu.Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:18.04 +FROM ubuntu:18.04 as base LABEL maintainer="GluonNLP Team" COPY install /install @@ -48,3 +48,30 @@ RUN cd ${WORKDIR} \ && cd gluon-nlp \ && git checkout master \ && python3 -m pip install -U -e ."[extras]" + + +# Stage-CI +FROM base as ci +WORKDIR ${WORKDIR}/gluon-nlp +ADD gluon_nlp_job.sh . +RUN chmod +x gluon_nlp_job.sh + + +# Stage-Devel +FROM base as devel +COPY start_jupyter.sh /start_jupyter.sh +COPY devel_entrypoint.sh /devel_entrypoint.sh +RUN chmod +x /devel_entrypoint.sh + +EXPOSE 8888 +EXPOSE 8787 +EXPOSE 8786 + +WORKDIR ${WORKDIR} + +# Add Tini +ARG TINI_VERSION=v0.19.0 +ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini +RUN chmod +x /tini +ENTRYPOINT [ "/tini", "--", "/devel_entrypoint.sh" ] +CMD ["/bin/bash"] diff --git a/tools/docker/ubuntu18.04-devel-cpu.Dockerfile b/tools/docker/ubuntu18.04-devel-cpu.Dockerfile deleted file mode 100644 index 47a587ebf0..0000000000 --- a/tools/docker/ubuntu18.04-devel-cpu.Dockerfile +++ /dev/null @@ -1,20 +0,0 @@ -FROM gluonai/gluon-nlp:cpu-base-latest - -LABEL maintainer="GluonNLP Team" - -COPY start_jupyter.sh /start_jupyter.sh -COPY devel_entrypoint.sh /devel_entrypoint.sh -RUN chmod +x /devel_entrypoint.sh - -EXPOSE 8888 -EXPOSE 8787 -EXPOSE 8786 - -WORKDIR ${WORKDIR} - -# Add Tini -ARG TINI_VERSION=v0.19.0 -ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini -RUN chmod +x /tini -ENTRYPOINT [ "/tini", "--", "/devel_entrypoint.sh" ] -CMD ["/bin/bash"] diff --git a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile deleted file mode 100644 index ef7b70de11..0000000000 --- a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile +++ /dev/null @@ -1,20 +0,0 @@ -FROM gluonai/gluon-nlp:gpu-base-latest - -LABEL maintainer="GluonNLP Team" - -COPY start_jupyter.sh /start_jupyter.sh -COPY devel_entrypoint.sh /devel_entrypoint.sh -RUN chmod +x /devel_entrypoint.sh - -EXPOSE 8888 -EXPOSE 8787 -EXPOSE 8786 - -WORKDIR ${WORKDIR} - -# Add Tini -ARG TINI_VERSION=v0.19.0 -ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini -RUN chmod +x /tini -ENTRYPOINT [ "/tini", "--", "/devel_entrypoint.sh" ] -CMD ["/bin/bash"] From eadf26845e42047f693cb607891e51e37a574f1d Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 13 Oct 2020 18:10:52 -0700 Subject: [PATCH 100/115] Update README.md --- tools/docker/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/docker/README.md b/tools/docker/README.md index 8e664b00df..576a39f14b 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -55,7 +55,7 @@ docker run --gpus all --rm -it --shm-size=2g -v `pwd`:/workspace/data gluonai/gl To build a docker image from the dockerfile, you may use the following command: ``` -# Build Base Dockers +# Build CPU Dockers docker build -f ubuntu18.04-cpu.Dockerfile --target ci -t gluonai/gluon-nlp:cpu-ci-latest . docker build -f ubuntu18.04-cpu.Dockerfile --target devel -t gluonai/gluon-nlp:cpu-latest . From aadd03d85f1cec219ca4383a988392eba591a08d Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 13 Oct 2020 18:11:35 -0700 Subject: [PATCH 101/115] Update README.md --- tools/docker/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/docker/README.md b/tools/docker/README.md index 576a39f14b..4ea21bddcf 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -51,7 +51,7 @@ docker run --gpus all --rm -it --shm-size=2g -v `pwd`:/workspace/data gluonai/gl ``` -## Build your own Docker Image +## Build by yourself To build a docker image from the dockerfile, you may use the following command: ``` From 207d01876664f5a1450ae0aeced18e19b773f1a5 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 13 Oct 2020 18:33:24 -0700 Subject: [PATCH 102/115] update --- tools/batch/README.md | 2 +- tools/docker/README.md | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/tools/batch/README.md b/tools/batch/README.md index 2206177420..e95d2e4c6f 100644 --- a/tools/batch/README.md +++ b/tools/batch/README.md @@ -15,7 +15,7 @@ python3 submit-job.py \ # Updating the Docker for AWS Batch. -You may refer to the instruction in [Docker](../docker) for more information. +You may refer to the instruction in [GluonNLP Docker Support](../docker/README.md#ci-maintainer) for more information. ## Conversion Toolkits Following the instruction of [converting scripts](../../scripts/conversion_toolkits), diff --git a/tools/docker/README.md b/tools/docker/README.md index 4ea21bddcf..e9e200288d 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -93,3 +93,25 @@ docker push gluonai/gluon-nlp:cpu-latest docker push gluonai/gluon-nlp:gpu-ci-latest docker push gluonai/gluon-nlp:gpu-latest ``` + +### CI maintainer + +Our current batch job dockers are in 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1. To +update the docker: +- Update the Dockerfile as described above +- Make sure docker and docker-compose, as well as the docker python package are installed. +- Export the AWS account credentials as environment variables +- CD to the same folder as the Dockerfile and execute the following: + +``` +# this executes a command that logs into ECR. +$(aws ecr get-login --no-include-email --region us-east-1) + +# tags the recent build as gluon-nlp-1:latest, which AWS batch pulls from. +docker tag gluonai/gluon-nlp:gpu-ci-latest 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:gpu-ci-latest +docker tag gluonai/gluon-nlp:cpu-ci-latest 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:cpu-ci-latest + +# pushes the change +docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:gpu-ci-latest +docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:cpu-ci-latest +``` From 2c9e84eed325e29cf2f0bbedb04c66b19d6cf3a5 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 14 Oct 2020 12:06:40 -0700 Subject: [PATCH 103/115] Update submit-job.py --- tools/batch/submit-job.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/batch/submit-job.py b/tools/batch/submit-job.py index 2a5b3b802d..9bbbc6fe81 100644 --- a/tools/batch/submit-job.py +++ b/tools/batch/submit-job.py @@ -10,39 +10,39 @@ instance_type_info = { 'g4dn.4x': { - 'job_definition': 'gluon-nlp-g4dn_4xlarge:4', + 'job_definition': 'gluon-nlp-g4dn_4xlarge:5', 'job_queue': 'g4dn' }, 'g4dn.8x': { - 'job_definition': 'gluon-nlp-g4dn_8xlarge:4', + 'job_definition': 'gluon-nlp-g4dn_8xlarge:5', 'job_queue': 'g4dn' }, 'g4dn.12x': { - 'job_definition': 'gluon-nlp-g4dn_12xlarge:4', + 'job_definition': 'gluon-nlp-g4dn_12xlarge:5', 'job_queue': 'g4dn-multi-gpu' }, 'p3.2x': { - 'job_definition': 'gluon-nlp-p3_2xlarge:4', + 'job_definition': 'gluon-nlp-p3_2xlarge:5', 'job_queue': 'p3' }, 'p3.8x': { - 'job_definition': 'gluon-nlp-p3_8xlarge:4', + 'job_definition': 'gluon-nlp-p3_8xlarge:5', 'job_queue': 'p3-4gpu' }, 'p3.16x': { - 'job_definition': 'gluon-nlp-p3_16xlarge:4', + 'job_definition': 'gluon-nlp-p3_16xlarge:5', 'job_queue': 'p3-8gpu' }, 'p3dn.24x': { - 'job_definition': 'gluon-nlp-p3_24xlarge:4', + 'job_definition': 'gluon-nlp-p3_24xlarge:5', 'job_queue': 'p3dn-8gpu' }, 'c5n.4x': { - 'job_definition': 'gluon-nlp-c5_4xlarge:2', + 'job_definition': 'gluon-nlp-c5_4xlarge:3', 'job_queue': 'c5n' }, 'c5n.18x': { - 'job_definition': 'gluon-nlp-c5_18xlarge:2', + 'job_definition': 'gluon-nlp-c5_18xlarge:3', 'job_queue': 'c5n' } } From bee78a62a452d6e55ff7f6c813966b025c87dfc4 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 14 Oct 2020 12:14:52 -0700 Subject: [PATCH 104/115] fix documentation --- tools/batch/batch_states/compile_notebooks.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/batch/batch_states/compile_notebooks.sh b/tools/batch/batch_states/compile_notebooks.sh index d993e2dcea..f5dcce24c7 100755 --- a/tools/batch/batch_states/compile_notebooks.sh +++ b/tools/batch/batch_states/compile_notebooks.sh @@ -6,6 +6,7 @@ runnumber=$2 remote=$3 refs=$4 + compile_notebook () { local MDFILE=$1 DIR=$(dirname $MDFILE) @@ -19,15 +20,15 @@ compile_notebook () { python3 tools/batch/submit-job.py --region us-east-1 \ --wait \ --timeout 3600 \ - --saved-output /gluon-nlp/docs/examples \ + --saved-output ./examples \ --name GluonNLP-Docs-${refs}-${prnumber}-${runnumber} \ --save-path ${runnumber}/gluon-nlp/docs/examples \ - --work-dir . \ + --work-dir docs \ --source-ref ${refs} \ --remote https://github.com/${remote} \ --command "python3 -m pip install --quiet nbformat notedown jupyter_client ipykernel && \ python3 -m nltk.downloader perluniprops nonbreaking_prefixes punkt && \ - python3 /gluon-nlp/docs/md2ipynb.py ${MDFILE}" 2>&1 | tee $LOGNAME >/dev/null + python3 md2ipynb.py ${MDFILE}" 2>&1 | tee $LOGNAME >/dev/null BATCH_EXIT_CODE=$? From e9889ec32b288a01fca0fe8236d5bfe5b2e662c7 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 14 Oct 2020 12:22:48 -0700 Subject: [PATCH 105/115] fix --- tools/batch/batch_states/compile_notebooks.sh | 1 + tools/batch/batch_states/test.sh | 3 ++- tools/batch/batch_states/test_data_pipeline.sh | 3 +++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/batch/batch_states/compile_notebooks.sh b/tools/batch/batch_states/compile_notebooks.sh index f5dcce24c7..df02734f75 100755 --- a/tools/batch/batch_states/compile_notebooks.sh +++ b/tools/batch/batch_states/compile_notebooks.sh @@ -1,5 +1,6 @@ #!/bin/bash # Shell script for submitting AWS Batch jobs to compile notebooks +set -ex prnumber=$1 runnumber=$2 diff --git a/tools/batch/batch_states/test.sh b/tools/batch/batch_states/test.sh index c14c16bc67..da0ba1b5dd 100755 --- a/tools/batch/batch_states/test.sh +++ b/tools/batch/batch_states/test.sh @@ -1,9 +1,10 @@ #!/bin/bash # Shell script for installing dependencies and running test on AWS Batch +set -ex echo $PWD python3 -m pip install --user --quiet -upgrade pip python3 -m pip install --user --quiet setuptools pytest pytest-cov contextvars python3 -m pip install --upgrade --quiet cython -python3 -m pytest --cov=/gluon-nlp --cov-config=/gluon-nlp/.coveragerc --cov-report=xml --durations=50 --device="gpu" --runslow /gluon-nlp/tests/ +python3 -m pytest --cov=../../gluon-nlp --cov-config=../../gluon-nlp/.coveragerc --cov-report=xml --durations=50 --device="gpu" --runslow ../../gluon-nlp/tests/ diff --git a/tools/batch/batch_states/test_data_pipeline.sh b/tools/batch/batch_states/test_data_pipeline.sh index 86fafeaf9b..649433868e 100644 --- a/tools/batch/batch_states/test_data_pipeline.sh +++ b/tools/batch/batch_states/test_data_pipeline.sh @@ -1,3 +1,6 @@ +#!/bin/bash +# Shell script for testing the data preprocessing on AWS Batch + set -ex echo $PWD From f52fbf6f4330b27a7108b6f4aea60fd42a7ca84e Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 14 Oct 2020 12:49:36 -0700 Subject: [PATCH 106/115] update --- tools/batch/batch_states/test.sh | 7 +++---- tools/batch/batch_states/test_data_pipeline.sh | 2 -- tools/docker/install/install_jupyter_lab.sh | 2 +- tools/docker/install/install_python_packages.sh | 11 +++++++++-- tools/docker/install/install_ubuntu18.04_core.sh | 4 ---- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tools/batch/batch_states/test.sh b/tools/batch/batch_states/test.sh index da0ba1b5dd..38362b4d5f 100755 --- a/tools/batch/batch_states/test.sh +++ b/tools/batch/batch_states/test.sh @@ -3,8 +3,7 @@ set -ex echo $PWD +DIRNAME=$(dirname $0) +REPODIR=$DIRNAME/../../.. -python3 -m pip install --user --quiet -upgrade pip -python3 -m pip install --user --quiet setuptools pytest pytest-cov contextvars -python3 -m pip install --upgrade --quiet cython -python3 -m pytest --cov=../../gluon-nlp --cov-config=../../gluon-nlp/.coveragerc --cov-report=xml --durations=50 --device="gpu" --runslow ../../gluon-nlp/tests/ +python3 -m pytest --cov=$REPODIR --cov-config=$REPODIR/.coveragerc --cov-report=xml --durations=50 --device="gpu" --runslow $REPODIR/tests/ diff --git a/tools/batch/batch_states/test_data_pipeline.sh b/tools/batch/batch_states/test_data_pipeline.sh index 649433868e..69a478a582 100644 --- a/tools/batch/batch_states/test_data_pipeline.sh +++ b/tools/batch/batch_states/test_data_pipeline.sh @@ -4,8 +4,6 @@ set -ex echo $PWD -python3 -m pip install --pre "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python - for MODEL in spm yttm do bash ../../../scripts/datasets/machine_translation/wmt2014_ende.sh ${MODEL} diff --git a/tools/docker/install/install_jupyter_lab.sh b/tools/docker/install/install_jupyter_lab.sh index 5b6f4b5a84..f6a67826cd 100644 --- a/tools/docker/install/install_jupyter_lab.sh +++ b/tools/docker/install/install_jupyter_lab.sh @@ -7,7 +7,7 @@ curl -sL https://deb.nodesource.com/setup_14.x | bash - \ apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev -pip3 install --no-cache --upgrade \ +python3 -m pip install --no-cache --upgrade \ soundfile==0.10.2 \ ipywidgets==7.5.1 \ jupyter_tensorboard==0.2.0 \ diff --git a/tools/docker/install/install_python_packages.sh b/tools/docker/install/install_python_packages.sh index eb9232426e..879089acc4 100644 --- a/tools/docker/install/install_python_packages.sh +++ b/tools/docker/install/install_python_packages.sh @@ -1,12 +1,18 @@ set -euo pipefail -pip3 install --no-cache --upgrade wheel + +python3 -m pip --no-cache-dir install --upgrade \ + pip \ + setuptools \ + wheel # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli -pip3 install --no-cache --upgrade \ +python3 -m pip install --no-cache --upgrade \ numpy==1.19.1 \ pandas==0.25.1 \ + cython \ pytest \ + pytest-cov \ Pillow \ requests==2.22.0 \ scikit-learn==0.20.4 \ @@ -17,5 +23,6 @@ pip3 install --no-cache --upgrade \ PyYAML==5.3.1 \ mpi4py==3.0.2 \ jupyterlab==2.2.4 \ + contextvars \ cmake \ awscli --user diff --git a/tools/docker/install/install_ubuntu18.04_core.sh b/tools/docker/install/install_ubuntu18.04_core.sh index 1a57e4af5a..404e00fb0e 100644 --- a/tools/docker/install/install_ubuntu18.04_core.sh +++ b/tools/docker/install/install_ubuntu18.04_core.sh @@ -34,8 +34,4 @@ apt-get update \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -python3 -m pip --no-cache-dir install --upgrade \ - pip \ - setuptools - ln -s $(which python3) /usr/local/bin/python From bbe13f7a4eb382313cfaf9dbf9c16b8e4ff2482b Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 14 Oct 2020 12:50:23 -0700 Subject: [PATCH 107/115] Update test.sh --- tools/batch/batch_states/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/batch/batch_states/test.sh b/tools/batch/batch_states/test.sh index 38362b4d5f..86960ec514 100755 --- a/tools/batch/batch_states/test.sh +++ b/tools/batch/batch_states/test.sh @@ -4,6 +4,6 @@ set -ex echo $PWD DIRNAME=$(dirname $0) -REPODIR=$DIRNAME/../../.. +REPODIR=$DIRNAME/../../../ python3 -m pytest --cov=$REPODIR --cov-config=$REPODIR/.coveragerc --cov-report=xml --durations=50 --device="gpu" --runslow $REPODIR/tests/ From b8046a0ddd00ecdf0f3d869c6030d5fea144db21 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 14 Oct 2020 12:52:21 -0700 Subject: [PATCH 108/115] Update test.sh --- tools/batch/batch_states/test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/batch/batch_states/test.sh b/tools/batch/batch_states/test.sh index 86960ec514..4ae82ca1b6 100755 --- a/tools/batch/batch_states/test.sh +++ b/tools/batch/batch_states/test.sh @@ -3,7 +3,7 @@ set -ex echo $PWD -DIRNAME=$(dirname $0) -REPODIR=$DIRNAME/../../../ +SCRIPTPATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" +REPODIR=${SCRIPTPATH}/../../../ python3 -m pytest --cov=$REPODIR --cov-config=$REPODIR/.coveragerc --cov-report=xml --durations=50 --device="gpu" --runslow $REPODIR/tests/ From ce551c8628bf51a004434e0721a49234ab4829d5 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 14 Oct 2020 12:54:47 -0700 Subject: [PATCH 109/115] Update test.sh --- tools/batch/batch_states/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/batch/batch_states/test.sh b/tools/batch/batch_states/test.sh index 4ae82ca1b6..85e8d83e23 100755 --- a/tools/batch/batch_states/test.sh +++ b/tools/batch/batch_states/test.sh @@ -4,6 +4,6 @@ set -ex echo $PWD SCRIPTPATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" -REPODIR=${SCRIPTPATH}/../../../ +REPODIR=${SCRIPTPATH}/../../../../gluon-nlp python3 -m pytest --cov=$REPODIR --cov-config=$REPODIR/.coveragerc --cov-report=xml --durations=50 --device="gpu" --runslow $REPODIR/tests/ From 3ac97b6db117c67f93bef6e6c8355d219e47e599 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 14 Oct 2020 12:56:38 -0700 Subject: [PATCH 110/115] Update test.sh --- tools/batch/batch_states/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/batch/batch_states/test.sh b/tools/batch/batch_states/test.sh index 85e8d83e23..9f285e2262 100755 --- a/tools/batch/batch_states/test.sh +++ b/tools/batch/batch_states/test.sh @@ -4,6 +4,6 @@ set -ex echo $PWD SCRIPTPATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" -REPODIR=${SCRIPTPATH}/../../../../gluon-nlp +REPODIR="$( readlink -f ${SCRIPTPATH}/../../../../gluon-nlp)" python3 -m pytest --cov=$REPODIR --cov-config=$REPODIR/.coveragerc --cov-report=xml --durations=50 --device="gpu" --runslow $REPODIR/tests/ From d34d69314f1ec02202a63377ead1f5ff9a01ac1e Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 14 Oct 2020 13:26:55 -0700 Subject: [PATCH 111/115] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c795778acc..675147a2c5 100644 --- a/README.md +++ b/README.md @@ -94,11 +94,11 @@ You can use Docker to launch a JupyterLab development environment with GluonNLP ``` # GPU Instance docker pull gluonai/gluon-nlp:gpu-latest -docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=4g gluonai/gluon-nlp:gpu-latest +docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=2g gluonai/gluon-nlp:gpu-latest # CPU Instance docker pull gluonai/gluon-nlp:cpu-latest -docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=4g gluonai/gluon-nlp:cpu-latest +docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=2g gluonai/gluon-nlp:cpu-latest ``` For more details, you can refer to the guidance in [tools/docker](tools/docker). From 899c61391234987fd58e95b55cbe20ea5c147ca1 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 14 Oct 2020 13:37:39 -0700 Subject: [PATCH 112/115] Update test.sh --- tools/batch/batch_states/test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/batch/batch_states/test.sh b/tools/batch/batch_states/test.sh index 9f285e2262..86ac4f467a 100755 --- a/tools/batch/batch_states/test.sh +++ b/tools/batch/batch_states/test.sh @@ -6,4 +6,5 @@ echo $PWD SCRIPTPATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" REPODIR="$( readlink -f ${SCRIPTPATH}/../../../../gluon-nlp)" +python3 -m pip install --upgrade --user pytest pytest-cov contextvars python3 -m pytest --cov=$REPODIR --cov-config=$REPODIR/.coveragerc --cov-report=xml --durations=50 --device="gpu" --runslow $REPODIR/tests/ From c73d3ed97b771c17e93f2f514099fb145a41b9b8 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 14 Oct 2020 13:48:21 -0700 Subject: [PATCH 113/115] fix --- tools/batch/batch_states/compile_notebooks.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/batch/batch_states/compile_notebooks.sh b/tools/batch/batch_states/compile_notebooks.sh index df02734f75..b82a4b14e0 100755 --- a/tools/batch/batch_states/compile_notebooks.sh +++ b/tools/batch/batch_states/compile_notebooks.sh @@ -21,15 +21,15 @@ compile_notebook () { python3 tools/batch/submit-job.py --region us-east-1 \ --wait \ --timeout 3600 \ - --saved-output ./examples \ + --saved-output docs/examples \ --name GluonNLP-Docs-${refs}-${prnumber}-${runnumber} \ --save-path ${runnumber}/gluon-nlp/docs/examples \ - --work-dir docs \ + --work-dir . \ --source-ref ${refs} \ --remote https://github.com/${remote} \ --command "python3 -m pip install --quiet nbformat notedown jupyter_client ipykernel && \ python3 -m nltk.downloader perluniprops nonbreaking_prefixes punkt && \ - python3 md2ipynb.py ${MDFILE}" 2>&1 | tee $LOGNAME >/dev/null + python3 docs/md2ipynb.py ${MDFILE}" 2>&1 | tee $LOGNAME >/dev/null BATCH_EXIT_CODE=$? From 42c8e41876ed937c2388b86a612074309b447a5f Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 14 Oct 2020 13:53:51 -0700 Subject: [PATCH 114/115] Update README.md --- scripts/machine_translation/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/machine_translation/README.md b/scripts/machine_translation/README.md index 4bafcb920c..4b729cc117 100644 --- a/scripts/machine_translation/README.md +++ b/scripts/machine_translation/README.md @@ -7,8 +7,8 @@ to generate the dataset. Then, run `train_transformer.py` to train the model. In the following, we give the training script for WMT2014 EN-DE task with yttm tokenizer. You may first run the following command in [datasets/machine_translation](../datasets/machine_translation). ```bash -bash ../datasets/machine_translation/wmt2014_ende_base.sh yttm (For transformer_base config) -bash ../datasets/machine_translation/wmt2014_ende.sh yttm (For transformer_wmt_en_de_big config) +bash ../datasets/machine_translation/wmt2014_ende_base.sh yttm # (For transformer_base config) +bash ../datasets/machine_translation/wmt2014_ende.sh yttm # (For transformer_wmt_en_de_big config) ``` Then, you can run the experiment. From 3e1a326eef6d70eb0bcfd6a8cc7803676b9a2f95 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 14 Oct 2020 16:57:00 -0700 Subject: [PATCH 115/115] Update gluon_nlp_job.sh --- tools/docker/gluon_nlp_job.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/docker/gluon_nlp_job.sh b/tools/docker/gluon_nlp_job.sh index f692c3fc16..c2f54e371f 100755 --- a/tools/docker/gluon_nlp_job.sh +++ b/tools/docker/gluon_nlp_job.sh @@ -1,5 +1,4 @@ #!/bin/bash -set -x date echo "Args: $@"