Merge branch 'master' into refactor/spawn/setup-environment

Lightning-AI · Aug 28, 2022 · 5ec39ad · 5ec39ad
2 parents a7cff0e + 03f2f32
commit 5ec39ad
Show file tree

Hide file tree

Showing 93 changed files with 1,027 additions and 1,015 deletions.
diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml
@@ -24,8 +24,10 @@ variables:
 
 jobs:
   - job: App_cloud_e2e_testing
-    pool:
-      vmImage: 'ubuntu-latest'
+    pool: azure-cpus
+    container:
+      image: mcr.microsoft.com/playwright/python:v1.25.2-focal
+      options: "--shm-size=2g"
     timeoutInMinutes: "30"
     cancelTimeoutInMinutes: "2"
     strategy:
@@ -56,6 +58,7 @@ jobs:
       clean: all
     steps:
     - bash: |
+        whoami
         python --version
         pip --version
       displayName: 'Info'
@@ -80,10 +83,10 @@ jobs:
 
     - bash: |
         python -m pip install playwright
-        python -m playwright install --with-deps
+        python -m playwright install  # --with-deps
       displayName: 'Install Playwright system dependencies'
 
-    - bash: pip install -e .
+    - bash: pip install -e . --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
       displayName: 'Install lightning'
 
     - bash: |
@@ -110,12 +113,12 @@ jobs:
         TEST_APP_NAME: $(name)
         HAR_LOCATION: './artifacts/hars'
         SLOW_MO: '50'
-        LAI_USER: $(LAI_USER)
-        LAI_PASS: $(LAI_PASS)
-        LIGHTNING_USER_ID: $(LIGHTNING_USER_ID)
-        LIGHTNING_API_KEY: $(LIGHTNING_API_KEY)
+        # LAI_USER: $(LAI_USER)
+        # LAI_PASS: $(LAI_PASS)
+        LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD)
+        LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD)
         LIGHTNING_USERNAME: $(LIGHTNING_USERNAME)
-        LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL)
+        LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD)
       displayName: 'Run the tests'
 
     - publish: '$(Build.ArtifactStagingDirectory)/videos'
@@ -125,16 +128,16 @@ jobs:
     - bash: |
         time python -c "from lightning.app import testing; testing.delete_cloud_lightning_apps()"
       env:
-        LAI_USER: $(LAI_USER)
-        LAI_PASS: $(LAI_PASS)
-        LIGHTNING_USER_ID: $(LIGHTNING_USER_ID)
-        LIGHTNING_API_KEY: $(LIGHTNING_API_KEY)
+        # LAI_USER: $(LAI_USER)
+        # LAI_PASS: $(LAI_PASS)
+        LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD)
+        LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD)
         LIGHTNING_USERNAME: $(LIGHTNING_USERNAME)
-        LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL)
+        LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD)
         PR_NUMBER: $(local_id)
         TEST_APP_NAME: $(name)
-        GRID_USER_ID: $(LIGHTNING_USER_ID)  # TODO: clarify the meaning
-        GRID_USER_KEY: $(LIGHTNING_API_KEY)  # TODO: clarify the meaning
-        GRID_URL: $(LIGHTNING_CLOUD_URL)
-        _GRID_USERNAME: $(LIGHTNING_USERNAME)
+        # GRID_USER_ID: $(LIGHTNING_USER_ID)  # TODO: clarify the meaning
+        # GRID_USER_KEY: $(LIGHTNING_API_KEY)  # TODO: clarify the meaning
+        # GRID_URL: $(LIGHTNING_CLOUD_URL)
+        # _GRID_USERNAME: $(LIGHTNING_USERNAME)
       displayName: 'Clean Previous Apps'
diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml
@@ -44,7 +44,7 @@ jobs:
 
     - bash: |
         CHANGED_FILES=$(git diff --name-status origin/master -- . | awk  '{print $2}')
-        FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
+        FILTER='.azure/gpu_*|src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
         echo $CHANGED_FILES > changed_files.txt
         MATCHES=$(cat changed_files.txt | grep -E $FILTER)
         echo $MATCHES
@@ -72,12 +72,15 @@ jobs:
         set -e
         python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'bagua' not in line] ; open(fname, 'w').writelines(lines)"
+        TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
         CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
         CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])")
+        python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION}
+        python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt ${PYTORCH_VERSION}
+        python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION}
         pip install "bagua-cuda$CUDA_VERSION_BAGUA>=0.9.0"
-        pip install -e .[strategies]
-        pip install -U deepspeed  # TODO: remove when docker images are upgraded
-        pip install --requirement requirements/pytorch/devel.txt
+        pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
+        pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
         pip list
       env:
         PACKAGE_NAME: pytorch
@@ -120,6 +123,15 @@ jobs:
       timeoutInMinutes: "35"
       condition: eq(variables['continue'], '1')
 
+    - bash: bash run_standalone_tasks.sh
+      workingDirectory: tests/tests_pytorch
+      env:
+        PL_USE_MOCKED_MNIST: "1"
+        PL_RUN_CUDA_TESTS: "1"
+      displayName: 'Testing: PyTorch standalone tasks'
+      timeoutInMinutes: "10"
+      condition: eq(variables['continue'], '1')
+
     - bash: |
         python -m coverage report
         python -m coverage xml

diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml
@@ -45,7 +45,7 @@ jobs:
         pip --version
         sudo pip uninstall -y lightning pytorch-lightning
         pip install fire
-        python .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext
+        python .actions/assistant.py requirements-prune-pkgs torch,torchvision
         pip install ".[extra,test]"
         pip list
       env:

diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml
@@ -75,7 +75,7 @@ jobs:
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }}
         timeout-minutes: 60
-      - uses: ravsamhq/notify-slack-action@v1
+      - uses: ravsamhq/notify-slack-action@v2
         if: failure() && env.PUSH_TO_HUB == 'true'
         with:
           status: ${{ job.status }}
@@ -117,7 +117,7 @@ jobs:
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}
         timeout-minutes: 95
-      - uses: ravsamhq/notify-slack-action@v1
+      - uses: ravsamhq/notify-slack-action@v2
         if: failure() && env.PUSH_TO_HUB == 'true'
         with:
           status: ${{ job.status }}
@@ -155,7 +155,7 @@ jobs:
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
         timeout-minutes: 95
-      - uses: ravsamhq/notify-slack-action@v1
+      - uses: ravsamhq/notify-slack-action@v2
         if: failure() && env.PUSH_TO_HUB == 'true'
         with:
           status: ${{ job.status }}
@@ -199,7 +199,7 @@ jobs:
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:ipu-ci-runner-py${{ matrix.python_version }}
         timeout-minutes: 10
-      - uses: ravsamhq/notify-slack-action@v1
+      - uses: ravsamhq/notify-slack-action@v2
         if: failure() && env.PUSH_TO_HUB == 'true'
         with:
           status: ${{ job.status }}
@@ -235,7 +235,7 @@ jobs:
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:hpu-ci-runner-gaudi${{ matrix.gaudi_version }}
         timeout-minutes: 10
-      - uses: ravsamhq/notify-slack-action@v1
+      - uses: ravsamhq/notify-slack-action@v2
         if: failure() && env.PUSH_TO_HUB == 'true'
         with:
           status: ${{ job.status }}

diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml
@@ -48,7 +48,7 @@ jobs:
       # report failure to Slack
       - name: Slack notification
         if: failure() && github.event_name == 'schedule'
-        uses: ravsamhq/notify-slack-action@v1
+        uses: ravsamhq/notify-slack-action@v2
         with:
           status: ${{ job.status }}
           token: ${{ secrets.GITHUB_TOKEN }}

diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile
@@ -34,6 +34,10 @@ RUN \
     # https://github.com/NVIDIA/nvidia-docker/issues/1631
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
     apt-get update -qq --fix-missing && \
+    NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
+    CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
+    MAX_ALLOWED_NCCL=2.11.4 && \
+    TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V  | head -n1)-1+cuda${CUDA_VERSION_MM} && \
     apt-get install -y --no-install-recommends \
         build-essential \
         cmake \
@@ -42,17 +46,15 @@ RUN \
         curl \
         unzip \
         ca-certificates \
-        libopenmpi-dev
-
-RUN \
+        libopenmpi-dev \
+        libnccl2=$TO_INSTALL_NCCL \
+        libnccl-dev=$TO_INSTALL_NCCL && \
 # Install conda and python.
 # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
     curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \
     chmod +x ~/miniconda.sh && \
     ~/miniconda.sh -b && \
-    rm ~/miniconda.sh
-
-RUN \
+    rm ~/miniconda.sh && \
 # Cleaning
     apt-get autoremove -y && \
     apt-get clean && \
@@ -76,11 +78,11 @@ RUN \
     conda update -n base -c defaults conda && \
     CUDA_VERSION_MM=$(python -c "print('.'.join('$CUDA_VERSION'.split('.')[:2]))") && \
     conda create -y --name $CONDA_ENV \
-      python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION_MM} \
+      python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision cudatoolkit=${CUDA_VERSION_MM} \
       -c nvidia -c pytorch -c pytorch-test && \
     conda init bash && \
     # NOTE: this requires that the channel is presented in the yaml before packages \
-    printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n    req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \
+    printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchvision']:\n    req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \
     python prune.py && \
     rm prune.py && \
     cat environment.yml && \
@@ -100,7 +102,7 @@ RUN \
     pip list | grep torch && \
     python -c "import torch; print(torch.__version__)" && \
     pip install -q fire && \
-    python assistant.py requirements_prune_pkgs torch,torchvision,torchtext && \
+    python assistant.py requirements_prune_pkgs torch,torchvision && \
     # Install remaining requirements
     pip install --no-cache-dir -r requirements/pytorch/base.txt \
         -r requirements/pytorch/extra.txt \

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
@@ -37,7 +37,11 @@ RUN \
     # https://github.com/NVIDIA/nvidia-docker/issues/1631
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
     apt-get update -qq --fix-missing && \
-    apt-get install -y --no-install-recommends \
+    NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
+    CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
+    MAX_ALLOWED_NCCL=2.11.4 && \
+    TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V  | head -n1)-1+cuda${CUDA_VERSION_MM} && \
+    apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
         build-essential \
         pkg-config \
         cmake \
@@ -50,19 +54,17 @@ RUN \
         libopenmpi-dev \
         openmpi-bin \
         ssh \
-    && \
-
+        libnccl2=$TO_INSTALL_NCCL \
+        libnccl-dev=$TO_INSTALL_NCCL && \
 # Install python
     add-apt-repository ppa:deadsnakes/ppa && \
     apt-get install -y \
         python${PYTHON_VERSION} \
         python${PYTHON_VERSION}-distutils \
         python${PYTHON_VERSION}-dev \
     && \
-
     update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \
     update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \
-
 # Cleaning
     apt-get autoremove -y && \
     apt-get clean && \
@@ -78,7 +80,6 @@ RUN \
     wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \
     python${PYTHON_VERSION} get-pip.py && \
     rm get-pip.py && \
-
     pip install -q fire && \
     # Disable cache \
     CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \
@@ -91,16 +92,6 @@ RUN \
     pip install -r requirements/pytorch/devel.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html && \
     rm assistant.py
 
-RUN \
-    apt-get purge -y cmake && \
-    wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz && \
-    tar -zxvf cmake-3.20.2.tar.gz && \
-    cd cmake-3.20.2 && \
-    ./bootstrap -- -DCMAKE_USE_OPENSSL=OFF && \
-    make && \
-    make install && \
-    cmake  --version
-
 ENV \
     HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \
     HOROVOD_GPU_OPERATIONS=NCCL \

diff --git a/docs/source-pytorch/accelerators/hpu_basic.rst b/docs/source-pytorch/accelerators/hpu_basic.rst
@@ -47,6 +47,40 @@ It uses :class:`~pytorch_lightning.strategies.hpu_parallel.HPUParallelStrategy`
 
 ----
 
+Scale-out on Gaudis
+-------------------
+
+To train a Lightning model using multiple HPU nodes, set the ``num_nodes`` parameter with the available nodes in the ``Trainer`` class.
+
+.. code-block:: python
+
+    trainer = Trainer(accelerator="hpu", devices=8, strategy="hpu_parallel", num_nodes=2)
+
+In addition to this, the following environment variables need to be set to establish communication across nodes. Check out the documentation on :doc:`Cluster Environment <../clouds/cluster>` for more details.
+
+- *MASTER_PORT* - required; has to be a free port on machine with NODE_RANK 0
+- *MASTER_ADDR* - required (except for NODE_RANK 0); address of NODE_RANK 0 node
+- *WORLD_SIZE* - required; how many workers are in the cluster
+- *NODE_RANK* - required; id of the node in the cluster
+
+The trainer needs to be instantiated on every node participating in the training.
+
+On Node 1:
+
+.. code-block:: bash
+
+    MASTER_ADDR=<MASTER_ADDR> MASTER_PORT=<MASTER_PORT> NODE_RANK=0 WORLD_SIZE=16
+        python -m some_model_trainer.py (--arg1 ... train script args...)
+
+On Node 2:
+
+.. code-block:: bash
+
+    MASTER_ADDR=<MASTER_ADDR> MASTER_PORT=<MASTER_PORT> NODE_RANK=1 WORLD_SIZE=16
+        python -m some_model_trainer.py (--arg1 ... train script args...)
+
+----
+
 Select Gaudis automatically
 ---------------------------
 

diff --git a/docs/source-pytorch/advanced/model_parallel.rst b/docs/source-pytorch/advanced/model_parallel.rst
@@ -212,14 +212,31 @@ PyTorch Fully Sharded Training
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 PyTorch has it's own version of `FSDP <https://pytorch.org/docs/stable/fsdp.html>`_ which is upstreamed from their `fairscale <https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html>`__ project.
-It was introduced in their `v1.11.0 release <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`_. The API is pretty similar to that of FairScale.
+It was introduced in their `v1.11.0 release <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`_ but it is recommended to use it with PyTorch v1.12 or more and that's what
+Lightning supports. The API is pretty similar to that of FairScale.
 
-.. note::
-    Currently Fully Sharded Training relies on the user to wrap the model with Fully Sharded within the ``LightningModule``.
-    This means you must create a single model that is treated as a ``torch.nn.Module`` within the ``LightningModule``.
-    This is a limitation of Fully Sharded Training that will be resolved in the future.
 
-To activate parameter sharding, you must wrap your model using the``wrap`` function. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` parameters are passed correctly.
+Auto Wrapping
+"""""""""""""
+Model layers should be wrapped in FSDP in a nested way to save peak memory and enable communication and computation overlapping. The
+simplest way to do it is auto wrapping, which can serve as a drop-in replacement for DDP without changing the rest of the code. You don't
+have to ``wrap`` layers manually as in the case of manual wrapping.
+
+.. code-block:: python
+
+    model = BoringModel()
+    trainer = Trainer(accelerator="gpu", devices=4, strategy="fsdp_native", precision=16)
+    trainer.fit(model)
+
+
+Read more `here <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/#auto-wrapping>`__.
+
+
+Manual Wrapping
+"""""""""""""""
+
+Manual wrapping can be useful to explore complex sharding strategies by applying ``wrap`` selectively to some parts of the model. To activate
+parameter sharding with manual wrapping, you can wrap your model using the ``wrap`` function. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` parameters are passed correctly.
 
 When not using Fully Sharded these wrap functions are a no-op. This means once the changes have been made, there is no need to remove the changes for other strategies.
 

diff --git a/docs/source-pytorch/common/checkpointing_intermediate.rst b/docs/source-pytorch/common/checkpointing_intermediate.rst
@@ -120,7 +120,7 @@ What
 Where
 =====
 
-- It gives you the ability to specify the ``dirpath`` and ``filename`` for your checkpoints. Filename can also be dynamic so you can inject the metrics that are being logged using :meth:`~pytorch_lightning.core.module.LightningModule.log`.
+- By default, the ``ModelCheckpoint`` will save files into the ``Trainer.log_dir``. It gives you the ability to specify the ``dirpath`` and ``filename`` for your checkpoints. Filename can also be dynamic so you can inject the metrics that are being logged using :meth:`~pytorch_lightning.core.module.LightningModule.log`.
 
 |
-Original file line number
+Diff line change
@@ Expand Up / @@ -120,7 +120,7 @@ What @@
     Where
     =====
-    - It gives you the ability to specify the ``dirpath`` and ``filename`` for your checkpoints. Filename can also be dynamic so you can inject the metrics that are being logged using :meth:`~pytorch_lightning.core.module.LightningModule.log`.
+    - By default, the ``ModelCheckpoint`` will save files into the ``Trainer.log_dir``. It gives you the ability to specify the ``dirpath`` and ``filename`` for your checkpoints. Filename can also be dynamic so you can inject the metrics that are being logged using :meth:`~pytorch_lightning.core.module.LightningModule.log`.
     |
@@ Expand Down @@