Skip to content

Commit

Permalink
Merge branch 'master' into refactor/spawn/setup-environment
Browse files Browse the repository at this point in the history
  • Loading branch information
awaelchli committed Aug 28, 2022
2 parents a7cff0e + 03f2f32 commit 5ec39ad
Show file tree
Hide file tree
Showing 93 changed files with 1,027 additions and 1,015 deletions.
39 changes: 21 additions & 18 deletions .azure/app-cloud-e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ variables:

jobs:
- job: App_cloud_e2e_testing
pool:
vmImage: 'ubuntu-latest'
pool: azure-cpus
container:
image: mcr.microsoft.com/playwright/python:v1.25.2-focal
options: "--shm-size=2g"
timeoutInMinutes: "30"
cancelTimeoutInMinutes: "2"
strategy:
Expand Down Expand Up @@ -56,6 +58,7 @@ jobs:
clean: all
steps:
- bash: |
whoami
python --version
pip --version
displayName: 'Info'
Expand All @@ -80,10 +83,10 @@ jobs:

- bash: |
python -m pip install playwright
python -m playwright install --with-deps
python -m playwright install # --with-deps
displayName: 'Install Playwright system dependencies'
- bash: pip install -e .
- bash: pip install -e . --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
displayName: 'Install lightning'

- bash: |
Expand All @@ -110,12 +113,12 @@ jobs:
TEST_APP_NAME: $(name)
HAR_LOCATION: './artifacts/hars'
SLOW_MO: '50'
LAI_USER: $(LAI_USER)
LAI_PASS: $(LAI_PASS)
LIGHTNING_USER_ID: $(LIGHTNING_USER_ID)
LIGHTNING_API_KEY: $(LIGHTNING_API_KEY)
# LAI_USER: $(LAI_USER)
# LAI_PASS: $(LAI_PASS)
LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD)
LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD)
LIGHTNING_USERNAME: $(LIGHTNING_USERNAME)
LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL)
LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD)
displayName: 'Run the tests'
- publish: '$(Build.ArtifactStagingDirectory)/videos'
Expand All @@ -125,16 +128,16 @@ jobs:
- bash: |
time python -c "from lightning.app import testing; testing.delete_cloud_lightning_apps()"
env:
LAI_USER: $(LAI_USER)
LAI_PASS: $(LAI_PASS)
LIGHTNING_USER_ID: $(LIGHTNING_USER_ID)
LIGHTNING_API_KEY: $(LIGHTNING_API_KEY)
# LAI_USER: $(LAI_USER)
# LAI_PASS: $(LAI_PASS)
LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD)
LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD)
LIGHTNING_USERNAME: $(LIGHTNING_USERNAME)
LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL)
LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD)
PR_NUMBER: $(local_id)
TEST_APP_NAME: $(name)
GRID_USER_ID: $(LIGHTNING_USER_ID) # TODO: clarify the meaning
GRID_USER_KEY: $(LIGHTNING_API_KEY) # TODO: clarify the meaning
GRID_URL: $(LIGHTNING_CLOUD_URL)
_GRID_USERNAME: $(LIGHTNING_USERNAME)
# GRID_USER_ID: $(LIGHTNING_USER_ID) # TODO: clarify the meaning
# GRID_USER_KEY: $(LIGHTNING_API_KEY) # TODO: clarify the meaning
# GRID_URL: $(LIGHTNING_CLOUD_URL)
# _GRID_USERNAME: $(LIGHTNING_USERNAME)
displayName: 'Clean Previous Apps'
20 changes: 16 additions & 4 deletions .azure/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:

- bash: |
CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}')
FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
FILTER='.azure/gpu_*|src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
echo $CHANGED_FILES > changed_files.txt
MATCHES=$(cat changed_files.txt | grep -E $FILTER)
echo $MATCHES
Expand Down Expand Up @@ -72,12 +72,15 @@ jobs:
set -e
python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'bagua' not in line] ; open(fname, 'w').writelines(lines)"
TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])")
python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION}
python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt ${PYTORCH_VERSION}
python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION}
pip install "bagua-cuda$CUDA_VERSION_BAGUA>=0.9.0"
pip install -e .[strategies]
pip install -U deepspeed # TODO: remove when docker images are upgraded
pip install --requirement requirements/pytorch/devel.txt
pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
pip list
env:
PACKAGE_NAME: pytorch
Expand Down Expand Up @@ -120,6 +123,15 @@ jobs:
timeoutInMinutes: "35"
condition: eq(variables['continue'], '1')

- bash: bash run_standalone_tasks.sh
workingDirectory: tests/tests_pytorch
env:
PL_USE_MOCKED_MNIST: "1"
PL_RUN_CUDA_TESTS: "1"
displayName: 'Testing: PyTorch standalone tasks'
timeoutInMinutes: "10"
condition: eq(variables['continue'], '1')

- bash: |
python -m coverage report
python -m coverage xml
Expand Down
2 changes: 1 addition & 1 deletion .azure/hpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
pip --version
sudo pip uninstall -y lightning pytorch-lightning
pip install fire
python .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext
python .actions/assistant.py requirements-prune-pkgs torch,torchvision
pip install ".[extra,test]"
pip list
env:
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/ci-pytorch-dockers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ jobs:
push: ${{ env.PUSH_TO_HUB }}
tags: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }}
timeout-minutes: 60
- uses: ravsamhq/notify-slack-action@v1
- uses: ravsamhq/notify-slack-action@v2
if: failure() && env.PUSH_TO_HUB == 'true'
with:
status: ${{ job.status }}
Expand Down Expand Up @@ -117,7 +117,7 @@ jobs:
push: ${{ env.PUSH_TO_HUB }}
tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}
timeout-minutes: 95
- uses: ravsamhq/notify-slack-action@v1
- uses: ravsamhq/notify-slack-action@v2
if: failure() && env.PUSH_TO_HUB == 'true'
with:
status: ${{ job.status }}
Expand Down Expand Up @@ -155,7 +155,7 @@ jobs:
push: ${{ env.PUSH_TO_HUB }}
tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
timeout-minutes: 95
- uses: ravsamhq/notify-slack-action@v1
- uses: ravsamhq/notify-slack-action@v2
if: failure() && env.PUSH_TO_HUB == 'true'
with:
status: ${{ job.status }}
Expand Down Expand Up @@ -199,7 +199,7 @@ jobs:
push: ${{ env.PUSH_TO_HUB }}
tags: pytorchlightning/pytorch_lightning:ipu-ci-runner-py${{ matrix.python_version }}
timeout-minutes: 10
- uses: ravsamhq/notify-slack-action@v1
- uses: ravsamhq/notify-slack-action@v2
if: failure() && env.PUSH_TO_HUB == 'true'
with:
status: ${{ job.status }}
Expand Down Expand Up @@ -235,7 +235,7 @@ jobs:
push: ${{ env.PUSH_TO_HUB }}
tags: pytorchlightning/pytorch_lightning:hpu-ci-runner-gaudi${{ matrix.gaudi_version }}
timeout-minutes: 10
- uses: ravsamhq/notify-slack-action@v1
- uses: ravsamhq/notify-slack-action@v2
if: failure() && env.PUSH_TO_HUB == 'true'
with:
status: ${{ job.status }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/events-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
# report failure to Slack
- name: Slack notification
if: failure() && github.event_name == 'schedule'
uses: ravsamhq/notify-slack-action@v1
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
token: ${{ secrets.GITHUB_TOKEN }}
Expand Down
20 changes: 11 additions & 9 deletions dockers/base-conda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ RUN \
# https://github.com/NVIDIA/nvidia-docker/issues/1631
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
apt-get update -qq --fix-missing && \
NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
MAX_ALLOWED_NCCL=2.11.4 && \
TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1)-1+cuda${CUDA_VERSION_MM} && \
apt-get install -y --no-install-recommends \
build-essential \
cmake \
Expand All @@ -42,17 +46,15 @@ RUN \
curl \
unzip \
ca-certificates \
libopenmpi-dev

RUN \
libopenmpi-dev \
libnccl2=$TO_INSTALL_NCCL \
libnccl-dev=$TO_INSTALL_NCCL && \
# Install conda and python.
# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \
chmod +x ~/miniconda.sh && \
~/miniconda.sh -b && \
rm ~/miniconda.sh

RUN \
rm ~/miniconda.sh && \
# Cleaning
apt-get autoremove -y && \
apt-get clean && \
Expand All @@ -76,11 +78,11 @@ RUN \
conda update -n base -c defaults conda && \
CUDA_VERSION_MM=$(python -c "print('.'.join('$CUDA_VERSION'.split('.')[:2]))") && \
conda create -y --name $CONDA_ENV \
python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION_MM} \
python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision cudatoolkit=${CUDA_VERSION_MM} \
-c nvidia -c pytorch -c pytorch-test && \
conda init bash && \
# NOTE: this requires that the channel is presented in the yaml before packages \
printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \
printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \
python prune.py && \
rm prune.py && \
cat environment.yml && \
Expand All @@ -100,7 +102,7 @@ RUN \
pip list | grep torch && \
python -c "import torch; print(torch.__version__)" && \
pip install -q fire && \
python assistant.py requirements_prune_pkgs torch,torchvision,torchtext && \
python assistant.py requirements_prune_pkgs torch,torchvision && \
# Install remaining requirements
pip install --no-cache-dir -r requirements/pytorch/base.txt \
-r requirements/pytorch/extra.txt \
Expand Down
23 changes: 7 additions & 16 deletions dockers/base-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,11 @@ RUN \
# https://github.com/NVIDIA/nvidia-docker/issues/1631
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
apt-get update -qq --fix-missing && \
apt-get install -y --no-install-recommends \
NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
MAX_ALLOWED_NCCL=2.11.4 && \
TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1)-1+cuda${CUDA_VERSION_MM} && \
apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
build-essential \
pkg-config \
cmake \
Expand All @@ -50,19 +54,17 @@ RUN \
libopenmpi-dev \
openmpi-bin \
ssh \
&& \

libnccl2=$TO_INSTALL_NCCL \
libnccl-dev=$TO_INSTALL_NCCL && \
# Install python
add-apt-repository ppa:deadsnakes/ppa && \
apt-get install -y \
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-distutils \
python${PYTHON_VERSION}-dev \
&& \

update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \
update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \

# Cleaning
apt-get autoremove -y && \
apt-get clean && \
Expand All @@ -78,7 +80,6 @@ RUN \
wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \
python${PYTHON_VERSION} get-pip.py && \
rm get-pip.py && \

pip install -q fire && \
# Disable cache \
CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \
Expand All @@ -91,16 +92,6 @@ RUN \
pip install -r requirements/pytorch/devel.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html && \
rm assistant.py

RUN \
apt-get purge -y cmake && \
wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz && \
tar -zxvf cmake-3.20.2.tar.gz && \
cd cmake-3.20.2 && \
./bootstrap -- -DCMAKE_USE_OPENSSL=OFF && \
make && \
make install && \
cmake --version

ENV \
HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \
HOROVOD_GPU_OPERATIONS=NCCL \
Expand Down
34 changes: 34 additions & 0 deletions docs/source-pytorch/accelerators/hpu_basic.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,40 @@ It uses :class:`~pytorch_lightning.strategies.hpu_parallel.HPUParallelStrategy`
----

Scale-out on Gaudis
-------------------

To train a Lightning model using multiple HPU nodes, set the ``num_nodes`` parameter with the available nodes in the ``Trainer`` class.

.. code-block:: python
trainer = Trainer(accelerator="hpu", devices=8, strategy="hpu_parallel", num_nodes=2)
In addition to this, the following environment variables need to be set to establish communication across nodes. Check out the documentation on :doc:`Cluster Environment <../clouds/cluster>` for more details.

- *MASTER_PORT* - required; has to be a free port on machine with NODE_RANK 0
- *MASTER_ADDR* - required (except for NODE_RANK 0); address of NODE_RANK 0 node
- *WORLD_SIZE* - required; how many workers are in the cluster
- *NODE_RANK* - required; id of the node in the cluster

The trainer needs to be instantiated on every node participating in the training.

On Node 1:

.. code-block:: bash
MASTER_ADDR=<MASTER_ADDR> MASTER_PORT=<MASTER_PORT> NODE_RANK=0 WORLD_SIZE=16
python -m some_model_trainer.py (--arg1 ... train script args...)
On Node 2:

.. code-block:: bash
MASTER_ADDR=<MASTER_ADDR> MASTER_PORT=<MASTER_PORT> NODE_RANK=1 WORLD_SIZE=16
python -m some_model_trainer.py (--arg1 ... train script args...)
----

Select Gaudis automatically
---------------------------

Expand Down
29 changes: 23 additions & 6 deletions docs/source-pytorch/advanced/model_parallel.rst
Original file line number Diff line number Diff line change
Expand Up @@ -212,14 +212,31 @@ PyTorch Fully Sharded Training
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

PyTorch has it's own version of `FSDP <https://pytorch.org/docs/stable/fsdp.html>`_ which is upstreamed from their `fairscale <https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html>`__ project.
It was introduced in their `v1.11.0 release <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`_. The API is pretty similar to that of FairScale.
It was introduced in their `v1.11.0 release <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`_ but it is recommended to use it with PyTorch v1.12 or more and that's what
Lightning supports. The API is pretty similar to that of FairScale.

.. note::
Currently Fully Sharded Training relies on the user to wrap the model with Fully Sharded within the ``LightningModule``.
This means you must create a single model that is treated as a ``torch.nn.Module`` within the ``LightningModule``.
This is a limitation of Fully Sharded Training that will be resolved in the future.

To activate parameter sharding, you must wrap your model using the``wrap`` function. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` parameters are passed correctly.
Auto Wrapping
"""""""""""""
Model layers should be wrapped in FSDP in a nested way to save peak memory and enable communication and computation overlapping. The
simplest way to do it is auto wrapping, which can serve as a drop-in replacement for DDP without changing the rest of the code. You don't
have to ``wrap`` layers manually as in the case of manual wrapping.

.. code-block:: python
model = BoringModel()
trainer = Trainer(accelerator="gpu", devices=4, strategy="fsdp_native", precision=16)
trainer.fit(model)
Read more `here <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/#auto-wrapping>`__.


Manual Wrapping
"""""""""""""""

Manual wrapping can be useful to explore complex sharding strategies by applying ``wrap`` selectively to some parts of the model. To activate
parameter sharding with manual wrapping, you can wrap your model using the ``wrap`` function. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` parameters are passed correctly.

When not using Fully Sharded these wrap functions are a no-op. This means once the changes have been made, there is no need to remove the changes for other strategies.

Expand Down
2 changes: 1 addition & 1 deletion docs/source-pytorch/common/checkpointing_intermediate.rst
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ What
Where
=====

- It gives you the ability to specify the ``dirpath`` and ``filename`` for your checkpoints. Filename can also be dynamic so you can inject the metrics that are being logged using :meth:`~pytorch_lightning.core.module.LightningModule.log`.
- By default, the ``ModelCheckpoint`` will save files into the ``Trainer.log_dir``. It gives you the ability to specify the ``dirpath`` and ``filename`` for your checkpoints. Filename can also be dynamic so you can inject the metrics that are being logged using :meth:`~pytorch_lightning.core.module.LightningModule.log`.

|
Expand Down
Loading

0 comments on commit 5ec39ad

Please sign in to comment.