Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into feat/auto_wrap_fsdp_f
Browse files Browse the repository at this point in the history
  • Loading branch information
rohitgr7 committed Aug 25, 2022
2 parents 4714e8c + 1ae14ca commit 9f6cc32
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 26 deletions.
11 changes: 7 additions & 4 deletions .azure/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:

- bash: |
CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}')
FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
FILTER='.azure/gpu_*|src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
echo $CHANGED_FILES > changed_files.txt
MATCHES=$(cat changed_files.txt | grep -E $FILTER)
echo $MATCHES
Expand Down Expand Up @@ -72,12 +72,15 @@ jobs:
set -e
python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'bagua' not in line] ; open(fname, 'w').writelines(lines)"
TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])")
python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION}
python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt ${PYTORCH_VERSION}
python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION}
pip install "bagua-cuda$CUDA_VERSION_BAGUA>=0.9.0"
pip install -e .[strategies]
pip install -U deepspeed # TODO: remove when docker images are upgraded
pip install --requirement requirements/pytorch/devel.txt
pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
pip list
env:
PACKAGE_NAME: pytorch
Expand Down
14 changes: 8 additions & 6 deletions dockers/base-conda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ RUN \
# https://github.com/NVIDIA/nvidia-docker/issues/1631
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
apt-get update -qq --fix-missing && \
NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
MAX_ALLOWED_NCCL=2.11.4 && \
TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1)-1+cuda${CUDA_VERSION_MM} && \
apt-get install -y --no-install-recommends \
build-essential \
cmake \
Expand All @@ -42,17 +46,15 @@ RUN \
curl \
unzip \
ca-certificates \
libopenmpi-dev

RUN \
libopenmpi-dev \
libnccl2=$TO_INSTALL_NCCL \
libnccl-dev=$TO_INSTALL_NCCL && \
# Install conda and python.
# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \
chmod +x ~/miniconda.sh && \
~/miniconda.sh -b && \
rm ~/miniconda.sh

RUN \
rm ~/miniconda.sh && \
# Cleaning
apt-get autoremove -y && \
apt-get clean && \
Expand Down
23 changes: 7 additions & 16 deletions dockers/base-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,11 @@ RUN \
# https://github.com/NVIDIA/nvidia-docker/issues/1631
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
apt-get update -qq --fix-missing && \
apt-get install -y --no-install-recommends \
NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
MAX_ALLOWED_NCCL=2.11.4 && \
TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1)-1+cuda${CUDA_VERSION_MM} && \
apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
build-essential \
pkg-config \
cmake \
Expand All @@ -50,19 +54,17 @@ RUN \
libopenmpi-dev \
openmpi-bin \
ssh \
&& \

libnccl2=$TO_INSTALL_NCCL \
libnccl-dev=$TO_INSTALL_NCCL && \
# Install python
add-apt-repository ppa:deadsnakes/ppa && \
apt-get install -y \
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-distutils \
python${PYTHON_VERSION}-dev \
&& \

update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \
update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \

# Cleaning
apt-get autoremove -y && \
apt-get clean && \
Expand All @@ -78,7 +80,6 @@ RUN \
wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \
python${PYTHON_VERSION} get-pip.py && \
rm get-pip.py && \

pip install -q fire && \
# Disable cache \
CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \
Expand All @@ -91,16 +92,6 @@ RUN \
pip install -r requirements/pytorch/devel.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html && \
rm assistant.py

RUN \
apt-get purge -y cmake && \
wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz && \
tar -zxvf cmake-3.20.2.tar.gz && \
cd cmake-3.20.2 && \
./bootstrap -- -DCMAKE_USE_OPENSSL=OFF && \
make && \
make install && \
cmake --version

ENV \
HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \
HOROVOD_GPU_OPERATIONS=NCCL \
Expand Down

0 comments on commit 9f6cc32

Please sign in to comment.