Lightning v2.3: Tensor Parallelism and 2D Parallelism #1714
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Docker builds | |
on: | |
push: | |
branches: [master, "release/*"] | |
pull_request: | |
branches: [master, "release/*"] | |
types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped | |
paths: | |
- ".actions/*" | |
- ".github/workflows/docker-build.yml" | |
- "dockers/**" | |
- "requirements/*.txt" | |
- "requirements/pytorch/**" | |
- "requirements/fabric/**" | |
- "setup.py" | |
- "!requirements/*/docs.txt" | |
- "!*.md" | |
- "!**/*.md" | |
schedule: | |
- cron: "0 0 * * *" # at the end of every day | |
release: | |
types: [published] | |
workflow_dispatch: {} | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}-${{ github.event_name }} | |
cancel-in-progress: ${{ github.event_name == 'pull_request' }} | |
env: | |
PUSH_NIGHTLY: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} | |
PUSH_RELEASE: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'release' }} | |
jobs: | |
build-pl: | |
# the images generated by this job are not used anywhere in this repository. they are just meant to be available | |
# for users | |
if: github.event.pull_request.draft == false | |
runs-on: ubuntu-latest | |
strategy: | |
fail-fast: false | |
matrix: | |
include: | |
# We only release one docker image per PyTorch version. | |
# Make sure the matrix here matches the one below. | |
- { python_version: "3.10", pytorch_version: "2.0", cuda_version: "11.8.0" } | |
- { python_version: "3.10", pytorch_version: "2.1", cuda_version: "12.1.0" } | |
- { python_version: "3.10", pytorch_version: "2.2", cuda_version: "12.1.0" } | |
- { python_version: "3.11", pytorch_version: "2.1", cuda_version: "12.1.0" } | |
- { python_version: "3.11", pytorch_version: "2.2", cuda_version: "12.1.0" } | |
steps: | |
- uses: actions/checkout@v4 | |
with: | |
submodules: true | |
- uses: docker/setup-buildx-action@v3 | |
- uses: docker/login-action@v3 | |
if: env.PUSH_RELEASE == 'true' && github.repository_owner == 'Lightning-AI' | |
with: | |
username: ${{ secrets.DOCKER_USERNAME }} | |
password: ${{ secrets.DOCKER_PASSWORD }} | |
- name: Get release version | |
if: github.event_name == 'release' | |
# For workflows triggered by release, `GITHUB_REF` is the release tag created. | |
run: echo "RELEASE_VERSION=$(echo ${GITHUB_REF##*/})" >> $GITHUB_ENV | |
- name: Set tags | |
run: | | |
import os | |
repo = "pytorchlightning/pytorch_lightning" | |
ver = os.getenv('RELEASE_VERSION') | |
py_ver = "${{ matrix.python_version }}" | |
pt_ver = "${{ matrix.pytorch_version }}" | |
cuda_ver = "${{ matrix.cuda_version }}" | |
tags = [f"latest-py{py_ver}-torch{pt_ver}-cuda{cuda_ver}"] | |
if ver: | |
tags += [f"{ver}-py{py_ver}-torch{pt_ver}-cuda{cuda_ver}"] | |
if py_ver == '3.10' and pt_ver == '2.1' and cuda_ver == '12.1.0': | |
tags += ["latest"] | |
tags = [f"{repo}:{tag}" for tag in tags] | |
with open(os.getenv('GITHUB_ENV'), "a") as gh_env: | |
gh_env.write("DOCKER_TAGS=" + ",".join(tags)) | |
shell: python | |
- uses: docker/build-push-action@v5 | |
with: | |
build-args: | | |
PYTHON_VERSION=${{ matrix.python_version }} | |
PYTORCH_VERSION=${{ matrix.pytorch_version }} | |
CUDA_VERSION=${{ matrix.cuda_version }} | |
LIGHTNING_VERSION=${{ env.RELEASE_VERSION }} | |
file: dockers/release/Dockerfile | |
push: ${{ env.PUSH_RELEASE }} # pushed in release-docker.yml only when PL is released | |
tags: ${{ env.DOCKER_TAGS }} | |
timeout-minutes: 35 | |
build-cuda: | |
if: github.event.pull_request.draft == false | |
runs-on: ubuntu-latest | |
strategy: | |
fail-fast: false | |
matrix: | |
include: | |
# These are the base images for PL release docker images. | |
# Make sure the matrix here matches the one above. | |
- { python_version: "3.10", pytorch_version: "2.0", cuda_version: "11.8.0" } | |
- { python_version: "3.10", pytorch_version: "2.1", cuda_version: "12.1.0" } | |
- { python_version: "3.10", pytorch_version: "2.2", cuda_version: "12.1.0" } | |
- { python_version: "3.11", pytorch_version: "2.1", cuda_version: "12.1.0" } | |
- { python_version: "3.11", pytorch_version: "2.2", cuda_version: "12.1.0" } | |
- { python_version: "3.11", pytorch_version: "2.3", cuda_version: "12.1.0" } | |
# - { python_version: "3.12", pytorch_version: "2.2", cuda_version: "12.1.0" } # todo: pending on `onnxruntime` | |
steps: | |
- uses: actions/checkout@v4 | |
- uses: docker/setup-buildx-action@v3 | |
- uses: docker/login-action@v3 | |
if: env.PUSH_NIGHTLY == 'true' && github.repository_owner == 'Lightning-AI' | |
with: | |
username: ${{ secrets.DOCKER_USERNAME }} | |
password: ${{ secrets.DOCKER_PASSWORD }} | |
- uses: docker/build-push-action@v5 | |
with: | |
build-args: | | |
PYTHON_VERSION=${{ matrix.python_version }} | |
PYTORCH_VERSION=${{ matrix.pytorch_version }} | |
CUDA_VERSION=${{ matrix.cuda_version }} | |
file: dockers/base-cuda/Dockerfile | |
push: ${{ env.PUSH_NIGHTLY }} | |
tags: "pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}" | |
timeout-minutes: 95 | |
- uses: ravsamhq/notify-slack-action@v2 | |
if: failure() && env.PUSH_NIGHTLY == 'true' | |
with: | |
status: ${{ job.status }} | |
token: ${{ secrets.GITHUB_TOKEN }} | |
notification_title: ${{ format('CUDA; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.pytorch_version) }} | |
message_format: "{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01A5T7EY9M>" # akihironitta | |
env: | |
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} | |
build-NGC: | |
if: github.event.pull_request.draft == false | |
# fixme: use larger machine or optimize image size | |
# runs-on: ubuntu-latest-4-cores | |
# then drop continue-on-error | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Build Conda Docker | |
# publish master/release | |
continue-on-error: true | |
uses: docker/build-push-action@v5 | |
with: | |
file: dockers/nvidia/Dockerfile | |
push: false | |
timeout-minutes: 55 |