Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CI: parameterize TPU tests #15876

Merged
merged 14 commits into from
Dec 6, 2022
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 26 additions & 7 deletions .github/workflows/tpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: Test PyTorch - TPU
on:
push:
branches: [master, "release/*"]
pull_request_target:
pull_request: # FIXME: use `pull_request_target`
Borda marked this conversation as resolved.
Show resolved Hide resolved
branches: [master, "release/*"]
types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped
paths:
Expand All @@ -30,13 +30,22 @@ env:
GKE_CLUSTER: lightning-cluster
GKE_ZONE: us-central1-a

defaults:
run:
shell: bash

jobs:
# TODO: package parametrization
test-on-tpus:
runs-on: ubuntu-22.04
if: github.event.pull_request.draft == false
env:
PYTHON_VER: 3.7
strategy:
fail-fast: false
max-parallel: 1 # run sequential
matrix:
# TODO: add also lightning
pkg-name: ["lite", "pytorch"]
timeout-minutes: 100 # should match the timeout in `tpu_workflow.jsonnet`

steps:
Expand Down Expand Up @@ -64,14 +73,24 @@ jobs:

- name: Update jsonnet
env:
SCOPE: ${{ matrix.pkg-name }}
XLA_VER: 1.12
PR_NUMBER: ${{ github.event.pull_request.number }}
SHA: ${{ github.event.pull_request.head.sha }}
run: |
python -c "fname = 'dockers/base-xla/tpu_workflow.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER')
data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)"
cat dockers/base-xla/tpu_workflow.jsonnet
shell: bash
import os
fname = f'dockers/base-xla/tpu_workflow_{os.getenv("SCOPE")}.jsonnet'
with open(fname) as fo:
data = fo.read()
data = data.replace('{PYTORCH_VERSION}', os.getenv("XLA_VER"))
data = data.replace('{PYTHON_VERSION}', os.getenv("PYTHON_VER"))
data = data.replace('{PR_NUMBER}', os.getenv("PR_NUMBER"))
data = data.replace('{SHA}', os.getenv("SHA"))
with open(fname, "w") as fw:
fw.write(data)
shell: python
- name: Show jsonnet
run: cat dockers/base-xla/tpu_workflow_${{ matrix.pkg-name }}.jsonnet

- uses: google-github-actions/auth@v1
with:
Expand All @@ -86,7 +105,7 @@ jobs:
- name: Deploy cluster
run: |
export PATH=$PATH:$HOME/go/bin
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow.jsonnet | kubectl create -f -)
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow_${{ matrix.pkg-name }}.jsonnet | kubectl create -f -)
job_name=${job_name#job.batch/}
job_name=${job_name% created}
pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}')
Expand Down
6 changes: 3 additions & 3 deletions dockers/base-xla/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,11 @@ RUN pip --version && \
rm *.whl

# Get package
COPY ./ ./pytorch-lightning/
COPY ./ ./lightning/

RUN \
python --version && \
cd pytorch-lightning && \
cd lightning && \
pip install -q -r .actions/requirements.txt && \
# Pin mkl version to avoid OSError on torch import
# OSError: libmkl_intel_lp64.so.1: cannot open shared object file: No such file or directory
Expand All @@ -103,7 +103,7 @@ RUN \
# install PL dependencies
pip install --requirement ./requirements/pytorch/devel.txt --no-cache-dir && \
cd .. && \
rm -rf pytorch-lightning && \
rm -rf lightning && \
rm -rf /root/.cache

RUN \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,26 +26,26 @@ local tputests = base.BaseTest {
source ~/.bashrc
conda activate lightning

echo "--- Fetch the SHA's changes ---"
echo "--- Cloning lightning repo ---"
git clone --single-branch --depth 1 https://github.com/Lightning-AI/lightning.git
cd lightning
# PR triggered it, check it out
if [ -n "{PR_NUMBER}" ]; then # if PR number is not empty
# PR triggered it, check it out
echo "--- Fetch the PR changes ---"
git fetch origin --depth 1 pull/{PR_NUMBER}/head:test/{PR_NUMBER}
echo "--- Checkout PR changes ---"
git -c advice.detachedHead=false checkout {SHA}
fi

echo "--- Install packages ---"
PACKAGE_NAME=lite pip install -e .[dev]
PACKAGE_NAME=pytorch pip install -e .[dev]
PACKAGE_NAME=lite pip install .[dev]
pip list

echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"

echo "--- Sanity check TPU availability ---"
python -c "from lightning_lite.accelerators import TPUAccelerator; assert TPUAccelerator.is_available()"
python -c "from pytorch_lightning.accelerators import TPUAccelerator; assert TPUAccelerator.is_available()"
echo "Sanity check passed!"

echo "--- Running Lite tests ---"
Expand All @@ -55,13 +55,6 @@ local tputests = base.BaseTest {
echo "--- Running standalone Lite tests ---"
PL_STANDALONE_TESTS_SOURCE=lightning_lite PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh

echo "--- Running PL tests ---"
cd ../tests_pytorch
PL_RUN_TPU_TESTS=1 coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./

echo "--- Running standalone PL tests ---"
PL_STANDALONE_TESTS_SOURCE=pytorch_lightning PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh

echo "--- Generating coverage ---"
coverage xml
cat coverage.xml | tr -d '\t'
Expand Down
65 changes: 65 additions & 0 deletions dockers/base-xla/tpu_workflow_pytorch.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
local base = import 'templates/base.libsonnet';
local tpus = import 'templates/tpus.libsonnet';
local utils = import "templates/utils.libsonnet";

local tputests = base.BaseTest {
frameworkPrefix: 'pl',
modelName: 'tpu-tests',
mode: 'postsubmit',
configMaps: [],

timeout: 6000, # 100 minutes, in seconds.

image: 'pytorchlightning/pytorch_lightning',
imageTag: 'base-xla-py{PYTHON_VERSION}-torch{PYTORCH_VERSION}',

tpuSettings+: {
softwareVersion: 'pytorch-{PYTORCH_VERSION}',
},
accelerator: tpus.v3_8,

command: utils.scriptCommand(
|||
set +x # turn off tracing, spammy
set -e # exit on error

source ~/.bashrc
conda activate lightning

echo "--- Cloning lightning repo ---"
git clone --single-branch --depth 1 https://github.com/Lightning-AI/lightning.git
cd lightning
# PR triggered it, check it out
if [ -n "{PR_NUMBER}" ]; then # if PR number is not empty
echo "--- Fetch the PR changes ---"
git fetch origin --depth 1 pull/{PR_NUMBER}/head:test/{PR_NUMBER}
echo "--- Checkout PR changes ---"
git -c advice.detachedHead=false checkout {SHA}
fi

echo "--- Install packages ---"
PACKAGE_NAME=pytorch pip install .[dev]
pip list

echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"

echo "--- Sanity check TPU availability ---"
python -c "from pytorch_lightning.accelerators import TPUAccelerator; assert TPUAccelerator.is_available()"
echo "Sanity check passed!"

echo "--- Running PL tests ---"
cd tests/tests_pytorch
PL_RUN_TPU_TESTS=1 coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./

echo "--- Running standalone PL tests ---"
PL_STANDALONE_TESTS_SOURCE=pytorch_lightning PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh

echo "--- Generating coverage ---"
coverage xml
cat coverage.xml | tr -d '\t'
|||
),
};

tputests.oneshotJob
10 changes: 2 additions & 8 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,21 +32,15 @@ dependencies:
- pytorch>=1.10.*
- future>=0.17.1
- PyYAML>=5.1
- tqdm>=4.41.0
- tqdm>=4.57.0
- fsspec[http]>=2021.06.1
#- tensorboard>=2.2.0 # not needed, already included in pytorch

# Optional
#- nvidia-apex # missing for py3.8
- scikit-learn>=0.20.0
- scikit-learn >0.22.1
- matplotlib>=3.1.1
- omegaconf>=2.0.5

# Examples
- torchvision>=0.11.*

- pip:
- mlflow>=1.0.0
- comet_ml>=3.1.12
- wandb>=0.10.22
- neptune-client>=0.10.0