Skip to content

Commit

Permalink
CI: parameterize TPU tests (#15876)
Browse files Browse the repository at this point in the history
* update
* param
* Apply suggestions from code review

(cherry picked from commit 77006a2)
  • Loading branch information
Borda committed Dec 6, 2022
1 parent 6ed0e00 commit c2ab7f8
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 29 deletions.
31 changes: 25 additions & 6 deletions .github/workflows/tpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,22 @@ env:
GKE_CLUSTER: lightning-cluster
GKE_ZONE: us-central1-a

defaults:
run:
shell: bash

jobs:
# TODO: package parametrization
test-on-tpus:
runs-on: ubuntu-22.04
if: github.event.pull_request.draft == false
env:
PYTHON_VER: 3.7
strategy:
fail-fast: false
max-parallel: 1 # run sequential
matrix:
# TODO: add also lightning
pkg-name: ["lite", "pytorch"]
timeout-minutes: 100 # should match the timeout in `tpu_workflow.jsonnet`

steps:
Expand Down Expand Up @@ -64,14 +73,24 @@ jobs:

- name: Update jsonnet
env:
SCOPE: ${{ matrix.pkg-name }}
XLA_VER: 1.12
PR_NUMBER: ${{ github.event.pull_request.number }}
SHA: ${{ github.event.pull_request.head.sha }}
run: |
python -c "fname = 'dockers/base-xla/tpu_workflow.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER')
data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)"
cat dockers/base-xla/tpu_workflow.jsonnet
shell: bash
import os
fname = f'dockers/base-xla/tpu_workflow_{os.getenv("SCOPE")}.jsonnet'
with open(fname) as fo:
data = fo.read()
data = data.replace('{PYTORCH_VERSION}', os.getenv("XLA_VER"))
data = data.replace('{PYTHON_VERSION}', os.getenv("PYTHON_VER"))
data = data.replace('{PR_NUMBER}', os.getenv("PR_NUMBER"))
data = data.replace('{SHA}', os.getenv("SHA"))
with open(fname, "w") as fw:
fw.write(data)
shell: python
- name: Show jsonnet
run: cat dockers/base-xla/tpu_workflow_${{ matrix.pkg-name }}.jsonnet

- uses: google-github-actions/auth@v1
with:
Expand All @@ -86,7 +105,7 @@ jobs:
- name: Deploy cluster
run: |
export PATH=$PATH:$HOME/go/bin
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow.jsonnet | kubectl create -f -)
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow_${{ matrix.pkg-name }}.jsonnet | kubectl create -f -)
job_name=${job_name#job.batch/}
job_name=${job_name% created}
pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}')
Expand Down
6 changes: 3 additions & 3 deletions dockers/base-xla/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,11 @@ RUN pip --version && \
rm *.whl

# Get package
COPY ./ ./pytorch-lightning/
COPY ./ ./lightning/

RUN \
python --version && \
cd pytorch-lightning && \
cd lightning && \
pip install -q -r .actions/requirements.txt && \
# Pin mkl version to avoid OSError on torch import
# OSError: libmkl_intel_lp64.so.1: cannot open shared object file: No such file or directory
Expand All @@ -103,7 +103,7 @@ RUN \
# install PL dependencies
pip install --requirement ./requirements/pytorch/devel.txt --no-cache-dir && \
cd .. && \
rm -rf pytorch-lightning && \
rm -rf lightning && \
rm -rf /root/.cache

RUN \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,26 +26,26 @@ local tputests = base.BaseTest {
source ~/.bashrc
conda activate lightning
echo "--- Fetch the SHA's changes ---"
echo "--- Cloning lightning repo ---"
git clone --single-branch --depth 1 https://github.com/Lightning-AI/lightning.git
cd lightning
# PR triggered it, check it out
if [ -n "{PR_NUMBER}" ]; then # if PR number is not empty
# PR triggered it, check it out
echo "--- Fetch the PR changes ---"
git fetch origin --depth 1 pull/{PR_NUMBER}/head:test/{PR_NUMBER}
echo "--- Checkout PR changes ---"
git -c advice.detachedHead=false checkout {SHA}
fi
echo "--- Install packages ---"
PACKAGE_NAME=lite pip install -e .[dev]
PACKAGE_NAME=pytorch pip install -e .[dev]
PACKAGE_NAME=lite pip install .[dev]
pip list
echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
echo "--- Sanity check TPU availability ---"
python -c "from lightning_lite.accelerators import TPUAccelerator; assert TPUAccelerator.is_available()"
python -c "from pytorch_lightning.accelerators import TPUAccelerator; assert TPUAccelerator.is_available()"
echo "Sanity check passed!"
echo "--- Running Lite tests ---"
Expand All @@ -55,13 +55,6 @@ local tputests = base.BaseTest {
echo "--- Running standalone Lite tests ---"
PL_STANDALONE_TESTS_SOURCE=lightning_lite PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh
echo "--- Running PL tests ---"
cd ../tests_pytorch
PL_RUN_TPU_TESTS=1 coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./
echo "--- Running standalone PL tests ---"
PL_STANDALONE_TESTS_SOURCE=pytorch_lightning PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh
echo "--- Generating coverage ---"
coverage xml
cat coverage.xml | tr -d '\t'
Expand Down
65 changes: 65 additions & 0 deletions dockers/base-xla/tpu_workflow_pytorch.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
local base = import 'templates/base.libsonnet';
local tpus = import 'templates/tpus.libsonnet';
local utils = import "templates/utils.libsonnet";

local tputests = base.BaseTest {
frameworkPrefix: 'pl',
modelName: 'tpu-tests',
mode: 'postsubmit',
configMaps: [],

timeout: 6000, # 100 minutes, in seconds.

image: 'pytorchlightning/pytorch_lightning',
imageTag: 'base-xla-py{PYTHON_VERSION}-torch{PYTORCH_VERSION}',

tpuSettings+: {
softwareVersion: 'pytorch-{PYTORCH_VERSION}',
},
accelerator: tpus.v3_8,

command: utils.scriptCommand(
|||
set +x # turn off tracing, spammy
set -e # exit on error
source ~/.bashrc
conda activate lightning
echo "--- Cloning lightning repo ---"
git clone --single-branch --depth 1 https://github.com/Lightning-AI/lightning.git
cd lightning
# PR triggered it, check it out
if [ -n "{PR_NUMBER}" ]; then # if PR number is not empty
echo "--- Fetch the PR changes ---"
git fetch origin --depth 1 pull/{PR_NUMBER}/head:test/{PR_NUMBER}
echo "--- Checkout PR changes ---"
git -c advice.detachedHead=false checkout {SHA}
fi
echo "--- Install packages ---"
PACKAGE_NAME=pytorch pip install .[dev]
pip list
echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
echo "--- Sanity check TPU availability ---"
python -c "from pytorch_lightning.accelerators import TPUAccelerator; assert TPUAccelerator.is_available()"
echo "Sanity check passed!"
echo "--- Running PL tests ---"
cd tests/tests_pytorch
PL_RUN_TPU_TESTS=1 coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./
echo "--- Running standalone PL tests ---"
PL_STANDALONE_TESTS_SOURCE=pytorch_lightning PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh
echo "--- Generating coverage ---"
coverage xml
cat coverage.xml | tr -d '\t'
|||
),
};

tputests.oneshotJob
10 changes: 2 additions & 8 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,21 +32,15 @@ dependencies:
- pytorch>=1.9.*
- future>=0.17.1
- PyYAML>=5.1
- tqdm>=4.41.0
- tqdm>=4.57.0
- fsspec[http]>=2021.06.1
#- tensorboard>=2.2.0 # not needed, already included in pytorch

# Optional
#- nvidia-apex # missing for py3.8
- scikit-learn>=0.20.0
- scikit-learn >0.22.1
- matplotlib>=3.1.1
- omegaconf>=2.0.5

# Examples
- torchvision>=0.10.*

- pip:
- mlflow>=1.0.0
- comet_ml>=3.1.12
- wandb>=0.10.22
- neptune-client>=0.10.0

0 comments on commit c2ab7f8

Please sign in to comment.