diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml index 6ce21d0f9ebc7..24c5fa1a205d5 100644 --- a/.github/workflows/tpu-tests.yml +++ b/.github/workflows/tpu-tests.yml @@ -30,13 +30,22 @@ env: GKE_CLUSTER: lightning-cluster GKE_ZONE: us-central1-a +defaults: + run: + shell: bash + jobs: - # TODO: package parametrization test-on-tpus: runs-on: ubuntu-22.04 if: github.event.pull_request.draft == false env: PYTHON_VER: 3.7 + strategy: + fail-fast: false + max-parallel: 1 # run sequential + matrix: + # TODO: add also lightning + pkg-name: ["lite", "pytorch"] timeout-minutes: 100 # should match the timeout in `tpu_workflow.jsonnet` steps: @@ -64,14 +73,24 @@ jobs: - name: Update jsonnet env: + SCOPE: ${{ matrix.pkg-name }} XLA_VER: 1.12 PR_NUMBER: ${{ github.event.pull_request.number }} SHA: ${{ github.event.pull_request.head.sha }} run: | - python -c "fname = 'dockers/base-xla/tpu_workflow.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER') - data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)" - cat dockers/base-xla/tpu_workflow.jsonnet - shell: bash + import os + fname = f'dockers/base-xla/tpu_workflow_{os.getenv("SCOPE")}.jsonnet' + with open(fname) as fo: + data = fo.read() + data = data.replace('{PYTORCH_VERSION}', os.getenv("XLA_VER")) + data = data.replace('{PYTHON_VERSION}', os.getenv("PYTHON_VER")) + data = data.replace('{PR_NUMBER}', os.getenv("PR_NUMBER")) + data = data.replace('{SHA}', os.getenv("SHA")) + with open(fname, "w") as fw: + fw.write(data) + shell: python + - name: Show jsonnet + run: cat dockers/base-xla/tpu_workflow_${{ matrix.pkg-name }}.jsonnet - uses: google-github-actions/auth@v1 with: @@ -86,7 +105,7 @@ jobs: - name: Deploy cluster run: | export PATH=$PATH:$HOME/go/bin - job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow.jsonnet | kubectl create -f -) + job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow_${{ matrix.pkg-name }}.jsonnet | kubectl create -f -) job_name=${job_name#job.batch/} job_name=${job_name% created} pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile index b8ac175044f80..267453b7d56f5 100644 --- a/dockers/base-xla/Dockerfile +++ b/dockers/base-xla/Dockerfile @@ -86,11 +86,11 @@ RUN pip --version && \ rm *.whl # Get package -COPY ./ ./pytorch-lightning/ +COPY ./ ./lightning/ RUN \ python --version && \ - cd pytorch-lightning && \ + cd lightning && \ pip install -q -r .actions/requirements.txt && \ # Pin mkl version to avoid OSError on torch import # OSError: libmkl_intel_lp64.so.1: cannot open shared object file: No such file or directory @@ -103,7 +103,7 @@ RUN \ # install PL dependencies pip install --requirement ./requirements/pytorch/devel.txt --no-cache-dir && \ cd .. && \ - rm -rf pytorch-lightning && \ + rm -rf lightning && \ rm -rf /root/.cache RUN \ diff --git a/dockers/base-xla/tpu_workflow.jsonnet b/dockers/base-xla/tpu_workflow_lite.jsonnet similarity index 74% rename from dockers/base-xla/tpu_workflow.jsonnet rename to dockers/base-xla/tpu_workflow_lite.jsonnet index 1d006fec89b6a..887f0f744a206 100644 --- a/dockers/base-xla/tpu_workflow.jsonnet +++ b/dockers/base-xla/tpu_workflow_lite.jsonnet @@ -26,18 +26,19 @@ local tputests = base.BaseTest { source ~/.bashrc conda activate lightning - echo "--- Fetch the SHA's changes ---" + echo "--- Cloning lightning repo ---" git clone --single-branch --depth 1 https://github.com/Lightning-AI/lightning.git cd lightning + # PR triggered it, check it out if [ -n "{PR_NUMBER}" ]; then # if PR number is not empty - # PR triggered it, check it out + echo "--- Fetch the PR changes ---" git fetch origin --depth 1 pull/{PR_NUMBER}/head:test/{PR_NUMBER} + echo "--- Checkout PR changes ---" git -c advice.detachedHead=false checkout {SHA} fi echo "--- Install packages ---" - PACKAGE_NAME=lite pip install -e .[dev] - PACKAGE_NAME=pytorch pip install -e .[dev] + PACKAGE_NAME=lite pip install .[dev] pip list echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS @@ -45,7 +46,6 @@ local tputests = base.BaseTest { echo "--- Sanity check TPU availability ---" python -c "from lightning_lite.accelerators import TPUAccelerator; assert TPUAccelerator.is_available()" - python -c "from pytorch_lightning.accelerators import TPUAccelerator; assert TPUAccelerator.is_available()" echo "Sanity check passed!" echo "--- Running Lite tests ---" @@ -55,13 +55,6 @@ local tputests = base.BaseTest { echo "--- Running standalone Lite tests ---" PL_STANDALONE_TESTS_SOURCE=lightning_lite PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh - echo "--- Running PL tests ---" - cd ../tests_pytorch - PL_RUN_TPU_TESTS=1 coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./ - - echo "--- Running standalone PL tests ---" - PL_STANDALONE_TESTS_SOURCE=pytorch_lightning PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh - echo "--- Generating coverage ---" coverage xml cat coverage.xml | tr -d '\t' diff --git a/dockers/base-xla/tpu_workflow_pytorch.jsonnet b/dockers/base-xla/tpu_workflow_pytorch.jsonnet new file mode 100644 index 0000000000000..5acaf5ce99f34 --- /dev/null +++ b/dockers/base-xla/tpu_workflow_pytorch.jsonnet @@ -0,0 +1,65 @@ +local base = import 'templates/base.libsonnet'; +local tpus = import 'templates/tpus.libsonnet'; +local utils = import "templates/utils.libsonnet"; + +local tputests = base.BaseTest { + frameworkPrefix: 'pl', + modelName: 'tpu-tests', + mode: 'postsubmit', + configMaps: [], + + timeout: 6000, # 100 minutes, in seconds. + + image: 'pytorchlightning/pytorch_lightning', + imageTag: 'base-xla-py{PYTHON_VERSION}-torch{PYTORCH_VERSION}', + + tpuSettings+: { + softwareVersion: 'pytorch-{PYTORCH_VERSION}', + }, + accelerator: tpus.v3_8, + + command: utils.scriptCommand( + ||| + set +x # turn off tracing, spammy + set -e # exit on error + + source ~/.bashrc + conda activate lightning + + echo "--- Cloning lightning repo ---" + git clone --single-branch --depth 1 https://github.com/Lightning-AI/lightning.git + cd lightning + # PR triggered it, check it out + if [ -n "{PR_NUMBER}" ]; then # if PR number is not empty + echo "--- Fetch the PR changes ---" + git fetch origin --depth 1 pull/{PR_NUMBER}/head:test/{PR_NUMBER} + echo "--- Checkout PR changes ---" + git -c advice.detachedHead=false checkout {SHA} + fi + + echo "--- Install packages ---" + PACKAGE_NAME=pytorch pip install .[dev] + pip list + + echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS + export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}" + + echo "--- Sanity check TPU availability ---" + python -c "from pytorch_lightning.accelerators import TPUAccelerator; assert TPUAccelerator.is_available()" + echo "Sanity check passed!" + + echo "--- Running PL tests ---" + cd tests/tests_pytorch + PL_RUN_TPU_TESTS=1 coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./ + + echo "--- Running standalone PL tests ---" + PL_STANDALONE_TESTS_SOURCE=pytorch_lightning PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh + + echo "--- Generating coverage ---" + coverage xml + cat coverage.xml | tr -d '\t' + ||| + ), +}; + +tputests.oneshotJob diff --git a/environment.yml b/environment.yml index 330a42f581767..b05061146a72a 100644 --- a/environment.yml +++ b/environment.yml @@ -32,21 +32,15 @@ dependencies: - pytorch>=1.10.* - future>=0.17.1 - PyYAML>=5.1 - - tqdm>=4.41.0 + - tqdm>=4.57.0 - fsspec[http]>=2021.06.1 #- tensorboard>=2.2.0 # not needed, already included in pytorch # Optional #- nvidia-apex # missing for py3.8 - - scikit-learn>=0.20.0 + - scikit-learn >0.22.1 - matplotlib>=3.1.1 - omegaconf>=2.0.5 # Examples - torchvision>=0.11.* - - - pip: - - mlflow>=1.0.0 - - comet_ml>=3.1.12 - - wandb>=0.10.22 - - neptune-client>=0.10.0