Lightning-AI · Borda · Dec 6, 2022 · Nov 30, 2022 · Nov 30, 2022 · Nov 30, 2022
@@ -3,7 +3,7 @@ name: Test PyTorch - TPU
 on:
   push:
     branches: [master, "release/*"]
-  pull_request_target:
+  pull_request:  # FIXME: use `pull_request_target`
     branches: [master, "release/*"]
     types: [opened, reopened, ready_for_review, synchronize]  # added `ready_for_review` since draft is skipped
     paths:
@@ -30,13 +30,22 @@ env:
   GKE_CLUSTER: lightning-cluster
   GKE_ZONE: us-central1-a
 
+defaults:
+  run:
+    shell: bash
+
 jobs:
-  # TODO: package parametrization
   test-on-tpus:
     runs-on: ubuntu-22.04
     if: github.event.pull_request.draft == false
     env:
       PYTHON_VER: 3.7
+    strategy:
+      fail-fast: false
+      max-parallel: 1  # run sequential
+      matrix:
+        # TODO: add also lightning
+        pkg-name: ["lite", "pytorch"]
     timeout-minutes: 100  # should match the timeout in `tpu_workflow.jsonnet`
 
     steps:
@@ -64,14 +73,24 @@ jobs:
 
     - name: Update jsonnet
       env:
+        SCOPE: ${{ matrix.pkg-name }}
         XLA_VER: 1.12
         PR_NUMBER: ${{ github.event.pull_request.number }}
         SHA: ${{ github.event.pull_request.head.sha }}
       run: |
-       python -c "fname = 'dockers/base-xla/tpu_workflow.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER')
-       data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)"
-       cat dockers/base-xla/tpu_workflow.jsonnet
-      shell: bash
+        import os
+        fname = f'dockers/base-xla/tpu_workflow_{os.getenv("SCOPE")}.jsonnet'
+        with open(fname) as fo:
+            data = fo.read()
+        data = data.replace('{PYTORCH_VERSION}', os.getenv("XLA_VER"))
+        data = data.replace('{PYTHON_VERSION}', os.getenv("PYTHON_VER"))
+        data = data.replace('{PR_NUMBER}', os.getenv("PR_NUMBER"))
+        data = data.replace('{SHA}', os.getenv("SHA"))
+        with open(fname, "w") as fw:
+            fw.write(data)
+      shell: python
+    - name: Show jsonnet
+      run: cat dockers/base-xla/tpu_workflow_${{ matrix.pkg-name }}.jsonnet
 
     - uses: google-github-actions/auth@v1
       with:
@@ -86,7 +105,7 @@ jobs:
     - name: Deploy cluster
       run: |
         export PATH=$PATH:$HOME/go/bin
-        job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow.jsonnet | kubectl create -f -)
+        job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow_${{ matrix.pkg-name }}.jsonnet | kubectl create -f -)
         job_name=${job_name#job.batch/}
         job_name=${job_name% created}
         pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}')

@@ -86,11 +86,11 @@ RUN pip --version && \
     rm *.whl
 
 # Get package
-COPY ./ ./pytorch-lightning/
+COPY ./ ./lightning/
 
 RUN \
     python --version && \
-    cd pytorch-lightning && \
+    cd lightning && \
     pip install -q -r .actions/requirements.txt && \
     # Pin mkl version to avoid OSError on torch import
     # OSError: libmkl_intel_lp64.so.1: cannot open shared object file: No such file or directory
@@ -103,7 +103,7 @@ RUN \
     # install PL dependencies
     pip install --requirement ./requirements/pytorch/devel.txt --no-cache-dir && \
     cd .. && \
-    rm -rf pytorch-lightning && \
+    rm -rf lightning && \
     rm -rf /root/.cache
 
 RUN \

@@ -26,26 +26,26 @@ local tputests = base.BaseTest {
       source ~/.bashrc
       conda activate lightning
 
-      echo "--- Fetch the SHA's changes ---"
+      echo "--- Cloning lightning repo ---"
       git clone --single-branch --depth 1 https://github.com/Lightning-AI/lightning.git
       cd lightning
+      # PR triggered it, check it out
       if [ -n "{PR_NUMBER}" ]; then  # if PR number is not empty
-        # PR triggered it, check it out
+        echo "--- Fetch the PR changes ---"
         git fetch origin --depth 1 pull/{PR_NUMBER}/head:test/{PR_NUMBER}
+        echo "--- Checkout PR changes ---"
         git -c advice.detachedHead=false checkout {SHA}
       fi
 
       echo "--- Install packages ---"
-      PACKAGE_NAME=lite pip install -e .[dev]
-      PACKAGE_NAME=pytorch pip install -e .[dev]
+      PACKAGE_NAME=lite pip install .[dev]
       pip list
 
       echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
       export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
 
       echo "--- Sanity check TPU availability ---"
       python -c "from lightning_lite.accelerators import TPUAccelerator; assert TPUAccelerator.is_available()"
-      python -c "from pytorch_lightning.accelerators import TPUAccelerator; assert TPUAccelerator.is_available()"
       echo "Sanity check passed!"
 
       echo "--- Running Lite tests ---"
@@ -55,13 +55,6 @@ local tputests = base.BaseTest {
       echo "--- Running standalone Lite tests ---"
       PL_STANDALONE_TESTS_SOURCE=lightning_lite PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh
 
-      echo "--- Running PL tests ---"
-      cd ../tests_pytorch
-      PL_RUN_TPU_TESTS=1 coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./
-
-      echo "--- Running standalone PL tests ---"
-      PL_STANDALONE_TESTS_SOURCE=pytorch_lightning PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh
-
       echo "--- Generating coverage ---"
       coverage xml
       cat coverage.xml | tr -d '\t'

@@ -0,0 +1,65 @@
+local base = import 'templates/base.libsonnet';
+local tpus = import 'templates/tpus.libsonnet';
+local utils = import "templates/utils.libsonnet";
+
+local tputests = base.BaseTest {
+  frameworkPrefix: 'pl',
+  modelName: 'tpu-tests',
+  mode: 'postsubmit',
+  configMaps: [],
+
+  timeout: 6000, # 100 minutes, in seconds.
+
+  image: 'pytorchlightning/pytorch_lightning',
+  imageTag: 'base-xla-py{PYTHON_VERSION}-torch{PYTORCH_VERSION}',
+
+  tpuSettings+: {
+    softwareVersion: 'pytorch-{PYTORCH_VERSION}',
+  },
+  accelerator: tpus.v3_8,
+
+  command: utils.scriptCommand(
+    |||
+      set +x  # turn off tracing, spammy
+      set -e  # exit on error
+
+      source ~/.bashrc
+      conda activate lightning
+
+      echo "--- Cloning lightning repo ---"
+      git clone --single-branch --depth 1 https://github.com/Lightning-AI/lightning.git
+      cd lightning
+      # PR triggered it, check it out
+      if [ -n "{PR_NUMBER}" ]; then  # if PR number is not empty
+        echo "--- Fetch the PR changes ---"
+        git fetch origin --depth 1 pull/{PR_NUMBER}/head:test/{PR_NUMBER}
+        echo "--- Checkout PR changes ---"
+        git -c advice.detachedHead=false checkout {SHA}
+      fi
+
+      echo "--- Install packages ---"
+      PACKAGE_NAME=pytorch pip install .[dev]
+      pip list
+
+      echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
+      export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
+
+      echo "--- Sanity check TPU availability ---"
+      python -c "from pytorch_lightning.accelerators import TPUAccelerator; assert TPUAccelerator.is_available()"
+      echo "Sanity check passed!"
+
+      echo "--- Running PL tests ---"
+      cd tests/tests_pytorch
+      PL_RUN_TPU_TESTS=1 coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./
+
+      echo "--- Running standalone PL tests ---"
+      PL_STANDALONE_TESTS_SOURCE=pytorch_lightning PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh
+
+      echo "--- Generating coverage ---"
+      coverage xml
+      cat coverage.xml | tr -d '\t'
+    |||
+  ),
+};
+
+tputests.oneshotJob
@@ -32,21 +32,15 @@ dependencies:
     - pytorch>=1.10.*
     - future>=0.17.1
     - PyYAML>=5.1
-    - tqdm>=4.41.0
+    - tqdm>=4.57.0
     - fsspec[http]>=2021.06.1
     #- tensorboard>=2.2.0  # not needed, already included in pytorch
 
     # Optional
     #- nvidia-apex  # missing for py3.8
-    - scikit-learn>=0.20.0
+    - scikit-learn >0.22.1
     - matplotlib>=3.1.1
     - omegaconf>=2.0.5
 
     # Examples
     - torchvision>=0.11.*
-
-    - pip:
-        - mlflow>=1.0.0
-        - comet_ml>=3.1.12
-        - wandb>=0.10.22
-        - neptune-client>=0.10.0