diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index 9025999cfc4ab7..3321fcb6b2b5bf 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -20,37 +20,32 @@ env: OMP_NUM_THREADS: 8 MKL_NUM_THREADS: 8 PYTEST_TIMEOUT: 60 + TF_FORCE_GPU_ALLOW_GROWTH: true + RUN_PT_TF_CROSS_TESTS: 1 jobs: - run_tests_torch_gpu: - runs-on: [self-hosted, docker-gpu, single-gpu] - container: - image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + setup: + name: Setup + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + test_map: ${{ steps.set-matrix.outputs.test_map }} steps: - - name: Install dependencies - run: | - apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git - apt install -y libsndfile1-dev espeak-ng - pip install --upgrade pip - pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] - pip install https://github.com/kpu/kenlm/archive/master.zip - - - name: Launcher docker + - name: Checkout transformers uses: actions/checkout@v2 with: fetch-depth: 2 - - name: NVIDIA-SMI + - name: Cleanup run: | - nvidia-smi - - - name: Are GPUs recognized by our DL frameworks - run: | - utils/print_env_pt.py + rm -rf tests/__pycache__ + rm -rf tests/models/__pycache__ + rm -rf reports - name: Fetch the tests to run + # TODO: add `git-python` in the docker images run: | + pip install --upgrade git-python python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt - name: Report fetched tests @@ -59,319 +54,158 @@ jobs: name: test_fetched path: test_preparation.txt - - name: Run all non-slow tests on GPU + - id: set-matrix + name: Organize tests into models + # The `keys` is used as GitHub actions matrix for jobs, i.e. `models/bert`, `tokenization`, `pipeline`, etc. + # The `test_map` is used to get the actual identified test files under each key. + # If no test to run (so no `test_map.json` file), create a dummy map (empty matrix will fail) run: | - if [ -f test_list.txt ]; then - python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_gpu $(cat test_list.txt) + if [ -f test_map.json ]; then + keys=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); d = list(test_map.keys()); print(d)') + test_map=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); print(test_map)') + else + keys=$(python3 -c 'keys = ["dummy"]; print(keys)') + test_map=$(python3 -c 'test_map = {"dummy": []}; print(test_map)') fi + echo $keys + echo $test_map + echo "::set-output name=matrix::$keys" + echo "::set-output name=test_map::$test_map" + + run_tests_single_gpu: + name: Model Tests on single GPU + needs: setup + # `dummy` means there is no test to run + if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true + strategy: + fail-fast: false + matrix: + folders: ${{ fromJson(needs.setup.outputs.matrix) }} + machines: [single-gpu] + runs-on: [self-hosted, docker-gpu, '${{ matrix.machines }}'] + container: + image: huggingface/transformers-all-latest-gpu + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Are GPUs recognized by our DL frameworks + working-directory: /transformers + run: | + utils/print_env_pt.py + TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" + TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" + + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + echo "${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Run all non-slow selected tests on GPU + working-directory: /transformers + run: | + python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machines }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }} - name: Failure short reports if: ${{ failure() }} - run: cat reports/tests_torch_gpu/failures_short.txt + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: - name: run_all_tests_torch_gpu_test_reports - path: reports - -# run_tests_flax_gpu: -# runs-on: [self-hosted, docker-gpu-test, single-gpu] -# container: -# image: tensorflow/tensorflow:2.4.1-gpu -# options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ -# steps: -# - name: Set up Python 3.7 -# uses: actions/setup-python@v2 -# with: -# python-version: 3.7 -# -# - name: Install dependencies -# run: | -# apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng -# pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html -# pip install --upgrade pip -# pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision] -# -# - name: Launcher docker -# uses: actions/checkout@v2 -# with: -# fetch-depth: 2 -# -# - name: NVIDIA-SMI -# continue-on-error: true -# run: | -# nvidia-smi -# -# - name: Are GPUs recognized by our DL frameworks -# run: | -# python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)" -# python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))" -# -# - name: Fetch the tests to run -# run: | -# python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt -# -# - name: Report fetched tests -# uses: actions/upload-artifact@v2 -# with: -# name: test_fetched -# path: test_preparation.txt -# -# - name: Run all non-slow tests on GPU -# run: | -# if [ -f test_list.txt ]; then -# python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_gpu $(cat test_list.txt) -# fi -# -# - name: Failure short reports -# if: ${{ failure() }} -# run: cat reports/tests_flax_gpu/failures_short.txt -# -# - name: Test suite reports artifacts -# if: ${{ always() }} -# uses: actions/upload-artifact@v2 -# with: -# name: run_all_tests_flax_gpu_test_reports -# path: reports -# -# run_tests_tf_gpu: -# runs-on: [self-hosted, docker-gpu, single-gpu] -# timeout-minutes: 120 -# container: -# image: tensorflow/tensorflow:2.4.1-gpu -# options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ -# steps: -# - name: Install dependencies -# run: | -# apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng -# pip install --upgrade pip -# pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech] -# pip install https://github.com/kpu/kenlm/archive/master.zip -# -# - name: Launcher docker -# uses: actions/checkout@v2 -# with: -# fetch-depth: 2 -# -# - name: NVIDIA-SMI -# run: | -# nvidia-smi -# -# - name: Are GPUs recognized by our DL frameworks -# run: | -# TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" -# TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" -# -# - name: Fetch the tests to run -# run: | -# python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt -# -# - name: Report fetched tests -# uses: actions/upload-artifact@v2 -# with: -# name: test_fetched -# path: test_preparation.txt -# -# - name: Run all non-slow tests on GPU -# env: -# TF_NUM_INTRAOP_THREADS: 8 -# TF_NUM_INTEROP_THREADS: 1 -# run: | -# if [ -f test_list.txt ]; then -# python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu $(cat test_list.txt) -# fi -# -# - name: Failure short reports -# if: ${{ failure() }} -# run: cat reports/tests_tf_gpu/failures_short.txt -# -# - name: Test suite reports artifacts -# if: ${{ always() }} -# uses: actions/upload-artifact@v2 -# with: -# name: run_all_tests_tf_gpu_test_reports -# path: reports - - - run_tests_torch_multi_gpu: - runs-on: [self-hosted, docker-gpu, multi-gpu] + name: ${{ matrix.machines }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }} + + run_tests_multi_gpu: + name: Model Tests on multi GPUs + needs: setup + # `dummy` means there is no test to run + if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true + strategy: + fail-fast: false + matrix: + folders: ${{ fromJson(needs.setup.outputs.matrix) }} + machines: [multi-gpu] + runs-on: [self-hosted, docker-gpu, '${{ matrix.machines }}'] container: - image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime + image: huggingface/transformers-all-latest-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - - name: Install dependencies - run: | - apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng - apt install -y libsndfile1-dev espeak-ng - pip install --upgrade pip - pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] - pip install https://github.com/kpu/kenlm/archive/master.zip - - name: Launcher docker - uses: actions/checkout@v2 - with: - fetch-depth: 2 - - name: NVIDIA-SMI - continue-on-error: true run: | nvidia-smi - name: Are GPUs recognized by our DL frameworks + working-directory: /transformers run: | utils/print_env_pt.py + TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" + TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" - - name: Fetch the tests to run + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). run: | - python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt - - - name: Report fetched tests - uses: actions/upload-artifact@v2 - with: - name: test_fetched - path: test_preparation.txt - - - name: Run all non-slow tests on GPU + echo "${{ matrix.folders }}" + echo "${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Run all non-slow selected tests on GPU env: MKL_SERVICE_FORCE_INTEL: 1 + working-directory: /transformers run: | - if [ -f test_list.txt ]; then - python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_multi_gpu $(cat test_list.txt) - fi + python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machines }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }} - name: Failure short reports if: ${{ failure() }} - run: cat reports/tests_torch_multi_gpu/failures_short.txt + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: - name: run_all_tests_torch_multi_gpu_test_reports - path: reports - -# run_tests_flax_multi_gpu: -# runs-on: [self-hosted, docker-gpu, multi-gpu] -# container: -# image: tensorflow/tensorflow:2.4.1-gpu -# options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ -# steps: -# - name: Install dependencies -# run: | -# apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng -# pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html -# pip install --upgrade pip -# pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision] -# pip install https://github.com/kpu/kenlm/archive/master.zip -# -# - name: Launcher docker -# uses: actions/checkout@v2 -# with: -# fetch-depth: 2 -# -# - name: NVIDIA-SMI -# continue-on-error: true -# run: | -# nvidia-smi -# -# - name: Are GPUs recognized by our DL frameworks -# run: | -# python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)" -# python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))" -# -# - name: Fetch the tests to run -# run: | -# python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt -# -# - name: Report fetched tests -# uses: actions/upload-artifact@v2 -# with: -# name: test_fetched -# path: test_preparation.txt -# -# - name: Run all non-slow tests on GPU -# run: | -# if [ -f test_list.txt ]; then -# python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_multi_gpu $(cat test_list.txt) -# fi -# -# - name: Failure short reports -# if: ${{ failure() }} -# run: cat reports/tests_flax_multi_gpu/failures_short.txt -# -# - name: Test suite reports artifacts -# if: ${{ always() }} -# uses: actions/upload-artifact@v2 -# with: -# name: run_all_tests_flax_multi_gpu_test_reports -# path: reports - -# run_tests_tf_multi_gpu: -# runs-on: [self-hosted, docker-gpu, multi-gpu] -# timeout-minutes: 120 -# container: -# image: tensorflow/tensorflow:2.4.1-gpu -# options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ -# steps: -# - name: Install dependencies -# run: | -# apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng -# pip install --upgrade pip -# pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech] -# pip install https://github.com/kpu/kenlm/archive/master.zip -# -# - name: Launcher docker -# uses: actions/checkout@v2 -# with: -# fetch-depth: 2 -# -# - name: NVIDIA-SMI -# run: | -# nvidia-smi -# -# - name: Are GPUs recognized by our DL frameworks -# run: | -# TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" -# TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" -# -# - name: Fetch the tests to run -# run: | -# python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt -# -# - name: Report fetched tests -# uses: actions/upload-artifact@v2 -# with: -# name: test_fetched -# path: test_preparation.txt -# -# - name: Run all non-slow tests on GPU -# env: -# TF_NUM_INTRAOP_THREADS: 8 -# TF_NUM_INTEROP_THREADS: 1 -# run: | -# if [ -f test_list.txt ]; then -# python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu $(cat test_list.txt) -# fi -# -# - name: Failure short reports -# if: ${{ failure() }} -# run: cat reports/tests_tf_multi_gpu/failures_short.txt -# -# - name: Test suite reports artifacts -# if: ${{ always() }} -# uses: actions/upload-artifact@v2 -# with: -# name: run_all_tests_tf_multi_gpu_test_reports -# path: reports - - run_tests_torch_cuda_extensions_gpu: - runs-on: [self-hosted, docker-gpu, single-gpu] + name: ${{ matrix.machines }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }} + + run_tests_torch_cuda_extensions_single_gpu: + name: Torch CUDA extension tests on single GPU + needs: setup + if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended') + strategy: + fail-fast: false + matrix: + machines: [single-gpu] + runs-on: [self-hosted, docker-gpu, '${{ matrix.machines }}'] container: image: nvcr.io/nvidia/pytorch:21.03-py3 options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - - name: Launcher docker + - name: Checkout transformers uses: actions/checkout@v2 with: fetch-depth: 2 @@ -390,46 +224,42 @@ jobs: run: | utils/print_env_pt.py - - name: Fetch the tests to run + - name: Run all non-slow selected tests on GPU + # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests. run: | - python utils/tests_fetcher.py --diff_with_last_commit --filters tests/deepspeed tests/extended | tee test_preparation.txt - - - name: Report fetched tests - uses: actions/upload-artifact@v2 - with: - name: test_fetched - path: test_preparation.txt - - - name: Run all tests on GPU - run: | - if [ -f test_list.txt ]; then - python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_gpu $(cat test_list.txt) - fi + python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machines }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended - name: Failure short reports if: ${{ failure() }} - run: cat reports/tests_torch_cuda_extensions_gpu/failures_short.txt + continue-on-error: true + run: cat reports/${{ matrix.machines }}_tests_torch_cuda_extensions_gpu/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: - name: run_tests_torch_cuda_extensions_gpu_test_reports - path: reports + name: ${{ matrix.machines }}_run_tests_torch_cuda_extensions_gpu_test_reports + path: reports/${{ matrix.machines }}_tests_torch_cuda_extensions_gpu run_tests_torch_cuda_extensions_multi_gpu: - runs-on: [self-hosted, docker-gpu, multi-gpu] + name: Torch CUDA extension tests on multi GPUs + needs: setup + if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended') + strategy: + fail-fast: false + matrix: + machines: [multi-gpu] + runs-on: [self-hosted, docker-gpu, '${{ matrix.machines }}'] container: image: nvcr.io/nvidia/pytorch:21.03-py3 - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - - name: Launcher docker + - name: Checkout transformers uses: actions/checkout@v2 with: fetch-depth: 2 - name: NVIDIA-SMI - continue-on-error: true run: | nvidia-smi @@ -444,56 +274,46 @@ jobs: run: | utils/print_env_pt.py - - name: Fetch the tests to run - run: | - python utils/tests_fetcher.py --diff_with_last_commit --filters tests/deepspeed tests/extended | tee test_preparation.txt - - - name: Report fetched tests - uses: actions/upload-artifact@v2 - with: - name: test_fetched - path: test_preparation.txt - - - name: Run all tests on GPU + - name: Run all non-slow selected tests on GPU + # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests. run: | - if [ -f test_list.txt ]; then - python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_multi_gpu $(cat test_list.txt) - fi + python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machines }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended - name: Failure short reports if: ${{ failure() }} - run: cat reports/tests_torch_cuda_extensions_multi_gpu/failures_short.txt + continue-on-error: true + run: cat reports/${{ matrix.machines }}_tests_torch_cuda_extensions_gpu/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: - name: run_tests_torch_cuda_extensions_multi_gpu_test_reports - path: reports - + name: ${{ matrix.machines }}_run_tests_torch_cuda_extensions_gpu_test_reports + path: reports/${{ matrix.machines }}_tests_torch_cuda_extensions_gpu send_results: name: Send results to webhook runs-on: ubuntu-latest if: always() needs: [ - run_tests_torch_gpu, -# run_tests_tf_gpu, - run_tests_torch_multi_gpu, -# run_tests_tf_multi_gpu, - run_tests_torch_cuda_extensions_gpu, + setup, + run_tests_single_gpu, + run_tests_multi_gpu, + run_tests_torch_cuda_extensions_single_gpu, run_tests_torch_cuda_extensions_multi_gpu ] steps: - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - - name: Send message to Slack env: CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} - + CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} + CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} + CI_EVENT: push + # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change + # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. run: | pip install slack_sdk - python utils/notification_service_deprecated.py push + python utils/notification_service.py "${{ needs.setup.outputs.matrix }}" diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index fd68b9cabc4337..62469f8e83331d 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -83,30 +83,38 @@ jobs: run: | echo "${{ matrix.folders }}" matrix_folders=${{ matrix.folders }} - echo "$matrix_folders" matrix_folders=${matrix_folders/'models/'/'models_'} echo "$matrix_folders" echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + # Set machine type, i.e. `single-gpu` or `multi-gpu`. Here we just remove `-docker`. + - name: Set machine type from ${{ matrix.machines }} + shell: bash + run: | + machine_type=${{ matrix.machines }} + machine_type=${machine_type/'-docker'/''} + echo "machine_type=$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + - name: Update clone working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} - name: Run all tests on GPU working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machines }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + run: python3 -m pytest -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: - name: ${{ matrix.machines }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }} + name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} run_tests_multi_gpu: name: Model tests @@ -128,30 +136,38 @@ jobs: run: | echo "${{ matrix.folders }}" matrix_folders=${{ matrix.folders }} - echo "$matrix_folders" matrix_folders=${matrix_folders/'models/'/'models_'} echo "$matrix_folders" echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + # Set machine type, i.e. `single-gpu` or `multi-gpu`. Here we just remove `-docker`. + - name: Set machine type from ${{ matrix.machines }} + shell: bash + run: | + machine_type=${{ matrix.machines }} + machine_type=${machine_type/'-docker'/''} + echo "machine_type=$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + - name: Update clone working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} - name: Run all tests on GPU working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machines }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + run: python3 -m pytest -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: - name: ${{ matrix.machines }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }} + name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} run_examples_gpu: name: Examples directory @@ -195,6 +211,15 @@ jobs: options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: + # Set machine type, i.e. `single-gpu` or `multi-gpu`. Here we just remove `-docker`. + - name: Set machine type from ${{ matrix.machines }} + shell: bash + run: | + machine_type=${{ matrix.machines }} + machine_type=${machine_type/'-docker'/''} + echo "machine_type=$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + - name: Update clone working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} @@ -204,19 +229,19 @@ jobs: env: RUN_PIPELINE_TESTS: yes run: | - python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ matrix.machines }}_tests_torch_pipeline_gpu tests + python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ env.machine_type }}_tests_torch_pipeline_gpu tests - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /transformers/reports/${{ matrix.machines }}_tests_torch_pipeline_gpu/failures_short.txt + run: cat /transformers/reports/${{ env.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: - name: ${{ matrix.machines }}_run_tests_torch_pipeline_gpu - path: /transformers/reports/${{ matrix.machines }}_tests_torch_pipeline_gpu + name: ${{ env.machine_type }}_run_tests_torch_pipeline_gpu + path: /transformers/reports/${{ env.machine_type }}_tests_torch_pipeline_gpu run_pipelines_tf_gpu: name: TensorFlow pipelines @@ -230,6 +255,15 @@ jobs: options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: + # Set machine type, i.e. `single-gpu` or `multi-gpu`. Here we just remove `-docker`. + - name: Set machine type from ${{ matrix.machines }} + shell: bash + run: | + machine_type=${{ matrix.machines }} + machine_type=${machine_type/'-docker'/''} + echo "machine_type=$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + - name: Update clone working-directory: /transformers run: | @@ -240,19 +274,19 @@ jobs: env: RUN_PIPELINE_TESTS: yes run: | - python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ matrix.machines }}_tests_tf_pipeline_gpu tests + python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ env.machine_type }}_tests_tf_pipeline_gpu tests - name: Failure short reports if: ${{ always() }} run: | - cat /transformers/reports/${{ matrix.machines }}_tests_tf_pipeline_gpu/failures_short.txt + cat /transformers/reports/${{ env.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: - name: ${{ matrix.machines }}_run_tests_tf_pipeline_gpu - path: /transformers/reports/${{ matrix.machines }}_tests_tf_pipeline_gpu + name: ${{ env.machine_type }}_run_tests_tf_pipeline_gpu + path: /transformers/reports/${{ env.machine_type }}_tests_tf_pipeline_gpu run_all_tests_torch_cuda_extensions_gpu: name: Torch CUDA extension tests @@ -266,6 +300,15 @@ jobs: image: huggingface/transformers-pytorch-deepspeed-latest-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: + # Set machine type, i.e. `single-gpu` or `multi-gpu`. Here we just remove `-docker`. + - name: Set machine type from ${{ matrix.machines }} + shell: bash + run: | + machine_type=${{ matrix.machines }} + machine_type=${machine_type/'-docker'/''} + echo "machine_type=$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + - name: Update clone working-directory: /workspace/transformers run: git fetch && git checkout ${{ github.sha }} @@ -281,19 +324,19 @@ jobs: - name: Run all tests on GPU working-directory: /workspace/transformers run: | - python -m pytest -v --make-reports=${{ matrix.machines }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended + python -m pytest -v --make-reports=${{ env.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /workspace/transformers/reports/${{ matrix.machines }}_tests_torch_cuda_extensions_gpu/failures_short.txt + run: cat /workspace/transformers/reports/${{ env.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: - name: ${{ matrix.machines }}_run_tests_torch_cuda_extensions_gpu_test_reports - path: /workspace/transformers/reports/${{ matrix.machines }}_tests_torch_cuda_extensions_gpu + name: ${{ env.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports + path: /workspace/transformers/reports/${{ env.machine_type }}_tests_torch_cuda_extensions_gpu send_results: @@ -310,6 +353,7 @@ jobs: CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} + CI_EVENT: scheduled # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. run: | diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py index c555e0381e25f2..64c244ae8ed2ee 100644 --- a/tests/extended/test_trainer_ext.py +++ b/tests/extended/test_trainer_ext.py @@ -105,6 +105,7 @@ def test_run_seq2seq_ddp(self): self.run_seq2seq_quick(distributed=True) # test --sharded_ddp w/o --fp16 + @unittest.skip("Requires an update of the env running those tests") @require_torch_multi_gpu @require_fairscale def test_run_seq2seq_sharded_ddp(self): @@ -118,6 +119,7 @@ def test_run_seq2seq_sharded_ddp_fp16(self): self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple --fp16") # test --sharded_ddp zero_dp_2 w/o --fp16 + @unittest.skip("Requires an update of the env running those tests") @require_torch_multi_gpu @require_fairscale def test_run_seq2seq_fully_sharded_ddp(self): diff --git a/utils/notification_service.py b/utils/notification_service.py index 8a50c745ef2f69..628cc76048071f 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -497,7 +497,7 @@ def retrieve_artifact(name: str, gpu: Optional[str]): raise ValueError(f"Invalid GPU for artifact. Passed GPU: `{gpu}`.") if gpu is not None: - name = f"{gpu}-gpu-docker_{name}" + name = f"{gpu}-gpu_{name}" _artifact = {} @@ -531,8 +531,8 @@ def add_path(self, path: str, gpu: str = None): directories = filter(os.path.isdir, os.listdir()) for directory in directories: - if directory.startswith("single-gpu-docker"): - artifact_name = directory[len("single-gpu-docker") + 1 :] + if directory.startswith("single-gpu"): + artifact_name = directory[len("single-gpu") + 1 :] if artifact_name in _available_artifacts: _available_artifacts[artifact_name].single_gpu = True @@ -541,8 +541,8 @@ def add_path(self, path: str, gpu: str = None): _available_artifacts[artifact_name].add_path(directory, gpu="single") - elif directory.startswith("multi-gpu-docker"): - artifact_name = directory[len("multi-gpu-docker") + 1 :] + elif directory.startswith("multi-gpu"): + artifact_name = directory[len("multi-gpu") + 1 :] if artifact_name in _available_artifacts: _available_artifacts[artifact_name].multi_gpu = True @@ -561,6 +561,10 @@ def add_path(self, path: str, gpu: str = None): if __name__ == "__main__": + + # This env. variable is set in workflow file (under the job `send_results`). + ci_event = os.environ["CI_EVENT"] + arguments = sys.argv[1:][0] try: models = ast.literal_eval(arguments) @@ -609,7 +613,7 @@ def add_path(self, path: str, gpu: str = None): if "stats" in artifact: # Link to the GitHub Action job model_results[model]["job_link"] = github_actions_job_links.get( - f"Model tests ({model}, {artifact_path['gpu']}-gpu-docker)" + f"Model tests ({model}, {artifact_path['gpu']}-gpu)" ) failed, success, time_spent = handle_test_results(artifact["stats"]) @@ -667,6 +671,11 @@ def add_path(self, path: str, gpu: str = None): "Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports", } + if ci_event == "push": + del additional_files["Examples directory"] + del additional_files["PyTorch pipelines"] + del additional_files["TensorFlow pipelines"] + additional_results = { key: { "failed": {"unclassified": 0, "single": 0, "multi": 0}, @@ -689,7 +698,7 @@ def add_path(self, path: str, gpu: str = None): for artifact_path in available_artifacts[additional_files[key]].paths: if artifact_path["gpu"] is not None: additional_results[key]["job_link"] = github_actions_job_links.get( - f"{key} ({artifact_path['gpu']}-gpu-docker)" + f"{key} ({artifact_path['gpu']}-gpu)" ) artifact = retrieve_artifact(artifact_path["name"], artifact_path["gpu"]) stacktraces = handle_stacktraces(artifact["failures_line"]) @@ -715,7 +724,7 @@ def add_path(self, path: str, gpu: str = None): artifact_path["gpu"] ] += f"*{line}*\n_{stacktraces.pop(0)}_\n\n" - message = Message("🤗 Results of the scheduled tests.", model_results, additional_results) + message = Message(f"🤗 Results of the {ci_event} tests.", model_results, additional_results) message.post() message.post_reply() diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py index 1eda2be47f5725..cdc79f371163ca 100644 --- a/utils/tests_fetcher.py +++ b/utils/tests_fetcher.py @@ -15,6 +15,7 @@ import argparse import collections +import json import os import re from contextlib import contextmanager @@ -65,6 +66,32 @@ def clean_code(content): return "\n".join(lines_to_keep) +def get_all_tests(): + """ + Return a list of paths to all test folders and files under `tests`. All paths are rooted at `tests`. + + - folders under `tests`: `tokenization`, `pipelines`, etc. The folder `models` is excluded. + - folders under `tests/models`: `bert`, `gpt2`, etc. + - test files under `tests`: `test_modeling_common.py`, `test_tokenization_common.py`, etc. + """ + test_root_dir = os.path.join(PATH_TO_TRANFORMERS, "tests") + + # test folders/files directly under `tests` folder + tests = os.listdir(test_root_dir) + tests = sorted( + list(filter(lambda x: os.path.isdir(x) or x.startswith("tests/test_"), [f"tests/{x}" for x in tests])) + ) + + # model specific test folders + model_tests_folders = os.listdir(os.path.join(test_root_dir, "models")) + model_test_folders = sorted(list(filter(os.path.isdir, [f"tests/models/{x}" for x in model_tests_folders]))) + + tests.remove("tests/models") + tests = model_test_folders + tests + + return tests + + def diff_is_docstring_only(repo, branching_point, filename): """ Check if the diff is only in docstrings in a filename. @@ -441,7 +468,7 @@ def sanity_check(): ) -def infer_tests_to_run(output_file, diff_with_last_commit=False, filters=None): +def infer_tests_to_run(output_file, diff_with_last_commit=False, filters=None, json_output_file=None): modified_files = get_modified_python_files(diff_with_last_commit=diff_with_last_commit) print(f"\n### MODIFIED FILES ###\n{_print_list(modified_files)}") @@ -495,6 +522,42 @@ def infer_tests_to_run(output_file, diff_with_last_commit=False, filters=None): with open(output_file, "w", encoding="utf-8") as f: f.write(" ".join(test_files_to_run)) + # Create a map that maps test categories to test files, i.e. `models/bert` -> [...test_modeling_bert.py, ...] + + # Get all test directories (and some common test files) under `tests` and `tests/models` if `test_files_to_run` + # contains `tests` (i.e. when `setup.py` is changed). + if "tests" in test_files_to_run: + test_files_to_run = get_all_tests() + + if json_output_file is not None: + test_map = {} + for test_file in test_files_to_run: + # `test_file` is a path to a test folder/file, starting with `tests/`. For example, + # - `tests/models/bert/test_modeling_bert.py` or `tests/models/bert` + # - `tests/trainer/test_trainer.py` or `tests/trainer` + # - `tests/test_modeling_common.py` + names = test_file.split(os.path.sep) + if names[1] == "models": + # take the part like `models/bert` for modeling tests + key = "/".join(names[1:3]) + elif len(names) > 2 or not test_file.endswith(".py"): + # test folders under `tests` or python files under them + # take the part like tokenization, `pipeline`, etc. for other test categories + key = "/".join(names[1:2]) + else: + # common test files directly under `tests/` + key = "common" + + if key not in test_map: + test_map[key] = [] + test_map[key].append(test_file) + + # sort the keys & values + keys = sorted(test_map.keys()) + test_map = {k: " ".join(sorted(test_map[k])) for k in keys} + with open(json_output_file, "w", encoding="UTF-8") as fp: + json.dump(test_map, fp, ensure_ascii=False) + if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -504,6 +567,12 @@ def infer_tests_to_run(output_file, diff_with_last_commit=False, filters=None): parser.add_argument( "--output_file", type=str, default="test_list.txt", help="Where to store the list of tests to run" ) + parser.add_argument( + "--json_output_file", + type=str, + default="test_map.json", + help="Where to store the tests to run in a dictionary format mapping test categories to test files", + ) parser.add_argument( "--diff_with_last_commit", action="store_true", @@ -528,7 +597,12 @@ def infer_tests_to_run(output_file, diff_with_last_commit=False, filters=None): diff_with_last_commit = True try: - infer_tests_to_run(args.output_file, diff_with_last_commit=diff_with_last_commit, filters=args.filters) + infer_tests_to_run( + args.output_file, + diff_with_last_commit=diff_with_last_commit, + filters=args.filters, + json_output_file=args.json_output_file, + ) except Exception as e: print(f"\nError when trying to grab the relevant tests: {e}\n\nRunning all tests.") with open(args.output_file, "w", encoding="utf-8") as f: