Self-hosted runner (scheduled) #774
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Self-hosted runner (scheduled) | |
on: | |
push: | |
branches: | |
- multi_ci_* | |
repository_dispatch: | |
schedule: | |
- cron: "0 0 * * *" | |
env: | |
HF_HOME: /mnt/cache | |
TRANSFORMERS_IS_CI: yes | |
RUN_SLOW: yes | |
OMP_NUM_THREADS: 16 | |
MKL_NUM_THREADS: 16 | |
PYTEST_TIMEOUT: 600 | |
jobs: | |
run_all_tests_torch_gpu: | |
runs-on: [self-hosted, docker-gpu, single-gpu] | |
container: | |
image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime | |
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
steps: | |
- name: Launcher docker | |
uses: actions/checkout@v2 | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Install dependencies | |
run: | | |
apt -y update && apt install -y libsndfile1-dev git | |
pip install --upgrade pip | |
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] | |
- name: Are GPUs recognized by our DL frameworks | |
run: | | |
python -c "import torch; print('Cuda available:', torch.cuda.is_available())" | |
python -c "import torch; print('Cuda version:', torch.version.cuda)" | |
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" | |
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" | |
- name: Run all tests on GPU | |
run: | | |
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests | |
- name: Failure short reports | |
if: ${{ always() }} | |
run: cat reports/tests_torch_gpu_failures_short.txt | |
- name: Run examples tests on GPU | |
if: ${{ always() }} | |
env: | |
OMP_NUM_THREADS: 16 | |
MKL_NUM_THREADS: 16 | |
RUN_SLOW: yes | |
HF_HOME: /mnt/cache | |
TRANSFORMERS_IS_CI: yes | |
run: | | |
pip install -r examples/pytorch/_tests_requirements.txt | |
python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples | |
- name: Failure short reports | |
if: ${{ always() }} | |
run: cat reports/examples_torch_gpu_failures_short.txt | |
- name: Run all pipeline tests on GPU | |
if: ${{ always() }} | |
env: | |
RUN_PIPELINE_TESTS: yes | |
run: | | |
python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests | |
- name: Failure short reports | |
if: ${{ always() }} | |
run: cat reports/tests_torch_pipeline_gpu_failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v2 | |
with: | |
name: run_all_tests_torch_gpu_test_reports | |
path: reports | |
run_all_tests_flax_gpu: | |
runs-on: [self-hosted, docker-gpu-test, single-gpu] | |
container: | |
image: tensorflow/tensorflow:2.4.1-gpu | |
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
steps: | |
- name: Launcher docker | |
uses: actions/checkout@v2 | |
- name: NVIDIA-SMI | |
continue-on-error: true | |
run: | | |
nvidia-smi | |
- name: Install dependencies | |
run: | | |
pip install --upgrade pip | |
pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html | |
pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision] | |
- name: Are GPUs recognized by our DL frameworks | |
run: | | |
python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)" | |
python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))" | |
- name: Run all tests on GPU | |
run: | | |
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests | |
- name: Failure short reports | |
if: ${{ always() }} | |
run: cat reports/tests_flax_gpu_failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v2 | |
with: | |
name: run_all_tests_flax_gpu_test_reports | |
path: reports | |
run_all_tests_tf_gpu: | |
runs-on: [self-hosted, docker-gpu, single-gpu] | |
container: | |
image: tensorflow/tensorflow:2.4.1-gpu | |
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
steps: | |
- name: Launcher docker | |
uses: actions/checkout@v2 | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Install dependencies | |
run: | | |
apt -y update && apt install -y libsndfile1-dev git | |
pip install --upgrade pip | |
pip install .[sklearn,testing,onnx,sentencepiece,tf-speech] | |
- name: Are GPUs recognized by our DL frameworks | |
run: | | |
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" | |
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" | |
- name: Run all tests on GPU | |
env: | |
TF_NUM_INTEROP_THREADS: 1 | |
TF_NUM_INTRAOP_THREADS: 16 | |
run: | | |
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_tf_gpu tests | |
- name: Failure short reports | |
if: ${{ always() }} | |
run: cat reports/tests_tf_gpu_failures_short.txt | |
- name: Run all pipeline tests on GPU | |
if: ${{ always() }} | |
env: | |
RUN_PIPELINE_TESTS: yes | |
TF_NUM_INTEROP_THREADS: 1 | |
TF_NUM_INTRAOP_THREADS: 16 | |
run: | | |
python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests | |
- name: Failure short reports | |
if: ${{ always() }} | |
run: cat reports/tests_tf_pipeline_gpu_failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v2 | |
with: | |
name: run_all_tests_tf_gpu_test_reports | |
path: reports | |
run_all_tests_torch_multi_gpu: | |
runs-on: [self-hosted, docker-gpu, multi-gpu] | |
container: | |
image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime | |
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
steps: | |
- name: Launcher docker | |
uses: actions/checkout@v2 | |
- name: NVIDIA-SMI | |
continue-on-error: true | |
run: | | |
nvidia-smi | |
- name: Install dependencies | |
run: | | |
apt -y update && apt install -y libsndfile1-dev git | |
pip install --upgrade pip | |
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] | |
- name: Are GPUs recognized by our DL frameworks | |
run: | | |
python -c "import torch; print('Cuda available:', torch.cuda.is_available())" | |
python -c "import torch; print('Cuda version:', torch.version.cuda)" | |
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" | |
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" | |
- name: Run all tests on GPU | |
env: | |
MKL_SERVICE_FORCE_INTEL: 1 | |
run: | | |
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests | |
- name: Failure short reports | |
if: ${{ always() }} | |
run: cat reports/tests_torch_multi_gpu_failures_short.txt | |
- name: Run all pipeline tests on GPU | |
if: ${{ always() }} | |
env: | |
RUN_PIPELINE_TESTS: yes | |
run: | | |
python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests | |
- name: Failure short reports | |
if: ${{ always() }} | |
run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v2 | |
with: | |
name: run_all_tests_torch_multi_gpu_test_reports | |
path: reports | |
run_all_tests_tf_multi_gpu: | |
runs-on: [self-hosted, docker-gpu, multi-gpu] | |
container: | |
image: tensorflow/tensorflow:2.4.1-gpu | |
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
steps: | |
- name: Launcher docker | |
uses: actions/checkout@v2 | |
- name: NVIDIA-SMI | |
continue-on-error: true | |
run: | | |
nvidia-smi | |
- name: Install dependencies | |
run: | | |
apt -y update && apt install -y libsndfile1-dev git | |
pip install --upgrade pip | |
pip install .[sklearn,testing,onnx,sentencepiece,tf-speech] | |
- name: Are GPUs recognized by our DL frameworks | |
run: | | |
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" | |
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" | |
- name: Run all tests on GPU | |
env: | |
TF_NUM_INTEROP_THREADS: 1 | |
TF_NUM_INTRAOP_THREADS: 16 | |
run: | | |
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_tf_multi_gpu tests | |
- name: Failure short reports | |
if: ${{ always() }} | |
run: cat reports/tests_tf_multi_gpu_failures_short.txt | |
- name: Run all pipeline tests on GPU | |
if: ${{ always() }} | |
env: | |
RUN_PIPELINE_TESTS: yes | |
TF_NUM_INTEROP_THREADS: 1 | |
TF_NUM_INTRAOP_THREADS: 16 | |
run: | | |
python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests | |
- name: Failure short reports | |
if: ${{ always() }} | |
run: cat reports/tests_tf_pipeline_multi_gpu_failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v2 | |
with: | |
name: run_all_tests_tf_multi_gpu_test_reports | |
path: reports | |
# run_all_tests_flax_multi_gpu: | |
# runs-on: [self-hosted, docker-gpu, multi-gpu] | |
# container: | |
# image: tensorflow/tensorflow:2.4.1-gpu | |
# options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
# steps: | |
# - name: Launcher docker | |
# uses: actions/checkout@v2 | |
# | |
# - name: NVIDIA-SMI | |
# run: | | |
# nvidia-smi | |
# | |
# - name: Install dependencies | |
# run: | | |
# pip install --upgrade pip | |
# pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html | |
# pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision] | |
# | |
# - name: Are GPUs recognized by our DL frameworks | |
# run: | | |
# python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)" | |
# python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))" | |
# | |
# - name: Run all tests on GPU | |
# run: | | |
# python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests | |
# | |
# - name: Failure short reports | |
# if: ${{ always() }} | |
# run: cat reports/tests_flax_gpu_failures_short.txt | |
# | |
# - name: Test suite reports artifacts | |
# if: ${{ always() }} | |
# uses: actions/upload-artifact@v2 | |
# with: | |
# name: run_all_tests_flax_gpu_test_reports | |
# path: reports | |
run_all_tests_torch_cuda_extensions_gpu: | |
runs-on: [self-hosted, docker-gpu, single-gpu] | |
container: | |
image: nvcr.io/nvidia/pytorch:21.03-py3 | |
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
steps: | |
- name: Launcher docker | |
uses: actions/checkout@v2 | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Install dependencies | |
run: | | |
apt -y update && apt install -y libaio-dev | |
pip install --upgrade pip | |
pip install .[testing,deepspeed] | |
- name: Are GPUs recognized by our DL frameworks | |
run: | | |
python -c "import torch; print('Cuda available:', torch.cuda.is_available())" | |
python -c "import torch; print('Cuda version:', torch.version.cuda)" | |
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" | |
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" | |
- name: Run all tests on GPU | |
run: | | |
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended | |
- name: Failure short reports | |
if: ${{ always() }} | |
run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v2 | |
with: | |
name: run_tests_torch_cuda_extensions_gpu_test_reports | |
path: reports | |
run_all_tests_torch_cuda_extensions_multi_gpu: | |
runs-on: [self-hosted, docker-gpu, multi-gpu] | |
container: | |
image: nvcr.io/nvidia/pytorch:21.03-py3 | |
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
steps: | |
- name: Launcher docker | |
uses: actions/checkout@v2 | |
- name: NVIDIA-SMI | |
continue-on-error: true | |
run: | | |
nvidia-smi | |
- name: Install dependencies | |
run: | | |
apt -y update && apt install -y libaio-dev | |
pip install --upgrade pip | |
pip install .[testing,deepspeed,fairscale] | |
- name: Are GPUs recognized by our DL frameworks | |
run: | | |
python -c "import torch; print('Cuda available:', torch.cuda.is_available())" | |
python -c "import torch; print('Cuda version:', torch.version.cuda)" | |
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" | |
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" | |
- name: Run all tests on GPU | |
run: | | |
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended | |
- name: Failure short reports | |
if: ${{ always() }} | |
run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v2 | |
with: | |
name: run_tests_torch_cuda_extensions_multi_gpu_test_reports | |
path: reports | |
send_results: | |
name: Send results to webhook | |
runs-on: ubuntu-latest | |
if: always() | |
needs: [ | |
run_all_tests_torch_gpu, | |
run_all_tests_tf_gpu, | |
run_all_tests_torch_multi_gpu, | |
run_all_tests_tf_multi_gpu, | |
run_all_tests_torch_cuda_extensions_gpu, | |
run_all_tests_torch_cuda_extensions_multi_gpu | |
] | |
steps: | |
- uses: actions/checkout@v2 | |
- uses: actions/download-artifact@v2 | |
- name: Send message to Slack | |
env: | |
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} | |
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} | |
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} | |
run: | | |
pip install slack_sdk | |
python utils/notification_service.py scheduled |