Self-hosted runner (scheduled) #803

Workflow file for this run

.github/workflows/self-scheduled.yml at c429589

	name: Self-hosted runner (scheduled)

	on:
	push:
	branches:
	- multi_ci_*
	repository_dispatch:
	schedule:
	- cron: "0 0 * * *"

	env:
	HF_HOME: /mnt/cache
	TRANSFORMERS_IS_CI: yes
	RUN_SLOW: yes
	OMP_NUM_THREADS: 16
	MKL_NUM_THREADS: 16
	PYTEST_TIMEOUT: 600

	jobs:
	run_all_tests_torch_gpu:
	runs-on: [self-hosted, docker-gpu, single-gpu]
	container:
	image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
	options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
	steps:
	- name: Launcher docker
	uses: actions/checkout@v2

	- name: NVIDIA-SMI
	run: \|
	nvidia-smi

	- name: Install dependencies
	run: \|
	apt -y update && apt install -y libsndfile1-dev git
	pip install --upgrade pip
	pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]

	- name: Are GPUs recognized by our DL frameworks
	run: \|
	python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
	python -c "import torch; print('Cuda version:', torch.version.cuda)"
	python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
	python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

	- name: Run all tests on GPU
	run: \|
	python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests

	- name: Failure short reports
	if: ${{ always() }}
	run: cat reports/tests_torch_gpu_failures_short.txt

	- name: Run examples tests on GPU
	if: ${{ always() }}
	env:
	OMP_NUM_THREADS: 16
	MKL_NUM_THREADS: 16
	RUN_SLOW: yes
	HF_HOME: /mnt/cache
	TRANSFORMERS_IS_CI: yes
	run: \|
	pip install -r examples/pytorch/_tests_requirements.txt
	python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples

	- name: Failure short reports
	if: ${{ always() }}
	run: cat reports/examples_torch_gpu_failures_short.txt

	- name: Run all pipeline tests on GPU
	if: ${{ always() }}
	env:
	RUN_PIPELINE_TESTS: yes
	run: \|
	python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests

	- name: Failure short reports
	if: ${{ always() }}
	run: cat reports/tests_torch_pipeline_gpu_failures_short.txt

	- name: Test suite reports artifacts
	if: ${{ always() }}
	uses: actions/upload-artifact@v2
	with:
	name: run_all_tests_torch_gpu_test_reports
	path: reports

	run_all_tests_flax_gpu:
	runs-on: [self-hosted, docker-gpu-test, single-gpu]
	container:
	image: tensorflow/tensorflow:2.4.1-gpu
	options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
	steps:
	- name: Launcher docker
	uses: actions/checkout@v2

	- name: NVIDIA-SMI
	continue-on-error: true
	run: \|
	nvidia-smi

	- name: Install dependencies
	run: \|
	pip install --upgrade pip
	pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
	pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]

	- name: Are GPUs recognized by our DL frameworks
	run: \|
	python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
	python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"

	- name: Run all tests on GPU
	run: \|
	python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests

	- name: Failure short reports
	if: ${{ always() }}
	run: cat reports/tests_flax_gpu_failures_short.txt

	- name: Test suite reports artifacts
	if: ${{ always() }}
	uses: actions/upload-artifact@v2
	with:
	name: run_all_tests_flax_gpu_test_reports
	path: reports

	run_all_tests_tf_gpu:
	runs-on: [self-hosted, docker-gpu, single-gpu]
	container:
	image: tensorflow/tensorflow:2.4.1-gpu
	options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
	steps:
	- name: Launcher docker
	uses: actions/checkout@v2

	- name: NVIDIA-SMI
	run: \|
	nvidia-smi

	- name: Install dependencies
	run: \|
	apt -y update && apt install -y libsndfile1-dev git
	pip install --upgrade pip
	pip install .[sklearn,testing,onnx,sentencepiece,tf-speech]

	- name: Are GPUs recognized by our DL frameworks
	run: \|
	TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
	TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"

	- name: Run all tests on GPU
	env:
	TF_NUM_INTEROP_THREADS: 1
	TF_NUM_INTRAOP_THREADS: 16
	run: \|
	python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_tf_gpu tests

	- name: Failure short reports
	if: ${{ always() }}
	run: cat reports/tests_tf_gpu_failures_short.txt

	- name: Run all pipeline tests on GPU
	if: ${{ always() }}
	env:
	RUN_PIPELINE_TESTS: yes
	TF_NUM_INTEROP_THREADS: 1
	TF_NUM_INTRAOP_THREADS: 16
	run: \|
	python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests

	- name: Failure short reports
	if: ${{ always() }}
	run: cat reports/tests_tf_pipeline_gpu_failures_short.txt

	- name: Test suite reports artifacts
	if: ${{ always() }}
	uses: actions/upload-artifact@v2
	with:
	name: run_all_tests_tf_gpu_test_reports
	path: reports

	run_all_tests_torch_multi_gpu:
	runs-on: [self-hosted, docker-gpu, multi-gpu]
	container:
	image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
	options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
	steps:
	- name: Launcher docker
	uses: actions/checkout@v2

	- name: NVIDIA-SMI
	continue-on-error: true
	run: \|
	nvidia-smi

	- name: Install dependencies
	run: \|
	apt -y update && apt install -y libsndfile1-dev git
	pip install --upgrade pip
	pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]

	- name: Are GPUs recognized by our DL frameworks
	run: \|
	python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
	python -c "import torch; print('Cuda version:', torch.version.cuda)"
	python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
	python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

	- name: Run all tests on GPU
	env:
	MKL_SERVICE_FORCE_INTEL: 1
	run: \|
	python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests

	- name: Failure short reports
	if: ${{ always() }}
	run: cat reports/tests_torch_multi_gpu_failures_short.txt

	- name: Run all pipeline tests on GPU
	if: ${{ always() }}
	env:
	RUN_PIPELINE_TESTS: yes
	run: \|
	python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests

	- name: Failure short reports
	if: ${{ always() }}
	run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt

	- name: Test suite reports artifacts
	if: ${{ always() }}
	uses: actions/upload-artifact@v2
	with:
	name: run_all_tests_torch_multi_gpu_test_reports
	path: reports

	run_all_tests_tf_multi_gpu:
	runs-on: [self-hosted, docker-gpu, multi-gpu]
	container:
	image: tensorflow/tensorflow:2.4.1-gpu
	options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
	steps:
	- name: Launcher docker
	uses: actions/checkout@v2

	- name: NVIDIA-SMI
	continue-on-error: true
	run: \|
	nvidia-smi

	- name: Install dependencies
	run: \|
	apt -y update && apt install -y libsndfile1-dev git
	pip install --upgrade pip
	pip install .[sklearn,testing,onnx,sentencepiece,tf-speech]

	- name: Are GPUs recognized by our DL frameworks
	run: \|
	TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
	TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"

	- name: Run all tests on GPU
	env:
	TF_NUM_INTEROP_THREADS: 1
	TF_NUM_INTRAOP_THREADS: 16
	run: \|
	python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_tf_multi_gpu tests

	- name: Failure short reports
	if: ${{ always() }}
	run: cat reports/tests_tf_multi_gpu_failures_short.txt

	- name: Run all pipeline tests on GPU
	if: ${{ always() }}
	env:
	RUN_PIPELINE_TESTS: yes
	TF_NUM_INTEROP_THREADS: 1
	TF_NUM_INTRAOP_THREADS: 16
	run: \|
	python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests

	- name: Failure short reports
	if: ${{ always() }}
	run: cat reports/tests_tf_pipeline_multi_gpu_failures_short.txt

	- name: Test suite reports artifacts
	if: ${{ always() }}
	uses: actions/upload-artifact@v2
	with:
	name: run_all_tests_tf_multi_gpu_test_reports
	path: reports

	# run_all_tests_flax_multi_gpu:
	# runs-on: [self-hosted, docker-gpu, multi-gpu]
	# container:
	# image: tensorflow/tensorflow:2.4.1-gpu
	# options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
	# steps:
	# - name: Launcher docker
	# uses: actions/checkout@v2
	#
	# - name: NVIDIA-SMI
	# run: \|
	# nvidia-smi
	#
	# - name: Install dependencies
	# run: \|
	# pip install --upgrade pip
	# pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
	# pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]
	#
	# - name: Are GPUs recognized by our DL frameworks
	# run: \|
	# python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
	# python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
	#
	# - name: Run all tests on GPU
	# run: \|
	# python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests
	#
	# - name: Failure short reports
	# if: ${{ always() }}
	# run: cat reports/tests_flax_gpu_failures_short.txt
	#
	# - name: Test suite reports artifacts
	# if: ${{ always() }}
	# uses: actions/upload-artifact@v2
	# with:
	# name: run_all_tests_flax_gpu_test_reports
	# path: reports

	run_all_tests_torch_cuda_extensions_gpu:
	runs-on: [self-hosted, docker-gpu, single-gpu]
	container:
	image: nvcr.io/nvidia/pytorch:21.03-py3
	options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
	steps:
	- name: Launcher docker
	uses: actions/checkout@v2

	- name: NVIDIA-SMI
	run: \|
	nvidia-smi

	- name: Install dependencies
	run: \|
	apt -y update && apt install -y libaio-dev
	pip install --upgrade pip
	pip install .[testing,deepspeed]

	- name: Are GPUs recognized by our DL frameworks
	run: \|
	python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
	python -c "import torch; print('Cuda version:', torch.version.cuda)"
	python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
	python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

	- name: Run all tests on GPU
	run: \|
	python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended

	- name: Failure short reports
	if: ${{ always() }}
	run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt

	- name: Test suite reports artifacts
	if: ${{ always() }}
	uses: actions/upload-artifact@v2
	with:
	name: run_tests_torch_cuda_extensions_gpu_test_reports
	path: reports

	run_all_tests_torch_cuda_extensions_multi_gpu:
	runs-on: [self-hosted, docker-gpu, multi-gpu]
	container:
	image: nvcr.io/nvidia/pytorch:21.03-py3
	options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
	steps:
	- name: Launcher docker
	uses: actions/checkout@v2

	- name: NVIDIA-SMI
	continue-on-error: true
	run: \|
	nvidia-smi

	- name: Install dependencies
	run: \|
	apt -y update && apt install -y libaio-dev
	pip install --upgrade pip
	pip install .[testing,deepspeed,fairscale]

	- name: Are GPUs recognized by our DL frameworks
	run: \|
	python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
	python -c "import torch; print('Cuda version:', torch.version.cuda)"
	python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
	python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

	- name: Run all tests on GPU
	run: \|
	python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended

	- name: Failure short reports
	if: ${{ always() }}
	run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt

	- name: Test suite reports artifacts
	if: ${{ always() }}
	uses: actions/upload-artifact@v2
	with:
	name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
	path: reports

	send_results:
	name: Send results to webhook
	runs-on: ubuntu-latest
	if: always()
	needs: [
	run_all_tests_torch_gpu,
	run_all_tests_tf_gpu,
	run_all_tests_torch_multi_gpu,
	run_all_tests_tf_multi_gpu,
	run_all_tests_torch_cuda_extensions_gpu,
	run_all_tests_torch_cuda_extensions_multi_gpu
	]
	steps:
	- uses: actions/checkout@v2

	- uses: actions/download-artifact@v2

	- name: Send message to Slack
	env:
	CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
	CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
	CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}


	run: \|
	pip install slack_sdk
	python utils/notification_service.py scheduled

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Self-hosted runner (scheduled) #803

Workflow file

Self-hosted runner (scheduled) #803

Jobs

Run details

Workflow file for this run